diff options
author | Timothy Pearson <tpearson@raptorengineering.com> | 2017-08-23 14:45:25 -0500 |
---|---|---|
committer | Timothy Pearson <tpearson@raptorengineering.com> | 2017-08-23 14:45:25 -0500 |
commit | fcbb27b0ec6dcbc5a5108cb8fb19eae64593d204 (patch) | |
tree | 22962a4387943edc841c72a4e636a068c66d58fd /fs/ext4 | |
download | ast2050-linux-kernel-fcbb27b0ec6dcbc5a5108cb8fb19eae64593d204.zip ast2050-linux-kernel-fcbb27b0ec6dcbc5a5108cb8fb19eae64593d204.tar.gz |
Initial import of modified Linux 2.6.28 tree
Original upstream URL:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git | branch linux-2.6.28.y
Diffstat (limited to 'fs/ext4')
34 files changed, 29535 insertions, 0 deletions
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig new file mode 100644 index 0000000..7505482 --- /dev/null +++ b/fs/ext4/Kconfig @@ -0,0 +1,79 @@ +config EXT4_FS + tristate "The Extended 4 (ext4) filesystem" + select JBD2 + select CRC16 + help + This is the next generation of the ext3 filesystem. + + Unlike the change from ext2 filesystem to ext3 filesystem, + the on-disk format of ext4 is not forwards compatible with + ext3; it is based on extent maps and it supports 48-bit + physical block numbers. The ext4 filesystem also supports delayed + allocation, persistent preallocation, high resolution time stamps, + and a number of other features to improve performance and speed + up fsck time. For more information, please see the web pages at + http://ext4.wiki.kernel.org. + + The ext4 filesystem will support mounting an ext3 + filesystem; while there will be some performance gains from + the delayed allocation and inode table readahead, the best + performance gains will require enabling ext4 features in the + filesystem, or formating a new filesystem as an ext4 + filesystem initially. + + To compile this file system support as a module, choose M here. The + module will be called ext4. + + If unsure, say N. + +config EXT4DEV_COMPAT + bool "Enable ext4dev compatibility" + depends on EXT4_FS + help + Starting with 2.6.28, the name of the ext4 filesystem was + renamed from ext4dev to ext4. Unfortunately there are some + legacy userspace programs (such as klibc's fstype) have + "ext4dev" hardcoded. + + To enable backwards compatibility so that systems that are + still expecting to mount ext4 filesystems using ext4dev, + chose Y here. This feature will go away by 2.6.31, so + please arrange to get your userspace programs fixed! + +config EXT4_FS_XATTR + bool "Ext4 extended attributes" + depends on EXT4_FS + default y + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + <http://acl.bestbits.at/> for details). + + If unsure, say N. + + You need this for POSIX ACL support on ext4. + +config EXT4_FS_POSIX_ACL + bool "Ext4 POSIX Access Control Lists" + depends on EXT4_FS_XATTR + select FS_POSIX_ACL + help + POSIX Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the POSIX ACLs for + Linux website <http://acl.bestbits.at/>. + + If you don't know what Access Control Lists are, say N + +config EXT4_FS_SECURITY + bool "Ext4 Security Labels" + depends on EXT4_FS_XATTR + help + Security labels support alternative access control models + implemented by security modules like SELinux. This option + enables an extended attribute handler for file security + labels in the ext4 filesystem. + + If you are not using a security module that requires using + extended attributes for file security labels, say N. diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile new file mode 100644 index 0000000..a8ff003 --- /dev/null +++ b/fs/ext4/Makefile @@ -0,0 +1,13 @@ +# +# Makefile for the linux ext4-filesystem routines. +# + +obj-$(CONFIG_EXT4_FS) += ext4.o + +ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ + ext4_jbd2.o migrate.o mballoc.o + +ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o +ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o +ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c new file mode 100644 index 0000000..694ed6f --- /dev/null +++ b/fs/ext4/acl.c @@ -0,0 +1,551 @@ +/* + * linux/fs/ext4/acl.c + * + * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> + */ + +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/capability.h> +#include <linux/fs.h> +#include "ext4_jbd2.h" +#include "ext4.h" +#include "xattr.h" +#include "acl.h" + +/* + * Convert from filesystem to in-memory representation. + */ +static struct posix_acl * +ext4_acl_from_disk(const void *value, size_t size) +{ + const char *end = (char *)value + size; + int n, count; + struct posix_acl *acl; + + if (!value) + return NULL; + if (size < sizeof(ext4_acl_header)) + return ERR_PTR(-EINVAL); + if (((ext4_acl_header *)value)->a_version != + cpu_to_le32(EXT4_ACL_VERSION)) + return ERR_PTR(-EINVAL); + value = (char *)value + sizeof(ext4_acl_header); + count = ext4_acl_count(size); + if (count < 0) + return ERR_PTR(-EINVAL); + if (count == 0) + return NULL; + acl = posix_acl_alloc(count, GFP_NOFS); + if (!acl) + return ERR_PTR(-ENOMEM); + for (n = 0; n < count; n++) { + ext4_acl_entry *entry = + (ext4_acl_entry *)value; + if ((char *)value + sizeof(ext4_acl_entry_short) > end) + goto fail; + acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); + acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); + + switch (acl->a_entries[n].e_tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + value = (char *)value + + sizeof(ext4_acl_entry_short); + acl->a_entries[n].e_id = ACL_UNDEFINED_ID; + break; + + case ACL_USER: + case ACL_GROUP: + value = (char *)value + sizeof(ext4_acl_entry); + if ((char *)value > end) + goto fail; + acl->a_entries[n].e_id = + le32_to_cpu(entry->e_id); + break; + + default: + goto fail; + } + } + if (value != end) + goto fail; + return acl; + +fail: + posix_acl_release(acl); + return ERR_PTR(-EINVAL); +} + +/* + * Convert from in-memory to filesystem representation. + */ +static void * +ext4_acl_to_disk(const struct posix_acl *acl, size_t *size) +{ + ext4_acl_header *ext_acl; + char *e; + size_t n; + + *size = ext4_acl_size(acl->a_count); + ext_acl = kmalloc(sizeof(ext4_acl_header) + acl->a_count * + sizeof(ext4_acl_entry), GFP_NOFS); + if (!ext_acl) + return ERR_PTR(-ENOMEM); + ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION); + e = (char *)ext_acl + sizeof(ext4_acl_header); + for (n = 0; n < acl->a_count; n++) { + ext4_acl_entry *entry = (ext4_acl_entry *)e; + entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); + entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); + switch (acl->a_entries[n].e_tag) { + case ACL_USER: + case ACL_GROUP: + entry->e_id = cpu_to_le32(acl->a_entries[n].e_id); + e += sizeof(ext4_acl_entry); + break; + + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + e += sizeof(ext4_acl_entry_short); + break; + + default: + goto fail; + } + } + return (char *)ext_acl; + +fail: + kfree(ext_acl); + return ERR_PTR(-EINVAL); +} + +static inline struct posix_acl * +ext4_iget_acl(struct inode *inode, struct posix_acl **i_acl) +{ + struct posix_acl *acl = EXT4_ACL_NOT_CACHED; + + spin_lock(&inode->i_lock); + if (*i_acl != EXT4_ACL_NOT_CACHED) + acl = posix_acl_dup(*i_acl); + spin_unlock(&inode->i_lock); + + return acl; +} + +static inline void +ext4_iset_acl(struct inode *inode, struct posix_acl **i_acl, + struct posix_acl *acl) +{ + spin_lock(&inode->i_lock); + if (*i_acl != EXT4_ACL_NOT_CACHED) + posix_acl_release(*i_acl); + *i_acl = posix_acl_dup(acl); + spin_unlock(&inode->i_lock); +} + +/* + * Inode operation get_posix_acl(). + * + * inode->i_mutex: don't care + */ +static struct posix_acl * +ext4_get_acl(struct inode *inode, int type) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + int name_index; + char *value = NULL; + struct posix_acl *acl; + int retval; + + if (!test_opt(inode->i_sb, POSIX_ACL)) + return NULL; + + switch (type) { + case ACL_TYPE_ACCESS: + acl = ext4_iget_acl(inode, &ei->i_acl); + if (acl != EXT4_ACL_NOT_CACHED) + return acl; + name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; + break; + + case ACL_TYPE_DEFAULT: + acl = ext4_iget_acl(inode, &ei->i_default_acl); + if (acl != EXT4_ACL_NOT_CACHED) + return acl; + name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT; + break; + + default: + return ERR_PTR(-EINVAL); + } + retval = ext4_xattr_get(inode, name_index, "", NULL, 0); + if (retval > 0) { + value = kmalloc(retval, GFP_NOFS); + if (!value) + return ERR_PTR(-ENOMEM); + retval = ext4_xattr_get(inode, name_index, "", value, retval); + } + if (retval > 0) + acl = ext4_acl_from_disk(value, retval); + else if (retval == -ENODATA || retval == -ENOSYS) + acl = NULL; + else + acl = ERR_PTR(retval); + kfree(value); + + if (!IS_ERR(acl)) { + switch (type) { + case ACL_TYPE_ACCESS: + ext4_iset_acl(inode, &ei->i_acl, acl); + break; + + case ACL_TYPE_DEFAULT: + ext4_iset_acl(inode, &ei->i_default_acl, acl); + break; + } + } + return acl; +} + +/* + * Set the access or default ACL of an inode. + * + * inode->i_mutex: down unless called from ext4_new_inode + */ +static int +ext4_set_acl(handle_t *handle, struct inode *inode, int type, + struct posix_acl *acl) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + int name_index; + void *value = NULL; + size_t size = 0; + int error; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + switch (type) { + case ACL_TYPE_ACCESS: + name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; + if (acl) { + mode_t mode = inode->i_mode; + error = posix_acl_equiv_mode(acl, &mode); + if (error < 0) + return error; + else { + inode->i_mode = mode; + ext4_mark_inode_dirty(handle, inode); + if (error == 0) + acl = NULL; + } + } + break; + + case ACL_TYPE_DEFAULT: + name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT; + if (!S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; + break; + + default: + return -EINVAL; + } + if (acl) { + value = ext4_acl_to_disk(acl, &size); + if (IS_ERR(value)) + return (int)PTR_ERR(value); + } + + error = ext4_xattr_set_handle(handle, inode, name_index, "", + value, size, 0); + + kfree(value); + if (!error) { + switch (type) { + case ACL_TYPE_ACCESS: + ext4_iset_acl(inode, &ei->i_acl, acl); + break; + + case ACL_TYPE_DEFAULT: + ext4_iset_acl(inode, &ei->i_default_acl, acl); + break; + } + } + return error; +} + +static int +ext4_check_acl(struct inode *inode, int mask) +{ + struct posix_acl *acl = ext4_get_acl(inode, ACL_TYPE_ACCESS); + + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl) { + int error = posix_acl_permission(inode, acl, mask); + posix_acl_release(acl); + return error; + } + + return -EAGAIN; +} + +int +ext4_permission(struct inode *inode, int mask) +{ + return generic_permission(inode, mask, ext4_check_acl); +} + +/* + * Initialize the ACLs of a new inode. Called from ext4_new_inode. + * + * dir->i_mutex: down + * inode->i_mutex: up (access to inode is still exclusive) + */ +int +ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) +{ + struct posix_acl *acl = NULL; + int error = 0; + + if (!S_ISLNK(inode->i_mode)) { + if (test_opt(dir->i_sb, POSIX_ACL)) { + acl = ext4_get_acl(dir, ACL_TYPE_DEFAULT); + if (IS_ERR(acl)) + return PTR_ERR(acl); + } + if (!acl) + inode->i_mode &= ~current->fs->umask; + } + if (test_opt(inode->i_sb, POSIX_ACL) && acl) { + struct posix_acl *clone; + mode_t mode; + + if (S_ISDIR(inode->i_mode)) { + error = ext4_set_acl(handle, inode, + ACL_TYPE_DEFAULT, acl); + if (error) + goto cleanup; + } + clone = posix_acl_clone(acl, GFP_NOFS); + error = -ENOMEM; + if (!clone) + goto cleanup; + + mode = inode->i_mode; + error = posix_acl_create_masq(clone, &mode); + if (error >= 0) { + inode->i_mode = mode; + if (error > 0) { + /* This is an extended ACL */ + error = ext4_set_acl(handle, inode, + ACL_TYPE_ACCESS, clone); + } + } + posix_acl_release(clone); + } +cleanup: + posix_acl_release(acl); + return error; +} + +/* + * Does chmod for an inode that may have an Access Control List. The + * inode->i_mode field must be updated to the desired value by the caller + * before calling this function. + * Returns 0 on success, or a negative error number. + * + * We change the ACL rather than storing some ACL entries in the file + * mode permission bits (which would be more efficient), because that + * would break once additional permissions (like ACL_APPEND, ACL_DELETE + * for directories) are added. There are no more bits available in the + * file mode. + * + * inode->i_mutex: down + */ +int +ext4_acl_chmod(struct inode *inode) +{ + struct posix_acl *acl, *clone; + int error; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + if (!test_opt(inode->i_sb, POSIX_ACL)) + return 0; + acl = ext4_get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(acl) || !acl) + return PTR_ERR(acl); + clone = posix_acl_clone(acl, GFP_KERNEL); + posix_acl_release(acl); + if (!clone) + return -ENOMEM; + error = posix_acl_chmod_masq(clone, inode->i_mode); + if (!error) { + handle_t *handle; + int retries = 0; + + retry: + handle = ext4_journal_start(inode, + EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + ext4_std_error(inode->i_sb, error); + goto out; + } + error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, clone); + ext4_journal_stop(handle); + if (error == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + } +out: + posix_acl_release(clone); + return error; +} + +/* + * Extended attribute handlers + */ +static size_t +ext4_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len, + const char *name, size_t name_len) +{ + const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); + + if (!test_opt(inode->i_sb, POSIX_ACL)) + return 0; + if (list && size <= list_len) + memcpy(list, POSIX_ACL_XATTR_ACCESS, size); + return size; +} + +static size_t +ext4_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len, + const char *name, size_t name_len) +{ + const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); + + if (!test_opt(inode->i_sb, POSIX_ACL)) + return 0; + if (list && size <= list_len) + memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); + return size; +} + +static int +ext4_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size) +{ + struct posix_acl *acl; + int error; + + if (!test_opt(inode->i_sb, POSIX_ACL)) + return -EOPNOTSUPP; + + acl = ext4_get_acl(inode, type); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl == NULL) + return -ENODATA; + error = posix_acl_to_xattr(acl, buffer, size); + posix_acl_release(acl); + + return error; +} + +static int +ext4_xattr_get_acl_access(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + if (strcmp(name, "") != 0) + return -EINVAL; + return ext4_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size); +} + +static int +ext4_xattr_get_acl_default(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + if (strcmp(name, "") != 0) + return -EINVAL; + return ext4_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size); +} + +static int +ext4_xattr_set_acl(struct inode *inode, int type, const void *value, + size_t size) +{ + handle_t *handle; + struct posix_acl *acl; + int error, retries = 0; + + if (!test_opt(inode->i_sb, POSIX_ACL)) + return -EOPNOTSUPP; + if (!is_owner_or_cap(inode)) + return -EPERM; + + if (value) { + acl = posix_acl_from_xattr(value, size); + if (IS_ERR(acl)) + return PTR_ERR(acl); + else if (acl) { + error = posix_acl_valid(acl); + if (error) + goto release_and_out; + } + } else + acl = NULL; + +retry: + handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + error = ext4_set_acl(handle, inode, type, acl); + ext4_journal_stop(handle); + if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + +release_and_out: + posix_acl_release(acl); + return error; +} + +static int +ext4_xattr_set_acl_access(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + if (strcmp(name, "") != 0) + return -EINVAL; + return ext4_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size); +} + +static int +ext4_xattr_set_acl_default(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + if (strcmp(name, "") != 0) + return -EINVAL; + return ext4_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size); +} + +struct xattr_handler ext4_xattr_acl_access_handler = { + .prefix = POSIX_ACL_XATTR_ACCESS, + .list = ext4_xattr_list_acl_access, + .get = ext4_xattr_get_acl_access, + .set = ext4_xattr_set_acl_access, +}; + +struct xattr_handler ext4_xattr_acl_default_handler = { + .prefix = POSIX_ACL_XATTR_DEFAULT, + .list = ext4_xattr_list_acl_default, + .get = ext4_xattr_get_acl_default, + .set = ext4_xattr_set_acl_default, +}; diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h new file mode 100644 index 0000000..cb45257 --- /dev/null +++ b/fs/ext4/acl.h @@ -0,0 +1,81 @@ +/* + File: fs/ext4/acl.h + + (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org> +*/ + +#include <linux/posix_acl_xattr.h> + +#define EXT4_ACL_VERSION 0x0001 + +typedef struct { + __le16 e_tag; + __le16 e_perm; + __le32 e_id; +} ext4_acl_entry; + +typedef struct { + __le16 e_tag; + __le16 e_perm; +} ext4_acl_entry_short; + +typedef struct { + __le32 a_version; +} ext4_acl_header; + +static inline size_t ext4_acl_size(int count) +{ + if (count <= 4) { + return sizeof(ext4_acl_header) + + count * sizeof(ext4_acl_entry_short); + } else { + return sizeof(ext4_acl_header) + + 4 * sizeof(ext4_acl_entry_short) + + (count - 4) * sizeof(ext4_acl_entry); + } +} + +static inline int ext4_acl_count(size_t size) +{ + ssize_t s; + size -= sizeof(ext4_acl_header); + s = size - 4 * sizeof(ext4_acl_entry_short); + if (s < 0) { + if (size % sizeof(ext4_acl_entry_short)) + return -1; + return size / sizeof(ext4_acl_entry_short); + } else { + if (s % sizeof(ext4_acl_entry)) + return -1; + return s / sizeof(ext4_acl_entry) + 4; + } +} + +#ifdef CONFIG_EXT4_FS_POSIX_ACL + +/* Value for inode->u.ext4_i.i_acl and inode->u.ext4_i.i_default_acl + if the ACL has not been cached */ +#define EXT4_ACL_NOT_CACHED ((void *)-1) + +/* acl.c */ +extern int ext4_permission(struct inode *, int); +extern int ext4_acl_chmod(struct inode *); +extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); + +#else /* CONFIG_EXT4_FS_POSIX_ACL */ +#include <linux/sched.h> +#define ext4_permission NULL + +static inline int +ext4_acl_chmod(struct inode *inode) +{ + return 0; +} + +static inline int +ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) +{ + return 0; +} +#endif /* CONFIG_EXT4_FS_POSIX_ACL */ + diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c new file mode 100644 index 0000000..49ae5e4 --- /dev/null +++ b/fs/ext4/balloc.c @@ -0,0 +1,848 @@ +/* + * linux/fs/ext4/balloc.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include <linux/time.h> +#include <linux/capability.h> +#include <linux/fs.h> +#include <linux/jbd2.h> +#include <linux/quotaops.h> +#include <linux/buffer_head.h> +#include "ext4.h" +#include "ext4_jbd2.h" +#include "group.h" +#include "mballoc.h" + +/* + * balloc.c contains the blocks allocation and deallocation routines + */ + +/* + * Calculate the block group number and offset, given a block number + */ +void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, + ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + ext4_grpblk_t offset; + + blocknr = blocknr - le32_to_cpu(es->s_first_data_block); + offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb)); + if (offsetp) + *offsetp = offset; + if (blockgrpp) + *blockgrpp = blocknr; + +} + +static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block, + ext4_group_t block_group) +{ + ext4_group_t actual_group; + ext4_get_group_no_and_offset(sb, block, &actual_group, NULL); + if (actual_group == block_group) + return 1; + return 0; +} + +static int ext4_group_used_meta_blocks(struct super_block *sb, + ext4_group_t block_group) +{ + ext4_fsblk_t tmp; + struct ext4_sb_info *sbi = EXT4_SB(sb); + /* block bitmap, inode bitmap, and inode table blocks */ + int used_blocks = sbi->s_itb_per_group + 2; + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { + struct ext4_group_desc *gdp; + struct buffer_head *bh; + + gdp = ext4_get_group_desc(sb, block_group, &bh); + if (!ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), + block_group)) + used_blocks--; + + if (!ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), + block_group)) + used_blocks--; + + tmp = ext4_inode_table(sb, gdp); + for (; tmp < ext4_inode_table(sb, gdp) + + sbi->s_itb_per_group; tmp++) { + if (!ext4_block_in_group(sb, tmp, block_group)) + used_blocks -= 1; + } + } + return used_blocks; +} + +/* Initializes an uninitialized block bitmap if given, and returns the + * number of blocks free in the group. */ +unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, + ext4_group_t block_group, struct ext4_group_desc *gdp) +{ + int bit, bit_max; + unsigned free_blocks, group_blocks; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (bh) { + J_ASSERT_BH(bh, buffer_locked(bh)); + + /* If checksum is bad mark all blocks used to prevent allocation + * essentially implementing a per-group read-only flag. */ + if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { + ext4_error(sb, __func__, + "Checksum bad for group %lu\n", block_group); + gdp->bg_free_blocks_count = 0; + gdp->bg_free_inodes_count = 0; + gdp->bg_itable_unused = 0; + memset(bh->b_data, 0xff, sb->s_blocksize); + return 0; + } + memset(bh->b_data, 0, sb->s_blocksize); + } + + /* Check for superblock and gdt backups in this group */ + bit_max = ext4_bg_has_super(sb, block_group); + + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) || + block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) * + sbi->s_desc_per_block) { + if (bit_max) { + bit_max += ext4_bg_num_gdb(sb, block_group); + bit_max += + le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); + } + } else { /* For META_BG_BLOCK_GROUPS */ + bit_max += ext4_bg_num_gdb(sb, block_group); + } + + if (block_group == sbi->s_groups_count - 1) { + /* + * Even though mke2fs always initialize first and last group + * if some other tool enabled the EXT4_BG_BLOCK_UNINIT we need + * to make sure we calculate the right free blocks + */ + group_blocks = ext4_blocks_count(sbi->s_es) - + le32_to_cpu(sbi->s_es->s_first_data_block) - + (EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count - 1)); + } else { + group_blocks = EXT4_BLOCKS_PER_GROUP(sb); + } + + free_blocks = group_blocks - bit_max; + + if (bh) { + ext4_fsblk_t start, tmp; + int flex_bg = 0; + + for (bit = 0; bit < bit_max; bit++) + ext4_set_bit(bit, bh->b_data); + + start = ext4_group_first_block_no(sb, block_group); + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_FLEX_BG)) + flex_bg = 1; + + /* Set bits for block and inode bitmaps, and inode table */ + tmp = ext4_block_bitmap(sb, gdp); + if (!flex_bg || ext4_block_in_group(sb, tmp, block_group)) + ext4_set_bit(tmp - start, bh->b_data); + + tmp = ext4_inode_bitmap(sb, gdp); + if (!flex_bg || ext4_block_in_group(sb, tmp, block_group)) + ext4_set_bit(tmp - start, bh->b_data); + + tmp = ext4_inode_table(sb, gdp); + for (; tmp < ext4_inode_table(sb, gdp) + + sbi->s_itb_per_group; tmp++) { + if (!flex_bg || + ext4_block_in_group(sb, tmp, block_group)) + ext4_set_bit(tmp - start, bh->b_data); + } + /* + * Also if the number of blocks within the group is + * less than the blocksize * 8 ( which is the size + * of bitmap ), set rest of the block bitmap to 1 + */ + mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data); + } + return free_blocks - ext4_group_used_meta_blocks(sb, block_group); +} + + +/* + * The free blocks are managed by bitmaps. A file system contains several + * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap + * block for inodes, N blocks for the inode table and data blocks. + * + * The file system contains group descriptors which are located after the + * super block. Each descriptor contains the number of the bitmap block and + * the free blocks count in the block. The descriptors are loaded in memory + * when a file system is mounted (see ext4_fill_super). + */ + + +#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) + +/** + * ext4_get_group_desc() -- load group descriptor from disk + * @sb: super block + * @block_group: given block group + * @bh: pointer to the buffer head to store the block + * group descriptor + */ +struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, + ext4_group_t block_group, + struct buffer_head **bh) +{ + unsigned long group_desc; + unsigned long offset; + struct ext4_group_desc *desc; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (block_group >= sbi->s_groups_count) { + ext4_error(sb, "ext4_get_group_desc", + "block_group >= groups_count - " + "block_group = %lu, groups_count = %lu", + block_group, sbi->s_groups_count); + + return NULL; + } + smp_rmb(); + + group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb); + offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1); + if (!sbi->s_group_desc[group_desc]) { + ext4_error(sb, "ext4_get_group_desc", + "Group descriptor not loaded - " + "block_group = %lu, group_desc = %lu, desc = %lu", + block_group, group_desc, offset); + return NULL; + } + + desc = (struct ext4_group_desc *)( + (__u8 *)sbi->s_group_desc[group_desc]->b_data + + offset * EXT4_DESC_SIZE(sb)); + if (bh) + *bh = sbi->s_group_desc[group_desc]; + return desc; +} + +static int ext4_valid_block_bitmap(struct super_block *sb, + struct ext4_group_desc *desc, + unsigned int block_group, + struct buffer_head *bh) +{ + ext4_grpblk_t offset; + ext4_grpblk_t next_zero_bit; + ext4_fsblk_t bitmap_blk; + ext4_fsblk_t group_first_block; + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { + /* with FLEX_BG, the inode/block bitmaps and itable + * blocks may not be in the group at all + * so the bitmap validation will be skipped for those groups + * or it has to also read the block group where the bitmaps + * are located to verify they are set. + */ + return 1; + } + group_first_block = ext4_group_first_block_no(sb, block_group); + + /* check whether block bitmap block number is set */ + bitmap_blk = ext4_block_bitmap(sb, desc); + offset = bitmap_blk - group_first_block; + if (!ext4_test_bit(offset, bh->b_data)) + /* bad block bitmap */ + goto err_out; + + /* check whether the inode bitmap block number is set */ + bitmap_blk = ext4_inode_bitmap(sb, desc); + offset = bitmap_blk - group_first_block; + if (!ext4_test_bit(offset, bh->b_data)) + /* bad block bitmap */ + goto err_out; + + /* check whether the inode table block number is set */ + bitmap_blk = ext4_inode_table(sb, desc); + offset = bitmap_blk - group_first_block; + next_zero_bit = ext4_find_next_zero_bit(bh->b_data, + offset + EXT4_SB(sb)->s_itb_per_group, + offset); + if (next_zero_bit >= offset + EXT4_SB(sb)->s_itb_per_group) + /* good bitmap for inode tables */ + return 1; + +err_out: + ext4_error(sb, __func__, + "Invalid block bitmap - " + "block_group = %d, block = %llu", + block_group, bitmap_blk); + return 0; +} +/** + * ext4_read_block_bitmap() + * @sb: super block + * @block_group: given block group + * + * Read the bitmap for a given block_group,and validate the + * bits for block/inode/inode tables are set in the bitmaps + * + * Return buffer_head on success or NULL in case of failure. + */ +struct buffer_head * +ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) +{ + struct ext4_group_desc *desc; + struct buffer_head *bh = NULL; + ext4_fsblk_t bitmap_blk; + + desc = ext4_get_group_desc(sb, block_group, NULL); + if (!desc) + return NULL; + bitmap_blk = ext4_block_bitmap(sb, desc); + bh = sb_getblk(sb, bitmap_blk); + if (unlikely(!bh)) { + ext4_error(sb, __func__, + "Cannot read block bitmap - " + "block_group = %lu, block_bitmap = %llu", + block_group, bitmap_blk); + return NULL; + } + + if (bitmap_uptodate(bh)) + return bh; + + lock_buffer(bh); + if (bitmap_uptodate(bh)) { + unlock_buffer(bh); + return bh; + } + spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); + if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { + ext4_init_block_bitmap(sb, bh, block_group, desc); + set_bitmap_uptodate(bh); + set_buffer_uptodate(bh); + unlock_buffer(bh); + spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); + return bh; + } + spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); + if (buffer_uptodate(bh)) { + /* + * if not uninit if bh is uptodate, + * bitmap is also uptodate + */ + set_bitmap_uptodate(bh); + unlock_buffer(bh); + return bh; + } + /* + * submit the buffer_head for read. We can + * safely mark the bitmap as uptodate now. + * We do it here so the bitmap uptodate bit + * get set with buffer lock held. + */ + set_bitmap_uptodate(bh); + if (bh_submit_read(bh) < 0) { + put_bh(bh); + ext4_error(sb, __func__, + "Cannot read block bitmap - " + "block_group = %lu, block_bitmap = %llu", + block_group, bitmap_blk); + return NULL; + } + ext4_valid_block_bitmap(sb, desc, block_group, bh); + /* + * file system mounted not to panic on error, + * continue with corrupt bitmap + */ + return bh; +} + +/** + * ext4_add_groupblocks() -- Add given blocks to an existing group + * @handle: handle to this transaction + * @sb: super block + * @block: start physcial block to add to the block group + * @count: number of blocks to free + * + * This marks the blocks as free in the bitmap. We ask the + * mballoc to reload the buddy after this by setting group + * EXT4_GROUP_INFO_NEED_INIT_BIT flag + */ +void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, + ext4_fsblk_t block, unsigned long count) +{ + struct buffer_head *bitmap_bh = NULL; + struct buffer_head *gd_bh; + ext4_group_t block_group; + ext4_grpblk_t bit; + unsigned long i; + struct ext4_group_desc *desc; + struct ext4_super_block *es; + struct ext4_sb_info *sbi; + int err = 0, ret; + ext4_grpblk_t blocks_freed; + struct ext4_group_info *grp; + + sbi = EXT4_SB(sb); + es = sbi->s_es; + ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); + + ext4_get_group_no_and_offset(sb, block, &block_group, &bit); + grp = ext4_get_group_info(sb, block_group); + /* + * Check to see if we are freeing blocks across a group + * boundary. + */ + if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { + goto error_return; + } + bitmap_bh = ext4_read_block_bitmap(sb, block_group); + if (!bitmap_bh) + goto error_return; + desc = ext4_get_group_desc(sb, block_group, &gd_bh); + if (!desc) + goto error_return; + + if (in_range(ext4_block_bitmap(sb, desc), block, count) || + in_range(ext4_inode_bitmap(sb, desc), block, count) || + in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || + in_range(block + count - 1, ext4_inode_table(sb, desc), + sbi->s_itb_per_group)) { + ext4_error(sb, __func__, + "Adding blocks in system zones - " + "Block = %llu, count = %lu", + block, count); + goto error_return; + } + + /* + * We are about to add blocks to the bitmap, + * so we need undo access. + */ + BUFFER_TRACE(bitmap_bh, "getting undo access"); + err = ext4_journal_get_undo_access(handle, bitmap_bh); + if (err) + goto error_return; + + /* + * We are about to modify some metadata. Call the journal APIs + * to unshare ->b_data if a currently-committing transaction is + * using it + */ + BUFFER_TRACE(gd_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, gd_bh); + if (err) + goto error_return; + /* + * make sure we don't allow a parallel init on other groups in the + * same buddy cache + */ + down_write(&grp->alloc_sem); + for (i = 0, blocks_freed = 0; i < count; i++) { + BUFFER_TRACE(bitmap_bh, "clear bit"); + if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group), + bit + i, bitmap_bh->b_data)) { + ext4_error(sb, __func__, + "bit already cleared for block %llu", + (ext4_fsblk_t)(block + i)); + BUFFER_TRACE(bitmap_bh, "bit already cleared"); + } else { + blocks_freed++; + } + } + spin_lock(sb_bgl_lock(sbi, block_group)); + le16_add_cpu(&desc->bg_free_blocks_count, blocks_freed); + desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc); + spin_unlock(sb_bgl_lock(sbi, block_group)); + percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed); + + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, block_group); + spin_lock(sb_bgl_lock(sbi, flex_group)); + sbi->s_flex_groups[flex_group].free_blocks += blocks_freed; + spin_unlock(sb_bgl_lock(sbi, flex_group)); + } + /* + * request to reload the buddy with the + * new bitmap information + */ + set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); + ext4_mb_update_group_info(grp, blocks_freed); + up_write(&grp->alloc_sem); + + /* We dirtied the bitmap block */ + BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); + err = ext4_journal_dirty_metadata(handle, bitmap_bh); + + /* And the group descriptor block */ + BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); + ret = ext4_journal_dirty_metadata(handle, gd_bh); + if (!err) + err = ret; + sb->s_dirt = 1; + +error_return: + brelse(bitmap_bh); + ext4_std_error(sb, err); + return; +} + +/** + * ext4_free_blocks() -- Free given blocks and update quota + * @handle: handle for this transaction + * @inode: inode + * @block: start physical block to free + * @count: number of blocks to count + * @metadata: Are these metadata blocks + */ +void ext4_free_blocks(handle_t *handle, struct inode *inode, + ext4_fsblk_t block, unsigned long count, + int metadata) +{ + struct super_block *sb; + unsigned long dquot_freed_blocks; + + /* this isn't the right place to decide whether block is metadata + * inode.c/extents.c knows better, but for safety ... */ + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + metadata = 1; + + /* We need to make sure we don't reuse + * block released untill the transaction commit. + * writeback mode have weak data consistency so + * don't force data as metadata when freeing block + * for writeback mode. + */ + if (metadata == 0 && !ext4_should_writeback_data(inode)) + metadata = 1; + + sb = inode->i_sb; + + ext4_mb_free_blocks(handle, inode, block, count, + metadata, &dquot_freed_blocks); + if (dquot_freed_blocks) + DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); + return; +} + +/** + * ext4_has_free_blocks() + * @sbi: in-core super block structure. + * @nblocks: number of needed blocks + * + * Check if filesystem has nblocks free & available for allocation. + * On success return 1, return 0 on failure. + */ +int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks) +{ + s64 free_blocks, dirty_blocks, root_blocks; + struct percpu_counter *fbc = &sbi->s_freeblocks_counter; + struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter; + + free_blocks = percpu_counter_read_positive(fbc); + dirty_blocks = percpu_counter_read_positive(dbc); + root_blocks = ext4_r_blocks_count(sbi->s_es); + + if (free_blocks - (nblocks + root_blocks + dirty_blocks) < + EXT4_FREEBLOCKS_WATERMARK) { + free_blocks = percpu_counter_sum_positive(fbc); + dirty_blocks = percpu_counter_sum_positive(dbc); + if (dirty_blocks < 0) { + printk(KERN_CRIT "Dirty block accounting " + "went wrong %lld\n", + dirty_blocks); + } + } + /* Check whether we have space after + * accounting for current dirty blocks & root reserved blocks. + */ + if (free_blocks >= ((root_blocks + nblocks) + dirty_blocks)) + return 1; + + /* Hm, nope. Are (enough) root reserved blocks available? */ + if (sbi->s_resuid == current->fsuid || + ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) || + capable(CAP_SYS_RESOURCE)) { + if (free_blocks >= (nblocks + dirty_blocks)) + return 1; + } + + return 0; +} + +int ext4_claim_free_blocks(struct ext4_sb_info *sbi, + s64 nblocks) +{ + if (ext4_has_free_blocks(sbi, nblocks)) { + percpu_counter_add(&sbi->s_dirtyblocks_counter, nblocks); + return 0; + } else + return -ENOSPC; +} + +/** + * ext4_should_retry_alloc() + * @sb: super block + * @retries number of attemps has been made + * + * ext4_should_retry_alloc() is called when ENOSPC is returned, and if + * it is profitable to retry the operation, this function will wait + * for the current or commiting transaction to complete, and then + * return TRUE. + * + * if the total number of retries exceed three times, return FALSE. + */ +int ext4_should_retry_alloc(struct super_block *sb, int *retries) +{ + if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || (*retries)++ > 3) + return 0; + + jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); + + return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal); +} + +#define EXT4_META_BLOCK 0x1 + +static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode, + ext4_lblk_t iblock, ext4_fsblk_t goal, + unsigned long *count, int *errp, int flags) +{ + struct ext4_allocation_request ar; + ext4_fsblk_t ret; + + memset(&ar, 0, sizeof(ar)); + /* Fill with neighbour allocated blocks */ + + ar.inode = inode; + ar.goal = goal; + ar.len = *count; + ar.logical = iblock; + + if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK)) + /* enable in-core preallocation for data block allocation */ + ar.flags = EXT4_MB_HINT_DATA; + else + /* disable in-core preallocation for non-regular files */ + ar.flags = 0; + + ret = ext4_mb_new_blocks(handle, &ar, errp); + *count = ar.len; + return ret; +} + +/* + * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks + * + * @handle: handle to this transaction + * @inode: file inode + * @goal: given target block(filesystem wide) + * @count: total number of blocks need + * @errp: error code + * + * Return 1st allocated block numberon success, *count stores total account + * error stores in errp pointer + */ +ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, + ext4_fsblk_t goal, unsigned long *count, int *errp) +{ + ext4_fsblk_t ret; + ret = do_blk_alloc(handle, inode, 0, goal, + count, errp, EXT4_META_BLOCK); + /* + * Account for the allocated meta blocks + */ + if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) { + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + EXT4_I(inode)->i_allocated_meta_blocks += *count; + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + } + return ret; +} + +/* + * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks + * + * @handle: handle to this transaction + * @inode: file inode + * @goal: given target block(filesystem wide) + * @errp: error code + * + * Return allocated block number on success + */ +ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode, + ext4_fsblk_t goal, int *errp) +{ + unsigned long count = 1; + return ext4_new_meta_blocks(handle, inode, goal, &count, errp); +} + +/* + * ext4_new_blocks() -- allocate data blocks + * + * @handle: handle to this transaction + * @inode: file inode + * @goal: given target block(filesystem wide) + * @count: total number of blocks need + * @errp: error code + * + * Return 1st allocated block numberon success, *count stores total account + * error stores in errp pointer + */ + +ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode, + ext4_lblk_t iblock, ext4_fsblk_t goal, + unsigned long *count, int *errp) +{ + return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0); +} + +/** + * ext4_count_free_blocks() -- count filesystem free blocks + * @sb: superblock + * + * Adds up the number of free blocks from each block group. + */ +ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) +{ + ext4_fsblk_t desc_count; + struct ext4_group_desc *gdp; + ext4_group_t i; + ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; +#ifdef EXT4FS_DEBUG + struct ext4_super_block *es; + ext4_fsblk_t bitmap_count; + unsigned long x; + struct buffer_head *bitmap_bh = NULL; + + es = EXT4_SB(sb)->s_es; + desc_count = 0; + bitmap_count = 0; + gdp = NULL; + + smp_rmb(); + for (i = 0; i < ngroups; i++) { + gdp = ext4_get_group_desc(sb, i, NULL); + if (!gdp) + continue; + desc_count += le16_to_cpu(gdp->bg_free_blocks_count); + brelse(bitmap_bh); + bitmap_bh = ext4_read_block_bitmap(sb, i); + if (bitmap_bh == NULL) + continue; + + x = ext4_count_free(bitmap_bh, sb->s_blocksize); + printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n", + i, le16_to_cpu(gdp->bg_free_blocks_count), x); + bitmap_count += x; + } + brelse(bitmap_bh); + printk(KERN_DEBUG "ext4_count_free_blocks: stored = %llu" + ", computed = %llu, %llu\n", ext4_free_blocks_count(es), + desc_count, bitmap_count); + return bitmap_count; +#else + desc_count = 0; + smp_rmb(); + for (i = 0; i < ngroups; i++) { + gdp = ext4_get_group_desc(sb, i, NULL); + if (!gdp) + continue; + desc_count += le16_to_cpu(gdp->bg_free_blocks_count); + } + + return desc_count; +#endif +} + +static inline int test_root(ext4_group_t a, int b) +{ + int num = b; + + while (a > num) + num *= b; + return num == a; +} + +static int ext4_group_sparse(ext4_group_t group) +{ + if (group <= 1) + return 1; + if (!(group & 1)) + return 0; + return (test_root(group, 7) || test_root(group, 5) || + test_root(group, 3)); +} + +/** + * ext4_bg_has_super - number of blocks used by the superblock in group + * @sb: superblock for filesystem + * @group: group number to check + * + * Return the number of blocks used by the superblock (primary or backup) + * in this group. Currently this will be only 0 or 1. + */ +int ext4_bg_has_super(struct super_block *sb, ext4_group_t group) +{ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) && + !ext4_group_sparse(group)) + return 0; + return 1; +} + +static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb, + ext4_group_t group) +{ + unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb); + ext4_group_t first = metagroup * EXT4_DESC_PER_BLOCK(sb); + ext4_group_t last = first + EXT4_DESC_PER_BLOCK(sb) - 1; + + if (group == first || group == first + 1 || group == last) + return 1; + return 0; +} + +static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, + ext4_group_t group) +{ + return ext4_bg_has_super(sb, group) ? EXT4_SB(sb)->s_gdb_count : 0; +} + +/** + * ext4_bg_num_gdb - number of blocks used by the group table in group + * @sb: superblock for filesystem + * @group: group number to check + * + * Return the number of blocks used by the group descriptor table + * (primary or backup) in this group. In the future there may be a + * different number of descriptor blocks in each group. + */ +unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group) +{ + unsigned long first_meta_bg = + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); + unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb); + + if (!EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG) || + metagroup < first_meta_bg) + return ext4_bg_num_gdb_nometa(sb, group); + + return ext4_bg_num_gdb_meta(sb,group); + +} + diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c new file mode 100644 index 0000000..0a7a666 --- /dev/null +++ b/fs/ext4/bitmap.c @@ -0,0 +1,32 @@ +/* + * linux/fs/ext4/bitmap.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + */ + +#include <linux/buffer_head.h> +#include <linux/jbd2.h> +#include "ext4.h" + +#ifdef EXT4FS_DEBUG + +static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0}; + +unsigned long ext4_count_free(struct buffer_head *map, unsigned int numchars) +{ + unsigned int i; + unsigned long sum = 0; + + if (!map) + return 0; + for (i = 0; i < numchars; i++) + sum += nibblemap[map->b_data[i] & 0xf] + + nibblemap[(map->b_data[i] >> 4) & 0xf]; + return sum; +} + +#endif /* EXT4FS_DEBUG */ + diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c new file mode 100644 index 0000000..fed5b61 --- /dev/null +++ b/fs/ext4/dir.c @@ -0,0 +1,522 @@ +/* + * linux/fs/ext4/dir.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/dir.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext4 directory handling functions + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * + * Hash Tree Directory indexing (c) 2001 Daniel Phillips + * + */ + +#include <linux/fs.h> +#include <linux/jbd2.h> +#include <linux/buffer_head.h> +#include <linux/slab.h> +#include <linux/rbtree.h> +#include "ext4.h" + +static unsigned char ext4_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK +}; + +static int ext4_readdir(struct file *, void *, filldir_t); +static int ext4_dx_readdir(struct file *filp, + void *dirent, filldir_t filldir); +static int ext4_release_dir(struct inode *inode, + struct file *filp); + +const struct file_operations ext4_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = ext4_readdir, /* we take BKL. needed?*/ + .unlocked_ioctl = ext4_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ext4_compat_ioctl, +#endif + .fsync = ext4_sync_file, + .release = ext4_release_dir, +}; + + +static unsigned char get_dtype(struct super_block *sb, int filetype) +{ + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) || + (filetype >= EXT4_FT_MAX)) + return DT_UNKNOWN; + + return (ext4_filetype_table[filetype]); +} + + +int ext4_check_dir_entry(const char *function, struct inode *dir, + struct ext4_dir_entry_2 *de, + struct buffer_head *bh, + unsigned long offset) +{ + const char *error_msg = NULL; + const int rlen = ext4_rec_len_from_disk(de->rec_len); + + if (rlen < EXT4_DIR_REC_LEN(1)) + error_msg = "rec_len is smaller than minimal"; + else if (rlen % 4 != 0) + error_msg = "rec_len % 4 != 0"; + else if (rlen < EXT4_DIR_REC_LEN(de->name_len)) + error_msg = "rec_len is too small for name_len"; + else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize) + error_msg = "directory entry across blocks"; + else if (le32_to_cpu(de->inode) > + le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)) + error_msg = "inode out of bounds"; + + if (error_msg != NULL) + ext4_error(dir->i_sb, function, + "bad entry in directory #%lu: %s - " + "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", + dir->i_ino, error_msg, offset, + (unsigned long) le32_to_cpu(de->inode), + rlen, de->name_len); + return error_msg == NULL ? 1 : 0; +} + +static int ext4_readdir(struct file *filp, + void *dirent, filldir_t filldir) +{ + int error = 0; + unsigned long offset; + int i, stored; + struct ext4_dir_entry_2 *de; + struct super_block *sb; + int err; + struct inode *inode = filp->f_path.dentry->d_inode; + int ret = 0; + int dir_has_error = 0; + + sb = inode->i_sb; + + if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_COMPAT_DIR_INDEX) && + ((EXT4_I(inode)->i_flags & EXT4_INDEX_FL) || + ((inode->i_size >> sb->s_blocksize_bits) == 1))) { + err = ext4_dx_readdir(filp, dirent, filldir); + if (err != ERR_BAD_DX_DIR) { + ret = err; + goto out; + } + /* + * We don't set the inode dirty flag since it's not + * critical that it get flushed back to the disk. + */ + EXT4_I(filp->f_path.dentry->d_inode)->i_flags &= ~EXT4_INDEX_FL; + } + stored = 0; + offset = filp->f_pos & (sb->s_blocksize - 1); + + while (!error && !stored && filp->f_pos < inode->i_size) { + ext4_lblk_t blk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb); + struct buffer_head map_bh; + struct buffer_head *bh = NULL; + + map_bh.b_state = 0; + err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, + 0, 0, 0); + if (err > 0) { + pgoff_t index = map_bh.b_blocknr >> + (PAGE_CACHE_SHIFT - inode->i_blkbits); + if (!ra_has_index(&filp->f_ra, index)) + page_cache_sync_readahead( + sb->s_bdev->bd_inode->i_mapping, + &filp->f_ra, filp, + index, 1); + filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; + bh = ext4_bread(NULL, inode, blk, 0, &err); + } + + /* + * We ignore I/O errors on directories so users have a chance + * of recovering data when there's a bad sector + */ + if (!bh) { + if (!dir_has_error) { + ext4_error(sb, __func__, "directory #%lu " + "contains a hole at offset %Lu", + inode->i_ino, + (unsigned long long) filp->f_pos); + dir_has_error = 1; + } + /* corrupt size? Maybe no more blocks to read */ + if (filp->f_pos > inode->i_blocks << 9) + break; + filp->f_pos += sb->s_blocksize - offset; + continue; + } + +revalidate: + /* If the dir block has changed since the last call to + * readdir(2), then we might be pointing to an invalid + * dirent right now. Scan from the start of the block + * to make sure. */ + if (filp->f_version != inode->i_version) { + for (i = 0; i < sb->s_blocksize && i < offset; ) { + de = (struct ext4_dir_entry_2 *) + (bh->b_data + i); + /* It's too expensive to do a full + * dirent test each time round this + * loop, but we do have to test at + * least that it is non-zero. A + * failure will be detected in the + * dirent test below. */ + if (ext4_rec_len_from_disk(de->rec_len) + < EXT4_DIR_REC_LEN(1)) + break; + i += ext4_rec_len_from_disk(de->rec_len); + } + offset = i; + filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) + | offset; + filp->f_version = inode->i_version; + } + + while (!error && filp->f_pos < inode->i_size + && offset < sb->s_blocksize) { + de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); + if (!ext4_check_dir_entry("ext4_readdir", inode, de, + bh, offset)) { + /* + * On error, skip the f_pos to the next block + */ + filp->f_pos = (filp->f_pos | + (sb->s_blocksize - 1)) + 1; + brelse(bh); + ret = stored; + goto out; + } + offset += ext4_rec_len_from_disk(de->rec_len); + if (le32_to_cpu(de->inode)) { + /* We might block in the next section + * if the data destination is + * currently swapped out. So, use a + * version stamp to detect whether or + * not the directory has been modified + * during the copy operation. + */ + u64 version = filp->f_version; + + error = filldir(dirent, de->name, + de->name_len, + filp->f_pos, + le32_to_cpu(de->inode), + get_dtype(sb, de->file_type)); + if (error) + break; + if (version != filp->f_version) + goto revalidate; + stored++; + } + filp->f_pos += ext4_rec_len_from_disk(de->rec_len); + } + offset = 0; + brelse(bh); + } +out: + return ret; +} + +/* + * These functions convert from the major/minor hash to an f_pos + * value. + * + * Currently we only use major hash numer. This is unfortunate, but + * on 32-bit machines, the same VFS interface is used for lseek and + * llseek, so if we use the 64 bit offset, then the 32-bit versions of + * lseek/telldir/seekdir will blow out spectacularly, and from within + * the ext2 low-level routine, we don't know if we're being called by + * a 64-bit version of the system call or the 32-bit version of the + * system call. Worse yet, NFSv2 only allows for a 32-bit readdir + * cookie. Sigh. + */ +#define hash2pos(major, minor) (major >> 1) +#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff) +#define pos2min_hash(pos) (0) + +/* + * This structure holds the nodes of the red-black tree used to store + * the directory entry in hash order. + */ +struct fname { + __u32 hash; + __u32 minor_hash; + struct rb_node rb_hash; + struct fname *next; + __u32 inode; + __u8 name_len; + __u8 file_type; + char name[0]; +}; + +/* + * This functoin implements a non-recursive way of freeing all of the + * nodes in the red-black tree. + */ +static void free_rb_tree_fname(struct rb_root *root) +{ + struct rb_node *n = root->rb_node; + struct rb_node *parent; + struct fname *fname; + + while (n) { + /* Do the node's children first */ + if (n->rb_left) { + n = n->rb_left; + continue; + } + if (n->rb_right) { + n = n->rb_right; + continue; + } + /* + * The node has no children; free it, and then zero + * out parent's link to it. Finally go to the + * beginning of the loop and try to free the parent + * node. + */ + parent = rb_parent(n); + fname = rb_entry(n, struct fname, rb_hash); + while (fname) { + struct fname *old = fname; + fname = fname->next; + kfree(old); + } + if (!parent) + root->rb_node = NULL; + else if (parent->rb_left == n) + parent->rb_left = NULL; + else if (parent->rb_right == n) + parent->rb_right = NULL; + n = parent; + } +} + + +static struct dir_private_info *ext4_htree_create_dir_info(loff_t pos) +{ + struct dir_private_info *p; + + p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL); + if (!p) + return NULL; + p->curr_hash = pos2maj_hash(pos); + p->curr_minor_hash = pos2min_hash(pos); + return p; +} + +void ext4_htree_free_dir_info(struct dir_private_info *p) +{ + free_rb_tree_fname(&p->root); + kfree(p); +} + +/* + * Given a directory entry, enter it into the fname rb tree. + */ +int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, + __u32 minor_hash, + struct ext4_dir_entry_2 *dirent) +{ + struct rb_node **p, *parent = NULL; + struct fname *fname, *new_fn; + struct dir_private_info *info; + int len; + + info = (struct dir_private_info *) dir_file->private_data; + p = &info->root.rb_node; + + /* Create and allocate the fname structure */ + len = sizeof(struct fname) + dirent->name_len + 1; + new_fn = kzalloc(len, GFP_KERNEL); + if (!new_fn) + return -ENOMEM; + new_fn->hash = hash; + new_fn->minor_hash = minor_hash; + new_fn->inode = le32_to_cpu(dirent->inode); + new_fn->name_len = dirent->name_len; + new_fn->file_type = dirent->file_type; + memcpy(new_fn->name, dirent->name, dirent->name_len); + new_fn->name[dirent->name_len] = 0; + + while (*p) { + parent = *p; + fname = rb_entry(parent, struct fname, rb_hash); + + /* + * If the hash and minor hash match up, then we put + * them on a linked list. This rarely happens... + */ + if ((new_fn->hash == fname->hash) && + (new_fn->minor_hash == fname->minor_hash)) { + new_fn->next = fname->next; + fname->next = new_fn; + return 0; + } + + if (new_fn->hash < fname->hash) + p = &(*p)->rb_left; + else if (new_fn->hash > fname->hash) + p = &(*p)->rb_right; + else if (new_fn->minor_hash < fname->minor_hash) + p = &(*p)->rb_left; + else /* if (new_fn->minor_hash > fname->minor_hash) */ + p = &(*p)->rb_right; + } + + rb_link_node(&new_fn->rb_hash, parent, p); + rb_insert_color(&new_fn->rb_hash, &info->root); + return 0; +} + + + +/* + * This is a helper function for ext4_dx_readdir. It calls filldir + * for all entres on the fname linked list. (Normally there is only + * one entry on the linked list, unless there are 62 bit hash collisions.) + */ +static int call_filldir(struct file *filp, void *dirent, + filldir_t filldir, struct fname *fname) +{ + struct dir_private_info *info = filp->private_data; + loff_t curr_pos; + struct inode *inode = filp->f_path.dentry->d_inode; + struct super_block *sb; + int error; + + sb = inode->i_sb; + + if (!fname) { + printk(KERN_ERR "ext4: call_filldir: called with " + "null fname?!?\n"); + return 0; + } + curr_pos = hash2pos(fname->hash, fname->minor_hash); + while (fname) { + error = filldir(dirent, fname->name, + fname->name_len, curr_pos, + fname->inode, + get_dtype(sb, fname->file_type)); + if (error) { + filp->f_pos = curr_pos; + info->extra_fname = fname; + return error; + } + fname = fname->next; + } + return 0; +} + +static int ext4_dx_readdir(struct file *filp, + void *dirent, filldir_t filldir) +{ + struct dir_private_info *info = filp->private_data; + struct inode *inode = filp->f_path.dentry->d_inode; + struct fname *fname; + int ret; + + if (!info) { + info = ext4_htree_create_dir_info(filp->f_pos); + if (!info) + return -ENOMEM; + filp->private_data = info; + } + + if (filp->f_pos == EXT4_HTREE_EOF) + return 0; /* EOF */ + + /* Some one has messed with f_pos; reset the world */ + if (info->last_pos != filp->f_pos) { + free_rb_tree_fname(&info->root); + info->curr_node = NULL; + info->extra_fname = NULL; + info->curr_hash = pos2maj_hash(filp->f_pos); + info->curr_minor_hash = pos2min_hash(filp->f_pos); + } + + /* + * If there are any leftover names on the hash collision + * chain, return them first. + */ + if (info->extra_fname) { + if (call_filldir(filp, dirent, filldir, info->extra_fname)) + goto finished; + info->extra_fname = NULL; + goto next_node; + } else if (!info->curr_node) + info->curr_node = rb_first(&info->root); + + while (1) { + /* + * Fill the rbtree if we have no more entries, + * or the inode has changed since we last read in the + * cached entries. + */ + if ((!info->curr_node) || + (filp->f_version != inode->i_version)) { + info->curr_node = NULL; + free_rb_tree_fname(&info->root); + filp->f_version = inode->i_version; + ret = ext4_htree_fill_tree(filp, info->curr_hash, + info->curr_minor_hash, + &info->next_hash); + if (ret < 0) + return ret; + if (ret == 0) { + filp->f_pos = EXT4_HTREE_EOF; + break; + } + info->curr_node = rb_first(&info->root); + } + + fname = rb_entry(info->curr_node, struct fname, rb_hash); + info->curr_hash = fname->hash; + info->curr_minor_hash = fname->minor_hash; + if (call_filldir(filp, dirent, filldir, fname)) + break; + next_node: + info->curr_node = rb_next(info->curr_node); + if (info->curr_node) { + fname = rb_entry(info->curr_node, struct fname, + rb_hash); + info->curr_hash = fname->hash; + info->curr_minor_hash = fname->minor_hash; + } else { + if (info->next_hash == ~0) { + filp->f_pos = EXT4_HTREE_EOF; + break; + } + info->curr_hash = info->next_hash; + info->curr_minor_hash = 0; + } + } +finished: + info->last_pos = filp->f_pos; + return 0; +} + +static int ext4_release_dir(struct inode *inode, struct file *filp) +{ + if (filp->private_data) + ext4_htree_free_dir_info(filp->private_data); + + return 0; +} diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h new file mode 100644 index 0000000..0b0c0fa --- /dev/null +++ b/fs/ext4/ext4.h @@ -0,0 +1,1313 @@ +/* + * ext4.h + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/include/linux/minix_fs.h + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#ifndef _EXT4_H +#define _EXT4_H + +#include <linux/types.h> +#include <linux/blkdev.h> +#include <linux/magic.h> +#include <linux/jbd2.h> +#include "ext4_i.h" + +/* + * The fourth extended filesystem constants/structures + */ + +/* + * Define EXT4FS_DEBUG to produce debug messages + */ +#undef EXT4FS_DEBUG + +/* + * Define EXT4_RESERVATION to reserve data blocks for expanding files + */ +#define EXT4_DEFAULT_RESERVE_BLOCKS 8 +/*max window size: 1024(direct blocks) + 3([t,d]indirect blocks) */ +#define EXT4_MAX_RESERVE_BLOCKS 1027 +#define EXT4_RESERVE_WINDOW_NOT_ALLOCATED 0 + +/* + * Debug code + */ +#ifdef EXT4FS_DEBUG +#define ext4_debug(f, a...) \ + do { \ + printk(KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \ + __FILE__, __LINE__, __func__); \ + printk(KERN_DEBUG f, ## a); \ + } while (0) +#else +#define ext4_debug(f, a...) do {} while (0) +#endif + +#define EXT4_MULTIBLOCK_ALLOCATOR 1 + +/* prefer goal again. length */ +#define EXT4_MB_HINT_MERGE 1 +/* blocks already reserved */ +#define EXT4_MB_HINT_RESERVED 2 +/* metadata is being allocated */ +#define EXT4_MB_HINT_METADATA 4 +/* first blocks in the file */ +#define EXT4_MB_HINT_FIRST 8 +/* search for the best chunk */ +#define EXT4_MB_HINT_BEST 16 +/* data is being allocated */ +#define EXT4_MB_HINT_DATA 32 +/* don't preallocate (for tails) */ +#define EXT4_MB_HINT_NOPREALLOC 64 +/* allocate for locality group */ +#define EXT4_MB_HINT_GROUP_ALLOC 128 +/* allocate goal blocks or none */ +#define EXT4_MB_HINT_GOAL_ONLY 256 +/* goal is meaningful */ +#define EXT4_MB_HINT_TRY_GOAL 512 +/* blocks already pre-reserved by delayed allocation */ +#define EXT4_MB_DELALLOC_RESERVED 1024 + + +struct ext4_allocation_request { + /* target inode for block we're allocating */ + struct inode *inode; + /* logical block in target inode */ + ext4_lblk_t logical; + /* phys. target (a hint) */ + ext4_fsblk_t goal; + /* the closest logical allocated block to the left */ + ext4_lblk_t lleft; + /* phys. block for ^^^ */ + ext4_fsblk_t pleft; + /* the closest logical allocated block to the right */ + ext4_lblk_t lright; + /* phys. block for ^^^ */ + ext4_fsblk_t pright; + /* how many blocks we want to allocate */ + unsigned long len; + /* flags. see above EXT4_MB_HINT_* */ + unsigned long flags; +}; + +/* + * Special inodes numbers + */ +#define EXT4_BAD_INO 1 /* Bad blocks inode */ +#define EXT4_ROOT_INO 2 /* Root inode */ +#define EXT4_BOOT_LOADER_INO 5 /* Boot loader inode */ +#define EXT4_UNDEL_DIR_INO 6 /* Undelete directory inode */ +#define EXT4_RESIZE_INO 7 /* Reserved group descriptors inode */ +#define EXT4_JOURNAL_INO 8 /* Journal inode */ + +/* First non-reserved inode for old ext4 filesystems */ +#define EXT4_GOOD_OLD_FIRST_INO 11 + +/* + * Maximal count of links to a file + */ +#define EXT4_LINK_MAX 65000 + +/* + * Macro-instructions used to manage several block sizes + */ +#define EXT4_MIN_BLOCK_SIZE 1024 +#define EXT4_MAX_BLOCK_SIZE 65536 +#define EXT4_MIN_BLOCK_LOG_SIZE 10 +#ifdef __KERNEL__ +# define EXT4_BLOCK_SIZE(s) ((s)->s_blocksize) +#else +# define EXT4_BLOCK_SIZE(s) (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size) +#endif +#define EXT4_ADDR_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / sizeof(__u32)) +#ifdef __KERNEL__ +# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) +#else +# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_log_block_size + 10) +#endif +#ifdef __KERNEL__ +#define EXT4_ADDR_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_addr_per_block_bits) +#define EXT4_INODE_SIZE(s) (EXT4_SB(s)->s_inode_size) +#define EXT4_FIRST_INO(s) (EXT4_SB(s)->s_first_ino) +#else +#define EXT4_INODE_SIZE(s) (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \ + EXT4_GOOD_OLD_INODE_SIZE : \ + (s)->s_inode_size) +#define EXT4_FIRST_INO(s) (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \ + EXT4_GOOD_OLD_FIRST_INO : \ + (s)->s_first_ino) +#endif +#define EXT4_BLOCK_ALIGN(size, blkbits) ALIGN((size), (1 << (blkbits))) + +/* + * Structure of a blocks group descriptor + */ +struct ext4_group_desc +{ + __le32 bg_block_bitmap_lo; /* Blocks bitmap block */ + __le32 bg_inode_bitmap_lo; /* Inodes bitmap block */ + __le32 bg_inode_table_lo; /* Inodes table block */ + __le16 bg_free_blocks_count; /* Free blocks count */ + __le16 bg_free_inodes_count; /* Free inodes count */ + __le16 bg_used_dirs_count; /* Directories count */ + __le16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */ + __u32 bg_reserved[2]; /* Likely block/inode bitmap checksum */ + __le16 bg_itable_unused; /* Unused inodes count */ + __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ + __le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */ + __le32 bg_inode_bitmap_hi; /* Inodes bitmap block MSB */ + __le32 bg_inode_table_hi; /* Inodes table block MSB */ + __le16 bg_free_blocks_count_hi;/* Free blocks count MSB */ + __le16 bg_free_inodes_count_hi;/* Free inodes count MSB */ + __le16 bg_used_dirs_count_hi; /* Directories count MSB */ + __le16 bg_itable_unused_hi; /* Unused inodes count MSB */ + __u32 bg_reserved2[3]; +}; + +/* + * Structure of a flex block group info + */ + +struct flex_groups { + __u32 free_inodes; + __u32 free_blocks; +}; + +#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ +#define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */ +#define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */ + +#ifdef __KERNEL__ +#include "ext4_sb.h" +#endif +/* + * Macro-instructions used to manage group descriptors + */ +#define EXT4_MIN_DESC_SIZE 32 +#define EXT4_MIN_DESC_SIZE_64BIT 64 +#define EXT4_MAX_DESC_SIZE EXT4_MIN_BLOCK_SIZE +#define EXT4_DESC_SIZE(s) (EXT4_SB(s)->s_desc_size) +#ifdef __KERNEL__ +# define EXT4_BLOCKS_PER_GROUP(s) (EXT4_SB(s)->s_blocks_per_group) +# define EXT4_DESC_PER_BLOCK(s) (EXT4_SB(s)->s_desc_per_block) +# define EXT4_INODES_PER_GROUP(s) (EXT4_SB(s)->s_inodes_per_group) +# define EXT4_DESC_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_desc_per_block_bits) +#else +# define EXT4_BLOCKS_PER_GROUP(s) ((s)->s_blocks_per_group) +# define EXT4_DESC_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / EXT4_DESC_SIZE(s)) +# define EXT4_INODES_PER_GROUP(s) ((s)->s_inodes_per_group) +#endif + +/* + * Constants relative to the data blocks + */ +#define EXT4_NDIR_BLOCKS 12 +#define EXT4_IND_BLOCK EXT4_NDIR_BLOCKS +#define EXT4_DIND_BLOCK (EXT4_IND_BLOCK + 1) +#define EXT4_TIND_BLOCK (EXT4_DIND_BLOCK + 1) +#define EXT4_N_BLOCKS (EXT4_TIND_BLOCK + 1) + +/* + * Inode flags + */ +#define EXT4_SECRM_FL 0x00000001 /* Secure deletion */ +#define EXT4_UNRM_FL 0x00000002 /* Undelete */ +#define EXT4_COMPR_FL 0x00000004 /* Compress file */ +#define EXT4_SYNC_FL 0x00000008 /* Synchronous updates */ +#define EXT4_IMMUTABLE_FL 0x00000010 /* Immutable file */ +#define EXT4_APPEND_FL 0x00000020 /* writes to file may only append */ +#define EXT4_NODUMP_FL 0x00000040 /* do not dump file */ +#define EXT4_NOATIME_FL 0x00000080 /* do not update atime */ +/* Reserved for compression usage... */ +#define EXT4_DIRTY_FL 0x00000100 +#define EXT4_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ +#define EXT4_NOCOMPR_FL 0x00000400 /* Don't compress */ +#define EXT4_ECOMPR_FL 0x00000800 /* Compression error */ +/* End compression flags --- maybe not all used */ +#define EXT4_INDEX_FL 0x00001000 /* hash-indexed directory */ +#define EXT4_IMAGIC_FL 0x00002000 /* AFS directory */ +#define EXT4_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */ +#define EXT4_NOTAIL_FL 0x00008000 /* file tail should not be merged */ +#define EXT4_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ +#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ +#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ +#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ +#define EXT4_EXT_MIGRATE 0x00100000 /* Inode is migrating */ +#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ + +#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ +#define EXT4_FL_USER_MODIFIABLE 0x000B80FF /* User modifiable flags */ + +/* + * Inode dynamic state flags + */ +#define EXT4_STATE_JDATA 0x00000001 /* journaled data exists */ +#define EXT4_STATE_NEW 0x00000002 /* inode is newly created */ +#define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */ +#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */ + +/* Used to pass group descriptor data when online resize is done */ +struct ext4_new_group_input { + __u32 group; /* Group number for this data */ + __u64 block_bitmap; /* Absolute block number of block bitmap */ + __u64 inode_bitmap; /* Absolute block number of inode bitmap */ + __u64 inode_table; /* Absolute block number of inode table start */ + __u32 blocks_count; /* Total number of blocks in this group */ + __u16 reserved_blocks; /* Number of reserved blocks in this group */ + __u16 unused; +}; + +/* The struct ext4_new_group_input in kernel space, with free_blocks_count */ +struct ext4_new_group_data { + __u32 group; + __u64 block_bitmap; + __u64 inode_bitmap; + __u64 inode_table; + __u32 blocks_count; + __u16 reserved_blocks; + __u16 unused; + __u32 free_blocks_count; +}; + +/* + * Following is used by preallocation code to tell get_blocks() that we + * want uninitialzed extents. + */ +#define EXT4_CREATE_UNINITIALIZED_EXT 2 + +/* + * ioctl commands + */ +#define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS +#define EXT4_IOC_SETFLAGS FS_IOC_SETFLAGS +#define EXT4_IOC_GETVERSION _IOR('f', 3, long) +#define EXT4_IOC_SETVERSION _IOW('f', 4, long) +#define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION +#define EXT4_IOC_SETVERSION_OLD FS_IOC_SETVERSION +#ifdef CONFIG_JBD2_DEBUG +#define EXT4_IOC_WAIT_FOR_READONLY _IOR('f', 99, long) +#endif +#define EXT4_IOC_GETRSVSZ _IOR('f', 5, long) +#define EXT4_IOC_SETRSVSZ _IOW('f', 6, long) +#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) +#define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input) +#define EXT4_IOC_MIGRATE _IO('f', 9) + /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */ + +/* + * ioctl commands in 32 bit emulation + */ +#define EXT4_IOC32_GETFLAGS FS_IOC32_GETFLAGS +#define EXT4_IOC32_SETFLAGS FS_IOC32_SETFLAGS +#define EXT4_IOC32_GETVERSION _IOR('f', 3, int) +#define EXT4_IOC32_SETVERSION _IOW('f', 4, int) +#define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int) +#define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int) +#define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int) +#ifdef CONFIG_JBD2_DEBUG +#define EXT4_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int) +#endif +#define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION +#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION + + +/* + * Mount options + */ +struct ext4_mount_options { + unsigned long s_mount_opt; + uid_t s_resuid; + gid_t s_resgid; + unsigned long s_commit_interval; +#ifdef CONFIG_QUOTA + int s_jquota_fmt; + char *s_qf_names[MAXQUOTAS]; +#endif +}; + +/* + * Structure of an inode on the disk + */ +struct ext4_inode { + __le16 i_mode; /* File mode */ + __le16 i_uid; /* Low 16 bits of Owner Uid */ + __le32 i_size_lo; /* Size in bytes */ + __le32 i_atime; /* Access time */ + __le32 i_ctime; /* Inode Change time */ + __le32 i_mtime; /* Modification time */ + __le32 i_dtime; /* Deletion Time */ + __le16 i_gid; /* Low 16 bits of Group Id */ + __le16 i_links_count; /* Links count */ + __le32 i_blocks_lo; /* Blocks count */ + __le32 i_flags; /* File flags */ + union { + struct { + __le32 l_i_version; + } linux1; + struct { + __u32 h_i_translator; + } hurd1; + struct { + __u32 m_i_reserved1; + } masix1; + } osd1; /* OS dependent 1 */ + __le32 i_block[EXT4_N_BLOCKS];/* Pointers to blocks */ + __le32 i_generation; /* File version (for NFS) */ + __le32 i_file_acl_lo; /* File ACL */ + __le32 i_size_high; + __le32 i_obso_faddr; /* Obsoleted fragment address */ + union { + struct { + __le16 l_i_blocks_high; /* were l_i_reserved1 */ + __le16 l_i_file_acl_high; + __le16 l_i_uid_high; /* these 2 fields */ + __le16 l_i_gid_high; /* were reserved2[0] */ + __u32 l_i_reserved2; + } linux2; + struct { + __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ + __u16 h_i_mode_high; + __u16 h_i_uid_high; + __u16 h_i_gid_high; + __u32 h_i_author; + } hurd2; + struct { + __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ + __le16 m_i_file_acl_high; + __u32 m_i_reserved2[2]; + } masix2; + } osd2; /* OS dependent 2 */ + __le16 i_extra_isize; + __le16 i_pad1; + __le32 i_ctime_extra; /* extra Change time (nsec << 2 | epoch) */ + __le32 i_mtime_extra; /* extra Modification time(nsec << 2 | epoch) */ + __le32 i_atime_extra; /* extra Access time (nsec << 2 | epoch) */ + __le32 i_crtime; /* File Creation time */ + __le32 i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */ + __le32 i_version_hi; /* high 32 bits for 64-bit version */ +}; + + +#define EXT4_EPOCH_BITS 2 +#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1) +#define EXT4_NSEC_MASK (~0UL << EXT4_EPOCH_BITS) + +/* + * Extended fields will fit into an inode if the filesystem was formatted + * with large inodes (-I 256 or larger) and there are not currently any EAs + * consuming all of the available space. For new inodes we always reserve + * enough space for the kernel's known extended fields, but for inodes + * created with an old kernel this might not have been the case. None of + * the extended inode fields is critical for correct filesystem operation. + * This macro checks if a certain field fits in the inode. Note that + * inode-size = GOOD_OLD_INODE_SIZE + i_extra_isize + */ +#define EXT4_FITS_IN_INODE(ext4_inode, einode, field) \ + ((offsetof(typeof(*ext4_inode), field) + \ + sizeof((ext4_inode)->field)) \ + <= (EXT4_GOOD_OLD_INODE_SIZE + \ + (einode)->i_extra_isize)) \ + +static inline __le32 ext4_encode_extra_time(struct timespec *time) +{ + return cpu_to_le32((sizeof(time->tv_sec) > 4 ? + time->tv_sec >> 32 : 0) | + ((time->tv_nsec << 2) & EXT4_NSEC_MASK)); +} + +static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra) +{ + if (sizeof(time->tv_sec) > 4) + time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) + << 32; + time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> 2; +} + +#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \ +do { \ + (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \ + if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) \ + (raw_inode)->xtime ## _extra = \ + ext4_encode_extra_time(&(inode)->xtime); \ +} while (0) + +#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \ +do { \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ + (raw_inode)->xtime = cpu_to_le32((einode)->xtime.tv_sec); \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ + (raw_inode)->xtime ## _extra = \ + ext4_encode_extra_time(&(einode)->xtime); \ +} while (0) + +#define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode) \ +do { \ + (inode)->xtime.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime); \ + if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) \ + ext4_decode_extra_time(&(inode)->xtime, \ + raw_inode->xtime ## _extra); \ +} while (0) + +#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \ +do { \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ + (einode)->xtime.tv_sec = \ + (signed)le32_to_cpu((raw_inode)->xtime); \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ + ext4_decode_extra_time(&(einode)->xtime, \ + raw_inode->xtime ## _extra); \ +} while (0) + +#define i_disk_version osd1.linux1.l_i_version + +#if defined(__KERNEL__) || defined(__linux__) +#define i_reserved1 osd1.linux1.l_i_reserved1 +#define i_file_acl_high osd2.linux2.l_i_file_acl_high +#define i_blocks_high osd2.linux2.l_i_blocks_high +#define i_uid_low i_uid +#define i_gid_low i_gid +#define i_uid_high osd2.linux2.l_i_uid_high +#define i_gid_high osd2.linux2.l_i_gid_high +#define i_reserved2 osd2.linux2.l_i_reserved2 + +#elif defined(__GNU__) + +#define i_translator osd1.hurd1.h_i_translator +#define i_uid_high osd2.hurd2.h_i_uid_high +#define i_gid_high osd2.hurd2.h_i_gid_high +#define i_author osd2.hurd2.h_i_author + +#elif defined(__masix__) + +#define i_reserved1 osd1.masix1.m_i_reserved1 +#define i_file_acl_high osd2.masix2.m_i_file_acl_high +#define i_reserved2 osd2.masix2.m_i_reserved2 + +#endif /* defined(__KERNEL__) || defined(__linux__) */ + +/* + * File system states + */ +#define EXT4_VALID_FS 0x0001 /* Unmounted cleanly */ +#define EXT4_ERROR_FS 0x0002 /* Errors detected */ +#define EXT4_ORPHAN_FS 0x0004 /* Orphans being recovered */ + +/* + * Misc. filesystem flags + */ +#define EXT2_FLAGS_SIGNED_HASH 0x0001 /* Signed dirhash in use */ +#define EXT2_FLAGS_UNSIGNED_HASH 0x0002 /* Unsigned dirhash in use */ +#define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */ + +/* + * Mount flags + */ +#define EXT4_MOUNT_OLDALLOC 0x00002 /* Don't use the new Orlov allocator */ +#define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ +#define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ +#define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ +#define EXT4_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */ +#define EXT4_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */ +#define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ +#define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ +#define EXT4_MOUNT_ABORT 0x00200 /* Fatal error detected */ +#define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ +#define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */ +#define EXT4_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */ +#define EXT4_MOUNT_WRITEBACK_DATA 0x00C00 /* No data ordering */ +#define EXT4_MOUNT_UPDATE_JOURNAL 0x01000 /* Update the journal format */ +#define EXT4_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */ +#define EXT4_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */ +#define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ +#define EXT4_MOUNT_RESERVATION 0x10000 /* Preallocation */ +#define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */ +#define EXT4_MOUNT_NOBH 0x40000 /* No bufferheads */ +#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ +#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ +#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ +#define EXT4_MOUNT_EXTENTS 0x400000 /* Extents support */ +#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ +#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ +#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ +#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ +#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ + +/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */ +#ifndef _LINUX_EXT2_FS_H +#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt +#define set_opt(o, opt) o |= EXT4_MOUNT_##opt +#define test_opt(sb, opt) (EXT4_SB(sb)->s_mount_opt & \ + EXT4_MOUNT_##opt) +#else +#define EXT2_MOUNT_NOLOAD EXT4_MOUNT_NOLOAD +#define EXT2_MOUNT_ABORT EXT4_MOUNT_ABORT +#define EXT2_MOUNT_DATA_FLAGS EXT4_MOUNT_DATA_FLAGS +#endif + +#define ext4_set_bit ext2_set_bit +#define ext4_set_bit_atomic ext2_set_bit_atomic +#define ext4_clear_bit ext2_clear_bit +#define ext4_clear_bit_atomic ext2_clear_bit_atomic +#define ext4_test_bit ext2_test_bit +#define ext4_find_first_zero_bit ext2_find_first_zero_bit +#define ext4_find_next_zero_bit ext2_find_next_zero_bit +#define ext4_find_next_bit ext2_find_next_bit + +/* + * Maximal mount counts between two filesystem checks + */ +#define EXT4_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */ +#define EXT4_DFL_CHECKINTERVAL 0 /* Don't use interval check */ + +/* + * Behaviour when detecting errors + */ +#define EXT4_ERRORS_CONTINUE 1 /* Continue execution */ +#define EXT4_ERRORS_RO 2 /* Remount fs read-only */ +#define EXT4_ERRORS_PANIC 3 /* Panic */ +#define EXT4_ERRORS_DEFAULT EXT4_ERRORS_CONTINUE + +/* + * Structure of the super block + */ +struct ext4_super_block { +/*00*/ __le32 s_inodes_count; /* Inodes count */ + __le32 s_blocks_count_lo; /* Blocks count */ + __le32 s_r_blocks_count_lo; /* Reserved blocks count */ + __le32 s_free_blocks_count_lo; /* Free blocks count */ +/*10*/ __le32 s_free_inodes_count; /* Free inodes count */ + __le32 s_first_data_block; /* First Data Block */ + __le32 s_log_block_size; /* Block size */ + __le32 s_obso_log_frag_size; /* Obsoleted fragment size */ +/*20*/ __le32 s_blocks_per_group; /* # Blocks per group */ + __le32 s_obso_frags_per_group; /* Obsoleted fragments per group */ + __le32 s_inodes_per_group; /* # Inodes per group */ + __le32 s_mtime; /* Mount time */ +/*30*/ __le32 s_wtime; /* Write time */ + __le16 s_mnt_count; /* Mount count */ + __le16 s_max_mnt_count; /* Maximal mount count */ + __le16 s_magic; /* Magic signature */ + __le16 s_state; /* File system state */ + __le16 s_errors; /* Behaviour when detecting errors */ + __le16 s_minor_rev_level; /* minor revision level */ +/*40*/ __le32 s_lastcheck; /* time of last check */ + __le32 s_checkinterval; /* max. time between checks */ + __le32 s_creator_os; /* OS */ + __le32 s_rev_level; /* Revision level */ +/*50*/ __le16 s_def_resuid; /* Default uid for reserved blocks */ + __le16 s_def_resgid; /* Default gid for reserved blocks */ + /* + * These fields are for EXT4_DYNAMIC_REV superblocks only. + * + * Note: the difference between the compatible feature set and + * the incompatible feature set is that if there is a bit set + * in the incompatible feature set that the kernel doesn't + * know about, it should refuse to mount the filesystem. + * + * e2fsck's requirements are more strict; if it doesn't know + * about a feature in either the compatible or incompatible + * feature set, it must abort and not try to meddle with + * things it doesn't understand... + */ + __le32 s_first_ino; /* First non-reserved inode */ + __le16 s_inode_size; /* size of inode structure */ + __le16 s_block_group_nr; /* block group # of this superblock */ + __le32 s_feature_compat; /* compatible feature set */ +/*60*/ __le32 s_feature_incompat; /* incompatible feature set */ + __le32 s_feature_ro_compat; /* readonly-compatible feature set */ +/*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */ +/*78*/ char s_volume_name[16]; /* volume name */ +/*88*/ char s_last_mounted[64]; /* directory where last mounted */ +/*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */ + /* + * Performance hints. Directory preallocation should only + * happen if the EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on. + */ + __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/ + __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */ + __le16 s_reserved_gdt_blocks; /* Per group desc for online growth */ + /* + * Journaling support valid if EXT4_FEATURE_COMPAT_HAS_JOURNAL set. + */ +/*D0*/ __u8 s_journal_uuid[16]; /* uuid of journal superblock */ +/*E0*/ __le32 s_journal_inum; /* inode number of journal file */ + __le32 s_journal_dev; /* device number of journal file */ + __le32 s_last_orphan; /* start of list of inodes to delete */ + __le32 s_hash_seed[4]; /* HTREE hash seed */ + __u8 s_def_hash_version; /* Default hash version to use */ + __u8 s_reserved_char_pad; + __le16 s_desc_size; /* size of group descriptor */ +/*100*/ __le32 s_default_mount_opts; + __le32 s_first_meta_bg; /* First metablock block group */ + __le32 s_mkfs_time; /* When the filesystem was created */ + __le32 s_jnl_blocks[17]; /* Backup of the journal inode */ + /* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */ +/*150*/ __le32 s_blocks_count_hi; /* Blocks count */ + __le32 s_r_blocks_count_hi; /* Reserved blocks count */ + __le32 s_free_blocks_count_hi; /* Free blocks count */ + __le16 s_min_extra_isize; /* All inodes have at least # bytes */ + __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ + __le32 s_flags; /* Miscellaneous flags */ + __le16 s_raid_stride; /* RAID stride */ + __le16 s_mmp_interval; /* # seconds to wait in MMP checking */ + __le64 s_mmp_block; /* Block for multi-mount protection */ + __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ + __u8 s_log_groups_per_flex; /* FLEX_BG group size */ + __u8 s_reserved_char_pad2; + __le16 s_reserved_pad; + __u32 s_reserved[162]; /* Padding to the end of the block */ +}; + +#ifdef __KERNEL__ +static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} +static inline struct ext4_inode_info *EXT4_I(struct inode *inode) +{ + return container_of(inode, struct ext4_inode_info, vfs_inode); +} + +static inline struct timespec ext4_current_time(struct inode *inode) +{ + return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ? + current_fs_time(inode->i_sb) : CURRENT_TIME_SEC; +} + + +static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) +{ + return ino == EXT4_ROOT_INO || + ino == EXT4_JOURNAL_INO || + ino == EXT4_RESIZE_INO || + (ino >= EXT4_FIRST_INO(sb) && + ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); +} +#else +/* Assume that user mode programs are passing in an ext4fs superblock, not + * a kernel struct super_block. This will allow us to call the feature-test + * macros from user land. */ +#define EXT4_SB(sb) (sb) +#endif + +#define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime + +/* + * Codes for operating systems + */ +#define EXT4_OS_LINUX 0 +#define EXT4_OS_HURD 1 +#define EXT4_OS_MASIX 2 +#define EXT4_OS_FREEBSD 3 +#define EXT4_OS_LITES 4 + +/* + * Revision levels + */ +#define EXT4_GOOD_OLD_REV 0 /* The good old (original) format */ +#define EXT4_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */ + +#define EXT4_CURRENT_REV EXT4_GOOD_OLD_REV +#define EXT4_MAX_SUPP_REV EXT4_DYNAMIC_REV + +#define EXT4_GOOD_OLD_INODE_SIZE 128 + +/* + * Feature set definitions + */ + +#define EXT4_HAS_COMPAT_FEATURE(sb,mask) \ + (EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) +#define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \ + (EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask)) +#define EXT4_HAS_INCOMPAT_FEATURE(sb,mask) \ + (EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask)) +#define EXT4_SET_COMPAT_FEATURE(sb,mask) \ + EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask) +#define EXT4_SET_RO_COMPAT_FEATURE(sb,mask) \ + EXT4_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask) +#define EXT4_SET_INCOMPAT_FEATURE(sb,mask) \ + EXT4_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask) +#define EXT4_CLEAR_COMPAT_FEATURE(sb,mask) \ + EXT4_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask) +#define EXT4_CLEAR_RO_COMPAT_FEATURE(sb,mask) \ + EXT4_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask) +#define EXT4_CLEAR_INCOMPAT_FEATURE(sb,mask) \ + EXT4_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask) + +#define EXT4_FEATURE_COMPAT_DIR_PREALLOC 0x0001 +#define EXT4_FEATURE_COMPAT_IMAGIC_INODES 0x0002 +#define EXT4_FEATURE_COMPAT_HAS_JOURNAL 0x0004 +#define EXT4_FEATURE_COMPAT_EXT_ATTR 0x0008 +#define EXT4_FEATURE_COMPAT_RESIZE_INODE 0x0010 +#define EXT4_FEATURE_COMPAT_DIR_INDEX 0x0020 + +#define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 +#define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 +#define EXT4_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 +#define EXT4_FEATURE_RO_COMPAT_HUGE_FILE 0x0008 +#define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010 +#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 +#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 + +#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 +#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 +#define EXT4_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ +#define EXT4_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ +#define EXT4_FEATURE_INCOMPAT_META_BG 0x0010 +#define EXT4_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */ +#define EXT4_FEATURE_INCOMPAT_64BIT 0x0080 +#define EXT4_FEATURE_INCOMPAT_MMP 0x0100 +#define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 + +#define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR +#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ + EXT4_FEATURE_INCOMPAT_RECOVER| \ + EXT4_FEATURE_INCOMPAT_META_BG| \ + EXT4_FEATURE_INCOMPAT_EXTENTS| \ + EXT4_FEATURE_INCOMPAT_64BIT| \ + EXT4_FEATURE_INCOMPAT_FLEX_BG) +#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ + EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \ + EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \ + EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\ + EXT4_FEATURE_RO_COMPAT_HUGE_FILE) + +/* + * Default values for user and/or group using reserved blocks + */ +#define EXT4_DEF_RESUID 0 +#define EXT4_DEF_RESGID 0 + +#define EXT4_DEF_INODE_READAHEAD_BLKS 32 + +/* + * Default mount options + */ +#define EXT4_DEFM_DEBUG 0x0001 +#define EXT4_DEFM_BSDGROUPS 0x0002 +#define EXT4_DEFM_XATTR_USER 0x0004 +#define EXT4_DEFM_ACL 0x0008 +#define EXT4_DEFM_UID16 0x0010 +#define EXT4_DEFM_JMODE 0x0060 +#define EXT4_DEFM_JMODE_DATA 0x0020 +#define EXT4_DEFM_JMODE_ORDERED 0x0040 +#define EXT4_DEFM_JMODE_WBACK 0x0060 + +/* + * Structure of a directory entry + */ +#define EXT4_NAME_LEN 255 + +struct ext4_dir_entry { + __le32 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __le16 name_len; /* Name length */ + char name[EXT4_NAME_LEN]; /* File name */ +}; + +/* + * The new version of the directory entry. Since EXT4 structures are + * stored in intel byte order, and the name_len field could never be + * bigger than 255 chars, it's safe to reclaim the extra byte for the + * file_type field. + */ +struct ext4_dir_entry_2 { + __le32 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __u8 name_len; /* Name length */ + __u8 file_type; + char name[EXT4_NAME_LEN]; /* File name */ +}; + +/* + * Ext4 directory file types. Only the low 3 bits are used. The + * other bits are reserved for now. + */ +#define EXT4_FT_UNKNOWN 0 +#define EXT4_FT_REG_FILE 1 +#define EXT4_FT_DIR 2 +#define EXT4_FT_CHRDEV 3 +#define EXT4_FT_BLKDEV 4 +#define EXT4_FT_FIFO 5 +#define EXT4_FT_SOCK 6 +#define EXT4_FT_SYMLINK 7 + +#define EXT4_FT_MAX 8 + +/* + * EXT4_DIR_PAD defines the directory entries boundaries + * + * NOTE: It must be a multiple of 4 + */ +#define EXT4_DIR_PAD 4 +#define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1) +#define EXT4_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT4_DIR_ROUND) & \ + ~EXT4_DIR_ROUND) +#define EXT4_MAX_REC_LEN ((1<<16)-1) + +static inline unsigned ext4_rec_len_from_disk(__le16 dlen) +{ + unsigned len = le16_to_cpu(dlen); + + if (len == EXT4_MAX_REC_LEN || len == 0) + return 1 << 16; + return len; +} + +static inline __le16 ext4_rec_len_to_disk(unsigned len) +{ + if (len == (1 << 16)) + return cpu_to_le16(EXT4_MAX_REC_LEN); + else if (len > (1 << 16)) + BUG(); + return cpu_to_le16(len); +} + +/* + * Hash Tree Directory indexing + * (c) Daniel Phillips, 2001 + */ + +#define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \ + EXT4_FEATURE_COMPAT_DIR_INDEX) && \ + (EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) +#define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX) +#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) + +/* Legal values for the dx_root hash_version field: */ + +#define DX_HASH_LEGACY 0 +#define DX_HASH_HALF_MD4 1 +#define DX_HASH_TEA 2 +#define DX_HASH_LEGACY_UNSIGNED 3 +#define DX_HASH_HALF_MD4_UNSIGNED 4 +#define DX_HASH_TEA_UNSIGNED 5 + +#ifdef __KERNEL__ + +/* hash info structure used by the directory hash */ +struct dx_hash_info +{ + u32 hash; + u32 minor_hash; + int hash_version; + u32 *seed; +}; + +#define EXT4_HTREE_EOF 0x7fffffff + +/* + * Control parameters used by ext4_htree_next_block + */ +#define HASH_NB_ALWAYS 1 + + +/* + * Describe an inode's exact location on disk and in memory + */ +struct ext4_iloc +{ + struct buffer_head *bh; + unsigned long offset; + ext4_group_t block_group; +}; + +static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc) +{ + return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset); +} + +/* + * This structure is stuffed into the struct file's private_data field + * for directories. It is where we put information so that we can do + * readdir operations in hash tree order. + */ +struct dir_private_info { + struct rb_root root; + struct rb_node *curr_node; + struct fname *extra_fname; + loff_t last_pos; + __u32 curr_hash; + __u32 curr_minor_hash; + __u32 next_hash; +}; + +/* calculate the first block number of the group */ +static inline ext4_fsblk_t +ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no) +{ + return group_no * (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) + + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); +} + +/* + * Special error return code only used by dx_probe() and its callers. + */ +#define ERR_BAD_DX_DIR -75000 + +void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, + unsigned long *blockgrpp, ext4_grpblk_t *offsetp); + +extern struct proc_dir_entry *ext4_proc_root; + +#ifdef CONFIG_PROC_FS +extern const struct file_operations ext4_ui_proc_fops; + +#define EXT4_PROC_HANDLER(name, var) \ +do { \ + proc = proc_create_data(name, mode, sbi->s_proc, \ + &ext4_ui_proc_fops, &sbi->s_##var); \ + if (proc == NULL) { \ + printk(KERN_ERR "EXT4-fs: can't create %s\n", name); \ + goto err_out; \ + } \ +} while (0) +#else +#define EXT4_PROC_HANDLER(name, var) +#endif + +/* + * Function prototypes + */ + +/* + * Ok, these declarations are also in <linux/kernel.h> but none of the + * ext4 source programs needs to include it so they are duplicated here. + */ +# define NORET_TYPE /**/ +# define ATTRIB_NORET __attribute__((noreturn)) +# define NORET_AND noreturn, + +/* balloc.c */ +extern unsigned int ext4_block_group(struct super_block *sb, + ext4_fsblk_t blocknr); +extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb, + ext4_fsblk_t blocknr); +extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group); +extern unsigned long ext4_bg_num_gdb(struct super_block *sb, + ext4_group_t group); +extern ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode, + ext4_fsblk_t goal, int *errp); +extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, + ext4_fsblk_t goal, unsigned long *count, int *errp); +extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode, + ext4_lblk_t iblock, ext4_fsblk_t goal, + unsigned long *count, int *errp); +extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); +extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); +extern void ext4_free_blocks(handle_t *handle, struct inode *inode, + ext4_fsblk_t block, unsigned long count, int metadata); +extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, + ext4_fsblk_t block, unsigned long count); +extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *); +extern void ext4_check_blocks_bitmap(struct super_block *); +extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, + ext4_group_t block_group, + struct buffer_head ** bh); +extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); + +/* dir.c */ +extern int ext4_check_dir_entry(const char *, struct inode *, + struct ext4_dir_entry_2 *, + struct buffer_head *, unsigned long); +extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, + __u32 minor_hash, + struct ext4_dir_entry_2 *dirent); +extern void ext4_htree_free_dir_info(struct dir_private_info *p); + +/* fsync.c */ +extern int ext4_sync_file(struct file *, struct dentry *, int); + +/* hash.c */ +extern int ext4fs_dirhash(const char *name, int len, struct + dx_hash_info *hinfo); + +/* ialloc.c */ +extern struct inode * ext4_new_inode(handle_t *, struct inode *, int); +extern void ext4_free_inode(handle_t *, struct inode *); +extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); +extern unsigned long ext4_count_free_inodes(struct super_block *); +extern unsigned long ext4_count_dirs(struct super_block *); +extern void ext4_check_inodes_bitmap(struct super_block *); +extern unsigned long ext4_count_free(struct buffer_head *, unsigned); + +/* mballoc.c */ +extern long ext4_mb_stats; +extern long ext4_mb_max_to_scan; +extern int ext4_mb_init(struct super_block *, int); +extern int ext4_mb_release(struct super_block *); +extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, + struct ext4_allocation_request *, int *); +extern int ext4_mb_reserve_blocks(struct super_block *, int); +extern void ext4_discard_preallocations(struct inode *); +extern int __init init_ext4_mballoc(void); +extern void exit_ext4_mballoc(void); +extern void ext4_mb_free_blocks(handle_t *, struct inode *, + unsigned long, unsigned long, int, unsigned long *); +extern int ext4_mb_add_groupinfo(struct super_block *sb, + ext4_group_t i, struct ext4_group_desc *desc); +extern void ext4_mb_update_group_info(struct ext4_group_info *grp, + ext4_grpblk_t add); +extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t); +extern void ext4_mb_put_buddy_cache_lock(struct super_block *, + ext4_group_t, int); +/* inode.c */ +int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, + struct buffer_head *bh, ext4_fsblk_t blocknr); +struct buffer_head *ext4_getblk(handle_t *, struct inode *, + ext4_lblk_t, int, int *); +struct buffer_head *ext4_bread(handle_t *, struct inode *, + ext4_lblk_t, int, int *); +int ext4_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); +int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, + ext4_lblk_t iblock, unsigned long maxblocks, + struct buffer_head *bh_result, + int create, int extend_disksize); + +extern struct inode *ext4_iget(struct super_block *, unsigned long); +extern int ext4_write_inode(struct inode *, int); +extern int ext4_setattr(struct dentry *, struct iattr *); +extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat); +extern void ext4_delete_inode(struct inode *); +extern int ext4_sync_inode(handle_t *, struct inode *); +extern void ext4_dirty_inode(struct inode *); +extern int ext4_change_inode_journal_flag(struct inode *, int); +extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); +extern int ext4_can_truncate(struct inode *inode); +extern void ext4_truncate(struct inode *); +extern void ext4_set_inode_flags(struct inode *); +extern void ext4_get_inode_flags(struct ext4_inode_info *); +extern void ext4_set_aops(struct inode *inode); +extern int ext4_writepage_trans_blocks(struct inode *); +extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks); +extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); +extern int ext4_block_truncate_page(handle_t *handle, + struct address_space *mapping, loff_t from); +extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page); + +/* ioctl.c */ +extern long ext4_ioctl(struct file *, unsigned int, unsigned long); +extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); + +/* migrate.c */ +extern int ext4_ext_migrate(struct inode *); +/* namei.c */ +extern int ext4_orphan_add(handle_t *, struct inode *); +extern int ext4_orphan_del(handle_t *, struct inode *); +extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, + __u32 start_minor_hash, __u32 *next_hash); + +/* resize.c */ +extern int ext4_group_add(struct super_block *sb, + struct ext4_new_group_data *input); +extern int ext4_group_extend(struct super_block *sb, + struct ext4_super_block *es, + ext4_fsblk_t n_blocks_count); + +/* super.c */ +extern void ext4_error(struct super_block *, const char *, const char *, ...) + __attribute__ ((format (printf, 3, 4))); +extern void __ext4_std_error(struct super_block *, const char *, int); +extern void ext4_abort(struct super_block *, const char *, const char *, ...) + __attribute__ ((format (printf, 3, 4))); +extern void ext4_warning(struct super_block *, const char *, const char *, ...) + __attribute__ ((format (printf, 3, 4))); +extern void ext4_update_dynamic_rev(struct super_block *sb); +extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, + __u32 compat); +extern int ext4_update_rocompat_feature(handle_t *handle, + struct super_block *sb, __u32 rocompat); +extern int ext4_update_incompat_feature(handle_t *handle, + struct super_block *sb, __u32 incompat); +extern ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, + struct ext4_group_desc *bg); +extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, + struct ext4_group_desc *bg); +extern ext4_fsblk_t ext4_inode_table(struct super_block *sb, + struct ext4_group_desc *bg); +extern void ext4_block_bitmap_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk); +extern void ext4_inode_bitmap_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk); +extern void ext4_inode_table_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk); + +static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) +{ + return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) | + le32_to_cpu(es->s_blocks_count_lo); +} + +static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es) +{ + return ((ext4_fsblk_t)le32_to_cpu(es->s_r_blocks_count_hi) << 32) | + le32_to_cpu(es->s_r_blocks_count_lo); +} + +static inline ext4_fsblk_t ext4_free_blocks_count(struct ext4_super_block *es) +{ + return ((ext4_fsblk_t)le32_to_cpu(es->s_free_blocks_count_hi) << 32) | + le32_to_cpu(es->s_free_blocks_count_lo); +} + +static inline void ext4_blocks_count_set(struct ext4_super_block *es, + ext4_fsblk_t blk) +{ + es->s_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_blocks_count_hi = cpu_to_le32(blk >> 32); +} + +static inline void ext4_free_blocks_count_set(struct ext4_super_block *es, + ext4_fsblk_t blk) +{ + es->s_free_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_free_blocks_count_hi = cpu_to_le32(blk >> 32); +} + +static inline void ext4_r_blocks_count_set(struct ext4_super_block *es, + ext4_fsblk_t blk) +{ + es->s_r_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); +} + +static inline loff_t ext4_isize(struct ext4_inode *raw_inode) +{ + if (S_ISREG(le16_to_cpu(raw_inode->i_mode))) + return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | + le32_to_cpu(raw_inode->i_size_lo); + else + return (loff_t) le32_to_cpu(raw_inode->i_size_lo); +} + +static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) +{ + raw_inode->i_size_lo = cpu_to_le32(i_size); + raw_inode->i_size_high = cpu_to_le32(i_size >> 32); +} + +static inline +struct ext4_group_info *ext4_get_group_info(struct super_block *sb, + ext4_group_t group) +{ + struct ext4_group_info ***grp_info; + long indexv, indexh; + grp_info = EXT4_SB(sb)->s_group_info; + indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb)); + indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1); + return grp_info[indexv][indexh]; +} + + +static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi, + ext4_group_t block_group) +{ + return block_group >> sbi->s_log_groups_per_flex; +} + +static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi) +{ + return 1 << sbi->s_log_groups_per_flex; +} + +#define ext4_std_error(sb, errno) \ +do { \ + if ((errno)) \ + __ext4_std_error((sb), __func__, (errno)); \ +} while (0) + +#ifdef CONFIG_SMP +/* Each CPU can accumulate FBC_BATCH blocks in their local + * counters. So we need to make sure we have free blocks more + * than FBC_BATCH * nr_cpu_ids. Also add a window of 4 times. + */ +#define EXT4_FREEBLOCKS_WATERMARK (4 * (FBC_BATCH * nr_cpu_ids)) +#else +#define EXT4_FREEBLOCKS_WATERMARK 0 +#endif + +static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) +{ + /* + * XXX: replace with spinlock if seen contended -bzzz + */ + down_write(&EXT4_I(inode)->i_data_sem); + if (newsize > EXT4_I(inode)->i_disksize) + EXT4_I(inode)->i_disksize = newsize; + up_write(&EXT4_I(inode)->i_data_sem); + return ; +} + +/* + * Inodes and files operations + */ + +/* dir.c */ +extern const struct file_operations ext4_dir_operations; + +/* file.c */ +extern const struct inode_operations ext4_file_inode_operations; +extern const struct file_operations ext4_file_operations; + +/* namei.c */ +extern const struct inode_operations ext4_dir_inode_operations; +extern const struct inode_operations ext4_special_inode_operations; + +/* symlink.c */ +extern const struct inode_operations ext4_symlink_inode_operations; +extern const struct inode_operations ext4_fast_symlink_inode_operations; + +/* extents.c */ +extern int ext4_ext_tree_init(handle_t *handle, struct inode *); +extern int ext4_ext_writepage_trans_blocks(struct inode *, int); +extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, + int chunk); +extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, + ext4_lblk_t iblock, + unsigned long max_blocks, struct buffer_head *bh_result, + int create, int extend_disksize); +extern void ext4_ext_truncate(struct inode *); +extern void ext4_ext_init(struct super_block *); +extern void ext4_ext_release(struct super_block *); +extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, + loff_t len); +extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, + sector_t block, unsigned long max_blocks, + struct buffer_head *bh, int create, + int extend_disksize, int flag); + +/* + * Add new method to test wether block and inode bitmaps are properly + * initialized. With uninit_bg reading the block from disk is not enough + * to mark the bitmap uptodate. We need to also zero-out the bitmap + */ +#define BH_BITMAP_UPTODATE BH_JBDPrivateStart + +static inline int bitmap_uptodate(struct buffer_head *bh) +{ + return (buffer_uptodate(bh) && + test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state)); +} +static inline void set_bitmap_uptodate(struct buffer_head *bh) +{ + set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); +} + +#endif /* __KERNEL__ */ + +#endif /* _EXT4_H */ diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h new file mode 100644 index 0000000..bec7ce5 --- /dev/null +++ b/fs/ext4/ext4_extents.h @@ -0,0 +1,250 @@ +/* + * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas <alex@clusterfs.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public Licens + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- + */ + +#ifndef _EXT4_EXTENTS +#define _EXT4_EXTENTS + +#include "ext4.h" + +/* + * With AGGRESSIVE_TEST defined, the capacity of index/leaf blocks + * becomes very small, so index split, in-depth growing and + * other hard changes happen much more often. + * This is for debug purposes only. + */ +#define AGGRESSIVE_TEST_ + +/* + * With EXTENTS_STATS defined, the number of blocks and extents + * are collected in the truncate path. They'll be shown at + * umount time. + */ +#define EXTENTS_STATS__ + +/* + * If CHECK_BINSEARCH is defined, then the results of the binary search + * will also be checked by linear search. + */ +#define CHECK_BINSEARCH__ + +/* + * If EXT_DEBUG is defined you can use the 'extdebug' mount option + * to get lots of info about what's going on. + */ +#define EXT_DEBUG__ +#ifdef EXT_DEBUG +#define ext_debug(a...) printk(a) +#else +#define ext_debug(a...) +#endif + +/* + * If EXT_STATS is defined then stats numbers are collected. + * These number will be displayed at umount time. + */ +#define EXT_STATS_ + + +/* + * ext4_inode has i_block array (60 bytes total). + * The first 12 bytes store ext4_extent_header; + * the remainder stores an array of ext4_extent. + */ + +/* + * This is the extent on-disk structure. + * It's used at the bottom of the tree. + */ +struct ext4_extent { + __le32 ee_block; /* first logical block extent covers */ + __le16 ee_len; /* number of blocks covered by extent */ + __le16 ee_start_hi; /* high 16 bits of physical block */ + __le32 ee_start_lo; /* low 32 bits of physical block */ +}; + +/* + * This is index on-disk structure. + * It's used at all the levels except the bottom. + */ +struct ext4_extent_idx { + __le32 ei_block; /* index covers logical blocks from 'block' */ + __le32 ei_leaf_lo; /* pointer to the physical block of the next * + * level. leaf or next index could be there */ + __le16 ei_leaf_hi; /* high 16 bits of physical block */ + __u16 ei_unused; +}; + +/* + * Each block (leaves and indexes), even inode-stored has header. + */ +struct ext4_extent_header { + __le16 eh_magic; /* probably will support different formats */ + __le16 eh_entries; /* number of valid entries */ + __le16 eh_max; /* capacity of store in entries */ + __le16 eh_depth; /* has tree real underlying blocks? */ + __le32 eh_generation; /* generation of the tree */ +}; + +#define EXT4_EXT_MAGIC cpu_to_le16(0xf30a) + +/* + * Array of ext4_ext_path contains path to some extent. + * Creation/lookup routines use it for traversal/splitting/etc. + * Truncate uses it to simulate recursive walking. + */ +struct ext4_ext_path { + ext4_fsblk_t p_block; + __u16 p_depth; + struct ext4_extent *p_ext; + struct ext4_extent_idx *p_idx; + struct ext4_extent_header *p_hdr; + struct buffer_head *p_bh; +}; + +/* + * structure for external API + */ + +#define EXT4_EXT_CACHE_NO 0 +#define EXT4_EXT_CACHE_GAP 1 +#define EXT4_EXT_CACHE_EXTENT 2 + +/* + * to be called by ext4_ext_walk_space() + * negative retcode - error + * positive retcode - signal for ext4_ext_walk_space(), see below + * callback must return valid extent (passed or newly created) + */ +typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *, + struct ext4_ext_cache *, + struct ext4_extent *, void *); + +#define EXT_CONTINUE 0 +#define EXT_BREAK 1 +#define EXT_REPEAT 2 + +#define EXT_MAX_BLOCK 0xffffffff + +/* + * EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an + * initialized extent. This is 2^15 and not (2^16 - 1), since we use the + * MSB of ee_len field in the extent datastructure to signify if this + * particular extent is an initialized extent or an uninitialized (i.e. + * preallocated). + * EXT_UNINIT_MAX_LEN is the maximum number of blocks we can have in an + * uninitialized extent. + * If ee_len is <= 0x8000, it is an initialized extent. Otherwise, it is an + * uninitialized one. In other words, if MSB of ee_len is set, it is an + * uninitialized extent with only one special scenario when ee_len = 0x8000. + * In this case we can not have an uninitialized extent of zero length and + * thus we make it as a special case of initialized extent with 0x8000 length. + * This way we get better extent-to-group alignment for initialized extents. + * Hence, the maximum number of blocks we can have in an *initialized* + * extent is 2^15 (32768) and in an *uninitialized* extent is 2^15-1 (32767). + */ +#define EXT_INIT_MAX_LEN (1UL << 15) +#define EXT_UNINIT_MAX_LEN (EXT_INIT_MAX_LEN - 1) + + +#define EXT_FIRST_EXTENT(__hdr__) \ + ((struct ext4_extent *) (((char *) (__hdr__)) + \ + sizeof(struct ext4_extent_header))) +#define EXT_FIRST_INDEX(__hdr__) \ + ((struct ext4_extent_idx *) (((char *) (__hdr__)) + \ + sizeof(struct ext4_extent_header))) +#define EXT_HAS_FREE_INDEX(__path__) \ + (le16_to_cpu((__path__)->p_hdr->eh_entries) \ + < le16_to_cpu((__path__)->p_hdr->eh_max)) +#define EXT_LAST_EXTENT(__hdr__) \ + (EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1) +#define EXT_LAST_INDEX(__hdr__) \ + (EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1) +#define EXT_MAX_EXTENT(__hdr__) \ + (EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1) +#define EXT_MAX_INDEX(__hdr__) \ + (EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1) + +static inline struct ext4_extent_header *ext_inode_hdr(struct inode *inode) +{ + return (struct ext4_extent_header *) EXT4_I(inode)->i_data; +} + +static inline struct ext4_extent_header *ext_block_hdr(struct buffer_head *bh) +{ + return (struct ext4_extent_header *) bh->b_data; +} + +static inline unsigned short ext_depth(struct inode *inode) +{ + return le16_to_cpu(ext_inode_hdr(inode)->eh_depth); +} + +static inline void ext4_ext_tree_changed(struct inode *inode) +{ + EXT4_I(inode)->i_ext_generation++; +} + +static inline void +ext4_ext_invalidate_cache(struct inode *inode) +{ + EXT4_I(inode)->i_cached_extent.ec_type = EXT4_EXT_CACHE_NO; +} + +static inline void ext4_ext_mark_uninitialized(struct ext4_extent *ext) +{ + /* We can not have an uninitialized extent of zero length! */ + BUG_ON((le16_to_cpu(ext->ee_len) & ~EXT_INIT_MAX_LEN) == 0); + ext->ee_len |= cpu_to_le16(EXT_INIT_MAX_LEN); +} + +static inline int ext4_ext_is_uninitialized(struct ext4_extent *ext) +{ + /* Extent with ee_len of 0x8000 is treated as an initialized extent */ + return (le16_to_cpu(ext->ee_len) > EXT_INIT_MAX_LEN); +} + +static inline int ext4_ext_get_actual_len(struct ext4_extent *ext) +{ + return (le16_to_cpu(ext->ee_len) <= EXT_INIT_MAX_LEN ? + le16_to_cpu(ext->ee_len) : + (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN)); +} + +extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks); +extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); +extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t); +extern int ext4_extent_tree_init(handle_t *, struct inode *); +extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, + int num, + struct ext4_ext_path *path); +extern int ext4_ext_try_to_merge(struct inode *inode, + struct ext4_ext_path *path, + struct ext4_extent *); +extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *); +extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *); +extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t, + ext_prepare_callback, void *); +extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, + struct ext4_ext_path *); +extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *, + ext4_lblk_t *, ext4_fsblk_t *); +extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *, + ext4_lblk_t *, ext4_fsblk_t *); +extern void ext4_ext_drop_refs(struct ext4_ext_path *); +#endif /* _EXT4_EXTENTS */ + diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h new file mode 100644 index 0000000..5c124c0 --- /dev/null +++ b/fs/ext4/ext4_i.h @@ -0,0 +1,140 @@ +/* + * ext4_i.h + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/include/linux/minix_fs_i.h + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#ifndef _EXT4_I +#define _EXT4_I + +#include <linux/rwsem.h> +#include <linux/rbtree.h> +#include <linux/seqlock.h> +#include <linux/mutex.h> + +/* data type for block offset of block group */ +typedef int ext4_grpblk_t; + +/* data type for filesystem-wide blocks number */ +typedef unsigned long long ext4_fsblk_t; + +/* data type for file logical block number */ +typedef __u32 ext4_lblk_t; + +/* data type for block group number */ +typedef unsigned long ext4_group_t; + +#define rsv_start rsv_window._rsv_start +#define rsv_end rsv_window._rsv_end + +/* + * storage for cached extent + */ +struct ext4_ext_cache { + ext4_fsblk_t ec_start; + ext4_lblk_t ec_block; + __u32 ec_len; /* must be 32bit to return holes */ + __u32 ec_type; +}; + +/* + * fourth extended file system inode data in memory + */ +struct ext4_inode_info { + __le32 i_data[15]; /* unconverted */ + __u32 i_flags; + ext4_fsblk_t i_file_acl; + __u32 i_dtime; + + /* + * i_block_group is the number of the block group which contains + * this file's inode. Constant across the lifetime of the inode, + * it is ued for making block allocation decisions - we try to + * place a file's data blocks near its inode block, and new inodes + * near to their parent directory's inode. + */ + ext4_group_t i_block_group; + __u32 i_state; /* Dynamic state flags for ext4 */ + + ext4_lblk_t i_dir_start_lookup; +#ifdef CONFIG_EXT4_FS_XATTR + /* + * Extended attributes can be read independently of the main file + * data. Taking i_mutex even when reading would cause contention + * between readers of EAs and writers of regular file data, so + * instead we synchronize on xattr_sem when reading or changing + * EAs. + */ + struct rw_semaphore xattr_sem; +#endif +#ifdef CONFIG_EXT4_FS_POSIX_ACL + struct posix_acl *i_acl; + struct posix_acl *i_default_acl; +#endif + + struct list_head i_orphan; /* unlinked but open inodes */ + + /* + * i_disksize keeps track of what the inode size is ON DISK, not + * in memory. During truncate, i_size is set to the new size by + * the VFS prior to calling ext4_truncate(), but the filesystem won't + * set i_disksize to 0 until the truncate is actually under way. + * + * The intent is that i_disksize always represents the blocks which + * are used by this file. This allows recovery to restart truncate + * on orphans if we crash during truncate. We actually write i_disksize + * into the on-disk inode when writing inodes out, instead of i_size. + * + * The only time when i_disksize and i_size may be different is when + * a truncate is in progress. The only things which change i_disksize + * are ext4_get_block (growth) and ext4_truncate (shrinkth). + */ + loff_t i_disksize; + + /* on-disk additional length */ + __u16 i_extra_isize; + + /* + * i_data_sem is for serialising ext4_truncate() against + * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's + * data tree are chopped off during truncate. We can't do that in + * ext4 because whenever we perform intermediate commits during + * truncate, the inode and all the metadata blocks *must* be in a + * consistent state which allows truncation of the orphans to restart + * during recovery. Hence we must fix the get_block-vs-truncate race + * by other means, so we have i_data_sem. + */ + struct rw_semaphore i_data_sem; + struct inode vfs_inode; + struct jbd2_inode jinode; + + unsigned long i_ext_generation; + struct ext4_ext_cache i_cached_extent; + /* + * File creation time. Its function is same as that of + * struct timespec i_{a,c,m}time in the generic inode. + */ + struct timespec i_crtime; + + /* mballoc */ + struct list_head i_prealloc_list; + spinlock_t i_prealloc_lock; + + /* allocation reservation info for delalloc */ + unsigned long i_reserved_data_blocks; + unsigned long i_reserved_meta_blocks; + unsigned long i_allocated_meta_blocks; + unsigned short i_delalloc_reserved_flag; + spinlock_t i_block_reservation_lock; +}; + +#endif /* _EXT4_I */ diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c new file mode 100644 index 0000000..c75384b --- /dev/null +++ b/fs/ext4/ext4_jbd2.c @@ -0,0 +1,59 @@ +/* + * Interface between ext4 and JBD + */ + +#include "ext4_jbd2.h" + +int __ext4_journal_get_undo_access(const char *where, handle_t *handle, + struct buffer_head *bh) +{ + int err = jbd2_journal_get_undo_access(handle, bh); + if (err) + ext4_journal_abort_handle(where, __func__, bh, handle, err); + return err; +} + +int __ext4_journal_get_write_access(const char *where, handle_t *handle, + struct buffer_head *bh) +{ + int err = jbd2_journal_get_write_access(handle, bh); + if (err) + ext4_journal_abort_handle(where, __func__, bh, handle, err); + return err; +} + +int __ext4_journal_forget(const char *where, handle_t *handle, + struct buffer_head *bh) +{ + int err = jbd2_journal_forget(handle, bh); + if (err) + ext4_journal_abort_handle(where, __func__, bh, handle, err); + return err; +} + +int __ext4_journal_revoke(const char *where, handle_t *handle, + ext4_fsblk_t blocknr, struct buffer_head *bh) +{ + int err = jbd2_journal_revoke(handle, blocknr, bh); + if (err) + ext4_journal_abort_handle(where, __func__, bh, handle, err); + return err; +} + +int __ext4_journal_get_create_access(const char *where, + handle_t *handle, struct buffer_head *bh) +{ + int err = jbd2_journal_get_create_access(handle, bh); + if (err) + ext4_journal_abort_handle(where, __func__, bh, handle, err); + return err; +} + +int __ext4_journal_dirty_metadata(const char *where, + handle_t *handle, struct buffer_head *bh) +{ + int err = jbd2_journal_dirty_metadata(handle, bh); + if (err) + ext4_journal_abort_handle(where, __func__, bh, handle, err); + return err; +} diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h new file mode 100644 index 0000000..b455c68 --- /dev/null +++ b/fs/ext4/ext4_jbd2.h @@ -0,0 +1,242 @@ +/* + * ext4_jbd2.h + * + * Written by Stephen C. Tweedie <sct@redhat.com>, 1999 + * + * Copyright 1998--1999 Red Hat corp --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Ext4-specific journaling extensions. + */ + +#ifndef _EXT4_JBD2_H +#define _EXT4_JBD2_H + +#include <linux/fs.h> +#include <linux/jbd2.h> +#include "ext4.h" + +#define EXT4_JOURNAL(inode) (EXT4_SB((inode)->i_sb)->s_journal) + +/* Define the number of blocks we need to account to a transaction to + * modify one block of data. + * + * We may have to touch one inode, one bitmap buffer, up to three + * indirection blocks, the group and superblock summaries, and the data + * block to complete the transaction. + * + * For extents-enabled fs we may have to allocate and modify up to + * 5 levels of tree + root which are stored in the inode. */ + +#define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \ + (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \ + || test_opt(sb, EXTENTS) ? 27U : 8U) + +/* Extended attribute operations touch at most two data buffers, + * two bitmap buffers, and two group summaries, in addition to the inode + * and the superblock, which are already accounted for. */ + +#define EXT4_XATTR_TRANS_BLOCKS 6U + +/* Define the minimum size for a transaction which modifies data. This + * needs to take into account the fact that we may end up modifying two + * quota files too (one for the group, one for the user quota). The + * superblock only gets updated once, of course, so don't bother + * counting that again for the quota updates. */ + +#define EXT4_DATA_TRANS_BLOCKS(sb) (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \ + EXT4_XATTR_TRANS_BLOCKS - 2 + \ + 2*EXT4_QUOTA_TRANS_BLOCKS(sb)) + +/* + * Define the number of metadata blocks we need to account to modify data. + * + * This include super block, inode block, quota blocks and xattr blocks + */ +#define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \ + 2*EXT4_QUOTA_TRANS_BLOCKS(sb)) + +/* Delete operations potentially hit one directory's namespace plus an + * entire inode, plus arbitrary amounts of bitmap/indirection data. Be + * generous. We can grow the delete transaction later if necessary. */ + +#define EXT4_DELETE_TRANS_BLOCKS(sb) (2 * EXT4_DATA_TRANS_BLOCKS(sb) + 64) + +/* Define an arbitrary limit for the amount of data we will anticipate + * writing to any given transaction. For unbounded transactions such as + * write(2) and truncate(2) we can write more than this, but we always + * start off at the maximum transaction size and grow the transaction + * optimistically as we go. */ + +#define EXT4_MAX_TRANS_DATA 64U + +/* We break up a large truncate or write transaction once the handle's + * buffer credits gets this low, we need either to extend the + * transaction or to start a new one. Reserve enough space here for + * inode, bitmap, superblock, group and indirection updates for at least + * one block, plus two quota updates. Quota allocations are not + * needed. */ + +#define EXT4_RESERVE_TRANS_BLOCKS 12U + +#define EXT4_INDEX_EXTRA_TRANS_BLOCKS 8 + +#ifdef CONFIG_QUOTA +/* Amount of blocks needed for quota update - we know that the structure was + * allocated so we need to update only inode+data */ +#define EXT4_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 2 : 0) +/* Amount of blocks needed for quota insert/delete - we do some block writes + * but inode, sb and group updates are done only once */ +#define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\ + (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0) +#define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\ + (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0) +#else +#define EXT4_QUOTA_TRANS_BLOCKS(sb) 0 +#define EXT4_QUOTA_INIT_BLOCKS(sb) 0 +#define EXT4_QUOTA_DEL_BLOCKS(sb) 0 +#endif + +int +ext4_mark_iloc_dirty(handle_t *handle, + struct inode *inode, + struct ext4_iloc *iloc); + +/* + * On success, We end up with an outstanding reference count against + * iloc->bh. This _must_ be cleaned up later. + */ + +int ext4_reserve_inode_write(handle_t *handle, struct inode *inode, + struct ext4_iloc *iloc); + +int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode); + +/* + * Wrapper functions with which ext4 calls into JBD. The intent here is + * to allow these to be turned into appropriate stubs so ext4 can control + * ext2 filesystems, so ext2+ext4 systems only nee one fs. This work hasn't + * been done yet. + */ + +static inline void ext4_journal_release_buffer(handle_t *handle, + struct buffer_head *bh) +{ + jbd2_journal_release_buffer(handle, bh); +} + +void ext4_journal_abort_handle(const char *caller, const char *err_fn, + struct buffer_head *bh, handle_t *handle, int err); + +int __ext4_journal_get_undo_access(const char *where, handle_t *handle, + struct buffer_head *bh); + +int __ext4_journal_get_write_access(const char *where, handle_t *handle, + struct buffer_head *bh); + +int __ext4_journal_forget(const char *where, handle_t *handle, + struct buffer_head *bh); + +int __ext4_journal_revoke(const char *where, handle_t *handle, + ext4_fsblk_t blocknr, struct buffer_head *bh); + +int __ext4_journal_get_create_access(const char *where, + handle_t *handle, struct buffer_head *bh); + +int __ext4_journal_dirty_metadata(const char *where, + handle_t *handle, struct buffer_head *bh); + +#define ext4_journal_get_undo_access(handle, bh) \ + __ext4_journal_get_undo_access(__func__, (handle), (bh)) +#define ext4_journal_get_write_access(handle, bh) \ + __ext4_journal_get_write_access(__func__, (handle), (bh)) +#define ext4_journal_revoke(handle, blocknr, bh) \ + __ext4_journal_revoke(__func__, (handle), (blocknr), (bh)) +#define ext4_journal_get_create_access(handle, bh) \ + __ext4_journal_get_create_access(__func__, (handle), (bh)) +#define ext4_journal_dirty_metadata(handle, bh) \ + __ext4_journal_dirty_metadata(__func__, (handle), (bh)) +#define ext4_journal_forget(handle, bh) \ + __ext4_journal_forget(__func__, (handle), (bh)) + +handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); +int __ext4_journal_stop(const char *where, handle_t *handle); + +static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks) +{ + return ext4_journal_start_sb(inode->i_sb, nblocks); +} + +#define ext4_journal_stop(handle) \ + __ext4_journal_stop(__func__, (handle)) + +static inline handle_t *ext4_journal_current_handle(void) +{ + return journal_current_handle(); +} + +static inline int ext4_journal_extend(handle_t *handle, int nblocks) +{ + return jbd2_journal_extend(handle, nblocks); +} + +static inline int ext4_journal_restart(handle_t *handle, int nblocks) +{ + return jbd2_journal_restart(handle, nblocks); +} + +static inline int ext4_journal_blocks_per_page(struct inode *inode) +{ + return jbd2_journal_blocks_per_page(inode); +} + +static inline int ext4_journal_force_commit(journal_t *journal) +{ + return jbd2_journal_force_commit(journal); +} + +static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) +{ + return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode); +} + +/* super.c */ +int ext4_force_commit(struct super_block *sb); + +static inline int ext4_should_journal_data(struct inode *inode) +{ + if (!S_ISREG(inode->i_mode)) + return 1; + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) + return 1; + if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) + return 1; + return 0; +} + +static inline int ext4_should_order_data(struct inode *inode) +{ + if (!S_ISREG(inode->i_mode)) + return 0; + if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) + return 0; + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) + return 1; + return 0; +} + +static inline int ext4_should_writeback_data(struct inode *inode) +{ + if (!S_ISREG(inode->i_mode)) + return 0; + if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) + return 0; + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) + return 1; + return 0; +} + +#endif /* _EXT4_JBD2_H */ diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h new file mode 100644 index 0000000..f00f112 --- /dev/null +++ b/fs/ext4/ext4_sb.h @@ -0,0 +1,151 @@ +/* + * ext4_sb.h + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/include/linux/minix_fs_sb.h + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#ifndef _EXT4_SB +#define _EXT4_SB + +#ifdef __KERNEL__ +#include <linux/timer.h> +#include <linux/wait.h> +#include <linux/blockgroup_lock.h> +#include <linux/percpu_counter.h> +#endif +#include <linux/rbtree.h> + +/* + * fourth extended-fs super-block data in memory + */ +struct ext4_sb_info { + unsigned long s_desc_size; /* Size of a group descriptor in bytes */ + unsigned long s_inodes_per_block;/* Number of inodes per block */ + unsigned long s_blocks_per_group;/* Number of blocks in a group */ + unsigned long s_inodes_per_group;/* Number of inodes in a group */ + unsigned long s_itb_per_group; /* Number of inode table blocks per group */ + unsigned long s_gdb_count; /* Number of group descriptor blocks */ + unsigned long s_desc_per_block; /* Number of group descriptors per block */ + ext4_group_t s_groups_count; /* Number of groups in the fs */ + unsigned long s_overhead_last; /* Last calculated overhead */ + unsigned long s_blocks_last; /* Last seen block count */ + loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ + struct buffer_head * s_sbh; /* Buffer containing the super block */ + struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ + struct buffer_head **s_group_desc; + unsigned long s_mount_opt; + ext4_fsblk_t s_sb_block; + uid_t s_resuid; + gid_t s_resgid; + unsigned short s_mount_state; + unsigned short s_pad; + int s_addr_per_block_bits; + int s_desc_per_block_bits; + int s_inode_size; + int s_first_ino; + unsigned int s_inode_readahead_blks; + spinlock_t s_next_gen_lock; + u32 s_next_generation; + u32 s_hash_seed[4]; + int s_def_hash_version; + int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */ + struct percpu_counter s_freeblocks_counter; + struct percpu_counter s_freeinodes_counter; + struct percpu_counter s_dirs_counter; + struct percpu_counter s_dirtyblocks_counter; + struct blockgroup_lock s_blockgroup_lock; + struct proc_dir_entry *s_proc; + + /* root of the per fs reservation window tree */ + spinlock_t s_rsv_window_lock; + struct rb_root s_rsv_window_root; + + /* Journaling */ + struct inode *s_journal_inode; + struct journal_s *s_journal; + struct list_head s_orphan; + unsigned long s_commit_interval; + struct block_device *journal_bdev; +#ifdef CONFIG_JBD2_DEBUG + struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ + wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ +#endif +#ifdef CONFIG_QUOTA + char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ + int s_jquota_fmt; /* Format of quota to use */ +#endif + unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ + +#ifdef EXTENTS_STATS + /* ext4 extents stats */ + unsigned long s_ext_min; + unsigned long s_ext_max; + unsigned long s_depth_max; + spinlock_t s_ext_stats_lock; + unsigned long s_ext_blocks; + unsigned long s_ext_extents; +#endif + + /* for buddy allocator */ + struct ext4_group_info ***s_group_info; + struct inode *s_buddy_cache; + long s_blocks_reserved; + spinlock_t s_reserve_lock; + spinlock_t s_md_lock; + tid_t s_last_transaction; + unsigned short *s_mb_offsets; + unsigned int *s_mb_maxs; + + /* tunables */ + unsigned long s_stripe; + unsigned int s_mb_stream_request; + unsigned int s_mb_max_to_scan; + unsigned int s_mb_min_to_scan; + unsigned int s_mb_stats; + unsigned int s_mb_order2_reqs; + unsigned int s_mb_group_prealloc; + /* where last allocation was done - for stream allocation */ + unsigned long s_mb_last_group; + unsigned long s_mb_last_start; + + /* history to debug policy */ + struct ext4_mb_history *s_mb_history; + int s_mb_history_cur; + int s_mb_history_max; + int s_mb_history_num; + spinlock_t s_mb_history_lock; + int s_mb_history_filter; + + /* stats for buddy allocator */ + spinlock_t s_mb_pa_lock; + atomic_t s_bal_reqs; /* number of reqs with len > 1 */ + atomic_t s_bal_success; /* we found long enough chunks */ + atomic_t s_bal_allocated; /* in blocks */ + atomic_t s_bal_ex_scanned; /* total extents scanned */ + atomic_t s_bal_goals; /* goal hits */ + atomic_t s_bal_breaks; /* too long searches */ + atomic_t s_bal_2orders; /* 2^order hits */ + spinlock_t s_bal_lock; + unsigned long s_mb_buddies_generated; + unsigned long long s_mb_generation_time; + atomic_t s_mb_lost_chunks; + atomic_t s_mb_preallocated; + atomic_t s_mb_discarded; + + /* locality groups */ + struct ext4_locality_group *s_locality_groups; + + unsigned int s_log_groups_per_flex; + struct flex_groups *s_flex_groups; +}; + +#endif /* _EXT4_SB */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c new file mode 100644 index 0000000..ea2ce3c --- /dev/null +++ b/fs/ext4/extents.c @@ -0,0 +1,3221 @@ +/* + * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas <alex@clusterfs.com> + * + * Architecture independence: + * Copyright (c) 2005, Bull S.A. + * Written by Pierre Peiffer <pierre.peiffer@bull.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public Licens + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- + */ + +/* + * Extents support for EXT4 + * + * TODO: + * - ext4*_error() should be used in some situations + * - analyze all BUG()/BUG_ON(), use -EIO where appropriate + * - smart tree reduction + */ + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/time.h> +#include <linux/jbd2.h> +#include <linux/highuid.h> +#include <linux/pagemap.h> +#include <linux/quotaops.h> +#include <linux/string.h> +#include <linux/slab.h> +#include <linux/falloc.h> +#include <asm/uaccess.h> +#include <linux/fiemap.h> +#include "ext4_jbd2.h" +#include "ext4_extents.h" + + +/* + * ext_pblock: + * combine low and high parts of physical block number into ext4_fsblk_t + */ +static ext4_fsblk_t ext_pblock(struct ext4_extent *ex) +{ + ext4_fsblk_t block; + + block = le32_to_cpu(ex->ee_start_lo); + block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1; + return block; +} + +/* + * idx_pblock: + * combine low and high parts of a leaf physical block number into ext4_fsblk_t + */ +ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix) +{ + ext4_fsblk_t block; + + block = le32_to_cpu(ix->ei_leaf_lo); + block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1; + return block; +} + +/* + * ext4_ext_store_pblock: + * stores a large physical block number into an extent struct, + * breaking it into parts + */ +void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb) +{ + ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff)); + ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); +} + +/* + * ext4_idx_store_pblock: + * stores a large physical block number into an index struct, + * breaking it into parts + */ +static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb) +{ + ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff)); + ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); +} + +static int ext4_ext_journal_restart(handle_t *handle, int needed) +{ + int err; + + if (handle->h_buffer_credits > needed) + return 0; + err = ext4_journal_extend(handle, needed); + if (err <= 0) + return err; + return ext4_journal_restart(handle, needed); +} + +/* + * could return: + * - EROFS + * - ENOMEM + */ +static int ext4_ext_get_access(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path) +{ + if (path->p_bh) { + /* path points to block */ + return ext4_journal_get_write_access(handle, path->p_bh); + } + /* path points to leaf/index in inode body */ + /* we use in-core data, no need to protect them */ + return 0; +} + +/* + * could return: + * - EROFS + * - ENOMEM + * - EIO + */ +static int ext4_ext_dirty(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path) +{ + int err; + if (path->p_bh) { + /* path points to block */ + err = ext4_journal_dirty_metadata(handle, path->p_bh); + } else { + /* path points to leaf/index in inode body */ + err = ext4_mark_inode_dirty(handle, inode); + } + return err; +} + +static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t block) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + ext4_fsblk_t bg_start; + ext4_fsblk_t last_block; + ext4_grpblk_t colour; + int depth; + + if (path) { + struct ext4_extent *ex; + depth = path->p_depth; + + /* try to predict block placement */ + ex = path[depth].p_ext; + if (ex) + return ext_pblock(ex)+(block-le32_to_cpu(ex->ee_block)); + + /* it looks like index is empty; + * try to find starting block from index itself */ + if (path[depth].p_bh) + return path[depth].p_bh->b_blocknr; + } + + /* OK. use inode's group */ + bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) + + le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block); + last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; + + if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) + colour = (current->pid % 16) * + (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); + else + colour = (current->pid % 16) * ((last_block - bg_start) / 16); + return bg_start + colour + block; +} + +/* + * Allocation for a meta data block + */ +static ext4_fsblk_t +ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path, + struct ext4_extent *ex, int *err) +{ + ext4_fsblk_t goal, newblock; + + goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); + newblock = ext4_new_meta_block(handle, inode, goal, err); + return newblock; +} + +static int ext4_ext_space_block(struct inode *inode) +{ + int size; + + size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) + / sizeof(struct ext4_extent); +#ifdef AGGRESSIVE_TEST + if (size > 6) + size = 6; +#endif + return size; +} + +static int ext4_ext_space_block_idx(struct inode *inode) +{ + int size; + + size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) + / sizeof(struct ext4_extent_idx); +#ifdef AGGRESSIVE_TEST + if (size > 5) + size = 5; +#endif + return size; +} + +static int ext4_ext_space_root(struct inode *inode) +{ + int size; + + size = sizeof(EXT4_I(inode)->i_data); + size -= sizeof(struct ext4_extent_header); + size /= sizeof(struct ext4_extent); +#ifdef AGGRESSIVE_TEST + if (size > 3) + size = 3; +#endif + return size; +} + +static int ext4_ext_space_root_idx(struct inode *inode) +{ + int size; + + size = sizeof(EXT4_I(inode)->i_data); + size -= sizeof(struct ext4_extent_header); + size /= sizeof(struct ext4_extent_idx); +#ifdef AGGRESSIVE_TEST + if (size > 4) + size = 4; +#endif + return size; +} + +/* + * Calculate the number of metadata blocks needed + * to allocate @blocks + * Worse case is one block per extent + */ +int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks) +{ + int lcap, icap, rcap, leafs, idxs, num; + int newextents = blocks; + + rcap = ext4_ext_space_root_idx(inode); + lcap = ext4_ext_space_block(inode); + icap = ext4_ext_space_block_idx(inode); + + /* number of new leaf blocks needed */ + num = leafs = (newextents + lcap - 1) / lcap; + + /* + * Worse case, we need separate index block(s) + * to link all new leaf blocks + */ + idxs = (leafs + icap - 1) / icap; + do { + num += idxs; + idxs = (idxs + icap - 1) / icap; + } while (idxs > rcap); + + return num; +} + +static int +ext4_ext_max_entries(struct inode *inode, int depth) +{ + int max; + + if (depth == ext_depth(inode)) { + if (depth == 0) + max = ext4_ext_space_root(inode); + else + max = ext4_ext_space_root_idx(inode); + } else { + if (depth == 0) + max = ext4_ext_space_block(inode); + else + max = ext4_ext_space_block_idx(inode); + } + + return max; +} + +static int __ext4_ext_check_header(const char *function, struct inode *inode, + struct ext4_extent_header *eh, + int depth) +{ + const char *error_msg; + int max = 0; + + if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) { + error_msg = "invalid magic"; + goto corrupted; + } + if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) { + error_msg = "unexpected eh_depth"; + goto corrupted; + } + if (unlikely(eh->eh_max == 0)) { + error_msg = "invalid eh_max"; + goto corrupted; + } + max = ext4_ext_max_entries(inode, depth); + if (unlikely(le16_to_cpu(eh->eh_max) > max)) { + error_msg = "too large eh_max"; + goto corrupted; + } + if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) { + error_msg = "invalid eh_entries"; + goto corrupted; + } + return 0; + +corrupted: + ext4_error(inode->i_sb, function, + "bad header in inode #%lu: %s - magic %x, " + "entries %u, max %u(%u), depth %u(%u)", + inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic), + le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), + max, le16_to_cpu(eh->eh_depth), depth); + + return -EIO; +} + +#define ext4_ext_check_header(inode, eh, depth) \ + __ext4_ext_check_header(__func__, inode, eh, depth) + +#ifdef EXT_DEBUG +static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) +{ + int k, l = path->p_depth; + + ext_debug("path:"); + for (k = 0; k <= l; k++, path++) { + if (path->p_idx) { + ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block), + idx_pblock(path->p_idx)); + } else if (path->p_ext) { + ext_debug(" %d:%d:%llu ", + le32_to_cpu(path->p_ext->ee_block), + ext4_ext_get_actual_len(path->p_ext), + ext_pblock(path->p_ext)); + } else + ext_debug(" []"); + } + ext_debug("\n"); +} + +static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) +{ + int depth = ext_depth(inode); + struct ext4_extent_header *eh; + struct ext4_extent *ex; + int i; + + if (!path) + return; + + eh = path[depth].p_hdr; + ex = EXT_FIRST_EXTENT(eh); + + for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { + ext_debug("%d:%d:%llu ", le32_to_cpu(ex->ee_block), + ext4_ext_get_actual_len(ex), ext_pblock(ex)); + } + ext_debug("\n"); +} +#else +#define ext4_ext_show_path(inode, path) +#define ext4_ext_show_leaf(inode, path) +#endif + +void ext4_ext_drop_refs(struct ext4_ext_path *path) +{ + int depth = path->p_depth; + int i; + + for (i = 0; i <= depth; i++, path++) + if (path->p_bh) { + brelse(path->p_bh); + path->p_bh = NULL; + } +} + +/* + * ext4_ext_binsearch_idx: + * binary search for the closest index of the given block + * the header must be checked before calling this + */ +static void +ext4_ext_binsearch_idx(struct inode *inode, + struct ext4_ext_path *path, ext4_lblk_t block) +{ + struct ext4_extent_header *eh = path->p_hdr; + struct ext4_extent_idx *r, *l, *m; + + + ext_debug("binsearch for %u(idx): ", block); + + l = EXT_FIRST_INDEX(eh) + 1; + r = EXT_LAST_INDEX(eh); + while (l <= r) { + m = l + (r - l) / 2; + if (block < le32_to_cpu(m->ei_block)) + r = m - 1; + else + l = m + 1; + ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block), + m, le32_to_cpu(m->ei_block), + r, le32_to_cpu(r->ei_block)); + } + + path->p_idx = l - 1; + ext_debug(" -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block), + idx_pblock(path->p_idx)); + +#ifdef CHECK_BINSEARCH + { + struct ext4_extent_idx *chix, *ix; + int k; + + chix = ix = EXT_FIRST_INDEX(eh); + for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) { + if (k != 0 && + le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) { + printk(KERN_DEBUG "k=%d, ix=0x%p, " + "first=0x%p\n", k, + ix, EXT_FIRST_INDEX(eh)); + printk(KERN_DEBUG "%u <= %u\n", + le32_to_cpu(ix->ei_block), + le32_to_cpu(ix[-1].ei_block)); + } + BUG_ON(k && le32_to_cpu(ix->ei_block) + <= le32_to_cpu(ix[-1].ei_block)); + if (block < le32_to_cpu(ix->ei_block)) + break; + chix = ix; + } + BUG_ON(chix != path->p_idx); + } +#endif + +} + +/* + * ext4_ext_binsearch: + * binary search for closest extent of the given block + * the header must be checked before calling this + */ +static void +ext4_ext_binsearch(struct inode *inode, + struct ext4_ext_path *path, ext4_lblk_t block) +{ + struct ext4_extent_header *eh = path->p_hdr; + struct ext4_extent *r, *l, *m; + + if (eh->eh_entries == 0) { + /* + * this leaf is empty: + * we get such a leaf in split/add case + */ + return; + } + + ext_debug("binsearch for %u: ", block); + + l = EXT_FIRST_EXTENT(eh) + 1; + r = EXT_LAST_EXTENT(eh); + + while (l <= r) { + m = l + (r - l) / 2; + if (block < le32_to_cpu(m->ee_block)) + r = m - 1; + else + l = m + 1; + ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block), + m, le32_to_cpu(m->ee_block), + r, le32_to_cpu(r->ee_block)); + } + + path->p_ext = l - 1; + ext_debug(" -> %d:%llu:%d ", + le32_to_cpu(path->p_ext->ee_block), + ext_pblock(path->p_ext), + ext4_ext_get_actual_len(path->p_ext)); + +#ifdef CHECK_BINSEARCH + { + struct ext4_extent *chex, *ex; + int k; + + chex = ex = EXT_FIRST_EXTENT(eh); + for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) { + BUG_ON(k && le32_to_cpu(ex->ee_block) + <= le32_to_cpu(ex[-1].ee_block)); + if (block < le32_to_cpu(ex->ee_block)) + break; + chex = ex; + } + BUG_ON(chex != path->p_ext); + } +#endif + +} + +int ext4_ext_tree_init(handle_t *handle, struct inode *inode) +{ + struct ext4_extent_header *eh; + + eh = ext_inode_hdr(inode); + eh->eh_depth = 0; + eh->eh_entries = 0; + eh->eh_magic = EXT4_EXT_MAGIC; + eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode)); + ext4_mark_inode_dirty(handle, inode); + ext4_ext_invalidate_cache(inode); + return 0; +} + +struct ext4_ext_path * +ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, + struct ext4_ext_path *path) +{ + struct ext4_extent_header *eh; + struct buffer_head *bh; + short int depth, i, ppos = 0, alloc = 0; + + eh = ext_inode_hdr(inode); + depth = ext_depth(inode); + if (ext4_ext_check_header(inode, eh, depth)) + return ERR_PTR(-EIO); + + + /* account possible depth increase */ + if (!path) { + path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2), + GFP_NOFS); + if (!path) + return ERR_PTR(-ENOMEM); + alloc = 1; + } + path[0].p_hdr = eh; + path[0].p_bh = NULL; + + i = depth; + /* walk through the tree */ + while (i) { + ext_debug("depth %d: num %d, max %d\n", + ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); + + ext4_ext_binsearch_idx(inode, path + ppos, block); + path[ppos].p_block = idx_pblock(path[ppos].p_idx); + path[ppos].p_depth = i; + path[ppos].p_ext = NULL; + + bh = sb_bread(inode->i_sb, path[ppos].p_block); + if (!bh) + goto err; + + eh = ext_block_hdr(bh); + ppos++; + BUG_ON(ppos > depth); + path[ppos].p_bh = bh; + path[ppos].p_hdr = eh; + i--; + + if (ext4_ext_check_header(inode, eh, i)) + goto err; + } + + path[ppos].p_depth = i; + path[ppos].p_ext = NULL; + path[ppos].p_idx = NULL; + + /* find extent */ + ext4_ext_binsearch(inode, path + ppos, block); + /* if not an empty leaf */ + if (path[ppos].p_ext) + path[ppos].p_block = ext_pblock(path[ppos].p_ext); + + ext4_ext_show_path(inode, path); + + return path; + +err: + ext4_ext_drop_refs(path); + if (alloc) + kfree(path); + return ERR_PTR(-EIO); +} + +/* + * ext4_ext_insert_index: + * insert new index [@logical;@ptr] into the block at @curp; + * check where to insert: before @curp or after @curp + */ +static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, + struct ext4_ext_path *curp, + int logical, ext4_fsblk_t ptr) +{ + struct ext4_extent_idx *ix; + int len, err; + + err = ext4_ext_get_access(handle, inode, curp); + if (err) + return err; + + BUG_ON(logical == le32_to_cpu(curp->p_idx->ei_block)); + len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; + if (logical > le32_to_cpu(curp->p_idx->ei_block)) { + /* insert after */ + if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) { + len = (len - 1) * sizeof(struct ext4_extent_idx); + len = len < 0 ? 0 : len; + ext_debug("insert new index %d after: %llu. " + "move %d from 0x%p to 0x%p\n", + logical, ptr, len, + (curp->p_idx + 1), (curp->p_idx + 2)); + memmove(curp->p_idx + 2, curp->p_idx + 1, len); + } + ix = curp->p_idx + 1; + } else { + /* insert before */ + len = len * sizeof(struct ext4_extent_idx); + len = len < 0 ? 0 : len; + ext_debug("insert new index %d before: %llu. " + "move %d from 0x%p to 0x%p\n", + logical, ptr, len, + curp->p_idx, (curp->p_idx + 1)); + memmove(curp->p_idx + 1, curp->p_idx, len); + ix = curp->p_idx; + } + + ix->ei_block = cpu_to_le32(logical); + ext4_idx_store_pblock(ix, ptr); + le16_add_cpu(&curp->p_hdr->eh_entries, 1); + + BUG_ON(le16_to_cpu(curp->p_hdr->eh_entries) + > le16_to_cpu(curp->p_hdr->eh_max)); + BUG_ON(ix > EXT_LAST_INDEX(curp->p_hdr)); + + err = ext4_ext_dirty(handle, inode, curp); + ext4_std_error(inode->i_sb, err); + + return err; +} + +/* + * ext4_ext_split: + * inserts new subtree into the path, using free index entry + * at depth @at: + * - allocates all needed blocks (new leaf and all intermediate index blocks) + * - makes decision where to split + * - moves remaining extents and index entries (right to the split point) + * into the newly allocated blocks + * - initializes subtree + */ +static int ext4_ext_split(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path, + struct ext4_extent *newext, int at) +{ + struct buffer_head *bh = NULL; + int depth = ext_depth(inode); + struct ext4_extent_header *neh; + struct ext4_extent_idx *fidx; + struct ext4_extent *ex; + int i = at, k, m, a; + ext4_fsblk_t newblock, oldblock; + __le32 border; + ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */ + int err = 0; + + /* make decision: where to split? */ + /* FIXME: now decision is simplest: at current extent */ + + /* if current leaf will be split, then we should use + * border from split point */ + BUG_ON(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr)); + if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { + border = path[depth].p_ext[1].ee_block; + ext_debug("leaf will be split." + " next leaf starts at %d\n", + le32_to_cpu(border)); + } else { + border = newext->ee_block; + ext_debug("leaf will be added." + " next leaf starts at %d\n", + le32_to_cpu(border)); + } + + /* + * If error occurs, then we break processing + * and mark filesystem read-only. index won't + * be inserted and tree will be in consistent + * state. Next mount will repair buffers too. + */ + + /* + * Get array to track all allocated blocks. + * We need this to handle errors and free blocks + * upon them. + */ + ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS); + if (!ablocks) + return -ENOMEM; + + /* allocate all needed blocks */ + ext_debug("allocate %d blocks for indexes/leaf\n", depth - at); + for (a = 0; a < depth - at; a++) { + newblock = ext4_ext_new_meta_block(handle, inode, path, + newext, &err); + if (newblock == 0) + goto cleanup; + ablocks[a] = newblock; + } + + /* initialize new leaf */ + newblock = ablocks[--a]; + BUG_ON(newblock == 0); + bh = sb_getblk(inode->i_sb, newblock); + if (!bh) { + err = -EIO; + goto cleanup; + } + lock_buffer(bh); + + err = ext4_journal_get_create_access(handle, bh); + if (err) + goto cleanup; + + neh = ext_block_hdr(bh); + neh->eh_entries = 0; + neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode)); + neh->eh_magic = EXT4_EXT_MAGIC; + neh->eh_depth = 0; + ex = EXT_FIRST_EXTENT(neh); + + /* move remainder of path[depth] to the new leaf */ + BUG_ON(path[depth].p_hdr->eh_entries != path[depth].p_hdr->eh_max); + /* start copy from next extent */ + /* TODO: we could do it by single memmove */ + m = 0; + path[depth].p_ext++; + while (path[depth].p_ext <= + EXT_MAX_EXTENT(path[depth].p_hdr)) { + ext_debug("move %d:%llu:%d in new leaf %llu\n", + le32_to_cpu(path[depth].p_ext->ee_block), + ext_pblock(path[depth].p_ext), + ext4_ext_get_actual_len(path[depth].p_ext), + newblock); + /*memmove(ex++, path[depth].p_ext++, + sizeof(struct ext4_extent)); + neh->eh_entries++;*/ + path[depth].p_ext++; + m++; + } + if (m) { + memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m); + le16_add_cpu(&neh->eh_entries, m); + } + + set_buffer_uptodate(bh); + unlock_buffer(bh); + + err = ext4_journal_dirty_metadata(handle, bh); + if (err) + goto cleanup; + brelse(bh); + bh = NULL; + + /* correct old leaf */ + if (m) { + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto cleanup; + le16_add_cpu(&path[depth].p_hdr->eh_entries, -m); + err = ext4_ext_dirty(handle, inode, path + depth); + if (err) + goto cleanup; + + } + + /* create intermediate indexes */ + k = depth - at - 1; + BUG_ON(k < 0); + if (k) + ext_debug("create %d intermediate indices\n", k); + /* insert new index into current index block */ + /* current depth stored in i var */ + i = depth - 1; + while (k--) { + oldblock = newblock; + newblock = ablocks[--a]; + bh = sb_getblk(inode->i_sb, newblock); + if (!bh) { + err = -EIO; + goto cleanup; + } + lock_buffer(bh); + + err = ext4_journal_get_create_access(handle, bh); + if (err) + goto cleanup; + + neh = ext_block_hdr(bh); + neh->eh_entries = cpu_to_le16(1); + neh->eh_magic = EXT4_EXT_MAGIC; + neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode)); + neh->eh_depth = cpu_to_le16(depth - i); + fidx = EXT_FIRST_INDEX(neh); + fidx->ei_block = border; + ext4_idx_store_pblock(fidx, oldblock); + + ext_debug("int.index at %d (block %llu): %u -> %llu\n", + i, newblock, le32_to_cpu(border), oldblock); + /* copy indexes */ + m = 0; + path[i].p_idx++; + + ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx, + EXT_MAX_INDEX(path[i].p_hdr)); + BUG_ON(EXT_MAX_INDEX(path[i].p_hdr) != + EXT_LAST_INDEX(path[i].p_hdr)); + while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { + ext_debug("%d: move %d:%llu in new index %llu\n", i, + le32_to_cpu(path[i].p_idx->ei_block), + idx_pblock(path[i].p_idx), + newblock); + /*memmove(++fidx, path[i].p_idx++, + sizeof(struct ext4_extent_idx)); + neh->eh_entries++; + BUG_ON(neh->eh_entries > neh->eh_max);*/ + path[i].p_idx++; + m++; + } + if (m) { + memmove(++fidx, path[i].p_idx - m, + sizeof(struct ext4_extent_idx) * m); + le16_add_cpu(&neh->eh_entries, m); + } + set_buffer_uptodate(bh); + unlock_buffer(bh); + + err = ext4_journal_dirty_metadata(handle, bh); + if (err) + goto cleanup; + brelse(bh); + bh = NULL; + + /* correct old index */ + if (m) { + err = ext4_ext_get_access(handle, inode, path + i); + if (err) + goto cleanup; + le16_add_cpu(&path[i].p_hdr->eh_entries, -m); + err = ext4_ext_dirty(handle, inode, path + i); + if (err) + goto cleanup; + } + + i--; + } + + /* insert new index */ + err = ext4_ext_insert_index(handle, inode, path + at, + le32_to_cpu(border), newblock); + +cleanup: + if (bh) { + if (buffer_locked(bh)) + unlock_buffer(bh); + brelse(bh); + } + + if (err) { + /* free all allocated blocks in error case */ + for (i = 0; i < depth; i++) { + if (!ablocks[i]) + continue; + ext4_free_blocks(handle, inode, ablocks[i], 1, 1); + } + } + kfree(ablocks); + + return err; +} + +/* + * ext4_ext_grow_indepth: + * implements tree growing procedure: + * - allocates new block + * - moves top-level data (index block or leaf) into the new block + * - initializes new top-level, creating index that points to the + * just created block + */ +static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path, + struct ext4_extent *newext) +{ + struct ext4_ext_path *curp = path; + struct ext4_extent_header *neh; + struct ext4_extent_idx *fidx; + struct buffer_head *bh; + ext4_fsblk_t newblock; + int err = 0; + + newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err); + if (newblock == 0) + return err; + + bh = sb_getblk(inode->i_sb, newblock); + if (!bh) { + err = -EIO; + ext4_std_error(inode->i_sb, err); + return err; + } + lock_buffer(bh); + + err = ext4_journal_get_create_access(handle, bh); + if (err) { + unlock_buffer(bh); + goto out; + } + + /* move top-level index/leaf into new block */ + memmove(bh->b_data, curp->p_hdr, sizeof(EXT4_I(inode)->i_data)); + + /* set size of new block */ + neh = ext_block_hdr(bh); + /* old root could have indexes or leaves + * so calculate e_max right way */ + if (ext_depth(inode)) + neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode)); + else + neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode)); + neh->eh_magic = EXT4_EXT_MAGIC; + set_buffer_uptodate(bh); + unlock_buffer(bh); + + err = ext4_journal_dirty_metadata(handle, bh); + if (err) + goto out; + + /* create index in new top-level index: num,max,pointer */ + err = ext4_ext_get_access(handle, inode, curp); + if (err) + goto out; + + curp->p_hdr->eh_magic = EXT4_EXT_MAGIC; + curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode)); + curp->p_hdr->eh_entries = cpu_to_le16(1); + curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); + + if (path[0].p_hdr->eh_depth) + curp->p_idx->ei_block = + EXT_FIRST_INDEX(path[0].p_hdr)->ei_block; + else + curp->p_idx->ei_block = + EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block; + ext4_idx_store_pblock(curp->p_idx, newblock); + + neh = ext_inode_hdr(inode); + fidx = EXT_FIRST_INDEX(neh); + ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n", + le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max), + le32_to_cpu(fidx->ei_block), idx_pblock(fidx)); + + neh->eh_depth = cpu_to_le16(path->p_depth + 1); + err = ext4_ext_dirty(handle, inode, curp); +out: + brelse(bh); + + return err; +} + +/* + * ext4_ext_create_new_leaf: + * finds empty index and adds new leaf. + * if no free index is found, then it requests in-depth growing. + */ +static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path, + struct ext4_extent *newext) +{ + struct ext4_ext_path *curp; + int depth, i, err = 0; + +repeat: + i = depth = ext_depth(inode); + + /* walk up to the tree and look for free index entry */ + curp = path + depth; + while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { + i--; + curp--; + } + + /* we use already allocated block for index block, + * so subsequent data blocks should be contiguous */ + if (EXT_HAS_FREE_INDEX(curp)) { + /* if we found index with free entry, then use that + * entry: create all needed subtree and add new leaf */ + err = ext4_ext_split(handle, inode, path, newext, i); + if (err) + goto out; + + /* refill path */ + ext4_ext_drop_refs(path); + path = ext4_ext_find_extent(inode, + (ext4_lblk_t)le32_to_cpu(newext->ee_block), + path); + if (IS_ERR(path)) + err = PTR_ERR(path); + } else { + /* tree is full, time to grow in depth */ + err = ext4_ext_grow_indepth(handle, inode, path, newext); + if (err) + goto out; + + /* refill path */ + ext4_ext_drop_refs(path); + path = ext4_ext_find_extent(inode, + (ext4_lblk_t)le32_to_cpu(newext->ee_block), + path); + if (IS_ERR(path)) { + err = PTR_ERR(path); + goto out; + } + + /* + * only first (depth 0 -> 1) produces free space; + * in all other cases we have to split the grown tree + */ + depth = ext_depth(inode); + if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { + /* now we need to split */ + goto repeat; + } + } + +out: + return err; +} + +/* + * search the closest allocated block to the left for *logical + * and returns it at @logical + it's physical address at @phys + * if *logical is the smallest allocated block, the function + * returns 0 at @phys + * return value contains 0 (success) or error code + */ +int +ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path, + ext4_lblk_t *logical, ext4_fsblk_t *phys) +{ + struct ext4_extent_idx *ix; + struct ext4_extent *ex; + int depth, ee_len; + + BUG_ON(path == NULL); + depth = path->p_depth; + *phys = 0; + + if (depth == 0 && path->p_ext == NULL) + return 0; + + /* usually extent in the path covers blocks smaller + * then *logical, but it can be that extent is the + * first one in the file */ + + ex = path[depth].p_ext; + ee_len = ext4_ext_get_actual_len(ex); + if (*logical < le32_to_cpu(ex->ee_block)) { + BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex); + while (--depth >= 0) { + ix = path[depth].p_idx; + BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr)); + } + return 0; + } + + BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len)); + + *logical = le32_to_cpu(ex->ee_block) + ee_len - 1; + *phys = ext_pblock(ex) + ee_len - 1; + return 0; +} + +/* + * search the closest allocated block to the right for *logical + * and returns it at @logical + it's physical address at @phys + * if *logical is the smallest allocated block, the function + * returns 0 at @phys + * return value contains 0 (success) or error code + */ +int +ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, + ext4_lblk_t *logical, ext4_fsblk_t *phys) +{ + struct buffer_head *bh = NULL; + struct ext4_extent_header *eh; + struct ext4_extent_idx *ix; + struct ext4_extent *ex; + ext4_fsblk_t block; + int depth, ee_len; + + BUG_ON(path == NULL); + depth = path->p_depth; + *phys = 0; + + if (depth == 0 && path->p_ext == NULL) + return 0; + + /* usually extent in the path covers blocks smaller + * then *logical, but it can be that extent is the + * first one in the file */ + + ex = path[depth].p_ext; + ee_len = ext4_ext_get_actual_len(ex); + if (*logical < le32_to_cpu(ex->ee_block)) { + BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex); + while (--depth >= 0) { + ix = path[depth].p_idx; + BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr)); + } + *logical = le32_to_cpu(ex->ee_block); + *phys = ext_pblock(ex); + return 0; + } + + BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len)); + + if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) { + /* next allocated block in this leaf */ + ex++; + *logical = le32_to_cpu(ex->ee_block); + *phys = ext_pblock(ex); + return 0; + } + + /* go up and search for index to the right */ + while (--depth >= 0) { + ix = path[depth].p_idx; + if (ix != EXT_LAST_INDEX(path[depth].p_hdr)) + break; + } + + if (depth < 0) { + /* we've gone up to the root and + * found no index to the right */ + return 0; + } + + /* we've found index to the right, let's + * follow it and find the closest allocated + * block to the right */ + ix++; + block = idx_pblock(ix); + while (++depth < path->p_depth) { + bh = sb_bread(inode->i_sb, block); + if (bh == NULL) + return -EIO; + eh = ext_block_hdr(bh); + if (ext4_ext_check_header(inode, eh, depth)) { + put_bh(bh); + return -EIO; + } + ix = EXT_FIRST_INDEX(eh); + block = idx_pblock(ix); + put_bh(bh); + } + + bh = sb_bread(inode->i_sb, block); + if (bh == NULL) + return -EIO; + eh = ext_block_hdr(bh); + if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) { + put_bh(bh); + return -EIO; + } + ex = EXT_FIRST_EXTENT(eh); + *logical = le32_to_cpu(ex->ee_block); + *phys = ext_pblock(ex); + put_bh(bh); + return 0; + +} + +/* + * ext4_ext_next_allocated_block: + * returns allocated block in subsequent extent or EXT_MAX_BLOCK. + * NOTE: it considers block number from index entry as + * allocated block. Thus, index entries have to be consistent + * with leaves. + */ +static ext4_lblk_t +ext4_ext_next_allocated_block(struct ext4_ext_path *path) +{ + int depth; + + BUG_ON(path == NULL); + depth = path->p_depth; + + if (depth == 0 && path->p_ext == NULL) + return EXT_MAX_BLOCK; + + while (depth >= 0) { + if (depth == path->p_depth) { + /* leaf */ + if (path[depth].p_ext != + EXT_LAST_EXTENT(path[depth].p_hdr)) + return le32_to_cpu(path[depth].p_ext[1].ee_block); + } else { + /* index */ + if (path[depth].p_idx != + EXT_LAST_INDEX(path[depth].p_hdr)) + return le32_to_cpu(path[depth].p_idx[1].ei_block); + } + depth--; + } + + return EXT_MAX_BLOCK; +} + +/* + * ext4_ext_next_leaf_block: + * returns first allocated block from next leaf or EXT_MAX_BLOCK + */ +static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode, + struct ext4_ext_path *path) +{ + int depth; + + BUG_ON(path == NULL); + depth = path->p_depth; + + /* zero-tree has no leaf blocks at all */ + if (depth == 0) + return EXT_MAX_BLOCK; + + /* go to index block */ + depth--; + + while (depth >= 0) { + if (path[depth].p_idx != + EXT_LAST_INDEX(path[depth].p_hdr)) + return (ext4_lblk_t) + le32_to_cpu(path[depth].p_idx[1].ei_block); + depth--; + } + + return EXT_MAX_BLOCK; +} + +/* + * ext4_ext_correct_indexes: + * if leaf gets modified and modified extent is first in the leaf, + * then we have to correct all indexes above. + * TODO: do we need to correct tree in all cases? + */ +static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path) +{ + struct ext4_extent_header *eh; + int depth = ext_depth(inode); + struct ext4_extent *ex; + __le32 border; + int k, err = 0; + + eh = path[depth].p_hdr; + ex = path[depth].p_ext; + BUG_ON(ex == NULL); + BUG_ON(eh == NULL); + + if (depth == 0) { + /* there is no tree at all */ + return 0; + } + + if (ex != EXT_FIRST_EXTENT(eh)) { + /* we correct tree if first leaf got modified only */ + return 0; + } + + /* + * TODO: we need correction if border is smaller than current one + */ + k = depth - 1; + border = path[depth].p_ext->ee_block; + err = ext4_ext_get_access(handle, inode, path + k); + if (err) + return err; + path[k].p_idx->ei_block = border; + err = ext4_ext_dirty(handle, inode, path + k); + if (err) + return err; + + while (k--) { + /* change all left-side indexes */ + if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) + break; + err = ext4_ext_get_access(handle, inode, path + k); + if (err) + break; + path[k].p_idx->ei_block = border; + err = ext4_ext_dirty(handle, inode, path + k); + if (err) + break; + } + + return err; +} + +static int +ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, + struct ext4_extent *ex2) +{ + unsigned short ext1_ee_len, ext2_ee_len, max_len; + + /* + * Make sure that either both extents are uninitialized, or + * both are _not_. + */ + if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2)) + return 0; + + if (ext4_ext_is_uninitialized(ex1)) + max_len = EXT_UNINIT_MAX_LEN; + else + max_len = EXT_INIT_MAX_LEN; + + ext1_ee_len = ext4_ext_get_actual_len(ex1); + ext2_ee_len = ext4_ext_get_actual_len(ex2); + + if (le32_to_cpu(ex1->ee_block) + ext1_ee_len != + le32_to_cpu(ex2->ee_block)) + return 0; + + /* + * To allow future support for preallocated extents to be added + * as an RO_COMPAT feature, refuse to merge to extents if + * this can result in the top bit of ee_len being set. + */ + if (ext1_ee_len + ext2_ee_len > max_len) + return 0; +#ifdef AGGRESSIVE_TEST + if (ext1_ee_len >= 4) + return 0; +#endif + + if (ext_pblock(ex1) + ext1_ee_len == ext_pblock(ex2)) + return 1; + return 0; +} + +/* + * This function tries to merge the "ex" extent to the next extent in the tree. + * It always tries to merge towards right. If you want to merge towards + * left, pass "ex - 1" as argument instead of "ex". + * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns + * 1 if they got merged. + */ +int ext4_ext_try_to_merge(struct inode *inode, + struct ext4_ext_path *path, + struct ext4_extent *ex) +{ + struct ext4_extent_header *eh; + unsigned int depth, len; + int merge_done = 0; + int uninitialized = 0; + + depth = ext_depth(inode); + BUG_ON(path[depth].p_hdr == NULL); + eh = path[depth].p_hdr; + + while (ex < EXT_LAST_EXTENT(eh)) { + if (!ext4_can_extents_be_merged(inode, ex, ex + 1)) + break; + /* merge with next extent! */ + if (ext4_ext_is_uninitialized(ex)) + uninitialized = 1; + ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + + ext4_ext_get_actual_len(ex + 1)); + if (uninitialized) + ext4_ext_mark_uninitialized(ex); + + if (ex + 1 < EXT_LAST_EXTENT(eh)) { + len = (EXT_LAST_EXTENT(eh) - ex - 1) + * sizeof(struct ext4_extent); + memmove(ex + 1, ex + 2, len); + } + le16_add_cpu(&eh->eh_entries, -1); + merge_done = 1; + WARN_ON(eh->eh_entries == 0); + if (!eh->eh_entries) + ext4_error(inode->i_sb, "ext4_ext_try_to_merge", + "inode#%lu, eh->eh_entries = 0!", inode->i_ino); + } + + return merge_done; +} + +/* + * check if a portion of the "newext" extent overlaps with an + * existing extent. + * + * If there is an overlap discovered, it updates the length of the newext + * such that there will be no overlap, and then returns 1. + * If there is no overlap found, it returns 0. + */ +unsigned int ext4_ext_check_overlap(struct inode *inode, + struct ext4_extent *newext, + struct ext4_ext_path *path) +{ + ext4_lblk_t b1, b2; + unsigned int depth, len1; + unsigned int ret = 0; + + b1 = le32_to_cpu(newext->ee_block); + len1 = ext4_ext_get_actual_len(newext); + depth = ext_depth(inode); + if (!path[depth].p_ext) + goto out; + b2 = le32_to_cpu(path[depth].p_ext->ee_block); + + /* + * get the next allocated block if the extent in the path + * is before the requested block(s) + */ + if (b2 < b1) { + b2 = ext4_ext_next_allocated_block(path); + if (b2 == EXT_MAX_BLOCK) + goto out; + } + + /* check for wrap through zero on extent logical start block*/ + if (b1 + len1 < b1) { + len1 = EXT_MAX_BLOCK - b1; + newext->ee_len = cpu_to_le16(len1); + ret = 1; + } + + /* check for overlap */ + if (b1 + len1 > b2) { + newext->ee_len = cpu_to_le16(b2 - b1); + ret = 1; + } +out: + return ret; +} + +/* + * ext4_ext_insert_extent: + * tries to merge requsted extent into the existing extent or + * inserts requested extent as new one into the tree, + * creating new leaf in the no-space case. + */ +int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path, + struct ext4_extent *newext) +{ + struct ext4_extent_header *eh; + struct ext4_extent *ex, *fex; + struct ext4_extent *nearex; /* nearest extent */ + struct ext4_ext_path *npath = NULL; + int depth, len, err; + ext4_lblk_t next; + unsigned uninitialized = 0; + + BUG_ON(ext4_ext_get_actual_len(newext) == 0); + depth = ext_depth(inode); + ex = path[depth].p_ext; + BUG_ON(path[depth].p_hdr == NULL); + + /* try to insert block into found extent and return */ + if (ex && ext4_can_extents_be_merged(inode, ex, newext)) { + ext_debug("append %d block to %d:%d (from %llu)\n", + ext4_ext_get_actual_len(newext), + le32_to_cpu(ex->ee_block), + ext4_ext_get_actual_len(ex), ext_pblock(ex)); + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + return err; + + /* + * ext4_can_extents_be_merged should have checked that either + * both extents are uninitialized, or both aren't. Thus we + * need to check only one of them here. + */ + if (ext4_ext_is_uninitialized(ex)) + uninitialized = 1; + ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + + ext4_ext_get_actual_len(newext)); + if (uninitialized) + ext4_ext_mark_uninitialized(ex); + eh = path[depth].p_hdr; + nearex = ex; + goto merge; + } + +repeat: + depth = ext_depth(inode); + eh = path[depth].p_hdr; + if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) + goto has_space; + + /* probably next leaf has space for us? */ + fex = EXT_LAST_EXTENT(eh); + next = ext4_ext_next_leaf_block(inode, path); + if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block) + && next != EXT_MAX_BLOCK) { + ext_debug("next leaf block - %d\n", next); + BUG_ON(npath != NULL); + npath = ext4_ext_find_extent(inode, next, NULL); + if (IS_ERR(npath)) + return PTR_ERR(npath); + BUG_ON(npath->p_depth != path->p_depth); + eh = npath[depth].p_hdr; + if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) { + ext_debug("next leaf isnt full(%d)\n", + le16_to_cpu(eh->eh_entries)); + path = npath; + goto repeat; + } + ext_debug("next leaf has no free space(%d,%d)\n", + le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); + } + + /* + * There is no free space in the found leaf. + * We're gonna add a new leaf in the tree. + */ + err = ext4_ext_create_new_leaf(handle, inode, path, newext); + if (err) + goto cleanup; + depth = ext_depth(inode); + eh = path[depth].p_hdr; + +has_space: + nearex = path[depth].p_ext; + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto cleanup; + + if (!nearex) { + /* there is no extent in this leaf, create first one */ + ext_debug("first extent in the leaf: %d:%llu:%d\n", + le32_to_cpu(newext->ee_block), + ext_pblock(newext), + ext4_ext_get_actual_len(newext)); + path[depth].p_ext = EXT_FIRST_EXTENT(eh); + } else if (le32_to_cpu(newext->ee_block) + > le32_to_cpu(nearex->ee_block)) { +/* BUG_ON(newext->ee_block == nearex->ee_block); */ + if (nearex != EXT_LAST_EXTENT(eh)) { + len = EXT_MAX_EXTENT(eh) - nearex; + len = (len - 1) * sizeof(struct ext4_extent); + len = len < 0 ? 0 : len; + ext_debug("insert %d:%llu:%d after: nearest 0x%p, " + "move %d from 0x%p to 0x%p\n", + le32_to_cpu(newext->ee_block), + ext_pblock(newext), + ext4_ext_get_actual_len(newext), + nearex, len, nearex + 1, nearex + 2); + memmove(nearex + 2, nearex + 1, len); + } + path[depth].p_ext = nearex + 1; + } else { + BUG_ON(newext->ee_block == nearex->ee_block); + len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent); + len = len < 0 ? 0 : len; + ext_debug("insert %d:%llu:%d before: nearest 0x%p, " + "move %d from 0x%p to 0x%p\n", + le32_to_cpu(newext->ee_block), + ext_pblock(newext), + ext4_ext_get_actual_len(newext), + nearex, len, nearex + 1, nearex + 2); + memmove(nearex + 1, nearex, len); + path[depth].p_ext = nearex; + } + + le16_add_cpu(&eh->eh_entries, 1); + nearex = path[depth].p_ext; + nearex->ee_block = newext->ee_block; + ext4_ext_store_pblock(nearex, ext_pblock(newext)); + nearex->ee_len = newext->ee_len; + +merge: + /* try to merge extents to the right */ + ext4_ext_try_to_merge(inode, path, nearex); + + /* try to merge extents to the left */ + + /* time to correct all indexes above */ + err = ext4_ext_correct_indexes(handle, inode, path); + if (err) + goto cleanup; + + err = ext4_ext_dirty(handle, inode, path + depth); + +cleanup: + if (npath) { + ext4_ext_drop_refs(npath); + kfree(npath); + } + ext4_ext_tree_changed(inode); + ext4_ext_invalidate_cache(inode); + return err; +} + +int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, + ext4_lblk_t num, ext_prepare_callback func, + void *cbdata) +{ + struct ext4_ext_path *path = NULL; + struct ext4_ext_cache cbex; + struct ext4_extent *ex; + ext4_lblk_t next, start = 0, end = 0; + ext4_lblk_t last = block + num; + int depth, exists, err = 0; + + BUG_ON(func == NULL); + BUG_ON(inode == NULL); + + while (block < last && block != EXT_MAX_BLOCK) { + num = last - block; + /* find extent for this block */ + path = ext4_ext_find_extent(inode, block, path); + if (IS_ERR(path)) { + err = PTR_ERR(path); + path = NULL; + break; + } + + depth = ext_depth(inode); + BUG_ON(path[depth].p_hdr == NULL); + ex = path[depth].p_ext; + next = ext4_ext_next_allocated_block(path); + + exists = 0; + if (!ex) { + /* there is no extent yet, so try to allocate + * all requested space */ + start = block; + end = block + num; + } else if (le32_to_cpu(ex->ee_block) > block) { + /* need to allocate space before found extent */ + start = block; + end = le32_to_cpu(ex->ee_block); + if (block + num < end) + end = block + num; + } else if (block >= le32_to_cpu(ex->ee_block) + + ext4_ext_get_actual_len(ex)) { + /* need to allocate space after found extent */ + start = block; + end = block + num; + if (end >= next) + end = next; + } else if (block >= le32_to_cpu(ex->ee_block)) { + /* + * some part of requested space is covered + * by found extent + */ + start = block; + end = le32_to_cpu(ex->ee_block) + + ext4_ext_get_actual_len(ex); + if (block + num < end) + end = block + num; + exists = 1; + } else { + BUG(); + } + BUG_ON(end <= start); + + if (!exists) { + cbex.ec_block = start; + cbex.ec_len = end - start; + cbex.ec_start = 0; + cbex.ec_type = EXT4_EXT_CACHE_GAP; + } else { + cbex.ec_block = le32_to_cpu(ex->ee_block); + cbex.ec_len = ext4_ext_get_actual_len(ex); + cbex.ec_start = ext_pblock(ex); + cbex.ec_type = EXT4_EXT_CACHE_EXTENT; + } + + BUG_ON(cbex.ec_len == 0); + err = func(inode, path, &cbex, ex, cbdata); + ext4_ext_drop_refs(path); + + if (err < 0) + break; + + if (err == EXT_REPEAT) + continue; + else if (err == EXT_BREAK) { + err = 0; + break; + } + + if (ext_depth(inode) != depth) { + /* depth was changed. we have to realloc path */ + kfree(path); + path = NULL; + } + + block = cbex.ec_block + cbex.ec_len; + } + + if (path) { + ext4_ext_drop_refs(path); + kfree(path); + } + + return err; +} + +static void +ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block, + __u32 len, ext4_fsblk_t start, int type) +{ + struct ext4_ext_cache *cex; + BUG_ON(len == 0); + cex = &EXT4_I(inode)->i_cached_extent; + cex->ec_type = type; + cex->ec_block = block; + cex->ec_len = len; + cex->ec_start = start; +} + +/* + * ext4_ext_put_gap_in_cache: + * calculate boundaries of the gap that the requested block fits into + * and cache this gap + */ +static void +ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, + ext4_lblk_t block) +{ + int depth = ext_depth(inode); + unsigned long len; + ext4_lblk_t lblock; + struct ext4_extent *ex; + + ex = path[depth].p_ext; + if (ex == NULL) { + /* there is no extent yet, so gap is [0;-] */ + lblock = 0; + len = EXT_MAX_BLOCK; + ext_debug("cache gap(whole file):"); + } else if (block < le32_to_cpu(ex->ee_block)) { + lblock = block; + len = le32_to_cpu(ex->ee_block) - block; + ext_debug("cache gap(before): %u [%u:%u]", + block, + le32_to_cpu(ex->ee_block), + ext4_ext_get_actual_len(ex)); + } else if (block >= le32_to_cpu(ex->ee_block) + + ext4_ext_get_actual_len(ex)) { + ext4_lblk_t next; + lblock = le32_to_cpu(ex->ee_block) + + ext4_ext_get_actual_len(ex); + + next = ext4_ext_next_allocated_block(path); + ext_debug("cache gap(after): [%u:%u] %u", + le32_to_cpu(ex->ee_block), + ext4_ext_get_actual_len(ex), + block); + BUG_ON(next == lblock); + len = next - lblock; + } else { + lblock = len = 0; + BUG(); + } + + ext_debug(" -> %u:%lu\n", lblock, len); + ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP); +} + +static int +ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, + struct ext4_extent *ex) +{ + struct ext4_ext_cache *cex; + + cex = &EXT4_I(inode)->i_cached_extent; + + /* has cache valid data? */ + if (cex->ec_type == EXT4_EXT_CACHE_NO) + return EXT4_EXT_CACHE_NO; + + BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP && + cex->ec_type != EXT4_EXT_CACHE_EXTENT); + if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) { + ex->ee_block = cpu_to_le32(cex->ec_block); + ext4_ext_store_pblock(ex, cex->ec_start); + ex->ee_len = cpu_to_le16(cex->ec_len); + ext_debug("%u cached by %u:%u:%llu\n", + block, + cex->ec_block, cex->ec_len, cex->ec_start); + return cex->ec_type; + } + + /* not in cache */ + return EXT4_EXT_CACHE_NO; +} + +/* + * ext4_ext_rm_idx: + * removes index from the index block. + * It's used in truncate case only, thus all requests are for + * last index in the block only. + */ +static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path) +{ + struct buffer_head *bh; + int err; + ext4_fsblk_t leaf; + + /* free index block */ + path--; + leaf = idx_pblock(path->p_idx); + BUG_ON(path->p_hdr->eh_entries == 0); + err = ext4_ext_get_access(handle, inode, path); + if (err) + return err; + le16_add_cpu(&path->p_hdr->eh_entries, -1); + err = ext4_ext_dirty(handle, inode, path); + if (err) + return err; + ext_debug("index is empty, remove it, free block %llu\n", leaf); + bh = sb_find_get_block(inode->i_sb, leaf); + ext4_forget(handle, 1, inode, bh, leaf); + ext4_free_blocks(handle, inode, leaf, 1, 1); + return err; +} + +/* + * ext4_ext_calc_credits_for_single_extent: + * This routine returns max. credits that needed to insert an extent + * to the extent tree. + * When pass the actual path, the caller should calculate credits + * under i_data_sem. + */ +int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, + struct ext4_ext_path *path) +{ + if (path) { + int depth = ext_depth(inode); + int ret = 0; + + /* probably there is space in leaf? */ + if (le16_to_cpu(path[depth].p_hdr->eh_entries) + < le16_to_cpu(path[depth].p_hdr->eh_max)) { + + /* + * There are some space in the leaf tree, no + * need to account for leaf block credit + * + * bitmaps and block group descriptor blocks + * and other metadat blocks still need to be + * accounted. + */ + /* 1 bitmap, 1 block group descriptor */ + ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb); + } + } + + return ext4_chunk_trans_blocks(inode, nrblocks); +} + +/* + * How many index/leaf blocks need to change/allocate to modify nrblocks? + * + * if nrblocks are fit in a single extent (chunk flag is 1), then + * in the worse case, each tree level index/leaf need to be changed + * if the tree split due to insert a new extent, then the old tree + * index/leaf need to be updated too + * + * If the nrblocks are discontiguous, they could cause + * the whole tree split more than once, but this is really rare. + */ +int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) +{ + int index; + int depth = ext_depth(inode); + + if (chunk) + index = depth * 2; + else + index = depth * 3; + + return index; +} + +static int ext4_remove_blocks(handle_t *handle, struct inode *inode, + struct ext4_extent *ex, + ext4_lblk_t from, ext4_lblk_t to) +{ + struct buffer_head *bh; + unsigned short ee_len = ext4_ext_get_actual_len(ex); + int i, metadata = 0; + + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + metadata = 1; +#ifdef EXTENTS_STATS + { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + spin_lock(&sbi->s_ext_stats_lock); + sbi->s_ext_blocks += ee_len; + sbi->s_ext_extents++; + if (ee_len < sbi->s_ext_min) + sbi->s_ext_min = ee_len; + if (ee_len > sbi->s_ext_max) + sbi->s_ext_max = ee_len; + if (ext_depth(inode) > sbi->s_depth_max) + sbi->s_depth_max = ext_depth(inode); + spin_unlock(&sbi->s_ext_stats_lock); + } +#endif + if (from >= le32_to_cpu(ex->ee_block) + && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { + /* tail removal */ + ext4_lblk_t num; + ext4_fsblk_t start; + + num = le32_to_cpu(ex->ee_block) + ee_len - from; + start = ext_pblock(ex) + ee_len - num; + ext_debug("free last %u blocks starting %llu\n", num, start); + for (i = 0; i < num; i++) { + bh = sb_find_get_block(inode->i_sb, start + i); + ext4_forget(handle, 0, inode, bh, start + i); + } + ext4_free_blocks(handle, inode, start, num, metadata); + } else if (from == le32_to_cpu(ex->ee_block) + && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { + printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n", + from, to, le32_to_cpu(ex->ee_block), ee_len); + } else { + printk(KERN_INFO "strange request: removal(2) " + "%u-%u from %u:%u\n", + from, to, le32_to_cpu(ex->ee_block), ee_len); + } + return 0; +} + +static int +ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path, ext4_lblk_t start) +{ + int err = 0, correct_index = 0; + int depth = ext_depth(inode), credits; + struct ext4_extent_header *eh; + ext4_lblk_t a, b, block; + unsigned num; + ext4_lblk_t ex_ee_block; + unsigned short ex_ee_len; + unsigned uninitialized = 0; + struct ext4_extent *ex; + + /* the header must be checked already in ext4_ext_remove_space() */ + ext_debug("truncate since %u in leaf\n", start); + if (!path[depth].p_hdr) + path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); + eh = path[depth].p_hdr; + BUG_ON(eh == NULL); + + /* find where to start removing */ + ex = EXT_LAST_EXTENT(eh); + + ex_ee_block = le32_to_cpu(ex->ee_block); + if (ext4_ext_is_uninitialized(ex)) + uninitialized = 1; + ex_ee_len = ext4_ext_get_actual_len(ex); + + while (ex >= EXT_FIRST_EXTENT(eh) && + ex_ee_block + ex_ee_len > start) { + ext_debug("remove ext %lu:%u\n", ex_ee_block, ex_ee_len); + path[depth].p_ext = ex; + + a = ex_ee_block > start ? ex_ee_block : start; + b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ? + ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK; + + ext_debug(" border %u:%u\n", a, b); + + if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) { + block = 0; + num = 0; + BUG(); + } else if (a != ex_ee_block) { + /* remove tail of the extent */ + block = ex_ee_block; + num = a - block; + } else if (b != ex_ee_block + ex_ee_len - 1) { + /* remove head of the extent */ + block = a; + num = b - a; + /* there is no "make a hole" API yet */ + BUG(); + } else { + /* remove whole extent: excellent! */ + block = ex_ee_block; + num = 0; + BUG_ON(a != ex_ee_block); + BUG_ON(b != ex_ee_block + ex_ee_len - 1); + } + + /* + * 3 for leaf, sb, and inode plus 2 (bmap and group + * descriptor) for each block group; assume two block + * groups plus ex_ee_len/blocks_per_block_group for + * the worst case + */ + credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb)); + if (ex == EXT_FIRST_EXTENT(eh)) { + correct_index = 1; + credits += (ext_depth(inode)) + 1; + } + credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); + + err = ext4_ext_journal_restart(handle, credits); + if (err) + goto out; + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + + err = ext4_remove_blocks(handle, inode, ex, a, b); + if (err) + goto out; + + if (num == 0) { + /* this extent is removed; mark slot entirely unused */ + ext4_ext_store_pblock(ex, 0); + le16_add_cpu(&eh->eh_entries, -1); + } + + ex->ee_block = cpu_to_le32(block); + ex->ee_len = cpu_to_le16(num); + /* + * Do not mark uninitialized if all the blocks in the + * extent have been removed. + */ + if (uninitialized && num) + ext4_ext_mark_uninitialized(ex); + + err = ext4_ext_dirty(handle, inode, path + depth); + if (err) + goto out; + + ext_debug("new extent: %u:%u:%llu\n", block, num, + ext_pblock(ex)); + ex--; + ex_ee_block = le32_to_cpu(ex->ee_block); + ex_ee_len = ext4_ext_get_actual_len(ex); + } + + if (correct_index && eh->eh_entries) + err = ext4_ext_correct_indexes(handle, inode, path); + + /* if this leaf is free, then we should + * remove it from index block above */ + if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) + err = ext4_ext_rm_idx(handle, inode, path + depth); + +out: + return err; +} + +/* + * ext4_ext_more_to_rm: + * returns 1 if current index has to be freed (even partial) + */ +static int +ext4_ext_more_to_rm(struct ext4_ext_path *path) +{ + BUG_ON(path->p_idx == NULL); + + if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) + return 0; + + /* + * if truncate on deeper level happened, it wasn't partial, + * so we have to consider current index for truncation + */ + if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block) + return 0; + return 1; +} + +static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) +{ + struct super_block *sb = inode->i_sb; + int depth = ext_depth(inode); + struct ext4_ext_path *path; + handle_t *handle; + int i = 0, err = 0; + + ext_debug("truncate since %u\n", start); + + /* probably first extent we're gonna free will be last in block */ + handle = ext4_journal_start(inode, depth + 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + ext4_ext_invalidate_cache(inode); + + /* + * We start scanning from right side, freeing all the blocks + * after i_size and walking into the tree depth-wise. + */ + path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS); + if (path == NULL) { + ext4_journal_stop(handle); + return -ENOMEM; + } + path[0].p_hdr = ext_inode_hdr(inode); + if (ext4_ext_check_header(inode, path[0].p_hdr, depth)) { + err = -EIO; + goto out; + } + path[0].p_depth = depth; + + while (i >= 0 && err == 0) { + if (i == depth) { + /* this is leaf block */ + err = ext4_ext_rm_leaf(handle, inode, path, start); + /* root level has p_bh == NULL, brelse() eats this */ + brelse(path[i].p_bh); + path[i].p_bh = NULL; + i--; + continue; + } + + /* this is index block */ + if (!path[i].p_hdr) { + ext_debug("initialize header\n"); + path[i].p_hdr = ext_block_hdr(path[i].p_bh); + } + + if (!path[i].p_idx) { + /* this level hasn't been touched yet */ + path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr); + path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1; + ext_debug("init index ptr: hdr 0x%p, num %d\n", + path[i].p_hdr, + le16_to_cpu(path[i].p_hdr->eh_entries)); + } else { + /* we were already here, see at next index */ + path[i].p_idx--; + } + + ext_debug("level %d - index, first 0x%p, cur 0x%p\n", + i, EXT_FIRST_INDEX(path[i].p_hdr), + path[i].p_idx); + if (ext4_ext_more_to_rm(path + i)) { + struct buffer_head *bh; + /* go to the next level */ + ext_debug("move to level %d (block %llu)\n", + i + 1, idx_pblock(path[i].p_idx)); + memset(path + i + 1, 0, sizeof(*path)); + bh = sb_bread(sb, idx_pblock(path[i].p_idx)); + if (!bh) { + /* should we reset i_size? */ + err = -EIO; + break; + } + if (WARN_ON(i + 1 > depth)) { + err = -EIO; + break; + } + if (ext4_ext_check_header(inode, ext_block_hdr(bh), + depth - i - 1)) { + err = -EIO; + break; + } + path[i + 1].p_bh = bh; + + /* save actual number of indexes since this + * number is changed at the next iteration */ + path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries); + i++; + } else { + /* we finished processing this index, go up */ + if (path[i].p_hdr->eh_entries == 0 && i > 0) { + /* index is empty, remove it; + * handle must be already prepared by the + * truncatei_leaf() */ + err = ext4_ext_rm_idx(handle, inode, path + i); + } + /* root level has p_bh == NULL, brelse() eats this */ + brelse(path[i].p_bh); + path[i].p_bh = NULL; + i--; + ext_debug("return to level %d\n", i); + } + } + + /* TODO: flexible tree reduction should be here */ + if (path->p_hdr->eh_entries == 0) { + /* + * truncate to zero freed all the tree, + * so we need to correct eh_depth + */ + err = ext4_ext_get_access(handle, inode, path); + if (err == 0) { + ext_inode_hdr(inode)->eh_depth = 0; + ext_inode_hdr(inode)->eh_max = + cpu_to_le16(ext4_ext_space_root(inode)); + err = ext4_ext_dirty(handle, inode, path); + } + } +out: + ext4_ext_tree_changed(inode); + ext4_ext_drop_refs(path); + kfree(path); + ext4_journal_stop(handle); + + return err; +} + +/* + * called at mount time + */ +void ext4_ext_init(struct super_block *sb) +{ + /* + * possible initialization would be here + */ + + if (test_opt(sb, EXTENTS)) { + printk(KERN_INFO "EXT4-fs: file extents enabled"); +#ifdef AGGRESSIVE_TEST + printk(", aggressive tests"); +#endif +#ifdef CHECK_BINSEARCH + printk(", check binsearch"); +#endif +#ifdef EXTENTS_STATS + printk(", stats"); +#endif + printk("\n"); +#ifdef EXTENTS_STATS + spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock); + EXT4_SB(sb)->s_ext_min = 1 << 30; + EXT4_SB(sb)->s_ext_max = 0; +#endif + } +} + +/* + * called at umount time + */ +void ext4_ext_release(struct super_block *sb) +{ + if (!test_opt(sb, EXTENTS)) + return; + +#ifdef EXTENTS_STATS + if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) { + struct ext4_sb_info *sbi = EXT4_SB(sb); + printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n", + sbi->s_ext_blocks, sbi->s_ext_extents, + sbi->s_ext_blocks / sbi->s_ext_extents); + printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n", + sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max); + } +#endif +} + +static void bi_complete(struct bio *bio, int error) +{ + complete((struct completion *)bio->bi_private); +} + +/* FIXME!! we need to try to merge to left or right after zero-out */ +static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) +{ + int ret = -EIO; + struct bio *bio; + int blkbits, blocksize; + sector_t ee_pblock; + struct completion event; + unsigned int ee_len, len, done, offset; + + + blkbits = inode->i_blkbits; + blocksize = inode->i_sb->s_blocksize; + ee_len = ext4_ext_get_actual_len(ex); + ee_pblock = ext_pblock(ex); + + /* convert ee_pblock to 512 byte sectors */ + ee_pblock = ee_pblock << (blkbits - 9); + + while (ee_len > 0) { + + if (ee_len > BIO_MAX_PAGES) + len = BIO_MAX_PAGES; + else + len = ee_len; + + bio = bio_alloc(GFP_NOIO, len); + if (!bio) + return -ENOMEM; + bio->bi_sector = ee_pblock; + bio->bi_bdev = inode->i_sb->s_bdev; + + done = 0; + offset = 0; + while (done < len) { + ret = bio_add_page(bio, ZERO_PAGE(0), + blocksize, offset); + if (ret != blocksize) { + /* + * We can't add any more pages because of + * hardware limitations. Start a new bio. + */ + break; + } + done++; + offset += blocksize; + if (offset >= PAGE_CACHE_SIZE) + offset = 0; + } + + init_completion(&event); + bio->bi_private = &event; + bio->bi_end_io = bi_complete; + submit_bio(WRITE, bio); + wait_for_completion(&event); + + if (test_bit(BIO_UPTODATE, &bio->bi_flags)) + ret = 0; + else { + ret = -EIO; + break; + } + bio_put(bio); + ee_len -= done; + ee_pblock += done << (blkbits - 9); + } + return ret; +} + +#define EXT4_EXT_ZERO_LEN 7 + +/* + * This function is called by ext4_ext_get_blocks() if someone tries to write + * to an uninitialized extent. It may result in splitting the uninitialized + * extent into multiple extents (upto three - one initialized and two + * uninitialized). + * There are three possibilities: + * a> There is no split required: Entire extent should be initialized + * b> Splits in two extents: Write is happening at either end of the extent + * c> Splits in three extents: Somone is writing in middle of the extent + */ +static int ext4_ext_convert_to_initialized(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t iblock, + unsigned long max_blocks) +{ + struct ext4_extent *ex, newex, orig_ex; + struct ext4_extent *ex1 = NULL; + struct ext4_extent *ex2 = NULL; + struct ext4_extent *ex3 = NULL; + struct ext4_extent_header *eh; + ext4_lblk_t ee_block; + unsigned int allocated, ee_len, depth; + ext4_fsblk_t newblock; + int err = 0; + int ret = 0; + + depth = ext_depth(inode); + eh = path[depth].p_hdr; + ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + allocated = ee_len - (iblock - ee_block); + newblock = iblock - ee_block + ext_pblock(ex); + ex2 = ex; + orig_ex.ee_block = ex->ee_block; + orig_ex.ee_len = cpu_to_le16(ee_len); + ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */ + if (ee_len <= 2*EXT4_EXT_ZERO_LEN) { + err = ext4_ext_zeroout(inode, &orig_ex); + if (err) + goto fix_extent_len; + /* update the extent length and mark as initialized */ + ex->ee_block = orig_ex.ee_block; + ex->ee_len = orig_ex.ee_len; + ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_dirty(handle, inode, path + depth); + /* zeroed the full extent */ + return allocated; + } + + /* ex1: ee_block to iblock - 1 : uninitialized */ + if (iblock > ee_block) { + ex1 = ex; + ex1->ee_len = cpu_to_le16(iblock - ee_block); + ext4_ext_mark_uninitialized(ex1); + ex2 = &newex; + } + /* + * for sanity, update the length of the ex2 extent before + * we insert ex3, if ex1 is NULL. This is to avoid temporary + * overlap of blocks. + */ + if (!ex1 && allocated > max_blocks) + ex2->ee_len = cpu_to_le16(max_blocks); + /* ex3: to ee_block + ee_len : uninitialised */ + if (allocated > max_blocks) { + unsigned int newdepth; + /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */ + if (allocated <= EXT4_EXT_ZERO_LEN) { + /* + * iblock == ee_block is handled by the zerouout + * at the beginning. + * Mark first half uninitialized. + * Mark second half initialized and zero out the + * initialized extent + */ + ex->ee_block = orig_ex.ee_block; + ex->ee_len = cpu_to_le16(ee_len - allocated); + ext4_ext_mark_uninitialized(ex); + ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_dirty(handle, inode, path + depth); + + ex3 = &newex; + ex3->ee_block = cpu_to_le32(iblock); + ext4_ext_store_pblock(ex3, newblock); + ex3->ee_len = cpu_to_le16(allocated); + err = ext4_ext_insert_extent(handle, inode, path, ex3); + if (err == -ENOSPC) { + err = ext4_ext_zeroout(inode, &orig_ex); + if (err) + goto fix_extent_len; + ex->ee_block = orig_ex.ee_block; + ex->ee_len = orig_ex.ee_len; + ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_dirty(handle, inode, path + depth); + /* blocks available from iblock */ + return allocated; + + } else if (err) + goto fix_extent_len; + + /* + * We need to zero out the second half because + * an fallocate request can update file size and + * converting the second half to initialized extent + * implies that we can leak some junk data to user + * space. + */ + err = ext4_ext_zeroout(inode, ex3); + if (err) { + /* + * We should actually mark the + * second half as uninit and return error + * Insert would have changed the extent + */ + depth = ext_depth(inode); + ext4_ext_drop_refs(path); + path = ext4_ext_find_extent(inode, + iblock, path); + if (IS_ERR(path)) { + err = PTR_ERR(path); + return err; + } + /* get the second half extent details */ + ex = path[depth].p_ext; + err = ext4_ext_get_access(handle, inode, + path + depth); + if (err) + return err; + ext4_ext_mark_uninitialized(ex); + ext4_ext_dirty(handle, inode, path + depth); + return err; + } + + /* zeroed the second half */ + return allocated; + } + ex3 = &newex; + ex3->ee_block = cpu_to_le32(iblock + max_blocks); + ext4_ext_store_pblock(ex3, newblock + max_blocks); + ex3->ee_len = cpu_to_le16(allocated - max_blocks); + ext4_ext_mark_uninitialized(ex3); + err = ext4_ext_insert_extent(handle, inode, path, ex3); + if (err == -ENOSPC) { + err = ext4_ext_zeroout(inode, &orig_ex); + if (err) + goto fix_extent_len; + /* update the extent length and mark as initialized */ + ex->ee_block = orig_ex.ee_block; + ex->ee_len = orig_ex.ee_len; + ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_dirty(handle, inode, path + depth); + /* zeroed the full extent */ + /* blocks available from iblock */ + return allocated; + + } else if (err) + goto fix_extent_len; + /* + * The depth, and hence eh & ex might change + * as part of the insert above. + */ + newdepth = ext_depth(inode); + /* + * update the extent length after successfull insert of the + * split extent + */ + orig_ex.ee_len = cpu_to_le16(ee_len - + ext4_ext_get_actual_len(ex3)); + depth = newdepth; + ext4_ext_drop_refs(path); + path = ext4_ext_find_extent(inode, iblock, path); + if (IS_ERR(path)) { + err = PTR_ERR(path); + goto out; + } + eh = path[depth].p_hdr; + ex = path[depth].p_ext; + if (ex2 != &newex) + ex2 = ex; + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + + allocated = max_blocks; + + /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying + * to insert a extent in the middle zerout directly + * otherwise give the extent a chance to merge to left + */ + if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN && + iblock != ee_block) { + err = ext4_ext_zeroout(inode, &orig_ex); + if (err) + goto fix_extent_len; + /* update the extent length and mark as initialized */ + ex->ee_block = orig_ex.ee_block; + ex->ee_len = orig_ex.ee_len; + ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_dirty(handle, inode, path + depth); + /* zero out the first half */ + /* blocks available from iblock */ + return allocated; + } + } + /* + * If there was a change of depth as part of the + * insertion of ex3 above, we need to update the length + * of the ex1 extent again here + */ + if (ex1 && ex1 != ex) { + ex1 = ex; + ex1->ee_len = cpu_to_le16(iblock - ee_block); + ext4_ext_mark_uninitialized(ex1); + ex2 = &newex; + } + /* ex2: iblock to iblock + maxblocks-1 : initialised */ + ex2->ee_block = cpu_to_le32(iblock); + ext4_ext_store_pblock(ex2, newblock); + ex2->ee_len = cpu_to_le16(allocated); + if (ex2 != ex) + goto insert; + /* + * New (initialized) extent starts from the first block + * in the current extent. i.e., ex2 == ex + * We have to see if it can be merged with the extent + * on the left. + */ + if (ex2 > EXT_FIRST_EXTENT(eh)) { + /* + * To merge left, pass "ex2 - 1" to try_to_merge(), + * since it merges towards right _only_. + */ + ret = ext4_ext_try_to_merge(inode, path, ex2 - 1); + if (ret) { + err = ext4_ext_correct_indexes(handle, inode, path); + if (err) + goto out; + depth = ext_depth(inode); + ex2--; + } + } + /* + * Try to Merge towards right. This might be required + * only when the whole extent is being written to. + * i.e. ex2 == ex and ex3 == NULL. + */ + if (!ex3) { + ret = ext4_ext_try_to_merge(inode, path, ex2); + if (ret) { + err = ext4_ext_correct_indexes(handle, inode, path); + if (err) + goto out; + } + } + /* Mark modified extent as dirty */ + err = ext4_ext_dirty(handle, inode, path + depth); + goto out; +insert: + err = ext4_ext_insert_extent(handle, inode, path, &newex); + if (err == -ENOSPC) { + err = ext4_ext_zeroout(inode, &orig_ex); + if (err) + goto fix_extent_len; + /* update the extent length and mark as initialized */ + ex->ee_block = orig_ex.ee_block; + ex->ee_len = orig_ex.ee_len; + ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_dirty(handle, inode, path + depth); + /* zero out the first half */ + return allocated; + } else if (err) + goto fix_extent_len; +out: + return err ? err : allocated; + +fix_extent_len: + ex->ee_block = orig_ex.ee_block; + ex->ee_len = orig_ex.ee_len; + ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_mark_uninitialized(ex); + ext4_ext_dirty(handle, inode, path + depth); + return err; +} + +/* + * Block allocation/map/preallocation routine for extents based files + * + * + * Need to be called with + * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block + * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) + * + * return > 0, number of of blocks already mapped/allocated + * if create == 0 and these are pre-allocated blocks + * buffer head is unmapped + * otherwise blocks are mapped + * + * return = 0, if plain look up failed (blocks have not been allocated) + * buffer head is unmapped + * + * return < 0, error case. + */ +int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, + ext4_lblk_t iblock, + unsigned long max_blocks, struct buffer_head *bh_result, + int create, int extend_disksize) +{ + struct ext4_ext_path *path = NULL; + struct ext4_extent_header *eh; + struct ext4_extent newex, *ex; + ext4_fsblk_t goal, newblock; + int err = 0, depth, ret; + unsigned long allocated = 0; + struct ext4_allocation_request ar; + loff_t disksize; + + __clear_bit(BH_New, &bh_result->b_state); + ext_debug("blocks %u/%lu requested for inode %u\n", + iblock, max_blocks, inode->i_ino); + + /* check in cache */ + goal = ext4_ext_in_cache(inode, iblock, &newex); + if (goal) { + if (goal == EXT4_EXT_CACHE_GAP) { + if (!create) { + /* + * block isn't allocated yet and + * user doesn't want to allocate it + */ + goto out2; + } + /* we should allocate requested block */ + } else if (goal == EXT4_EXT_CACHE_EXTENT) { + /* block is already allocated */ + newblock = iblock + - le32_to_cpu(newex.ee_block) + + ext_pblock(&newex); + /* number of remaining blocks in the extent */ + allocated = ext4_ext_get_actual_len(&newex) - + (iblock - le32_to_cpu(newex.ee_block)); + goto out; + } else { + BUG(); + } + } + + /* find extent for this block */ + path = ext4_ext_find_extent(inode, iblock, NULL); + if (IS_ERR(path)) { + err = PTR_ERR(path); + path = NULL; + goto out2; + } + + depth = ext_depth(inode); + + /* + * consistent leaf must not be empty; + * this situation is possible, though, _during_ tree modification; + * this is why assert can't be put in ext4_ext_find_extent() + */ + BUG_ON(path[depth].p_ext == NULL && depth != 0); + eh = path[depth].p_hdr; + + ex = path[depth].p_ext; + if (ex) { + ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); + ext4_fsblk_t ee_start = ext_pblock(ex); + unsigned short ee_len; + + /* + * Uninitialized extents are treated as holes, except that + * we split out initialized portions during a write. + */ + ee_len = ext4_ext_get_actual_len(ex); + /* if found extent covers block, simply return it */ + if (iblock >= ee_block && iblock < ee_block + ee_len) { + newblock = iblock - ee_block + ee_start; + /* number of remaining blocks in the extent */ + allocated = ee_len - (iblock - ee_block); + ext_debug("%u fit into %lu:%d -> %llu\n", iblock, + ee_block, ee_len, newblock); + + /* Do not put uninitialized extent in the cache */ + if (!ext4_ext_is_uninitialized(ex)) { + ext4_ext_put_in_cache(inode, ee_block, + ee_len, ee_start, + EXT4_EXT_CACHE_EXTENT); + goto out; + } + if (create == EXT4_CREATE_UNINITIALIZED_EXT) + goto out; + if (!create) { + /* + * We have blocks reserved already. We + * return allocated blocks so that delalloc + * won't do block reservation for us. But + * the buffer head will be unmapped so that + * a read from the block returns 0s. + */ + if (allocated > max_blocks) + allocated = max_blocks; + set_buffer_unwritten(bh_result); + goto out2; + } + + ret = ext4_ext_convert_to_initialized(handle, inode, + path, iblock, + max_blocks); + if (ret <= 0) { + err = ret; + goto out2; + } else + allocated = ret; + goto outnew; + } + } + + /* + * requested block isn't allocated yet; + * we couldn't try to create block if create flag is zero + */ + if (!create) { + /* + * put just found gap into cache to speed up + * subsequent requests + */ + ext4_ext_put_gap_in_cache(inode, path, iblock); + goto out2; + } + /* + * Okay, we need to do block allocation. + */ + + /* find neighbour allocated blocks */ + ar.lleft = iblock; + err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft); + if (err) + goto out2; + ar.lright = iblock; + err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright); + if (err) + goto out2; + + /* + * See if request is beyond maximum number of blocks we can have in + * a single extent. For an initialized extent this limit is + * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is + * EXT_UNINIT_MAX_LEN. + */ + if (max_blocks > EXT_INIT_MAX_LEN && + create != EXT4_CREATE_UNINITIALIZED_EXT) + max_blocks = EXT_INIT_MAX_LEN; + else if (max_blocks > EXT_UNINIT_MAX_LEN && + create == EXT4_CREATE_UNINITIALIZED_EXT) + max_blocks = EXT_UNINIT_MAX_LEN; + + /* Check if we can really insert (iblock)::(iblock+max_blocks) extent */ + newex.ee_block = cpu_to_le32(iblock); + newex.ee_len = cpu_to_le16(max_blocks); + err = ext4_ext_check_overlap(inode, &newex, path); + if (err) + allocated = ext4_ext_get_actual_len(&newex); + else + allocated = max_blocks; + + /* allocate new block */ + ar.inode = inode; + ar.goal = ext4_ext_find_goal(inode, path, iblock); + ar.logical = iblock; + ar.len = allocated; + if (S_ISREG(inode->i_mode)) + ar.flags = EXT4_MB_HINT_DATA; + else + /* disable in-core preallocation for non-regular files */ + ar.flags = 0; + newblock = ext4_mb_new_blocks(handle, &ar, &err); + if (!newblock) + goto out2; + ext_debug("allocate new block: goal %llu, found %llu/%lu\n", + goal, newblock, allocated); + + /* try to insert new extent into found leaf and return */ + ext4_ext_store_pblock(&newex, newblock); + newex.ee_len = cpu_to_le16(ar.len); + if (create == EXT4_CREATE_UNINITIALIZED_EXT) /* Mark uninitialized */ + ext4_ext_mark_uninitialized(&newex); + err = ext4_ext_insert_extent(handle, inode, path, &newex); + if (err) { + /* free data blocks we just allocated */ + /* not a good idea to call discard here directly, + * but otherwise we'd need to call it every free() */ + ext4_discard_preallocations(inode); + ext4_free_blocks(handle, inode, ext_pblock(&newex), + ext4_ext_get_actual_len(&newex), 0); + goto out2; + } + + /* previous routine could use block we allocated */ + newblock = ext_pblock(&newex); + allocated = ext4_ext_get_actual_len(&newex); +outnew: + if (extend_disksize) { + disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits; + if (disksize > i_size_read(inode)) + disksize = i_size_read(inode); + if (disksize > EXT4_I(inode)->i_disksize) + EXT4_I(inode)->i_disksize = disksize; + } + + set_buffer_new(bh_result); + + /* Cache only when it is _not_ an uninitialized extent */ + if (create != EXT4_CREATE_UNINITIALIZED_EXT) + ext4_ext_put_in_cache(inode, iblock, allocated, newblock, + EXT4_EXT_CACHE_EXTENT); +out: + if (allocated > max_blocks) + allocated = max_blocks; + ext4_ext_show_leaf(inode, path); + set_buffer_mapped(bh_result); + bh_result->b_bdev = inode->i_sb->s_bdev; + bh_result->b_blocknr = newblock; +out2: + if (path) { + ext4_ext_drop_refs(path); + kfree(path); + } + return err ? err : allocated; +} + +void ext4_ext_truncate(struct inode *inode) +{ + struct address_space *mapping = inode->i_mapping; + struct super_block *sb = inode->i_sb; + ext4_lblk_t last_block; + handle_t *handle; + int err = 0; + + /* + * probably first extent we're gonna free will be last in block + */ + err = ext4_writepage_trans_blocks(inode); + handle = ext4_journal_start(inode, err); + if (IS_ERR(handle)) + return; + + if (inode->i_size & (sb->s_blocksize - 1)) + ext4_block_truncate_page(handle, mapping, inode->i_size); + + if (ext4_orphan_add(handle, inode)) + goto out_stop; + + down_write(&EXT4_I(inode)->i_data_sem); + ext4_ext_invalidate_cache(inode); + + ext4_discard_preallocations(inode); + + /* + * TODO: optimization is possible here. + * Probably we need not scan at all, + * because page truncation is enough. + */ + + /* we have to know where to truncate from in crash case */ + EXT4_I(inode)->i_disksize = inode->i_size; + ext4_mark_inode_dirty(handle, inode); + + last_block = (inode->i_size + sb->s_blocksize - 1) + >> EXT4_BLOCK_SIZE_BITS(sb); + err = ext4_ext_remove_space(inode, last_block); + + /* In a multi-transaction truncate, we only make the final + * transaction synchronous. + */ + if (IS_SYNC(inode)) + handle->h_sync = 1; + +out_stop: + up_write(&EXT4_I(inode)->i_data_sem); + /* + * If this was a simple ftruncate() and the file will remain alive, + * then we need to clear up the orphan record which we created above. + * However, if this was a real unlink then we were called by + * ext4_delete_inode(), and we allow that function to clean up the + * orphan info for us. + */ + if (inode->i_nlink) + ext4_orphan_del(handle, inode); + + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); +} + +static void ext4_falloc_update_inode(struct inode *inode, + int mode, loff_t new_size, int update_ctime) +{ + struct timespec now; + + if (update_ctime) { + now = current_fs_time(inode->i_sb); + if (!timespec_equal(&inode->i_ctime, &now)) + inode->i_ctime = now; + } + /* + * Update only when preallocation was requested beyond + * the file size. + */ + if (!(mode & FALLOC_FL_KEEP_SIZE)) { + if (new_size > i_size_read(inode)) + i_size_write(inode, new_size); + if (new_size > EXT4_I(inode)->i_disksize) + ext4_update_i_disksize(inode, new_size); + } + +} + +/* + * preallocate space for a file. This implements ext4's fallocate inode + * operation, which gets called from sys_fallocate system call. + * For block-mapped files, posix_fallocate should fall back to the method + * of writing zeroes to the required new blocks (the same behavior which is + * expected for file systems which do not support fallocate() system call). + */ +long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) +{ + handle_t *handle; + ext4_lblk_t block; + loff_t new_size; + unsigned long max_blocks; + int ret = 0; + int ret2 = 0; + int retries = 0; + struct buffer_head map_bh; + unsigned int credits, blkbits = inode->i_blkbits; + + /* + * currently supporting (pre)allocate mode for extent-based + * files _only_ + */ + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + return -EOPNOTSUPP; + + /* preallocation to directories is currently not supported */ + if (S_ISDIR(inode->i_mode)) + return -ENODEV; + + block = offset >> blkbits; + /* + * We can't just convert len to max_blocks because + * If blocksize = 4096 offset = 3072 and len = 2048 + */ + max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) + - block; + /* + * credits to insert 1 extent into extent tree + */ + credits = ext4_chunk_trans_blocks(inode, max_blocks); + mutex_lock(&inode->i_mutex); +retry: + while (ret >= 0 && ret < max_blocks) { + block = block + ret; + max_blocks = max_blocks - ret; + handle = ext4_journal_start(inode, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + break; + } + ret = ext4_get_blocks_wrap(handle, inode, block, + max_blocks, &map_bh, + EXT4_CREATE_UNINITIALIZED_EXT, 0, 0); + if (ret <= 0) { +#ifdef EXT4FS_DEBUG + WARN_ON(ret <= 0); + printk(KERN_ERR "%s: ext4_ext_get_blocks " + "returned error inode#%lu, block=%u, " + "max_blocks=%lu", __func__, + inode->i_ino, block, max_blocks); +#endif + ext4_mark_inode_dirty(handle, inode); + ret2 = ext4_journal_stop(handle); + break; + } + if ((block + ret) >= (EXT4_BLOCK_ALIGN(offset + len, + blkbits) >> blkbits)) + new_size = offset + len; + else + new_size = (block + ret) << blkbits; + + ext4_falloc_update_inode(inode, mode, new_size, + buffer_new(&map_bh)); + ext4_mark_inode_dirty(handle, inode); + ret2 = ext4_journal_stop(handle); + if (ret2) + break; + } + if (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) { + ret = 0; + goto retry; + } + mutex_unlock(&inode->i_mutex); + return ret > 0 ? ret2 : ret; +} + +/* + * Callback function called for each extent to gather FIEMAP information. + */ +int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, + struct ext4_ext_cache *newex, struct ext4_extent *ex, + void *data) +{ + struct fiemap_extent_info *fieinfo = data; + unsigned long blksize_bits = inode->i_sb->s_blocksize_bits; + __u64 logical; + __u64 physical; + __u64 length; + __u32 flags = 0; + int error; + + logical = (__u64)newex->ec_block << blksize_bits; + + if (newex->ec_type == EXT4_EXT_CACHE_GAP) { + pgoff_t offset; + struct page *page; + struct buffer_head *bh = NULL; + + offset = logical >> PAGE_SHIFT; + page = find_get_page(inode->i_mapping, offset); + if (!page || !page_has_buffers(page)) + return EXT_CONTINUE; + + bh = page_buffers(page); + + if (!bh) + return EXT_CONTINUE; + + if (buffer_delay(bh)) { + flags |= FIEMAP_EXTENT_DELALLOC; + page_cache_release(page); + } else { + page_cache_release(page); + return EXT_CONTINUE; + } + } + + physical = (__u64)newex->ec_start << blksize_bits; + length = (__u64)newex->ec_len << blksize_bits; + + if (ex && ext4_ext_is_uninitialized(ex)) + flags |= FIEMAP_EXTENT_UNWRITTEN; + + /* + * If this extent reaches EXT_MAX_BLOCK, it must be last. + * + * Or if ext4_ext_next_allocated_block is EXT_MAX_BLOCK, + * this also indicates no more allocated blocks. + * + * XXX this might miss a single-block extent at EXT_MAX_BLOCK + */ + if (logical + length - 1 == EXT_MAX_BLOCK || + ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK) + flags |= FIEMAP_EXTENT_LAST; + + error = fiemap_fill_next_extent(fieinfo, logical, physical, + length, flags); + if (error < 0) + return error; + if (error == 1) + return EXT_BREAK; + + return EXT_CONTINUE; +} + +/* fiemap flags we can handle specified here */ +#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) + +int ext4_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo) +{ + __u64 physical = 0; + __u64 length; + __u32 flags = FIEMAP_EXTENT_LAST; + int blockbits = inode->i_sb->s_blocksize_bits; + int error = 0; + + /* in-inode? */ + if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) { + struct ext4_iloc iloc; + int offset; /* offset of xattr in inode */ + + error = ext4_get_inode_loc(inode, &iloc); + if (error) + return error; + physical = iloc.bh->b_blocknr << blockbits; + offset = EXT4_GOOD_OLD_INODE_SIZE + + EXT4_I(inode)->i_extra_isize; + physical += offset; + length = EXT4_SB(inode->i_sb)->s_inode_size - offset; + flags |= FIEMAP_EXTENT_DATA_INLINE; + } else { /* external block */ + physical = EXT4_I(inode)->i_file_acl << blockbits; + length = inode->i_sb->s_blocksize; + } + + if (physical) + error = fiemap_fill_next_extent(fieinfo, 0, physical, + length, flags); + return (error < 0 ? error : 0); +} + +int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len) +{ + ext4_lblk_t start_blk; + ext4_lblk_t len_blks; + int error = 0; + + /* fallback to generic here if not in extents fmt */ + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + return generic_block_fiemap(inode, fieinfo, start, len, + ext4_get_block); + + if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS)) + return -EBADR; + + if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { + error = ext4_xattr_fiemap(inode, fieinfo); + } else { + start_blk = start >> inode->i_sb->s_blocksize_bits; + len_blks = len >> inode->i_sb->s_blocksize_bits; + + /* + * Walk the extent tree gathering extent information. + * ext4_ext_fiemap_cb will push extents back to user. + */ + down_write(&EXT4_I(inode)->i_data_sem); + error = ext4_ext_walk_space(inode, start_blk, len_blks, + ext4_ext_fiemap_cb, fieinfo); + up_write(&EXT4_I(inode)->i_data_sem); + } + + return error; +} + diff --git a/fs/ext4/file.c b/fs/ext4/file.c new file mode 100644 index 0000000..6bd11fb --- /dev/null +++ b/fs/ext4/file.c @@ -0,0 +1,178 @@ +/* + * linux/fs/ext4/file.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/file.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext4 fs regular file handling primitives + * + * 64-bit file support on 64-bit platforms by Jakub Jelinek + * (jj@sunsite.ms.mff.cuni.cz) + */ + +#include <linux/time.h> +#include <linux/fs.h> +#include <linux/jbd2.h> +#include "ext4.h" +#include "ext4_jbd2.h" +#include "xattr.h" +#include "acl.h" + +/* + * Called when an inode is released. Note that this is different + * from ext4_file_open: open gets called at every open, but release + * gets called only when /all/ the files are closed. + */ +static int ext4_release_file(struct inode *inode, struct file *filp) +{ + /* if we are the last writer on the inode, drop the block reservation */ + if ((filp->f_mode & FMODE_WRITE) && + (atomic_read(&inode->i_writecount) == 1)) + { + down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_preallocations(inode); + up_write(&EXT4_I(inode)->i_data_sem); + } + if (is_dx(inode) && filp->private_data) + ext4_htree_free_dir_info(filp->private_data); + + return 0; +} + +static ssize_t +ext4_file_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_path.dentry->d_inode; + ssize_t ret; + int err; + + /* + * If we have encountered a bitmap-format file, the size limit + * is smaller than s_maxbytes, which is for extent-mapped files. + */ + + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + size_t length = iov_length(iov, nr_segs); + + if (pos > sbi->s_bitmap_maxbytes) + return -EFBIG; + + if (pos + length > sbi->s_bitmap_maxbytes) { + nr_segs = iov_shorten((struct iovec *)iov, nr_segs, + sbi->s_bitmap_maxbytes - pos); + } + } + + ret = generic_file_aio_write(iocb, iov, nr_segs, pos); + /* + * Skip flushing if there was an error, or if nothing was written. + */ + if (ret <= 0) + return ret; + + /* + * If the inode is IS_SYNC, or is O_SYNC and we are doing data + * journalling then we need to make sure that we force the transaction + * to disk to keep all metadata uptodate synchronously. + */ + if (file->f_flags & O_SYNC) { + /* + * If we are non-data-journaled, then the dirty data has + * already been flushed to backing store by generic_osync_inode, + * and the inode has been flushed too if there have been any + * modifications other than mere timestamp updates. + * + * Open question --- do we care about flushing timestamps too + * if the inode is IS_SYNC? + */ + if (!ext4_should_journal_data(inode)) + return ret; + + goto force_commit; + } + + /* + * So we know that there has been no forced data flush. If the inode + * is marked IS_SYNC, we need to force one ourselves. + */ + if (!IS_SYNC(inode)) + return ret; + + /* + * Open question #2 --- should we force data to disk here too? If we + * don't, the only impact is that data=writeback filesystems won't + * flush data to disk automatically on IS_SYNC, only metadata (but + * historically, that is what ext2 has done.) + */ + +force_commit: + err = ext4_force_commit(inode->i_sb); + if (err) + return err; + return ret; +} + +static struct vm_operations_struct ext4_file_vm_ops = { + .fault = filemap_fault, + .page_mkwrite = ext4_page_mkwrite, +}; + +static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct address_space *mapping = file->f_mapping; + + if (!mapping->a_ops->readpage) + return -ENOEXEC; + file_accessed(file); + vma->vm_ops = &ext4_file_vm_ops; + vma->vm_flags |= VM_CAN_NONLINEAR; + return 0; +} + +extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); + +const struct file_operations ext4_file_operations = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = generic_file_aio_read, + .aio_write = ext4_file_write, + .unlocked_ioctl = ext4_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ext4_compat_ioctl, +#endif + .mmap = ext4_file_mmap, + .open = generic_file_open, + .release = ext4_release_file, + .fsync = ext4_sync_file, + .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, +}; + +const struct inode_operations ext4_file_inode_operations = { + .truncate = ext4_truncate, + .setattr = ext4_setattr, + .getattr = ext4_getattr, +#ifdef CONFIG_EXT4_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext4_listxattr, + .removexattr = generic_removexattr, +#endif + .permission = ext4_permission, + .fallocate = ext4_fallocate, + .fiemap = ext4_fiemap, +}; + diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c new file mode 100644 index 0000000..5afe437 --- /dev/null +++ b/fs/ext4/fsync.c @@ -0,0 +1,100 @@ +/* + * linux/fs/ext4/fsync.c + * + * Copyright (C) 1993 Stephen Tweedie (sct@redhat.com) + * from + * Copyright (C) 1992 Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * from + * linux/fs/minix/truncate.c Copyright (C) 1991, 1992 Linus Torvalds + * + * ext4fs fsync primitive + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * + * Removed unnecessary code duplication for little endian machines + * and excessive __inline__s. + * Andi Kleen, 1997 + * + * Major simplications and cleanup - we only need to do the metadata, because + * we can depend on generic_block_fdatasync() to sync the data blocks. + */ + +#include <linux/time.h> +#include <linux/fs.h> +#include <linux/sched.h> +#include <linux/writeback.h> +#include <linux/jbd2.h> +#include <linux/blkdev.h> +#include <linux/marker.h> +#include "ext4.h" +#include "ext4_jbd2.h" + +/* + * akpm: A new design for ext4_sync_file(). + * + * This is only called from sys_fsync(), sys_fdatasync() and sys_msync(). + * There cannot be a transaction open by this task. + * Another task could have dirtied this inode. Its data can be in any + * state in the journalling system. + * + * What we do is just kick off a commit and wait on it. This will snapshot the + * inode to disk. + */ + +int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) +{ + struct inode *inode = dentry->d_inode; + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; + int ret = 0; + + J_ASSERT(ext4_journal_current_handle() == NULL); + + trace_mark(ext4_sync_file, "dev %s datasync %d ino %ld parent %ld", + inode->i_sb->s_id, datasync, inode->i_ino, + dentry->d_parent->d_inode->i_ino); + + /* + * data=writeback: + * The caller's filemap_fdatawrite()/wait will sync the data. + * sync_inode() will sync the metadata + * + * data=ordered: + * The caller's filemap_fdatawrite() will write the data and + * sync_inode() will write the inode if it is dirty. Then the caller's + * filemap_fdatawait() will wait on the pages. + * + * data=journal: + * filemap_fdatawrite won't do anything (the buffers are clean). + * ext4_force_commit will write the file data into the journal and + * will wait on that. + * filemap_fdatawait() will encounter a ton of newly-dirtied pages + * (they were dirtied by commit). But that's OK - the blocks are + * safe in-journal, which is all fsync() needs to ensure. + */ + if (ext4_should_journal_data(inode)) { + ret = ext4_force_commit(inode->i_sb); + goto out; + } + + if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) + goto out; + + /* + * The VFS has written the file data. If the inode is unaltered + * then we need not start a commit. + */ + if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) { + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 0, /* sys_fsync did this */ + }; + ret = sync_inode(inode, &wbc); + if (journal && (journal->j_flags & JBD2_BARRIER)) + blkdev_issue_flush(inode->i_sb->s_bdev, NULL); + } +out: + return ret; +} diff --git a/fs/ext4/group.h b/fs/ext4/group.h new file mode 100644 index 0000000..c2c0a8d --- /dev/null +++ b/fs/ext4/group.h @@ -0,0 +1,29 @@ +/* + * linux/fs/ext4/group.h + * + * Copyright (C) 2007 Cluster File Systems, Inc + * + * Author: Andreas Dilger <adilger@clusterfs.com> + */ + +#ifndef _LINUX_EXT4_GROUP_H +#define _LINUX_EXT4_GROUP_H + +extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group, + struct ext4_group_desc *gdp); +extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group, + struct ext4_group_desc *gdp); +struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, + ext4_group_t block_group); +extern unsigned ext4_init_block_bitmap(struct super_block *sb, + struct buffer_head *bh, + ext4_group_t group, + struct ext4_group_desc *desc); +#define ext4_free_blocks_after_init(sb, group, desc) \ + ext4_init_block_bitmap(sb, NULL, group, desc) +extern unsigned ext4_init_inode_bitmap(struct super_block *sb, + struct buffer_head *bh, + ext4_group_t group, + struct ext4_group_desc *desc); +extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap); +#endif /* _LINUX_EXT4_GROUP_H */ diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c new file mode 100644 index 0000000..ac8f168 --- /dev/null +++ b/fs/ext4/hash.c @@ -0,0 +1,208 @@ +/* + * linux/fs/ext4/hash.c + * + * Copyright (C) 2002 by Theodore Ts'o + * + * This file is released under the GPL v2. + * + * This file may be redistributed under the terms of the GNU Public + * License. + */ + +#include <linux/fs.h> +#include <linux/jbd2.h> +#include <linux/cryptohash.h> +#include "ext4.h" + +#define DELTA 0x9E3779B9 + +static void TEA_transform(__u32 buf[4], __u32 const in[]) +{ + __u32 sum = 0; + __u32 b0 = buf[0], b1 = buf[1]; + __u32 a = in[0], b = in[1], c = in[2], d = in[3]; + int n = 16; + + do { + sum += DELTA; + b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); + b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); + } while (--n); + + buf[0] += b0; + buf[1] += b1; +} + + +/* The old legacy hash */ +static __u32 dx_hack_hash_unsigned(const char *name, int len) +{ + __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; + const unsigned char *ucp = (const unsigned char *) name; + + while (len--) { + hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373)); + + if (hash & 0x80000000) + hash -= 0x7fffffff; + hash1 = hash0; + hash0 = hash; + } + return hash0 << 1; +} + +static __u32 dx_hack_hash_signed(const char *name, int len) +{ + __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; + const signed char *scp = (const signed char *) name; + + while (len--) { + hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373)); + + if (hash & 0x80000000) + hash -= 0x7fffffff; + hash1 = hash0; + hash0 = hash; + } + return hash0 << 1; +} + +static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num) +{ + __u32 pad, val; + int i; + const signed char *scp = (const signed char *) msg; + + pad = (__u32)len | ((__u32)len << 8); + pad |= pad << 16; + + val = pad; + if (len > num*4) + len = num * 4; + for (i = 0; i < len; i++) { + if ((i % 4) == 0) + val = pad; + val = ((int) scp[i]) + (val << 8); + if ((i % 4) == 3) { + *buf++ = val; + val = pad; + num--; + } + } + if (--num >= 0) + *buf++ = val; + while (--num >= 0) + *buf++ = pad; +} + +static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num) +{ + __u32 pad, val; + int i; + const unsigned char *ucp = (const unsigned char *) msg; + + pad = (__u32)len | ((__u32)len << 8); + pad |= pad << 16; + + val = pad; + if (len > num*4) + len = num * 4; + for (i = 0; i < len; i++) { + if ((i % 4) == 0) + val = pad; + val = ((int) ucp[i]) + (val << 8); + if ((i % 4) == 3) { + *buf++ = val; + val = pad; + num--; + } + } + if (--num >= 0) + *buf++ = val; + while (--num >= 0) + *buf++ = pad; +} + +/* + * Returns the hash of a filename. If len is 0 and name is NULL, then + * this function can be used to test whether or not a hash version is + * supported. + * + * The seed is an 4 longword (32 bits) "secret" which can be used to + * uniquify a hash. If the seed is all zero's, then some default seed + * may be used. + * + * A particular hash version specifies whether or not the seed is + * represented, and whether or not the returned hash is 32 bits or 64 + * bits. 32 bit hashes will return 0 for the minor hash. + */ +int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) +{ + __u32 hash; + __u32 minor_hash = 0; + const char *p; + int i; + __u32 in[8], buf[4]; + void (*str2hashbuf)(const char *, int, __u32 *, int) = + str2hashbuf_signed; + + /* Initialize the default seed for the hash checksum functions */ + buf[0] = 0x67452301; + buf[1] = 0xefcdab89; + buf[2] = 0x98badcfe; + buf[3] = 0x10325476; + + /* Check to see if the seed is all zero's */ + if (hinfo->seed) { + for (i = 0; i < 4; i++) { + if (hinfo->seed[i]) + break; + } + if (i < 4) + memcpy(buf, hinfo->seed, sizeof(buf)); + } + + switch (hinfo->hash_version) { + case DX_HASH_LEGACY_UNSIGNED: + hash = dx_hack_hash_unsigned(name, len); + break; + case DX_HASH_LEGACY: + hash = dx_hack_hash_signed(name, len); + break; + case DX_HASH_HALF_MD4_UNSIGNED: + str2hashbuf = str2hashbuf_unsigned; + case DX_HASH_HALF_MD4: + p = name; + while (len > 0) { + (*str2hashbuf)(p, len, in, 8); + half_md4_transform(buf, in); + len -= 32; + p += 32; + } + minor_hash = buf[2]; + hash = buf[1]; + break; + case DX_HASH_TEA_UNSIGNED: + str2hashbuf = str2hashbuf_unsigned; + case DX_HASH_TEA: + p = name; + while (len > 0) { + (*str2hashbuf)(p, len, in, 4); + TEA_transform(buf, in); + len -= 16; + p += 16; + } + hash = buf[0]; + minor_hash = buf[1]; + break; + default: + hinfo->hash = 0; + return -1; + } + hash = hash & ~1; + if (hash == (EXT4_HTREE_EOF << 1)) + hash = (EXT4_HTREE_EOF-1) << 1; + hinfo->hash = hash; + hinfo->minor_hash = minor_hash; + return 0; +} diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c new file mode 100644 index 0000000..03c0819 --- /dev/null +++ b/fs/ext4/ialloc.c @@ -0,0 +1,1080 @@ +/* + * linux/fs/ext4/ialloc.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * BSD ufs-inspired inode and directory allocation by + * Stephen Tweedie (sct@redhat.com), 1993 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include <linux/time.h> +#include <linux/fs.h> +#include <linux/jbd2.h> +#include <linux/stat.h> +#include <linux/string.h> +#include <linux/quotaops.h> +#include <linux/buffer_head.h> +#include <linux/random.h> +#include <linux/bitops.h> +#include <linux/blkdev.h> +#include <asm/byteorder.h> +#include "ext4.h" +#include "ext4_jbd2.h" +#include "xattr.h" +#include "acl.h" +#include "group.h" + +/* + * ialloc.c contains the inodes allocation and deallocation routines + */ + +/* + * The free inodes are managed by bitmaps. A file system contains several + * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap + * block for inodes, N blocks for the inode table and data blocks. + * + * The file system contains group descriptors which are located after the + * super block. Each descriptor contains the number of the bitmap block and + * the free blocks count in the block. + */ + +/* + * To avoid calling the atomic setbit hundreds or thousands of times, we only + * need to use it within a single byte (to ensure we get endianness right). + * We can use memset for the rest of the bitmap as there are no other users. + */ +void mark_bitmap_end(int start_bit, int end_bit, char *bitmap) +{ + int i; + + if (start_bit >= end_bit) + return; + + ext4_debug("mark end bits +%d through +%d used\n", start_bit, end_bit); + for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++) + ext4_set_bit(i, bitmap); + if (i < end_bit) + memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3); +} + +/* Initializes an uninitialized inode bitmap */ +unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh, + ext4_group_t block_group, + struct ext4_group_desc *gdp) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + J_ASSERT_BH(bh, buffer_locked(bh)); + + /* If checksum is bad mark all blocks and inodes use to prevent + * allocation, essentially implementing a per-group read-only flag. */ + if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { + ext4_error(sb, __func__, "Checksum bad for group %lu\n", + block_group); + gdp->bg_free_blocks_count = 0; + gdp->bg_free_inodes_count = 0; + gdp->bg_itable_unused = 0; + memset(bh->b_data, 0xff, sb->s_blocksize); + return 0; + } + + memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8); + mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, + bh->b_data); + + return EXT4_INODES_PER_GROUP(sb); +} + +/* + * Read the inode allocation bitmap for a given block_group, reading + * into the specified slot in the superblock's bitmap cache. + * + * Return buffer_head of bitmap on success or NULL. + */ +static struct buffer_head * +ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) +{ + struct ext4_group_desc *desc; + struct buffer_head *bh = NULL; + ext4_fsblk_t bitmap_blk; + + desc = ext4_get_group_desc(sb, block_group, NULL); + if (!desc) + return NULL; + bitmap_blk = ext4_inode_bitmap(sb, desc); + bh = sb_getblk(sb, bitmap_blk); + if (unlikely(!bh)) { + ext4_error(sb, __func__, + "Cannot read inode bitmap - " + "block_group = %lu, inode_bitmap = %llu", + block_group, bitmap_blk); + return NULL; + } + if (bitmap_uptodate(bh)) + return bh; + + lock_buffer(bh); + if (bitmap_uptodate(bh)) { + unlock_buffer(bh); + return bh; + } + spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); + if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { + ext4_init_inode_bitmap(sb, bh, block_group, desc); + set_bitmap_uptodate(bh); + set_buffer_uptodate(bh); + unlock_buffer(bh); + spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); + return bh; + } + spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); + if (buffer_uptodate(bh)) { + /* + * if not uninit if bh is uptodate, + * bitmap is also uptodate + */ + set_bitmap_uptodate(bh); + unlock_buffer(bh); + return bh; + } + /* + * submit the buffer_head for read. We can + * safely mark the bitmap as uptodate now. + * We do it here so the bitmap uptodate bit + * get set with buffer lock held. + */ + set_bitmap_uptodate(bh); + if (bh_submit_read(bh) < 0) { + put_bh(bh); + ext4_error(sb, __func__, + "Cannot read inode bitmap - " + "block_group = %lu, inode_bitmap = %llu", + block_group, bitmap_blk); + return NULL; + } + return bh; +} + +/* + * NOTE! When we get the inode, we're the only people + * that have access to it, and as such there are no + * race conditions we have to worry about. The inode + * is not on the hash-lists, and it cannot be reached + * through the filesystem because the directory entry + * has been deleted earlier. + * + * HOWEVER: we must make sure that we get no aliases, + * which means that we have to call "clear_inode()" + * _before_ we mark the inode not in use in the inode + * bitmaps. Otherwise a newly created file might use + * the same inode number (not actually the same pointer + * though), and then we'd have two inodes sharing the + * same inode number and space on the harddisk. + */ +void ext4_free_inode(handle_t *handle, struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + int is_directory; + unsigned long ino; + struct buffer_head *bitmap_bh = NULL; + struct buffer_head *bh2; + ext4_group_t block_group; + unsigned long bit; + struct ext4_group_desc *gdp; + struct ext4_super_block *es; + struct ext4_sb_info *sbi; + int fatal = 0, err; + ext4_group_t flex_group; + + if (atomic_read(&inode->i_count) > 1) { + printk(KERN_ERR "ext4_free_inode: inode has count=%d\n", + atomic_read(&inode->i_count)); + return; + } + if (inode->i_nlink) { + printk(KERN_ERR "ext4_free_inode: inode has nlink=%d\n", + inode->i_nlink); + return; + } + if (!sb) { + printk(KERN_ERR "ext4_free_inode: inode on " + "nonexistent device\n"); + return; + } + sbi = EXT4_SB(sb); + + ino = inode->i_ino; + ext4_debug("freeing inode %lu\n", ino); + + /* + * Note: we must free any quota before locking the superblock, + * as writing the quota to disk may need the lock as well. + */ + DQUOT_INIT(inode); + ext4_xattr_delete_inode(handle, inode); + DQUOT_FREE_INODE(inode); + DQUOT_DROP(inode); + + is_directory = S_ISDIR(inode->i_mode); + + /* Do this BEFORE marking the inode not in use or returning an error */ + clear_inode(inode); + + es = EXT4_SB(sb)->s_es; + if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { + ext4_error(sb, "ext4_free_inode", + "reserved or nonexistent inode %lu", ino); + goto error_return; + } + block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); + bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); + bitmap_bh = ext4_read_inode_bitmap(sb, block_group); + if (!bitmap_bh) + goto error_return; + + BUFFER_TRACE(bitmap_bh, "get_write_access"); + fatal = ext4_journal_get_write_access(handle, bitmap_bh); + if (fatal) + goto error_return; + + /* Ok, now we can actually update the inode bitmaps.. */ + if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group), + bit, bitmap_bh->b_data)) + ext4_error(sb, "ext4_free_inode", + "bit already cleared for inode %lu", ino); + else { + gdp = ext4_get_group_desc(sb, block_group, &bh2); + + BUFFER_TRACE(bh2, "get_write_access"); + fatal = ext4_journal_get_write_access(handle, bh2); + if (fatal) goto error_return; + + if (gdp) { + spin_lock(sb_bgl_lock(sbi, block_group)); + le16_add_cpu(&gdp->bg_free_inodes_count, 1); + if (is_directory) + le16_add_cpu(&gdp->bg_used_dirs_count, -1); + gdp->bg_checksum = ext4_group_desc_csum(sbi, + block_group, gdp); + spin_unlock(sb_bgl_lock(sbi, block_group)); + percpu_counter_inc(&sbi->s_freeinodes_counter); + if (is_directory) + percpu_counter_dec(&sbi->s_dirs_counter); + + if (sbi->s_log_groups_per_flex) { + flex_group = ext4_flex_group(sbi, block_group); + spin_lock(sb_bgl_lock(sbi, flex_group)); + sbi->s_flex_groups[flex_group].free_inodes++; + spin_unlock(sb_bgl_lock(sbi, flex_group)); + } + } + BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata"); + err = ext4_journal_dirty_metadata(handle, bh2); + if (!fatal) fatal = err; + } + BUFFER_TRACE(bitmap_bh, "call ext4_journal_dirty_metadata"); + err = ext4_journal_dirty_metadata(handle, bitmap_bh); + if (!fatal) + fatal = err; + sb->s_dirt = 1; +error_return: + brelse(bitmap_bh); + ext4_std_error(sb, fatal); +} + +/* + * There are two policies for allocating an inode. If the new inode is + * a directory, then a forward search is made for a block group with both + * free space and a low directory-to-inode ratio; if that fails, then of + * the groups with above-average free space, that group with the fewest + * directories already is chosen. + * + * For other inodes, search forward from the parent directory\'s block + * group to find a free inode. + */ +static int find_group_dir(struct super_block *sb, struct inode *parent, + ext4_group_t *best_group) +{ + ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; + unsigned int freei, avefreei; + struct ext4_group_desc *desc, *best_desc = NULL; + ext4_group_t group; + int ret = -1; + + freei = percpu_counter_read_positive(&EXT4_SB(sb)->s_freeinodes_counter); + avefreei = freei / ngroups; + + for (group = 0; group < ngroups; group++) { + desc = ext4_get_group_desc(sb, group, NULL); + if (!desc || !desc->bg_free_inodes_count) + continue; + if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) + continue; + if (!best_desc || + (le16_to_cpu(desc->bg_free_blocks_count) > + le16_to_cpu(best_desc->bg_free_blocks_count))) { + *best_group = group; + best_desc = desc; + ret = 0; + } + } + return ret; +} + +#define free_block_ratio 10 + +static int find_group_flex(struct super_block *sb, struct inode *parent, + ext4_group_t *best_group) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_desc *desc; + struct buffer_head *bh; + struct flex_groups *flex_group = sbi->s_flex_groups; + ext4_group_t parent_group = EXT4_I(parent)->i_block_group; + ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group); + ext4_group_t ngroups = sbi->s_groups_count; + int flex_size = ext4_flex_bg_size(sbi); + ext4_group_t best_flex = parent_fbg_group; + int blocks_per_flex = sbi->s_blocks_per_group * flex_size; + int flexbg_free_blocks; + int flex_freeb_ratio; + ext4_group_t n_fbg_groups; + ext4_group_t i; + + n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >> + sbi->s_log_groups_per_flex; + +find_close_to_parent: + flexbg_free_blocks = flex_group[best_flex].free_blocks; + flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex; + if (flex_group[best_flex].free_inodes && + flex_freeb_ratio > free_block_ratio) + goto found_flexbg; + + if (best_flex && best_flex == parent_fbg_group) { + best_flex--; + goto find_close_to_parent; + } + + for (i = 0; i < n_fbg_groups; i++) { + if (i == parent_fbg_group || i == parent_fbg_group - 1) + continue; + + flexbg_free_blocks = flex_group[i].free_blocks; + flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex; + + if (flex_freeb_ratio > free_block_ratio && + flex_group[i].free_inodes) { + best_flex = i; + goto found_flexbg; + } + + if (flex_group[best_flex].free_inodes == 0 || + (flex_group[i].free_blocks > + flex_group[best_flex].free_blocks && + flex_group[i].free_inodes)) + best_flex = i; + } + + if (!flex_group[best_flex].free_inodes || + !flex_group[best_flex].free_blocks) + return -1; + +found_flexbg: + for (i = best_flex * flex_size; i < ngroups && + i < (best_flex + 1) * flex_size; i++) { + desc = ext4_get_group_desc(sb, i, &bh); + if (le16_to_cpu(desc->bg_free_inodes_count)) { + *best_group = i; + goto out; + } + } + + return -1; +out: + return 0; +} + +/* + * Orlov's allocator for directories. + * + * We always try to spread first-level directories. + * + * If there are blockgroups with both free inodes and free blocks counts + * not worse than average we return one with smallest directory count. + * Otherwise we simply return a random group. + * + * For the rest rules look so: + * + * It's OK to put directory into a group unless + * it has too many directories already (max_dirs) or + * it has too few free inodes left (min_inodes) or + * it has too few free blocks left (min_blocks) or + * it's already running too large debt (max_debt). + * Parent's group is preferred, if it doesn't satisfy these + * conditions we search cyclically through the rest. If none + * of the groups look good we just look for a group with more + * free inodes than average (starting at parent's group). + * + * Debt is incremented each time we allocate a directory and decremented + * when we allocate an inode, within 0--255. + */ + +#define INODE_COST 64 +#define BLOCK_COST 256 + +static int find_group_orlov(struct super_block *sb, struct inode *parent, + ext4_group_t *group) +{ + ext4_group_t parent_group = EXT4_I(parent)->i_block_group; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + ext4_group_t ngroups = sbi->s_groups_count; + int inodes_per_group = EXT4_INODES_PER_GROUP(sb); + unsigned int freei, avefreei; + ext4_fsblk_t freeb, avefreeb; + ext4_fsblk_t blocks_per_dir; + unsigned int ndirs; + int max_debt, max_dirs, min_inodes; + ext4_grpblk_t min_blocks; + ext4_group_t i; + struct ext4_group_desc *desc; + + freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); + avefreei = freei / ngroups; + freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter); + avefreeb = freeb; + do_div(avefreeb, ngroups); + ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); + + if ((parent == sb->s_root->d_inode) || + (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) { + int best_ndir = inodes_per_group; + ext4_group_t grp; + int ret = -1; + + get_random_bytes(&grp, sizeof(grp)); + parent_group = (unsigned)grp % ngroups; + for (i = 0; i < ngroups; i++) { + grp = (parent_group + i) % ngroups; + desc = ext4_get_group_desc(sb, grp, NULL); + if (!desc || !desc->bg_free_inodes_count) + continue; + if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir) + continue; + if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) + continue; + if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb) + continue; + *group = grp; + ret = 0; + best_ndir = le16_to_cpu(desc->bg_used_dirs_count); + } + if (ret == 0) + return ret; + goto fallback; + } + + blocks_per_dir = ext4_blocks_count(es) - freeb; + do_div(blocks_per_dir, ndirs); + + max_dirs = ndirs / ngroups + inodes_per_group / 16; + min_inodes = avefreei - inodes_per_group / 4; + min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb) / 4; + + max_debt = EXT4_BLOCKS_PER_GROUP(sb); + max_debt /= max_t(int, blocks_per_dir, BLOCK_COST); + if (max_debt * INODE_COST > inodes_per_group) + max_debt = inodes_per_group / INODE_COST; + if (max_debt > 255) + max_debt = 255; + if (max_debt == 0) + max_debt = 1; + + for (i = 0; i < ngroups; i++) { + *group = (parent_group + i) % ngroups; + desc = ext4_get_group_desc(sb, *group, NULL); + if (!desc || !desc->bg_free_inodes_count) + continue; + if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs) + continue; + if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes) + continue; + if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks) + continue; + return 0; + } + +fallback: + for (i = 0; i < ngroups; i++) { + *group = (parent_group + i) % ngroups; + desc = ext4_get_group_desc(sb, *group, NULL); + if (desc && desc->bg_free_inodes_count && + le16_to_cpu(desc->bg_free_inodes_count) >= avefreei) + return 0; + } + + if (avefreei) { + /* + * The free-inodes counter is approximate, and for really small + * filesystems the above test can fail to find any blockgroups + */ + avefreei = 0; + goto fallback; + } + + return -1; +} + +static int find_group_other(struct super_block *sb, struct inode *parent, + ext4_group_t *group) +{ + ext4_group_t parent_group = EXT4_I(parent)->i_block_group; + ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; + struct ext4_group_desc *desc; + ext4_group_t i; + + /* + * Try to place the inode in its parent directory + */ + *group = parent_group; + desc = ext4_get_group_desc(sb, *group, NULL); + if (desc && le16_to_cpu(desc->bg_free_inodes_count) && + le16_to_cpu(desc->bg_free_blocks_count)) + return 0; + + /* + * We're going to place this inode in a different blockgroup from its + * parent. We want to cause files in a common directory to all land in + * the same blockgroup. But we want files which are in a different + * directory which shares a blockgroup with our parent to land in a + * different blockgroup. + * + * So add our directory's i_ino into the starting point for the hash. + */ + *group = (*group + parent->i_ino) % ngroups; + + /* + * Use a quadratic hash to find a group with a free inode and some free + * blocks. + */ + for (i = 1; i < ngroups; i <<= 1) { + *group += i; + if (*group >= ngroups) + *group -= ngroups; + desc = ext4_get_group_desc(sb, *group, NULL); + if (desc && le16_to_cpu(desc->bg_free_inodes_count) && + le16_to_cpu(desc->bg_free_blocks_count)) + return 0; + } + + /* + * That failed: try linear search for a free inode, even if that group + * has no free blocks. + */ + *group = parent_group; + for (i = 0; i < ngroups; i++) { + if (++*group >= ngroups) + *group = 0; + desc = ext4_get_group_desc(sb, *group, NULL); + if (desc && le16_to_cpu(desc->bg_free_inodes_count)) + return 0; + } + + return -1; +} + +/* + * claim the inode from the inode bitmap. If the group + * is uninit we need to take the groups's sb_bgl_lock + * and clear the uninit flag. The inode bitmap update + * and group desc uninit flag clear should be done + * after holding sb_bgl_lock so that ext4_read_inode_bitmap + * doesn't race with the ext4_claim_inode + */ +static int ext4_claim_inode(struct super_block *sb, + struct buffer_head *inode_bitmap_bh, + unsigned long ino, ext4_group_t group, int mode) +{ + int free = 0, retval = 0; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); + + spin_lock(sb_bgl_lock(sbi, group)); + if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) { + /* not a free inode */ + retval = 1; + goto err_ret; + } + ino++; + if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || + ino > EXT4_INODES_PER_GROUP(sb)) { + spin_unlock(sb_bgl_lock(sbi, group)); + ext4_error(sb, __func__, + "reserved inode or inode > inodes count - " + "block_group = %lu, inode=%lu", group, + ino + group * EXT4_INODES_PER_GROUP(sb)); + return 1; + } + /* If we didn't allocate from within the initialized part of the inode + * table then we need to initialize up to this inode. */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { + + if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); + /* When marking the block group with + * ~EXT4_BG_INODE_UNINIT we don't want to depend + * on the value of bg_itable_unused even though + * mke2fs could have initialized the same for us. + * Instead we calculated the value below + */ + + free = 0; + } else { + free = EXT4_INODES_PER_GROUP(sb) - + le16_to_cpu(gdp->bg_itable_unused); + } + + /* + * Check the relative inode number against the last used + * relative inode number in this group. if it is greater + * we need to update the bg_itable_unused count + * + */ + if (ino > free) + gdp->bg_itable_unused = + cpu_to_le16(EXT4_INODES_PER_GROUP(sb) - ino); + } + le16_add_cpu(&gdp->bg_free_inodes_count, -1); + if (S_ISDIR(mode)) { + le16_add_cpu(&gdp->bg_used_dirs_count, 1); + } + gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); +err_ret: + spin_unlock(sb_bgl_lock(sbi, group)); + return retval; +} + +/* + * There are two policies for allocating an inode. If the new inode is + * a directory, then a forward search is made for a block group with both + * free space and a low directory-to-inode ratio; if that fails, then of + * the groups with above-average free space, that group with the fewest + * directories already is chosen. + * + * For other inodes, search forward from the parent directory's block + * group to find a free inode. + */ +struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) +{ + struct super_block *sb; + struct buffer_head *bitmap_bh = NULL; + struct buffer_head *bh2; + ext4_group_t group = 0; + unsigned long ino = 0; + struct inode *inode; + struct ext4_group_desc *gdp = NULL; + struct ext4_super_block *es; + struct ext4_inode_info *ei; + struct ext4_sb_info *sbi; + int ret2, err = 0; + struct inode *ret; + ext4_group_t i; + int free = 0; + ext4_group_t flex_group; + + /* Cannot create files in a deleted directory */ + if (!dir || !dir->i_nlink) + return ERR_PTR(-EPERM); + + sb = dir->i_sb; + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + ei = EXT4_I(inode); + + sbi = EXT4_SB(sb); + es = sbi->s_es; + + if (sbi->s_log_groups_per_flex) { + ret2 = find_group_flex(sb, dir, &group); + if (ret2 == -1) { + ret2 = find_group_other(sb, dir, &group); + if (ret2 == 0 && printk_ratelimit()) + printk(KERN_NOTICE "ext4: find_group_flex " + "failed, fallback succeeded dir %lu\n", + dir->i_ino); + } + goto got_group; + } + + if (S_ISDIR(mode)) { + if (test_opt(sb, OLDALLOC)) + ret2 = find_group_dir(sb, dir, &group); + else + ret2 = find_group_orlov(sb, dir, &group); + } else + ret2 = find_group_other(sb, dir, &group); + +got_group: + err = -ENOSPC; + if (ret2 == -1) + goto out; + + for (i = 0; i < sbi->s_groups_count; i++) { + err = -EIO; + + gdp = ext4_get_group_desc(sb, group, &bh2); + if (!gdp) + goto fail; + + brelse(bitmap_bh); + bitmap_bh = ext4_read_inode_bitmap(sb, group); + if (!bitmap_bh) + goto fail; + + ino = 0; + +repeat_in_this_group: + ino = ext4_find_next_zero_bit((unsigned long *) + bitmap_bh->b_data, EXT4_INODES_PER_GROUP(sb), ino); + if (ino < EXT4_INODES_PER_GROUP(sb)) { + + BUFFER_TRACE(bitmap_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, bitmap_bh); + if (err) + goto fail; + + BUFFER_TRACE(bh2, "get_write_access"); + err = ext4_journal_get_write_access(handle, bh2); + if (err) + goto fail; + if (!ext4_claim_inode(sb, bitmap_bh, + ino, group, mode)) { + /* we won it */ + BUFFER_TRACE(bitmap_bh, + "call ext4_journal_dirty_metadata"); + err = ext4_journal_dirty_metadata(handle, + bitmap_bh); + if (err) + goto fail; + /* zero bit is inode number 1*/ + ino++; + goto got; + } + /* we lost it */ + jbd2_journal_release_buffer(handle, bitmap_bh); + jbd2_journal_release_buffer(handle, bh2); + + if (++ino < EXT4_INODES_PER_GROUP(sb)) + goto repeat_in_this_group; + } + + /* + * This case is possible in concurrent environment. It is very + * rare. We cannot repeat the find_group_xxx() call because + * that will simply return the same blockgroup, because the + * group descriptor metadata has not yet been updated. + * So we just go onto the next blockgroup. + */ + if (++group == sbi->s_groups_count) + group = 0; + } + err = -ENOSPC; + goto out; + +got: + /* We may have to initialize the block bitmap if it isn't already */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) && + gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { + struct buffer_head *block_bh = ext4_read_block_bitmap(sb, group); + + BUFFER_TRACE(block_bh, "get block bitmap access"); + err = ext4_journal_get_write_access(handle, block_bh); + if (err) { + brelse(block_bh); + goto fail; + } + + free = 0; + spin_lock(sb_bgl_lock(sbi, group)); + /* recheck and clear flag under lock if we still need to */ + if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); + free = ext4_free_blocks_after_init(sb, group, gdp); + gdp->bg_free_blocks_count = cpu_to_le16(free); + gdp->bg_checksum = ext4_group_desc_csum(sbi, group, + gdp); + } + spin_unlock(sb_bgl_lock(sbi, group)); + + /* Don't need to dirty bitmap block if we didn't change it */ + if (free) { + BUFFER_TRACE(block_bh, "dirty block bitmap"); + err = ext4_journal_dirty_metadata(handle, block_bh); + } + + brelse(block_bh); + if (err) + goto fail; + } + BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata"); + err = ext4_journal_dirty_metadata(handle, bh2); + if (err) + goto fail; + + percpu_counter_dec(&sbi->s_freeinodes_counter); + if (S_ISDIR(mode)) + percpu_counter_inc(&sbi->s_dirs_counter); + sb->s_dirt = 1; + + if (sbi->s_log_groups_per_flex) { + flex_group = ext4_flex_group(sbi, group); + spin_lock(sb_bgl_lock(sbi, flex_group)); + sbi->s_flex_groups[flex_group].free_inodes--; + spin_unlock(sb_bgl_lock(sbi, flex_group)); + } + + inode->i_uid = current->fsuid; + if (test_opt(sb, GRPID)) + inode->i_gid = dir->i_gid; + else if (dir->i_mode & S_ISGID) { + inode->i_gid = dir->i_gid; + if (S_ISDIR(mode)) + mode |= S_ISGID; + } else + inode->i_gid = current->fsgid; + inode->i_mode = mode; + + inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb); + /* This is the optimal IO size (for stat), not the fs block size */ + inode->i_blocks = 0; + inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime = + ext4_current_time(inode); + + memset(ei->i_data, 0, sizeof(ei->i_data)); + ei->i_dir_start_lookup = 0; + ei->i_disksize = 0; + + /* + * Don't inherit extent flag from directory. We set extent flag on + * newly created directory and file only if -o extent mount option is + * specified + */ + ei->i_flags = EXT4_I(dir)->i_flags & ~(EXT4_INDEX_FL|EXT4_EXTENTS_FL); + if (S_ISLNK(mode)) + ei->i_flags &= ~(EXT4_IMMUTABLE_FL|EXT4_APPEND_FL); + /* dirsync only applies to directories */ + if (!S_ISDIR(mode)) + ei->i_flags &= ~EXT4_DIRSYNC_FL; + ei->i_file_acl = 0; + ei->i_dtime = 0; + ei->i_block_group = group; + + ext4_set_inode_flags(inode); + if (IS_DIRSYNC(inode)) + handle->h_sync = 1; + insert_inode_hash(inode); + spin_lock(&sbi->s_next_gen_lock); + inode->i_generation = sbi->s_next_generation++; + spin_unlock(&sbi->s_next_gen_lock); + + ei->i_state = EXT4_STATE_NEW; + + ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; + + ret = inode; + if (DQUOT_ALLOC_INODE(inode)) { + err = -EDQUOT; + goto fail_drop; + } + + err = ext4_init_acl(handle, inode, dir); + if (err) + goto fail_free_drop; + + err = ext4_init_security(handle, inode, dir); + if (err) + goto fail_free_drop; + + if (test_opt(sb, EXTENTS)) { + /* set extent flag only for directory, file and normal symlink*/ + if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) { + EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL; + ext4_ext_tree_init(handle, inode); + } + } + + err = ext4_mark_inode_dirty(handle, inode); + if (err) { + ext4_std_error(sb, err); + goto fail_free_drop; + } + + ext4_debug("allocating inode %lu\n", inode->i_ino); + goto really_out; +fail: + ext4_std_error(sb, err); +out: + iput(inode); + ret = ERR_PTR(err); +really_out: + brelse(bitmap_bh); + return ret; + +fail_free_drop: + DQUOT_FREE_INODE(inode); + +fail_drop: + DQUOT_DROP(inode); + inode->i_flags |= S_NOQUOTA; + inode->i_nlink = 0; + iput(inode); + brelse(bitmap_bh); + return ERR_PTR(err); +} + +/* Verify that we are loading a valid orphan from disk */ +struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) +{ + unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count); + ext4_group_t block_group; + int bit; + struct buffer_head *bitmap_bh; + struct inode *inode = NULL; + long err = -EIO; + + /* Error cases - e2fsck has already cleaned up for us */ + if (ino > max_ino) { + ext4_warning(sb, __func__, + "bad orphan ino %lu! e2fsck was run?", ino); + goto error; + } + + block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); + bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); + bitmap_bh = ext4_read_inode_bitmap(sb, block_group); + if (!bitmap_bh) { + ext4_warning(sb, __func__, + "inode bitmap error for orphan %lu", ino); + goto error; + } + + /* Having the inode bit set should be a 100% indicator that this + * is a valid orphan (no e2fsck run on fs). Orphans also include + * inodes that were being truncated, so we can't check i_nlink==0. + */ + if (!ext4_test_bit(bit, bitmap_bh->b_data)) + goto bad_orphan; + + inode = ext4_iget(sb, ino); + if (IS_ERR(inode)) + goto iget_failed; + + /* + * If the orphans has i_nlinks > 0 then it should be able to be + * truncated, otherwise it won't be removed from the orphan list + * during processing and an infinite loop will result. + */ + if (inode->i_nlink && !ext4_can_truncate(inode)) + goto bad_orphan; + + if (NEXT_ORPHAN(inode) > max_ino) + goto bad_orphan; + brelse(bitmap_bh); + return inode; + +iget_failed: + err = PTR_ERR(inode); + inode = NULL; +bad_orphan: + ext4_warning(sb, __func__, + "bad orphan inode %lu! e2fsck was run?", ino); + printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n", + bit, (unsigned long long)bitmap_bh->b_blocknr, + ext4_test_bit(bit, bitmap_bh->b_data)); + printk(KERN_NOTICE "inode=%p\n", inode); + if (inode) { + printk(KERN_NOTICE "is_bad_inode(inode)=%d\n", + is_bad_inode(inode)); + printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n", + NEXT_ORPHAN(inode)); + printk(KERN_NOTICE "max_ino=%lu\n", max_ino); + printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink); + /* Avoid freeing blocks if we got a bad deleted inode */ + if (inode->i_nlink == 0) + inode->i_blocks = 0; + iput(inode); + } + brelse(bitmap_bh); +error: + return ERR_PTR(err); +} + +unsigned long ext4_count_free_inodes(struct super_block *sb) +{ + unsigned long desc_count; + struct ext4_group_desc *gdp; + ext4_group_t i; +#ifdef EXT4FS_DEBUG + struct ext4_super_block *es; + unsigned long bitmap_count, x; + struct buffer_head *bitmap_bh = NULL; + + es = EXT4_SB(sb)->s_es; + desc_count = 0; + bitmap_count = 0; + gdp = NULL; + for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { + gdp = ext4_get_group_desc(sb, i, NULL); + if (!gdp) + continue; + desc_count += le16_to_cpu(gdp->bg_free_inodes_count); + brelse(bitmap_bh); + bitmap_bh = ext4_read_inode_bitmap(sb, i); + if (!bitmap_bh) + continue; + + x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); + printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n", + i, le16_to_cpu(gdp->bg_free_inodes_count), x); + bitmap_count += x; + } + brelse(bitmap_bh); + printk(KERN_DEBUG "ext4_count_free_inodes: " + "stored = %u, computed = %lu, %lu\n", + le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count); + return desc_count; +#else + desc_count = 0; + for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { + gdp = ext4_get_group_desc(sb, i, NULL); + if (!gdp) + continue; + desc_count += le16_to_cpu(gdp->bg_free_inodes_count); + cond_resched(); + } + return desc_count; +#endif +} + +/* Called at mount-time, super-block is locked */ +unsigned long ext4_count_dirs(struct super_block * sb) +{ + unsigned long count = 0; + ext4_group_t i; + + for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { + struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); + if (!gdp) + continue; + count += le16_to_cpu(gdp->bg_used_dirs_count); + } + return count; +} + diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c new file mode 100644 index 0000000..b7f20b0 --- /dev/null +++ b/fs/ext4/inode.c @@ -0,0 +1,5065 @@ +/* + * linux/fs/ext4/inode.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Goal-directed block allocation by Stephen Tweedie + * (sct@redhat.com), 1993, 1998 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * 64-bit file support on 64-bit platforms by Jakub Jelinek + * (jj@sunsite.ms.mff.cuni.cz) + * + * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000 + */ + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/time.h> +#include <linux/jbd2.h> +#include <linux/highuid.h> +#include <linux/pagemap.h> +#include <linux/quotaops.h> +#include <linux/string.h> +#include <linux/buffer_head.h> +#include <linux/writeback.h> +#include <linux/pagevec.h> +#include <linux/mpage.h> +#include <linux/uio.h> +#include <linux/bio.h> +#include "ext4_jbd2.h" +#include "xattr.h" +#include "acl.h" +#include "ext4_extents.h" + +#define MPAGE_DA_EXTENT_TAIL 0x01 + +static inline int ext4_begin_ordered_truncate(struct inode *inode, + loff_t new_size) +{ + return jbd2_journal_begin_ordered_truncate( + EXT4_SB(inode->i_sb)->s_journal, + &EXT4_I(inode)->jinode, + new_size); +} + +static void ext4_invalidatepage(struct page *page, unsigned long offset); + +/* + * Test whether an inode is a fast symlink. + */ +static int ext4_inode_is_fast_symlink(struct inode *inode) +{ + int ea_blocks = EXT4_I(inode)->i_file_acl ? + (inode->i_sb->s_blocksize >> 9) : 0; + + return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); +} + +/* + * The ext4 forget function must perform a revoke if we are freeing data + * which has been journaled. Metadata (eg. indirect blocks) must be + * revoked in all cases. + * + * "bh" may be NULL: a metadata block may have been freed from memory + * but there may still be a record of it in the journal, and that record + * still needs to be revoked. + */ +int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, + struct buffer_head *bh, ext4_fsblk_t blocknr) +{ + int err; + + might_sleep(); + + BUFFER_TRACE(bh, "enter"); + + jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " + "data mode %lx\n", + bh, is_metadata, inode->i_mode, + test_opt(inode->i_sb, DATA_FLAGS)); + + /* Never use the revoke function if we are doing full data + * journaling: there is no need to, and a V1 superblock won't + * support it. Otherwise, only skip the revoke on un-journaled + * data blocks. */ + + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || + (!is_metadata && !ext4_should_journal_data(inode))) { + if (bh) { + BUFFER_TRACE(bh, "call jbd2_journal_forget"); + return ext4_journal_forget(handle, bh); + } + return 0; + } + + /* + * data!=journal && (is_metadata || should_journal_data(inode)) + */ + BUFFER_TRACE(bh, "call ext4_journal_revoke"); + err = ext4_journal_revoke(handle, blocknr, bh); + if (err) + ext4_abort(inode->i_sb, __func__, + "error %d when attempting revoke", err); + BUFFER_TRACE(bh, "exit"); + return err; +} + +/* + * Work out how many blocks we need to proceed with the next chunk of a + * truncate transaction. + */ +static unsigned long blocks_for_truncate(struct inode *inode) +{ + ext4_lblk_t needed; + + needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); + + /* Give ourselves just enough room to cope with inodes in which + * i_blocks is corrupt: we've seen disk corruptions in the past + * which resulted in random data in an inode which looked enough + * like a regular file for ext4 to try to delete it. Things + * will go a bit crazy if that happens, but at least we should + * try not to panic the whole kernel. */ + if (needed < 2) + needed = 2; + + /* But we need to bound the transaction so we don't overflow the + * journal. */ + if (needed > EXT4_MAX_TRANS_DATA) + needed = EXT4_MAX_TRANS_DATA; + + return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed; +} + +/* + * Truncate transactions can be complex and absolutely huge. So we need to + * be able to restart the transaction at a conventient checkpoint to make + * sure we don't overflow the journal. + * + * start_transaction gets us a new handle for a truncate transaction, + * and extend_transaction tries to extend the existing one a bit. If + * extend fails, we need to propagate the failure up and restart the + * transaction in the top-level truncate loop. --sct + */ +static handle_t *start_transaction(struct inode *inode) +{ + handle_t *result; + + result = ext4_journal_start(inode, blocks_for_truncate(inode)); + if (!IS_ERR(result)) + return result; + + ext4_std_error(inode->i_sb, PTR_ERR(result)); + return result; +} + +/* + * Try to extend this transaction for the purposes of truncation. + * + * Returns 0 if we managed to create more room. If we can't create more + * room, and the transaction must be restarted we return 1. + */ +static int try_to_extend_transaction(handle_t *handle, struct inode *inode) +{ + if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS) + return 0; + if (!ext4_journal_extend(handle, blocks_for_truncate(inode))) + return 0; + return 1; +} + +/* + * Restart the transaction associated with *handle. This does a commit, + * so before we call here everything must be consistently dirtied against + * this transaction. + */ +static int ext4_journal_test_restart(handle_t *handle, struct inode *inode) +{ + jbd_debug(2, "restarting handle %p\n", handle); + return ext4_journal_restart(handle, blocks_for_truncate(inode)); +} + +/* + * Called at the last iput() if i_nlink is zero. + */ +void ext4_delete_inode(struct inode *inode) +{ + handle_t *handle; + int err; + + if (ext4_should_order_data(inode)) + ext4_begin_ordered_truncate(inode, 0); + truncate_inode_pages(&inode->i_data, 0); + + if (is_bad_inode(inode)) + goto no_delete; + + handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3); + if (IS_ERR(handle)) { + ext4_std_error(inode->i_sb, PTR_ERR(handle)); + /* + * If we're going to skip the normal cleanup, we still need to + * make sure that the in-core orphan linked list is properly + * cleaned up. + */ + ext4_orphan_del(NULL, inode); + goto no_delete; + } + + if (IS_SYNC(inode)) + handle->h_sync = 1; + inode->i_size = 0; + err = ext4_mark_inode_dirty(handle, inode); + if (err) { + ext4_warning(inode->i_sb, __func__, + "couldn't mark inode dirty (err %d)", err); + goto stop_handle; + } + if (inode->i_blocks) + ext4_truncate(inode); + + /* + * ext4_ext_truncate() doesn't reserve any slop when it + * restarts journal transactions; therefore there may not be + * enough credits left in the handle to remove the inode from + * the orphan list and set the dtime field. + */ + if (handle->h_buffer_credits < 3) { + err = ext4_journal_extend(handle, 3); + if (err > 0) + err = ext4_journal_restart(handle, 3); + if (err != 0) { + ext4_warning(inode->i_sb, __func__, + "couldn't extend journal (err %d)", err); + stop_handle: + ext4_journal_stop(handle); + goto no_delete; + } + } + + /* + * Kill off the orphan record which ext4_truncate created. + * AKPM: I think this can be inside the above `if'. + * Note that ext4_orphan_del() has to be able to cope with the + * deletion of a non-existent orphan - this is because we don't + * know if ext4_truncate() actually created an orphan record. + * (Well, we could do this if we need to, but heck - it works) + */ + ext4_orphan_del(handle, inode); + EXT4_I(inode)->i_dtime = get_seconds(); + + /* + * One subtle ordering requirement: if anything has gone wrong + * (transaction abort, IO errors, whatever), then we can still + * do these next steps (the fs will already have been marked as + * having errors), but we can't free the inode if the mark_dirty + * fails. + */ + if (ext4_mark_inode_dirty(handle, inode)) + /* If that failed, just do the required in-core inode clear. */ + clear_inode(inode); + else + ext4_free_inode(handle, inode); + ext4_journal_stop(handle); + return; +no_delete: + clear_inode(inode); /* We must guarantee clearing of inode... */ +} + +typedef struct { + __le32 *p; + __le32 key; + struct buffer_head *bh; +} Indirect; + +static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) +{ + p->key = *(p->p = v); + p->bh = bh; +} + +/** + * ext4_block_to_path - parse the block number into array of offsets + * @inode: inode in question (we are only interested in its superblock) + * @i_block: block number to be parsed + * @offsets: array to store the offsets in + * @boundary: set this non-zero if the referred-to block is likely to be + * followed (on disk) by an indirect block. + * + * To store the locations of file's data ext4 uses a data structure common + * for UNIX filesystems - tree of pointers anchored in the inode, with + * data blocks at leaves and indirect blocks in intermediate nodes. + * This function translates the block number into path in that tree - + * return value is the path length and @offsets[n] is the offset of + * pointer to (n+1)th node in the nth one. If @block is out of range + * (negative or too large) warning is printed and zero returned. + * + * Note: function doesn't find node addresses, so no IO is needed. All + * we need to know is the capacity of indirect blocks (taken from the + * inode->i_sb). + */ + +/* + * Portability note: the last comparison (check that we fit into triple + * indirect block) is spelled differently, because otherwise on an + * architecture with 32-bit longs and 8Kb pages we might get into trouble + * if our filesystem had 8Kb blocks. We might use long long, but that would + * kill us on x86. Oh, well, at least the sign propagation does not matter - + * i_block would have to be negative in the very beginning, so we would not + * get there at all. + */ + +static int ext4_block_to_path(struct inode *inode, + ext4_lblk_t i_block, + ext4_lblk_t offsets[4], int *boundary) +{ + int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); + int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); + const long direct_blocks = EXT4_NDIR_BLOCKS, + indirect_blocks = ptrs, + double_blocks = (1 << (ptrs_bits * 2)); + int n = 0; + int final = 0; + + if (i_block < 0) { + ext4_warning(inode->i_sb, "ext4_block_to_path", "block < 0"); + } else if (i_block < direct_blocks) { + offsets[n++] = i_block; + final = direct_blocks; + } else if ((i_block -= direct_blocks) < indirect_blocks) { + offsets[n++] = EXT4_IND_BLOCK; + offsets[n++] = i_block; + final = ptrs; + } else if ((i_block -= indirect_blocks) < double_blocks) { + offsets[n++] = EXT4_DIND_BLOCK; + offsets[n++] = i_block >> ptrs_bits; + offsets[n++] = i_block & (ptrs - 1); + final = ptrs; + } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { + offsets[n++] = EXT4_TIND_BLOCK; + offsets[n++] = i_block >> (ptrs_bits * 2); + offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); + offsets[n++] = i_block & (ptrs - 1); + final = ptrs; + } else { + ext4_warning(inode->i_sb, "ext4_block_to_path", + "block %lu > max in inode %lu", + i_block + direct_blocks + + indirect_blocks + double_blocks, inode->i_ino); + } + if (boundary) + *boundary = final - 1 - (i_block & (ptrs - 1)); + return n; +} + +/** + * ext4_get_branch - read the chain of indirect blocks leading to data + * @inode: inode in question + * @depth: depth of the chain (1 - direct pointer, etc.) + * @offsets: offsets of pointers in inode/indirect blocks + * @chain: place to store the result + * @err: here we store the error value + * + * Function fills the array of triples <key, p, bh> and returns %NULL + * if everything went OK or the pointer to the last filled triple + * (incomplete one) otherwise. Upon the return chain[i].key contains + * the number of (i+1)-th block in the chain (as it is stored in memory, + * i.e. little-endian 32-bit), chain[i].p contains the address of that + * number (it points into struct inode for i==0 and into the bh->b_data + * for i>0) and chain[i].bh points to the buffer_head of i-th indirect + * block for i>0 and NULL for i==0. In other words, it holds the block + * numbers of the chain, addresses they were taken from (and where we can + * verify that chain did not change) and buffer_heads hosting these + * numbers. + * + * Function stops when it stumbles upon zero pointer (absent block) + * (pointer to last triple returned, *@err == 0) + * or when it gets an IO error reading an indirect block + * (ditto, *@err == -EIO) + * or when it reads all @depth-1 indirect blocks successfully and finds + * the whole chain, all way to the data (returns %NULL, *err == 0). + * + * Need to be called with + * down_read(&EXT4_I(inode)->i_data_sem) + */ +static Indirect *ext4_get_branch(struct inode *inode, int depth, + ext4_lblk_t *offsets, + Indirect chain[4], int *err) +{ + struct super_block *sb = inode->i_sb; + Indirect *p = chain; + struct buffer_head *bh; + + *err = 0; + /* i_data is not going away, no lock needed */ + add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets); + if (!p->key) + goto no_block; + while (--depth) { + bh = sb_bread(sb, le32_to_cpu(p->key)); + if (!bh) + goto failure; + add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); + /* Reader: end */ + if (!p->key) + goto no_block; + } + return NULL; + +failure: + *err = -EIO; +no_block: + return p; +} + +/** + * ext4_find_near - find a place for allocation with sufficient locality + * @inode: owner + * @ind: descriptor of indirect block. + * + * This function returns the preferred place for block allocation. + * It is used when heuristic for sequential allocation fails. + * Rules are: + * + if there is a block to the left of our position - allocate near it. + * + if pointer will live in indirect block - allocate near that block. + * + if pointer will live in inode - allocate in the same + * cylinder group. + * + * In the latter case we colour the starting block by the callers PID to + * prevent it from clashing with concurrent allocations for a different inode + * in the same block group. The PID is used here so that functionally related + * files will be close-by on-disk. + * + * Caller must make sure that @ind is valid and will stay that way. + */ +static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; + __le32 *p; + ext4_fsblk_t bg_start; + ext4_fsblk_t last_block; + ext4_grpblk_t colour; + + /* Try to find previous block */ + for (p = ind->p - 1; p >= start; p--) { + if (*p) + return le32_to_cpu(*p); + } + + /* No such thing, so let's try location of indirect block */ + if (ind->bh) + return ind->bh->b_blocknr; + + /* + * It is going to be referred to from the inode itself? OK, just put it + * into the same cylinder group then. + */ + bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group); + last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; + + if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) + colour = (current->pid % 16) * + (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); + else + colour = (current->pid % 16) * ((last_block - bg_start) / 16); + return bg_start + colour; +} + +/** + * ext4_find_goal - find a preferred place for allocation. + * @inode: owner + * @block: block we want + * @partial: pointer to the last triple within a chain + * + * Normally this function find the preferred place for block allocation, + * returns it. + */ +static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, + Indirect *partial) +{ + /* + * XXX need to get goal block from mballoc's data structures + */ + + return ext4_find_near(inode, partial); +} + +/** + * ext4_blks_to_allocate: Look up the block map and count the number + * of direct blocks need to be allocated for the given branch. + * + * @branch: chain of indirect blocks + * @k: number of blocks need for indirect blocks + * @blks: number of data blocks to be mapped. + * @blocks_to_boundary: the offset in the indirect block + * + * return the total number of blocks to be allocate, including the + * direct and indirect blocks. + */ +static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks, + int blocks_to_boundary) +{ + unsigned long count = 0; + + /* + * Simple case, [t,d]Indirect block(s) has not allocated yet + * then it's clear blocks on that path have not allocated + */ + if (k > 0) { + /* right now we don't handle cross boundary allocation */ + if (blks < blocks_to_boundary + 1) + count += blks; + else + count += blocks_to_boundary + 1; + return count; + } + + count++; + while (count < blks && count <= blocks_to_boundary && + le32_to_cpu(*(branch[0].p + count)) == 0) { + count++; + } + return count; +} + +/** + * ext4_alloc_blocks: multiple allocate blocks needed for a branch + * @indirect_blks: the number of blocks need to allocate for indirect + * blocks + * + * @new_blocks: on return it will store the new block numbers for + * the indirect blocks(if needed) and the first direct block, + * @blks: on return it will store the total number of allocated + * direct blocks + */ +static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, + ext4_lblk_t iblock, ext4_fsblk_t goal, + int indirect_blks, int blks, + ext4_fsblk_t new_blocks[4], int *err) +{ + int target, i; + unsigned long count = 0, blk_allocated = 0; + int index = 0; + ext4_fsblk_t current_block = 0; + int ret = 0; + + /* + * Here we try to allocate the requested multiple blocks at once, + * on a best-effort basis. + * To build a branch, we should allocate blocks for + * the indirect blocks(if not allocated yet), and at least + * the first direct block of this branch. That's the + * minimum number of blocks need to allocate(required) + */ + /* first we try to allocate the indirect blocks */ + target = indirect_blks; + while (target > 0) { + count = target; + /* allocating blocks for indirect blocks and direct blocks */ + current_block = ext4_new_meta_blocks(handle, inode, + goal, &count, err); + if (*err) + goto failed_out; + + target -= count; + /* allocate blocks for indirect blocks */ + while (index < indirect_blks && count) { + new_blocks[index++] = current_block++; + count--; + } + if (count > 0) { + /* + * save the new block number + * for the first direct block + */ + new_blocks[index] = current_block; + printk(KERN_INFO "%s returned more blocks than " + "requested\n", __func__); + WARN_ON(1); + break; + } + } + + target = blks - count ; + blk_allocated = count; + if (!target) + goto allocated; + /* Now allocate data blocks */ + count = target; + /* allocating blocks for data blocks */ + current_block = ext4_new_blocks(handle, inode, iblock, + goal, &count, err); + if (*err && (target == blks)) { + /* + * if the allocation failed and we didn't allocate + * any blocks before + */ + goto failed_out; + } + if (!*err) { + if (target == blks) { + /* + * save the new block number + * for the first direct block + */ + new_blocks[index] = current_block; + } + blk_allocated += count; + } +allocated: + /* total number of blocks allocated for direct blocks */ + ret = blk_allocated; + *err = 0; + return ret; +failed_out: + for (i = 0; i < index; i++) + ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); + return ret; +} + +/** + * ext4_alloc_branch - allocate and set up a chain of blocks. + * @inode: owner + * @indirect_blks: number of allocated indirect blocks + * @blks: number of allocated direct blocks + * @offsets: offsets (in the blocks) to store the pointers to next. + * @branch: place to store the chain in. + * + * This function allocates blocks, zeroes out all but the last one, + * links them into chain and (if we are synchronous) writes them to disk. + * In other words, it prepares a branch that can be spliced onto the + * inode. It stores the information about that chain in the branch[], in + * the same format as ext4_get_branch() would do. We are calling it after + * we had read the existing part of chain and partial points to the last + * triple of that (one with zero ->key). Upon the exit we have the same + * picture as after the successful ext4_get_block(), except that in one + * place chain is disconnected - *branch->p is still zero (we did not + * set the last link), but branch->key contains the number that should + * be placed into *branch->p to fill that gap. + * + * If allocation fails we free all blocks we've allocated (and forget + * their buffer_heads) and return the error value the from failed + * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain + * as described above and return 0. + */ +static int ext4_alloc_branch(handle_t *handle, struct inode *inode, + ext4_lblk_t iblock, int indirect_blks, + int *blks, ext4_fsblk_t goal, + ext4_lblk_t *offsets, Indirect *branch) +{ + int blocksize = inode->i_sb->s_blocksize; + int i, n = 0; + int err = 0; + struct buffer_head *bh; + int num; + ext4_fsblk_t new_blocks[4]; + ext4_fsblk_t current_block; + + num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, + *blks, new_blocks, &err); + if (err) + return err; + + branch[0].key = cpu_to_le32(new_blocks[0]); + /* + * metadata blocks and data blocks are allocated. + */ + for (n = 1; n <= indirect_blks; n++) { + /* + * Get buffer_head for parent block, zero it out + * and set the pointer to new one, then send + * parent to disk. + */ + bh = sb_getblk(inode->i_sb, new_blocks[n-1]); + branch[n].bh = bh; + lock_buffer(bh); + BUFFER_TRACE(bh, "call get_create_access"); + err = ext4_journal_get_create_access(handle, bh); + if (err) { + unlock_buffer(bh); + brelse(bh); + goto failed; + } + + memset(bh->b_data, 0, blocksize); + branch[n].p = (__le32 *) bh->b_data + offsets[n]; + branch[n].key = cpu_to_le32(new_blocks[n]); + *branch[n].p = branch[n].key; + if (n == indirect_blks) { + current_block = new_blocks[n]; + /* + * End of chain, update the last new metablock of + * the chain to point to the new allocated + * data blocks numbers + */ + for (i=1; i < num; i++) + *(branch[n].p + i) = cpu_to_le32(++current_block); + } + BUFFER_TRACE(bh, "marking uptodate"); + set_buffer_uptodate(bh); + unlock_buffer(bh); + + BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); + err = ext4_journal_dirty_metadata(handle, bh); + if (err) + goto failed; + } + *blks = num; + return err; +failed: + /* Allocation failed, free what we already allocated */ + for (i = 1; i <= n ; i++) { + BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget"); + ext4_journal_forget(handle, branch[i].bh); + } + for (i = 0; i < indirect_blks; i++) + ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); + + ext4_free_blocks(handle, inode, new_blocks[i], num, 0); + + return err; +} + +/** + * ext4_splice_branch - splice the allocated branch onto inode. + * @inode: owner + * @block: (logical) number of block we are adding + * @chain: chain of indirect blocks (with a missing link - see + * ext4_alloc_branch) + * @where: location of missing link + * @num: number of indirect blocks we are adding + * @blks: number of direct blocks we are adding + * + * This function fills the missing link and does all housekeeping needed in + * inode (->i_blocks, etc.). In case of success we end up with the full + * chain to new block and return 0. + */ +static int ext4_splice_branch(handle_t *handle, struct inode *inode, + ext4_lblk_t block, Indirect *where, int num, int blks) +{ + int i; + int err = 0; + ext4_fsblk_t current_block; + + /* + * If we're splicing into a [td]indirect block (as opposed to the + * inode) then we need to get write access to the [td]indirect block + * before the splice. + */ + if (where->bh) { + BUFFER_TRACE(where->bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, where->bh); + if (err) + goto err_out; + } + /* That's it */ + + *where->p = where->key; + + /* + * Update the host buffer_head or inode to point to more just allocated + * direct blocks blocks + */ + if (num == 0 && blks > 1) { + current_block = le32_to_cpu(where->key) + 1; + for (i = 1; i < blks; i++) + *(where->p + i) = cpu_to_le32(current_block++); + } + + /* We are done with atomic stuff, now do the rest of housekeeping */ + + inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + + /* had we spliced it onto indirect block? */ + if (where->bh) { + /* + * If we spliced it onto an indirect block, we haven't + * altered the inode. Note however that if it is being spliced + * onto an indirect block at the very end of the file (the + * file is growing) then we *will* alter the inode to reflect + * the new i_size. But that is not done here - it is done in + * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. + */ + jbd_debug(5, "splicing indirect only\n"); + BUFFER_TRACE(where->bh, "call ext4_journal_dirty_metadata"); + err = ext4_journal_dirty_metadata(handle, where->bh); + if (err) + goto err_out; + } else { + /* + * OK, we spliced it into the inode itself on a direct block. + * Inode was dirtied above. + */ + jbd_debug(5, "splicing direct\n"); + } + return err; + +err_out: + for (i = 1; i <= num; i++) { + BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget"); + ext4_journal_forget(handle, where[i].bh); + ext4_free_blocks(handle, inode, + le32_to_cpu(where[i-1].key), 1, 0); + } + ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0); + + return err; +} + +/* + * Allocation strategy is simple: if we have to allocate something, we will + * have to go the whole way to leaf. So let's do it before attaching anything + * to tree, set linkage between the newborn blocks, write them if sync is + * required, recheck the path, free and repeat if check fails, otherwise + * set the last missing link (that will protect us from any truncate-generated + * removals - all blocks on the path are immune now) and possibly force the + * write on the parent block. + * That has a nice additional property: no special recovery from the failed + * allocations is needed - we simply release blocks and do not touch anything + * reachable from inode. + * + * `handle' can be NULL if create == 0. + * + * return > 0, # of blocks mapped or allocated. + * return = 0, if plain lookup failed. + * return < 0, error case. + * + * + * Need to be called with + * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block + * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) + */ +int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, + ext4_lblk_t iblock, unsigned long maxblocks, + struct buffer_head *bh_result, + int create, int extend_disksize) +{ + int err = -EIO; + ext4_lblk_t offsets[4]; + Indirect chain[4]; + Indirect *partial; + ext4_fsblk_t goal; + int indirect_blks; + int blocks_to_boundary = 0; + int depth; + struct ext4_inode_info *ei = EXT4_I(inode); + int count = 0; + ext4_fsblk_t first_block = 0; + loff_t disksize; + + + J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)); + J_ASSERT(handle != NULL || create == 0); + depth = ext4_block_to_path(inode, iblock, offsets, + &blocks_to_boundary); + + if (depth == 0) + goto out; + + partial = ext4_get_branch(inode, depth, offsets, chain, &err); + + /* Simplest case - block found, no allocation needed */ + if (!partial) { + first_block = le32_to_cpu(chain[depth - 1].key); + clear_buffer_new(bh_result); + count++; + /*map more blocks*/ + while (count < maxblocks && count <= blocks_to_boundary) { + ext4_fsblk_t blk; + + blk = le32_to_cpu(*(chain[depth-1].p + count)); + + if (blk == first_block + count) + count++; + else + break; + } + goto got_it; + } + + /* Next simple case - plain lookup or failed read of indirect block */ + if (!create || err == -EIO) + goto cleanup; + + /* + * Okay, we need to do block allocation. + */ + goal = ext4_find_goal(inode, iblock, partial); + + /* the number of blocks need to allocate for [d,t]indirect blocks */ + indirect_blks = (chain + depth) - partial - 1; + + /* + * Next look up the indirect map to count the totoal number of + * direct blocks to allocate for this branch. + */ + count = ext4_blks_to_allocate(partial, indirect_blks, + maxblocks, blocks_to_boundary); + /* + * Block out ext4_truncate while we alter the tree + */ + err = ext4_alloc_branch(handle, inode, iblock, indirect_blks, + &count, goal, + offsets + (partial - chain), partial); + + /* + * The ext4_splice_branch call will free and forget any buffers + * on the new chain if there is a failure, but that risks using + * up transaction credits, especially for bitmaps where the + * credits cannot be returned. Can we handle this somehow? We + * may need to return -EAGAIN upwards in the worst case. --sct + */ + if (!err) + err = ext4_splice_branch(handle, inode, iblock, + partial, indirect_blks, count); + /* + * i_disksize growing is protected by i_data_sem. Don't forget to + * protect it if you're about to implement concurrent + * ext4_get_block() -bzzz + */ + if (!err && extend_disksize) { + disksize = ((loff_t) iblock + count) << inode->i_blkbits; + if (disksize > i_size_read(inode)) + disksize = i_size_read(inode); + if (disksize > ei->i_disksize) + ei->i_disksize = disksize; + } + if (err) + goto cleanup; + + set_buffer_new(bh_result); +got_it: + map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); + if (count > blocks_to_boundary) + set_buffer_boundary(bh_result); + err = count; + /* Clean up and exit */ + partial = chain + depth - 1; /* the whole chain */ +cleanup: + while (partial > chain) { + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + partial--; + } + BUFFER_TRACE(bh_result, "returned"); +out: + return err; +} + +/* + * Calculate the number of metadata blocks need to reserve + * to allocate @blocks for non extent file based file + */ +static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks) +{ + int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb); + int ind_blks, dind_blks, tind_blks; + + /* number of new indirect blocks needed */ + ind_blks = (blocks + icap - 1) / icap; + + dind_blks = (ind_blks + icap - 1) / icap; + + tind_blks = 1; + + return ind_blks + dind_blks + tind_blks; +} + +/* + * Calculate the number of metadata blocks need to reserve + * to allocate given number of blocks + */ +static int ext4_calc_metadata_amount(struct inode *inode, int blocks) +{ + if (!blocks) + return 0; + + if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) + return ext4_ext_calc_metadata_amount(inode, blocks); + + return ext4_indirect_calc_metadata_amount(inode, blocks); +} + +static void ext4_da_update_reserve_space(struct inode *inode, int used) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int total, mdb, mdb_free; + + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + /* recalculate the number of metablocks still need to be reserved */ + total = EXT4_I(inode)->i_reserved_data_blocks - used; + mdb = ext4_calc_metadata_amount(inode, total); + + /* figure out how many metablocks to release */ + BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); + mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; + + if (mdb_free) { + /* Account for allocated meta_blocks */ + mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks; + + /* update fs dirty blocks counter */ + percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free); + EXT4_I(inode)->i_allocated_meta_blocks = 0; + EXT4_I(inode)->i_reserved_meta_blocks = mdb; + } + + /* update per-inode reservations */ + BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks); + EXT4_I(inode)->i_reserved_data_blocks -= used; + + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); +} + +/* + * The ext4_get_blocks_wrap() function try to look up the requested blocks, + * and returns if the blocks are already mapped. + * + * Otherwise it takes the write lock of the i_data_sem and allocate blocks + * and store the allocated blocks in the result buffer head and mark it + * mapped. + * + * If file type is extents based, it will call ext4_ext_get_blocks(), + * Otherwise, call with ext4_get_blocks_handle() to handle indirect mapping + * based files + * + * On success, it returns the number of blocks being mapped or allocate. + * if create==0 and the blocks are pre-allocated and uninitialized block, + * the result buffer head is unmapped. If the create ==1, it will make sure + * the buffer head is mapped. + * + * It returns 0 if plain look up failed (blocks have not been allocated), in + * that casem, buffer head is unmapped + * + * It returns the error in case of allocation failure. + */ +int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, + unsigned long max_blocks, struct buffer_head *bh, + int create, int extend_disksize, int flag) +{ + int retval; + + clear_buffer_mapped(bh); + + /* + * Try to see if we can get the block without requesting + * for new file system block. + */ + down_read((&EXT4_I(inode)->i_data_sem)); + if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { + retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, + bh, 0, 0); + } else { + retval = ext4_get_blocks_handle(handle, + inode, block, max_blocks, bh, 0, 0); + } + up_read((&EXT4_I(inode)->i_data_sem)); + + /* If it is only a block(s) look up */ + if (!create) + return retval; + + /* + * Returns if the blocks have already allocated + * + * Note that if blocks have been preallocated + * ext4_ext_get_block() returns th create = 0 + * with buffer head unmapped. + */ + if (retval > 0 && buffer_mapped(bh)) + return retval; + + /* + * New blocks allocate and/or writing to uninitialized extent + * will possibly result in updating i_data, so we take + * the write lock of i_data_sem, and call get_blocks() + * with create == 1 flag. + */ + down_write((&EXT4_I(inode)->i_data_sem)); + + /* + * if the caller is from delayed allocation writeout path + * we have already reserved fs blocks for allocation + * let the underlying get_block() function know to + * avoid double accounting + */ + if (flag) + EXT4_I(inode)->i_delalloc_reserved_flag = 1; + /* + * We need to check for EXT4 here because migrate + * could have changed the inode type in between + */ + if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { + retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, + bh, create, extend_disksize); + } else { + retval = ext4_get_blocks_handle(handle, inode, block, + max_blocks, bh, create, extend_disksize); + + if (retval > 0 && buffer_new(bh)) { + /* + * We allocated new blocks which will result in + * i_data's format changing. Force the migrate + * to fail by clearing migrate flags + */ + EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags & + ~EXT4_EXT_MIGRATE; + } + } + + if (flag) { + EXT4_I(inode)->i_delalloc_reserved_flag = 0; + /* + * Update reserved blocks/metadata blocks + * after successful block allocation + * which were deferred till now + */ + if ((retval > 0) && buffer_delay(bh)) + ext4_da_update_reserve_space(inode, retval); + } + + up_write((&EXT4_I(inode)->i_data_sem)); + return retval; +} + +/* Maximum number of blocks we map for direct IO at once. */ +#define DIO_MAX_BLOCKS 4096 + +int ext4_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + handle_t *handle = ext4_journal_current_handle(); + int ret = 0, started = 0; + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; + int dio_credits; + + if (create && !handle) { + /* Direct IO write... */ + if (max_blocks > DIO_MAX_BLOCKS) + max_blocks = DIO_MAX_BLOCKS; + dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); + handle = ext4_journal_start(inode, dio_credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + started = 1; + } + + ret = ext4_get_blocks_wrap(handle, inode, iblock, + max_blocks, bh_result, create, 0, 0); + if (ret > 0) { + bh_result->b_size = (ret << inode->i_blkbits); + ret = 0; + } + if (started) + ext4_journal_stop(handle); +out: + return ret; +} + +/* + * `handle' can be NULL if create is zero + */ +struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, + ext4_lblk_t block, int create, int *errp) +{ + struct buffer_head dummy; + int fatal = 0, err; + + J_ASSERT(handle != NULL || create == 0); + + dummy.b_state = 0; + dummy.b_blocknr = -1000; + buffer_trace_init(&dummy.b_history); + err = ext4_get_blocks_wrap(handle, inode, block, 1, + &dummy, create, 1, 0); + /* + * ext4_get_blocks_handle() returns number of blocks + * mapped. 0 in case of a HOLE. + */ + if (err > 0) { + if (err > 1) + WARN_ON(1); + err = 0; + } + *errp = err; + if (!err && buffer_mapped(&dummy)) { + struct buffer_head *bh; + bh = sb_getblk(inode->i_sb, dummy.b_blocknr); + if (!bh) { + *errp = -EIO; + goto err; + } + if (buffer_new(&dummy)) { + J_ASSERT(create != 0); + J_ASSERT(handle != NULL); + + /* + * Now that we do not always journal data, we should + * keep in mind whether this should always journal the + * new buffer as metadata. For now, regular file + * writes use ext4_get_block instead, so it's not a + * problem. + */ + lock_buffer(bh); + BUFFER_TRACE(bh, "call get_create_access"); + fatal = ext4_journal_get_create_access(handle, bh); + if (!fatal && !buffer_uptodate(bh)) { + memset(bh->b_data, 0, inode->i_sb->s_blocksize); + set_buffer_uptodate(bh); + } + unlock_buffer(bh); + BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); + err = ext4_journal_dirty_metadata(handle, bh); + if (!fatal) + fatal = err; + } else { + BUFFER_TRACE(bh, "not a new buffer"); + } + if (fatal) { + *errp = fatal; + brelse(bh); + bh = NULL; + } + return bh; + } +err: + return NULL; +} + +struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, + ext4_lblk_t block, int create, int *err) +{ + struct buffer_head *bh; + + bh = ext4_getblk(handle, inode, block, create, err); + if (!bh) + return bh; + if (buffer_uptodate(bh)) + return bh; + ll_rw_block(READ_META, 1, &bh); + wait_on_buffer(bh); + if (buffer_uptodate(bh)) + return bh; + put_bh(bh); + *err = -EIO; + return NULL; +} + +static int walk_page_buffers(handle_t *handle, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)(handle_t *handle, + struct buffer_head *bh)) +{ + struct buffer_head *bh; + unsigned block_start, block_end; + unsigned blocksize = head->b_size; + int err, ret = 0; + struct buffer_head *next; + + for (bh = head, block_start = 0; + ret == 0 && (bh != head || !block_start); + block_start = block_end, bh = next) + { + next = bh->b_this_page; + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (partial && !buffer_uptodate(bh)) + *partial = 1; + continue; + } + err = (*fn)(handle, bh); + if (!ret) + ret = err; + } + return ret; +} + +/* + * To preserve ordering, it is essential that the hole instantiation and + * the data write be encapsulated in a single transaction. We cannot + * close off a transaction and start a new one between the ext4_get_block() + * and the commit_write(). So doing the jbd2_journal_start at the start of + * prepare_write() is the right place. + * + * Also, this function can nest inside ext4_writepage() -> + * block_write_full_page(). In that case, we *know* that ext4_writepage() + * has generated enough buffer credits to do the whole page. So we won't + * block on the journal in that case, which is good, because the caller may + * be PF_MEMALLOC. + * + * By accident, ext4 can be reentered when a transaction is open via + * quota file writes. If we were to commit the transaction while thus + * reentered, there can be a deadlock - we would be holding a quota + * lock, and the commit would never complete if another thread had a + * transaction open and was blocking on the quota lock - a ranking + * violation. + * + * So what we do is to rely on the fact that jbd2_journal_stop/journal_start + * will _not_ run commit under these circumstances because handle->h_ref + * is elevated. We'll still have enough credits for the tiny quotafile + * write. + */ +static int do_journal_get_write_access(handle_t *handle, + struct buffer_head *bh) +{ + if (!buffer_mapped(bh) || buffer_freed(bh)) + return 0; + return ext4_journal_get_write_access(handle, bh); +} + +static int ext4_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct inode *inode = mapping->host; + int ret, needed_blocks = ext4_writepage_trans_blocks(inode); + handle_t *handle; + int retries = 0; + struct page *page; + pgoff_t index; + unsigned from, to; + + index = pos >> PAGE_CACHE_SHIFT; + from = pos & (PAGE_CACHE_SIZE - 1); + to = from + len; + +retry: + handle = ext4_journal_start(inode, needed_blocks); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + + /* We cannot recurse into the filesystem as the transaction is already + * started */ + flags |= AOP_FLAG_NOFS; + + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) { + ext4_journal_stop(handle); + ret = -ENOMEM; + goto out; + } + *pagep = page; + + ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, + ext4_get_block); + + if (!ret && ext4_should_journal_data(inode)) { + ret = walk_page_buffers(handle, page_buffers(page), + from, to, NULL, do_journal_get_write_access); + } + + if (ret) { + unlock_page(page); + ext4_journal_stop(handle); + page_cache_release(page); + /* + * block_write_begin may have instantiated a few blocks + * outside i_size. Trim these off again. Don't need + * i_size_read because we hold i_mutex. + */ + if (pos + len > inode->i_size) + vmtruncate(inode, inode->i_size); + } + + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; +out: + return ret; +} + +/* For write_end() in data=journal mode */ +static int write_end_fn(handle_t *handle, struct buffer_head *bh) +{ + if (!buffer_mapped(bh) || buffer_freed(bh)) + return 0; + set_buffer_uptodate(bh); + return ext4_journal_dirty_metadata(handle, bh); +} + +/* + * We need to pick up the new inode size which generic_commit_write gave us + * `file' can be NULL - eg, when called from page_symlink(). + * + * ext4 never places buffers on inode->i_mapping->private_list. metadata + * buffers are managed internally. + */ +static int ext4_ordered_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + handle_t *handle = ext4_journal_current_handle(); + struct inode *inode = mapping->host; + int ret = 0, ret2; + + ret = ext4_jbd2_file_inode(handle, inode); + + if (ret == 0) { + loff_t new_i_size; + + new_i_size = pos + copied; + if (new_i_size > EXT4_I(inode)->i_disksize) { + ext4_update_i_disksize(inode, new_i_size); + /* We need to mark inode dirty even if + * new_i_size is less that inode->i_size + * bu greater than i_disksize.(hint delalloc) + */ + ext4_mark_inode_dirty(handle, inode); + } + + ret2 = generic_write_end(file, mapping, pos, len, copied, + page, fsdata); + copied = ret2; + if (ret2 < 0) + ret = ret2; + } + ret2 = ext4_journal_stop(handle); + if (!ret) + ret = ret2; + + return ret ? ret : copied; +} + +static int ext4_writeback_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + handle_t *handle = ext4_journal_current_handle(); + struct inode *inode = mapping->host; + int ret = 0, ret2; + loff_t new_i_size; + + new_i_size = pos + copied; + if (new_i_size > EXT4_I(inode)->i_disksize) { + ext4_update_i_disksize(inode, new_i_size); + /* We need to mark inode dirty even if + * new_i_size is less that inode->i_size + * bu greater than i_disksize.(hint delalloc) + */ + ext4_mark_inode_dirty(handle, inode); + } + + ret2 = generic_write_end(file, mapping, pos, len, copied, + page, fsdata); + copied = ret2; + if (ret2 < 0) + ret = ret2; + + ret2 = ext4_journal_stop(handle); + if (!ret) + ret = ret2; + + return ret ? ret : copied; +} + +static int ext4_journalled_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + handle_t *handle = ext4_journal_current_handle(); + struct inode *inode = mapping->host; + int ret = 0, ret2; + int partial = 0; + unsigned from, to; + loff_t new_i_size; + + from = pos & (PAGE_CACHE_SIZE - 1); + to = from + len; + + if (copied < len) { + if (!PageUptodate(page)) + copied = 0; + page_zero_new_buffers(page, from+copied, to); + } + + ret = walk_page_buffers(handle, page_buffers(page), from, + to, &partial, write_end_fn); + if (!partial) + SetPageUptodate(page); + new_i_size = pos + copied; + if (new_i_size > inode->i_size) + i_size_write(inode, pos+copied); + EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; + if (new_i_size > EXT4_I(inode)->i_disksize) { + ext4_update_i_disksize(inode, new_i_size); + ret2 = ext4_mark_inode_dirty(handle, inode); + if (!ret) + ret = ret2; + } + + unlock_page(page); + ret2 = ext4_journal_stop(handle); + if (!ret) + ret = ret2; + page_cache_release(page); + + return ret ? ret : copied; +} + +static int ext4_da_reserve_space(struct inode *inode, int nrblocks) +{ + int retries = 0; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + unsigned long md_needed, mdblocks, total = 0; + + /* + * recalculate the amount of metadata blocks to reserve + * in order to allocate nrblocks + * worse case is one extent per block + */ +repeat: + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks; + mdblocks = ext4_calc_metadata_amount(inode, total); + BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks); + + md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks; + total = md_needed + nrblocks; + + if (ext4_claim_free_blocks(sbi, total)) { + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + if (ext4_should_retry_alloc(inode->i_sb, &retries)) { + yield(); + goto repeat; + } + return -ENOSPC; + } + EXT4_I(inode)->i_reserved_data_blocks += nrblocks; + EXT4_I(inode)->i_reserved_meta_blocks = mdblocks; + + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + return 0; /* success */ +} + +static void ext4_da_release_space(struct inode *inode, int to_free) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int total, mdb, mdb_free, release; + + if (!to_free) + return; /* Nothing to release, exit */ + + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + + if (!EXT4_I(inode)->i_reserved_data_blocks) { + /* + * if there is no reserved blocks, but we try to free some + * then the counter is messed up somewhere. + * but since this function is called from invalidate + * page, it's harmless to return without any action + */ + printk(KERN_INFO "ext4 delalloc try to release %d reserved " + "blocks for inode %lu, but there is no reserved " + "data blocks\n", to_free, inode->i_ino); + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + return; + } + + /* recalculate the number of metablocks still need to be reserved */ + total = EXT4_I(inode)->i_reserved_data_blocks - to_free; + mdb = ext4_calc_metadata_amount(inode, total); + + /* figure out how many metablocks to release */ + BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); + mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; + + release = to_free + mdb_free; + + /* update fs dirty blocks counter for truncate case */ + percpu_counter_sub(&sbi->s_dirtyblocks_counter, release); + + /* update per-inode reservations */ + BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks); + EXT4_I(inode)->i_reserved_data_blocks -= to_free; + + BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); + EXT4_I(inode)->i_reserved_meta_blocks = mdb; + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); +} + +static void ext4_da_page_release_reservation(struct page *page, + unsigned long offset) +{ + int to_release = 0; + struct buffer_head *head, *bh; + unsigned int curr_off = 0; + + head = page_buffers(page); + bh = head; + do { + unsigned int next_off = curr_off + bh->b_size; + + if ((offset <= curr_off) && (buffer_delay(bh))) { + to_release++; + clear_buffer_delay(bh); + } + curr_off = next_off; + } while ((bh = bh->b_this_page) != head); + ext4_da_release_space(page->mapping->host, to_release); +} + +/* + * Delayed allocation stuff + */ + +struct mpage_da_data { + struct inode *inode; + struct buffer_head lbh; /* extent of blocks */ + unsigned long first_page, next_page; /* extent of pages */ + get_block_t *get_block; + struct writeback_control *wbc; + int io_done; + long pages_written; + int retval; +}; + +/* + * mpage_da_submit_io - walks through extent of pages and try to write + * them with writepage() call back + * + * @mpd->inode: inode + * @mpd->first_page: first page of the extent + * @mpd->next_page: page after the last page of the extent + * @mpd->get_block: the filesystem's block mapper function + * + * By the time mpage_da_submit_io() is called we expect all blocks + * to be allocated. this may be wrong if allocation failed. + * + * As pages are already locked by write_cache_pages(), we can't use it + */ +static int mpage_da_submit_io(struct mpage_da_data *mpd) +{ + long pages_skipped; + struct pagevec pvec; + unsigned long index, end; + int ret = 0, err, nr_pages, i; + struct inode *inode = mpd->inode; + struct address_space *mapping = inode->i_mapping; + + BUG_ON(mpd->next_page <= mpd->first_page); + /* + * We need to start from the first_page to the next_page - 1 + * to make sure we also write the mapped dirty buffer_heads. + * If we look at mpd->lbh.b_blocknr we would only be looking + * at the currently mapped buffer_heads. + */ + index = mpd->first_page; + end = mpd->next_page - 1; + + pagevec_init(&pvec, 0); + while (index <= end) { + nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); + if (nr_pages == 0) + break; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + index = page->index; + if (index > end) + break; + index++; + + BUG_ON(!PageLocked(page)); + BUG_ON(PageWriteback(page)); + + pages_skipped = mpd->wbc->pages_skipped; + err = mapping->a_ops->writepage(page, mpd->wbc); + if (!err && (pages_skipped == mpd->wbc->pages_skipped)) + /* + * have successfully written the page + * without skipping the same + */ + mpd->pages_written++; + /* + * In error case, we have to continue because + * remaining pages are still locked + * XXX: unlock and re-dirty them? + */ + if (ret == 0) + ret = err; + } + pagevec_release(&pvec); + } + return ret; +} + +/* + * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers + * + * @mpd->inode - inode to walk through + * @exbh->b_blocknr - first block on a disk + * @exbh->b_size - amount of space in bytes + * @logical - first logical block to start assignment with + * + * the function goes through all passed space and put actual disk + * block numbers into buffer heads, dropping BH_Delay + */ +static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, + struct buffer_head *exbh) +{ + struct inode *inode = mpd->inode; + struct address_space *mapping = inode->i_mapping; + int blocks = exbh->b_size >> inode->i_blkbits; + sector_t pblock = exbh->b_blocknr, cur_logical; + struct buffer_head *head, *bh; + pgoff_t index, end; + struct pagevec pvec; + int nr_pages, i; + + index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits); + end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits); + cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + + pagevec_init(&pvec, 0); + + while (index <= end) { + /* XXX: optimize tail */ + nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); + if (nr_pages == 0) + break; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + index = page->index; + if (index > end) + break; + index++; + + BUG_ON(!PageLocked(page)); + BUG_ON(PageWriteback(page)); + BUG_ON(!page_has_buffers(page)); + + bh = page_buffers(page); + head = bh; + + /* skip blocks out of the range */ + do { + if (cur_logical >= logical) + break; + cur_logical++; + } while ((bh = bh->b_this_page) != head); + + do { + if (cur_logical >= logical + blocks) + break; + if (buffer_delay(bh)) { + bh->b_blocknr = pblock; + clear_buffer_delay(bh); + bh->b_bdev = inode->i_sb->s_bdev; + } else if (buffer_unwritten(bh)) { + bh->b_blocknr = pblock; + clear_buffer_unwritten(bh); + set_buffer_mapped(bh); + set_buffer_new(bh); + bh->b_bdev = inode->i_sb->s_bdev; + } else if (buffer_mapped(bh)) + BUG_ON(bh->b_blocknr != pblock); + + cur_logical++; + pblock++; + } while ((bh = bh->b_this_page) != head); + } + pagevec_release(&pvec); + } +} + + +/* + * __unmap_underlying_blocks - just a helper function to unmap + * set of blocks described by @bh + */ +static inline void __unmap_underlying_blocks(struct inode *inode, + struct buffer_head *bh) +{ + struct block_device *bdev = inode->i_sb->s_bdev; + int blocks, i; + + blocks = bh->b_size >> inode->i_blkbits; + for (i = 0; i < blocks; i++) + unmap_underlying_metadata(bdev, bh->b_blocknr + i); +} + +static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd, + sector_t logical, long blk_cnt) +{ + int nr_pages, i; + pgoff_t index, end; + struct pagevec pvec; + struct inode *inode = mpd->inode; + struct address_space *mapping = inode->i_mapping; + + index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits); + end = (logical + blk_cnt - 1) >> + (PAGE_CACHE_SHIFT - inode->i_blkbits); + while (index <= end) { + nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); + if (nr_pages == 0) + break; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + index = page->index; + if (index > end) + break; + index++; + + BUG_ON(!PageLocked(page)); + BUG_ON(PageWriteback(page)); + block_invalidatepage(page, 0); + ClearPageUptodate(page); + unlock_page(page); + } + } + return; +} + +static void ext4_print_free_blocks(struct inode *inode) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + printk(KERN_EMERG "Total free blocks count %lld\n", + ext4_count_free_blocks(inode->i_sb)); + printk(KERN_EMERG "Free/Dirty block details\n"); + printk(KERN_EMERG "free_blocks=%lld\n", + percpu_counter_sum(&sbi->s_freeblocks_counter)); + printk(KERN_EMERG "dirty_blocks=%lld\n", + percpu_counter_sum(&sbi->s_dirtyblocks_counter)); + printk(KERN_EMERG "Block reservation details\n"); + printk(KERN_EMERG "i_reserved_data_blocks=%lu\n", + EXT4_I(inode)->i_reserved_data_blocks); + printk(KERN_EMERG "i_reserved_meta_blocks=%lu\n", + EXT4_I(inode)->i_reserved_meta_blocks); + return; +} + +/* + * mpage_da_map_blocks - go through given space + * + * @mpd->lbh - bh describing space + * @mpd->get_block - the filesystem's block mapper function + * + * The function skips space we know is already mapped to disk blocks. + * + */ +static int mpage_da_map_blocks(struct mpage_da_data *mpd) +{ + int err = 0; + struct buffer_head new; + struct buffer_head *lbh = &mpd->lbh; + sector_t next; + + /* + * We consider only non-mapped and non-allocated blocks + */ + if (buffer_mapped(lbh) && !buffer_delay(lbh)) + return 0; + new.b_state = lbh->b_state; + new.b_blocknr = 0; + new.b_size = lbh->b_size; + next = lbh->b_blocknr; + /* + * If we didn't accumulate anything + * to write simply return + */ + if (!new.b_size) + return 0; + err = mpd->get_block(mpd->inode, next, &new, 1); + if (err) { + + /* If get block returns with error + * we simply return. Later writepage + * will redirty the page and writepages + * will find the dirty page again + */ + if (err == -EAGAIN) + return 0; + + if (err == -ENOSPC && + ext4_count_free_blocks(mpd->inode->i_sb)) { + mpd->retval = err; + return 0; + } + + /* + * get block failure will cause us + * to loop in writepages. Because + * a_ops->writepage won't be able to + * make progress. The page will be redirtied + * by writepage and writepages will again + * try to write the same. + */ + printk(KERN_EMERG "%s block allocation failed for inode %lu " + "at logical offset %llu with max blocks " + "%zd with error %d\n", + __func__, mpd->inode->i_ino, + (unsigned long long)next, + lbh->b_size >> mpd->inode->i_blkbits, err); + printk(KERN_EMERG "This should not happen.!! " + "Data will be lost\n"); + if (err == -ENOSPC) { + ext4_print_free_blocks(mpd->inode); + } + /* invlaidate all the pages */ + ext4_da_block_invalidatepages(mpd, next, + lbh->b_size >> mpd->inode->i_blkbits); + return err; + } + BUG_ON(new.b_size == 0); + + if (buffer_new(&new)) + __unmap_underlying_blocks(mpd->inode, &new); + + /* + * If blocks are delayed marked, we need to + * put actual blocknr and drop delayed bit + */ + if (buffer_delay(lbh) || buffer_unwritten(lbh)) + mpage_put_bnr_to_bhs(mpd, next, &new); + + return 0; +} + +#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ + (1 << BH_Delay) | (1 << BH_Unwritten)) + +/* + * mpage_add_bh_to_extent - try to add one more block to extent of blocks + * + * @mpd->lbh - extent of blocks + * @logical - logical number of the block in the file + * @bh - bh of the block (used to access block's state) + * + * the function is used to collect contig. blocks in same state + */ +static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, + sector_t logical, struct buffer_head *bh) +{ + sector_t next; + size_t b_size = bh->b_size; + struct buffer_head *lbh = &mpd->lbh; + int nrblocks = lbh->b_size >> mpd->inode->i_blkbits; + + /* check if thereserved journal credits might overflow */ + if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) { + if (nrblocks >= EXT4_MAX_TRANS_DATA) { + /* + * With non-extent format we are limited by the journal + * credit available. Total credit needed to insert + * nrblocks contiguous blocks is dependent on the + * nrblocks. So limit nrblocks. + */ + goto flush_it; + } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) > + EXT4_MAX_TRANS_DATA) { + /* + * Adding the new buffer_head would make it cross the + * allowed limit for which we have journal credit + * reserved. So limit the new bh->b_size + */ + b_size = (EXT4_MAX_TRANS_DATA - nrblocks) << + mpd->inode->i_blkbits; + /* we will do mpage_da_submit_io in the next loop */ + } + } + /* + * First block in the extent + */ + if (lbh->b_size == 0) { + lbh->b_blocknr = logical; + lbh->b_size = b_size; + lbh->b_state = bh->b_state & BH_FLAGS; + return; + } + + next = lbh->b_blocknr + nrblocks; + /* + * Can we merge the block to our big extent? + */ + if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) { + lbh->b_size += b_size; + return; + } + +flush_it: + /* + * We couldn't merge the block to our extent, so we + * need to flush current extent and start new one + */ + if (mpage_da_map_blocks(mpd) == 0) + mpage_da_submit_io(mpd); + mpd->io_done = 1; + return; +} + +/* + * __mpage_da_writepage - finds extent of pages and blocks + * + * @page: page to consider + * @wbc: not used, we just follow rules + * @data: context + * + * The function finds extents of pages and scan them for all blocks. + */ +static int __mpage_da_writepage(struct page *page, + struct writeback_control *wbc, void *data) +{ + struct mpage_da_data *mpd = data; + struct inode *inode = mpd->inode; + struct buffer_head *bh, *head, fake; + sector_t logical; + + if (mpd->io_done) { + /* + * Rest of the page in the page_vec + * redirty then and skip then. We will + * try to to write them again after + * starting a new transaction + */ + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return MPAGE_DA_EXTENT_TAIL; + } + /* + * Can we merge this page to current extent? + */ + if (mpd->next_page != page->index) { + /* + * Nope, we can't. So, we map non-allocated blocks + * and start IO on them using writepage() + */ + if (mpd->next_page != mpd->first_page) { + if (mpage_da_map_blocks(mpd) == 0) + mpage_da_submit_io(mpd); + /* + * skip rest of the page in the page_vec + */ + mpd->io_done = 1; + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return MPAGE_DA_EXTENT_TAIL; + } + + /* + * Start next extent of pages ... + */ + mpd->first_page = page->index; + + /* + * ... and blocks + */ + mpd->lbh.b_size = 0; + mpd->lbh.b_state = 0; + mpd->lbh.b_blocknr = 0; + } + + mpd->next_page = page->index + 1; + logical = (sector_t) page->index << + (PAGE_CACHE_SHIFT - inode->i_blkbits); + + if (!page_has_buffers(page)) { + /* + * There is no attached buffer heads yet (mmap?) + * we treat the page asfull of dirty blocks + */ + bh = &fake; + bh->b_size = PAGE_CACHE_SIZE; + bh->b_state = 0; + set_buffer_dirty(bh); + set_buffer_uptodate(bh); + mpage_add_bh_to_extent(mpd, logical, bh); + if (mpd->io_done) + return MPAGE_DA_EXTENT_TAIL; + } else { + /* + * Page with regular buffer heads, just add all dirty ones + */ + head = page_buffers(page); + bh = head; + do { + BUG_ON(buffer_locked(bh)); + /* + * We need to try to allocate + * unmapped blocks in the same page. + * Otherwise we won't make progress + * with the page in ext4_da_writepage + */ + if (buffer_dirty(bh) && + (!buffer_mapped(bh) || buffer_delay(bh))) { + mpage_add_bh_to_extent(mpd, logical, bh); + if (mpd->io_done) + return MPAGE_DA_EXTENT_TAIL; + } else if (buffer_dirty(bh) && (buffer_mapped(bh))) { + /* + * mapped dirty buffer. We need to update + * the b_state because we look at + * b_state in mpage_da_map_blocks. We don't + * update b_size because if we find an + * unmapped buffer_head later we need to + * use the b_state flag of that buffer_head. + */ + if (mpd->lbh.b_size == 0) + mpd->lbh.b_state = + bh->b_state & BH_FLAGS; + } + logical++; + } while ((bh = bh->b_this_page) != head); + } + + return 0; +} + +/* + * mpage_da_writepages - walk the list of dirty pages of the given + * address space, allocates non-allocated blocks, maps newly-allocated + * blocks to existing bhs and issue IO them + * + * @mapping: address space structure to write + * @wbc: subtract the number of written pages from *@wbc->nr_to_write + * @get_block: the filesystem's block mapper function. + * + * This is a library function, which implements the writepages() + * address_space_operation. + */ +static int mpage_da_writepages(struct address_space *mapping, + struct writeback_control *wbc, + struct mpage_da_data *mpd) +{ + int ret; + + if (!mpd->get_block) + return generic_writepages(mapping, wbc); + + mpd->lbh.b_size = 0; + mpd->lbh.b_state = 0; + mpd->lbh.b_blocknr = 0; + mpd->first_page = 0; + mpd->next_page = 0; + mpd->io_done = 0; + mpd->pages_written = 0; + mpd->retval = 0; + + ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd); + /* + * Handle last extent of pages + */ + if (!mpd->io_done && mpd->next_page != mpd->first_page) { + if (mpage_da_map_blocks(mpd) == 0) + mpage_da_submit_io(mpd); + + mpd->io_done = 1; + ret = MPAGE_DA_EXTENT_TAIL; + } + wbc->nr_to_write -= mpd->pages_written; + return ret; +} + +/* + * this is a special callback for ->write_begin() only + * it's intention is to return mapped block or reserve space + */ +static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int ret = 0; + + BUG_ON(create == 0); + BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); + + /* + * first, we need to know whether the block is allocated already + * preallocated blocks are unmapped but should treated + * the same as allocated blocks. + */ + ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1, bh_result, 0, 0, 0); + if ((ret == 0) && !buffer_delay(bh_result)) { + /* the block isn't (pre)allocated yet, let's reserve space */ + /* + * XXX: __block_prepare_write() unmaps passed block, + * is it OK? + */ + ret = ext4_da_reserve_space(inode, 1); + if (ret) + /* not enough space to reserve */ + return ret; + + map_bh(bh_result, inode->i_sb, 0); + set_buffer_new(bh_result); + set_buffer_delay(bh_result); + } else if (ret > 0) { + bh_result->b_size = (ret << inode->i_blkbits); + ret = 0; + } + + return ret; +} +#define EXT4_DELALLOC_RSVED 1 +static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int ret; + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; + loff_t disksize = EXT4_I(inode)->i_disksize; + handle_t *handle = NULL; + + handle = ext4_journal_current_handle(); + BUG_ON(!handle); + ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks, + bh_result, create, 0, EXT4_DELALLOC_RSVED); + if (ret > 0) { + + bh_result->b_size = (ret << inode->i_blkbits); + + if (ext4_should_order_data(inode)) { + int retval; + retval = ext4_jbd2_file_inode(handle, inode); + if (retval) + /* + * Failed to add inode for ordered + * mode. Don't update file size + */ + return retval; + } + + /* + * Update on-disk size along with block allocation + * we don't use 'extend_disksize' as size may change + * within already allocated block -bzzz + */ + disksize = ((loff_t) iblock + ret) << inode->i_blkbits; + if (disksize > i_size_read(inode)) + disksize = i_size_read(inode); + if (disksize > EXT4_I(inode)->i_disksize) { + ext4_update_i_disksize(inode, disksize); + ret = ext4_mark_inode_dirty(handle, inode); + return ret; + } + ret = 0; + } + return ret; +} + +static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) +{ + /* + * unmapped buffer is possible for holes. + * delay buffer is possible with delayed allocation + */ + return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh)); +} + +static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int ret = 0; + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; + + /* + * we don't want to do block allocation in writepage + * so call get_block_wrap with create = 0 + */ + ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks, + bh_result, 0, 0, 0); + if (ret > 0) { + bh_result->b_size = (ret << inode->i_blkbits); + ret = 0; + } + return ret; +} + +/* + * get called vi ext4_da_writepages after taking page lock (have journal handle) + * get called via journal_submit_inode_data_buffers (no journal handle) + * get called via shrink_page_list via pdflush (no journal handle) + * or grab_page_cache when doing write_begin (have journal handle) + */ +static int ext4_da_writepage(struct page *page, + struct writeback_control *wbc) +{ + int ret = 0; + loff_t size; + unsigned long len; + struct buffer_head *page_bufs; + struct inode *inode = page->mapping->host; + + size = i_size_read(inode); + if (page->index == size >> PAGE_CACHE_SHIFT) + len = size & ~PAGE_CACHE_MASK; + else + len = PAGE_CACHE_SIZE; + + if (page_has_buffers(page)) { + page_bufs = page_buffers(page); + if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, + ext4_bh_unmapped_or_delay)) { + /* + * We don't want to do block allocation + * So redirty the page and return + * We may reach here when we do a journal commit + * via journal_submit_inode_data_buffers. + * If we don't have mapping block we just ignore + * them. We can also reach here via shrink_page_list + */ + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; + } + } else { + /* + * The test for page_has_buffers() is subtle: + * We know the page is dirty but it lost buffers. That means + * that at some moment in time after write_begin()/write_end() + * has been called all buffers have been clean and thus they + * must have been written at least once. So they are all + * mapped and we can happily proceed with mapping them + * and writing the page. + * + * Try to initialize the buffer_heads and check whether + * all are mapped and non delay. We don't want to + * do block allocation here. + */ + ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, + ext4_normal_get_block_write); + if (!ret) { + page_bufs = page_buffers(page); + /* check whether all are mapped and non delay */ + if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, + ext4_bh_unmapped_or_delay)) { + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; + } + } else { + /* + * We can't do block allocation here + * so just redity the page and unlock + * and return + */ + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; + } + /* now mark the buffer_heads as dirty and uptodate */ + block_commit_write(page, 0, PAGE_CACHE_SIZE); + } + + if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) + ret = nobh_writepage(page, ext4_normal_get_block_write, wbc); + else + ret = block_write_full_page(page, + ext4_normal_get_block_write, + wbc); + + return ret; +} + +/* + * This is called via ext4_da_writepages() to + * calulate the total number of credits to reserve to fit + * a single extent allocation into a single transaction, + * ext4_da_writpeages() will loop calling this before + * the block allocation. + */ + +static int ext4_da_writepages_trans_blocks(struct inode *inode) +{ + int max_blocks = EXT4_I(inode)->i_reserved_data_blocks; + + /* + * With non-extent format the journal credit needed to + * insert nrblocks contiguous block is dependent on + * number of contiguous block. So we will limit + * number of contiguous block to a sane value + */ + if (!(inode->i_flags & EXT4_EXTENTS_FL) && + (max_blocks > EXT4_MAX_TRANS_DATA)) + max_blocks = EXT4_MAX_TRANS_DATA; + + return ext4_chunk_trans_blocks(inode, max_blocks); +} + +static int ext4_da_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + pgoff_t index; + int range_whole = 0; + handle_t *handle = NULL; + struct mpage_da_data mpd; + struct inode *inode = mapping->host; + int no_nrwrite_index_update; + long pages_written = 0, pages_skipped; + int range_cyclic, cycled = 1, io_done = 0; + int needed_blocks, ret = 0, nr_to_writebump = 0; + struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); + + /* + * No pages to write? This is mainly a kludge to avoid starting + * a transaction for special inodes like journal inode on last iput() + * because that could violate lock ordering on umount + */ + if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) + return 0; + + /* + * If the filesystem has aborted, it is read-only, so return + * right away instead of dumping stack traces later on that + * will obscure the real source of the problem. We test + * EXT4_MOUNT_ABORT instead of sb->s_flag's MS_RDONLY because + * the latter could be true if the filesystem is mounted + * read-only, and in that case, ext4_da_writepages should + * *never* be called, so if that ever happens, we would want + * the stack trace. + */ + if (unlikely(sbi->s_mount_opt & EXT4_MOUNT_ABORT)) + return -EROFS; + + /* + * Make sure nr_to_write is >= sbi->s_mb_stream_request + * This make sure small files blocks are allocated in + * single attempt. This ensure that small files + * get less fragmented. + */ + if (wbc->nr_to_write < sbi->s_mb_stream_request) { + nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write; + wbc->nr_to_write = sbi->s_mb_stream_request; + } + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + + range_cyclic = wbc->range_cyclic; + if (wbc->range_cyclic) { + index = mapping->writeback_index; + if (index) + cycled = 0; + wbc->range_start = index << PAGE_CACHE_SHIFT; + wbc->range_end = LLONG_MAX; + wbc->range_cyclic = 0; + } else + index = wbc->range_start >> PAGE_CACHE_SHIFT; + + mpd.wbc = wbc; + mpd.inode = mapping->host; + + /* + * we don't want write_cache_pages to update + * nr_to_write and writeback_index + */ + no_nrwrite_index_update = wbc->no_nrwrite_index_update; + wbc->no_nrwrite_index_update = 1; + pages_skipped = wbc->pages_skipped; + +retry: + while (!ret && wbc->nr_to_write > 0) { + + /* + * we insert one extent at a time. So we need + * credit needed for single extent allocation. + * journalled mode is currently not supported + * by delalloc + */ + BUG_ON(ext4_should_journal_data(inode)); + needed_blocks = ext4_da_writepages_trans_blocks(inode); + + /* start a new transaction*/ + handle = ext4_journal_start(inode, needed_blocks); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + printk(KERN_CRIT "%s: jbd2_start: " + "%ld pages, ino %lu; err %d\n", __func__, + wbc->nr_to_write, inode->i_ino, ret); + dump_stack(); + goto out_writepages; + } + mpd.get_block = ext4_da_get_block_write; + ret = mpage_da_writepages(mapping, wbc, &mpd); + + ext4_journal_stop(handle); + + if (mpd.retval == -ENOSPC) { + /* commit the transaction which would + * free blocks released in the transaction + * and try again + */ + jbd2_journal_force_commit_nested(sbi->s_journal); + wbc->pages_skipped = pages_skipped; + ret = 0; + } else if (ret == MPAGE_DA_EXTENT_TAIL) { + /* + * got one extent now try with + * rest of the pages + */ + pages_written += mpd.pages_written; + wbc->pages_skipped = pages_skipped; + ret = 0; + io_done = 1; + } else if (wbc->nr_to_write) + /* + * There is no more writeout needed + * or we requested for a noblocking writeout + * and we found the device congested + */ + break; + } + if (!io_done && !cycled) { + cycled = 1; + index = 0; + wbc->range_start = index << PAGE_CACHE_SHIFT; + wbc->range_end = mapping->writeback_index - 1; + goto retry; + } + if (pages_skipped != wbc->pages_skipped) + printk(KERN_EMERG "This should not happen leaving %s " + "with nr_to_write = %ld ret = %d\n", + __func__, wbc->nr_to_write, ret); + + /* Update index */ + index += pages_written; + wbc->range_cyclic = range_cyclic; + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) + /* + * set the writeback_index so that range_cyclic + * mode will write it back later + */ + mapping->writeback_index = index; + +out_writepages: + if (!no_nrwrite_index_update) + wbc->no_nrwrite_index_update = 0; + wbc->nr_to_write -= nr_to_writebump; + return ret; +} + +#define FALL_BACK_TO_NONDELALLOC 1 +static int ext4_nonda_switch(struct super_block *sb) +{ + s64 free_blocks, dirty_blocks; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + /* + * switch to non delalloc mode if we are running low + * on free block. The free block accounting via percpu + * counters can get slightly wrong with FBC_BATCH getting + * accumulated on each CPU without updating global counters + * Delalloc need an accurate free block accounting. So switch + * to non delalloc when we are near to error range. + */ + free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); + dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter); + if (2 * free_blocks < 3 * dirty_blocks || + free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) { + /* + * free block count is less that 150% of dirty blocks + * or free blocks is less that watermark + */ + return 1; + } + return 0; +} + +static int ext4_da_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret, retries = 0; + struct page *page; + pgoff_t index; + unsigned from, to; + struct inode *inode = mapping->host; + handle_t *handle; + + index = pos >> PAGE_CACHE_SHIFT; + from = pos & (PAGE_CACHE_SIZE - 1); + to = from + len; + + if (ext4_nonda_switch(inode->i_sb)) { + *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; + return ext4_write_begin(file, mapping, pos, + len, flags, pagep, fsdata); + } + *fsdata = (void *)0; +retry: + /* + * With delayed allocation, we don't log the i_disksize update + * if there is delayed block allocation. But we still need + * to journalling the i_disksize update if writes to the end + * of file which has an already mapped buffer. + */ + handle = ext4_journal_start(inode, 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + /* We cannot recurse into the filesystem as the transaction is already + * started */ + flags |= AOP_FLAG_NOFS; + + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) { + ext4_journal_stop(handle); + ret = -ENOMEM; + goto out; + } + *pagep = page; + + ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, + ext4_da_get_block_prep); + if (ret < 0) { + unlock_page(page); + ext4_journal_stop(handle); + page_cache_release(page); + /* + * block_write_begin may have instantiated a few blocks + * outside i_size. Trim these off again. Don't need + * i_size_read because we hold i_mutex. + */ + if (pos + len > inode->i_size) + vmtruncate(inode, inode->i_size); + } + + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; +out: + return ret; +} + +/* + * Check if we should update i_disksize + * when write to the end of file but not require block allocation + */ +static int ext4_da_should_update_i_disksize(struct page *page, + unsigned long offset) +{ + struct buffer_head *bh; + struct inode *inode = page->mapping->host; + unsigned int idx; + int i; + + bh = page_buffers(page); + idx = offset >> inode->i_blkbits; + + for (i = 0; i < idx; i++) + bh = bh->b_this_page; + + if (!buffer_mapped(bh) || (buffer_delay(bh))) + return 0; + return 1; +} + +static int ext4_da_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + int ret = 0, ret2; + handle_t *handle = ext4_journal_current_handle(); + loff_t new_i_size; + unsigned long start, end; + int write_mode = (int)(unsigned long)fsdata; + + if (write_mode == FALL_BACK_TO_NONDELALLOC) { + if (ext4_should_order_data(inode)) { + return ext4_ordered_write_end(file, mapping, pos, + len, copied, page, fsdata); + } else if (ext4_should_writeback_data(inode)) { + return ext4_writeback_write_end(file, mapping, pos, + len, copied, page, fsdata); + } else { + BUG(); + } + } + + start = pos & (PAGE_CACHE_SIZE - 1); + end = start + copied - 1; + + /* + * generic_write_end() will run mark_inode_dirty() if i_size + * changes. So let's piggyback the i_disksize mark_inode_dirty + * into that. + */ + + new_i_size = pos + copied; + if (new_i_size > EXT4_I(inode)->i_disksize) { + if (ext4_da_should_update_i_disksize(page, end)) { + down_write(&EXT4_I(inode)->i_data_sem); + if (new_i_size > EXT4_I(inode)->i_disksize) { + /* + * Updating i_disksize when extending file + * without needing block allocation + */ + if (ext4_should_order_data(inode)) + ret = ext4_jbd2_file_inode(handle, + inode); + + EXT4_I(inode)->i_disksize = new_i_size; + } + up_write(&EXT4_I(inode)->i_data_sem); + /* We need to mark inode dirty even if + * new_i_size is less that inode->i_size + * bu greater than i_disksize.(hint delalloc) + */ + ext4_mark_inode_dirty(handle, inode); + } + } + ret2 = generic_write_end(file, mapping, pos, len, copied, + page, fsdata); + copied = ret2; + if (ret2 < 0) + ret = ret2; + ret2 = ext4_journal_stop(handle); + if (!ret) + ret = ret2; + + return ret ? ret : copied; +} + +static void ext4_da_invalidatepage(struct page *page, unsigned long offset) +{ + /* + * Drop reserved blocks + */ + BUG_ON(!PageLocked(page)); + if (!page_has_buffers(page)) + goto out; + + ext4_da_page_release_reservation(page, offset); + +out: + ext4_invalidatepage(page, offset); + + return; +} + + +/* + * bmap() is special. It gets used by applications such as lilo and by + * the swapper to find the on-disk block of a specific piece of data. + * + * Naturally, this is dangerous if the block concerned is still in the + * journal. If somebody makes a swapfile on an ext4 data-journaling + * filesystem and enables swap, then they may get a nasty shock when the + * data getting swapped to that swapfile suddenly gets overwritten by + * the original zero's written out previously to the journal and + * awaiting writeback in the kernel's buffer cache. + * + * So, if we see any bmap calls here on a modified, data-journaled file, + * take extra steps to flush any blocks which might be in the cache. + */ +static sector_t ext4_bmap(struct address_space *mapping, sector_t block) +{ + struct inode *inode = mapping->host; + journal_t *journal; + int err; + + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && + test_opt(inode->i_sb, DELALLOC)) { + /* + * With delalloc we want to sync the file + * so that we can make sure we allocate + * blocks for file + */ + filemap_write_and_wait(mapping); + } + + if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { + /* + * This is a REALLY heavyweight approach, but the use of + * bmap on dirty files is expected to be extremely rare: + * only if we run lilo or swapon on a freshly made file + * do we expect this to happen. + * + * (bmap requires CAP_SYS_RAWIO so this does not + * represent an unprivileged user DOS attack --- we'd be + * in trouble if mortal users could trigger this path at + * will.) + * + * NB. EXT4_STATE_JDATA is not set on files other than + * regular files. If somebody wants to bmap a directory + * or symlink and gets confused because the buffer + * hasn't yet been flushed to disk, they deserve + * everything they get. + */ + + EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA; + journal = EXT4_JOURNAL(inode); + jbd2_journal_lock_updates(journal); + err = jbd2_journal_flush(journal); + jbd2_journal_unlock_updates(journal); + + if (err) + return 0; + } + + return generic_block_bmap(mapping, block, ext4_get_block); +} + +static int bget_one(handle_t *handle, struct buffer_head *bh) +{ + get_bh(bh); + return 0; +} + +static int bput_one(handle_t *handle, struct buffer_head *bh) +{ + put_bh(bh); + return 0; +} + +/* + * Note that we don't need to start a transaction unless we're journaling data + * because we should have holes filled from ext4_page_mkwrite(). We even don't + * need to file the inode to the transaction's list in ordered mode because if + * we are writing back data added by write(), the inode is already there and if + * we are writing back data modified via mmap(), noone guarantees in which + * transaction the data will hit the disk. In case we are journaling data, we + * cannot start transaction directly because transaction start ranks above page + * lock so we have to do some magic. + * + * In all journaling modes block_write_full_page() will start the I/O. + * + * Problem: + * + * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> + * ext4_writepage() + * + * Similar for: + * + * ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ... + * + * Same applies to ext4_get_block(). We will deadlock on various things like + * lock_journal and i_data_sem + * + * Setting PF_MEMALLOC here doesn't work - too many internal memory + * allocations fail. + * + * 16May01: If we're reentered then journal_current_handle() will be + * non-zero. We simply *return*. + * + * 1 July 2001: @@@ FIXME: + * In journalled data mode, a data buffer may be metadata against the + * current transaction. But the same file is part of a shared mapping + * and someone does a writepage() on it. + * + * We will move the buffer onto the async_data list, but *after* it has + * been dirtied. So there's a small window where we have dirty data on + * BJ_Metadata. + * + * Note that this only applies to the last partial page in the file. The + * bit which block_write_full_page() uses prepare/commit for. (That's + * broken code anyway: it's wrong for msync()). + * + * It's a rare case: affects the final partial page, for journalled data + * where the file is subject to bith write() and writepage() in the same + * transction. To fix it we'll need a custom block_write_full_page(). + * We'll probably need that anyway for journalling writepage() output. + * + * We don't honour synchronous mounts for writepage(). That would be + * disastrous. Any write() or metadata operation will sync the fs for + * us. + * + */ +static int __ext4_normal_writepage(struct page *page, + struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + + if (test_opt(inode->i_sb, NOBH)) + return nobh_writepage(page, + ext4_normal_get_block_write, wbc); + else + return block_write_full_page(page, + ext4_normal_get_block_write, + wbc); +} + +static int ext4_normal_writepage(struct page *page, + struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + loff_t size = i_size_read(inode); + loff_t len; + + J_ASSERT(PageLocked(page)); + if (page->index == size >> PAGE_CACHE_SHIFT) + len = size & ~PAGE_CACHE_MASK; + else + len = PAGE_CACHE_SIZE; + + if (page_has_buffers(page)) { + /* if page has buffers it should all be mapped + * and allocated. If there are not buffers attached + * to the page we know the page is dirty but it lost + * buffers. That means that at some moment in time + * after write_begin() / write_end() has been called + * all buffers have been clean and thus they must have been + * written at least once. So they are all mapped and we can + * happily proceed with mapping them and writing the page. + */ + BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, + ext4_bh_unmapped_or_delay)); + } + + if (!ext4_journal_current_handle()) + return __ext4_normal_writepage(page, wbc); + + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; +} + +static int __ext4_journalled_writepage(struct page *page, + struct writeback_control *wbc) +{ + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + struct buffer_head *page_bufs; + handle_t *handle = NULL; + int ret = 0; + int err; + + ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, + ext4_normal_get_block_write); + if (ret != 0) + goto out_unlock; + + page_bufs = page_buffers(page); + walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL, + bget_one); + /* As soon as we unlock the page, it can go away, but we have + * references to buffers so we are safe */ + unlock_page(page); + + handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + + ret = walk_page_buffers(handle, page_bufs, 0, + PAGE_CACHE_SIZE, NULL, do_journal_get_write_access); + + err = walk_page_buffers(handle, page_bufs, 0, + PAGE_CACHE_SIZE, NULL, write_end_fn); + if (ret == 0) + ret = err; + err = ext4_journal_stop(handle); + if (!ret) + ret = err; + + walk_page_buffers(handle, page_bufs, 0, + PAGE_CACHE_SIZE, NULL, bput_one); + EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; + goto out; + +out_unlock: + unlock_page(page); +out: + return ret; +} + +static int ext4_journalled_writepage(struct page *page, + struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + loff_t size = i_size_read(inode); + loff_t len; + + J_ASSERT(PageLocked(page)); + if (page->index == size >> PAGE_CACHE_SHIFT) + len = size & ~PAGE_CACHE_MASK; + else + len = PAGE_CACHE_SIZE; + + if (page_has_buffers(page)) { + /* if page has buffers it should all be mapped + * and allocated. If there are not buffers attached + * to the page we know the page is dirty but it lost + * buffers. That means that at some moment in time + * after write_begin() / write_end() has been called + * all buffers have been clean and thus they must have been + * written at least once. So they are all mapped and we can + * happily proceed with mapping them and writing the page. + */ + BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, + ext4_bh_unmapped_or_delay)); + } + + if (ext4_journal_current_handle()) + goto no_write; + + if (PageChecked(page)) { + /* + * It's mmapped pagecache. Add buffers and journal it. There + * doesn't seem much point in redirtying the page here. + */ + ClearPageChecked(page); + return __ext4_journalled_writepage(page, wbc); + } else { + /* + * It may be a page full of checkpoint-mode buffers. We don't + * really know unless we go poke around in the buffer_heads. + * But block_write_full_page will do the right thing. + */ + return block_write_full_page(page, + ext4_normal_get_block_write, + wbc); + } +no_write: + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; +} + +static int ext4_readpage(struct file *file, struct page *page) +{ + return mpage_readpage(page, ext4_get_block); +} + +static int +ext4_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); +} + +static void ext4_invalidatepage(struct page *page, unsigned long offset) +{ + journal_t *journal = EXT4_JOURNAL(page->mapping->host); + + /* + * If it's a full truncate we just forget about the pending dirtying + */ + if (offset == 0) + ClearPageChecked(page); + + jbd2_journal_invalidatepage(journal, page, offset); +} + +static int ext4_releasepage(struct page *page, gfp_t wait) +{ + journal_t *journal = EXT4_JOURNAL(page->mapping->host); + + WARN_ON(PageChecked(page)); + if (!page_has_buffers(page)) + return 0; + return jbd2_journal_try_to_free_buffers(journal, page, wait); +} + +/* + * If the O_DIRECT write will extend the file then add this inode to the + * orphan list. So recovery will truncate it back to the original size + * if the machine crashes during the write. + * + * If the O_DIRECT write is intantiating holes inside i_size and the machine + * crashes then stale disk data _may_ be exposed inside the file. But current + * VFS code falls back into buffered path in that case so we are safe. + */ +static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct ext4_inode_info *ei = EXT4_I(inode); + handle_t *handle; + ssize_t ret; + int orphan = 0; + size_t count = iov_length(iov, nr_segs); + + if (rw == WRITE) { + loff_t final_size = offset + count; + + if (final_size > inode->i_size) { + /* Credits for sb + inode write */ + handle = ext4_journal_start(inode, 2); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + ret = ext4_orphan_add(handle, inode); + if (ret) { + ext4_journal_stop(handle); + goto out; + } + orphan = 1; + ei->i_disksize = inode->i_size; + ext4_journal_stop(handle); + } + } + + ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, + offset, nr_segs, + ext4_get_block, NULL); + + if (orphan) { + int err; + + /* Credits for sb + inode write */ + handle = ext4_journal_start(inode, 2); + if (IS_ERR(handle)) { + /* This is really bad luck. We've written the data + * but cannot extend i_size. Bail out and pretend + * the write failed... */ + ret = PTR_ERR(handle); + goto out; + } + if (inode->i_nlink) + ext4_orphan_del(handle, inode); + if (ret > 0) { + loff_t end = offset + ret; + if (end > inode->i_size) { + ei->i_disksize = end; + i_size_write(inode, end); + /* + * We're going to return a positive `ret' + * here due to non-zero-length I/O, so there's + * no way of reporting error returns from + * ext4_mark_inode_dirty() to userspace. So + * ignore it. + */ + ext4_mark_inode_dirty(handle, inode); + } + } + err = ext4_journal_stop(handle); + if (ret == 0) + ret = err; + } +out: + return ret; +} + +/* + * Pages can be marked dirty completely asynchronously from ext4's journalling + * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do + * much here because ->set_page_dirty is called under VFS locks. The page is + * not necessarily locked. + * + * We cannot just dirty the page and leave attached buffers clean, because the + * buffers' dirty state is "definitive". We cannot just set the buffers dirty + * or jbddirty because all the journalling code will explode. + * + * So what we do is to mark the page "pending dirty" and next time writepage + * is called, propagate that into the buffers appropriately. + */ +static int ext4_journalled_set_page_dirty(struct page *page) +{ + SetPageChecked(page); + return __set_page_dirty_nobuffers(page); +} + +static const struct address_space_operations ext4_ordered_aops = { + .readpage = ext4_readpage, + .readpages = ext4_readpages, + .writepage = ext4_normal_writepage, + .sync_page = block_sync_page, + .write_begin = ext4_write_begin, + .write_end = ext4_ordered_write_end, + .bmap = ext4_bmap, + .invalidatepage = ext4_invalidatepage, + .releasepage = ext4_releasepage, + .direct_IO = ext4_direct_IO, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, +}; + +static const struct address_space_operations ext4_writeback_aops = { + .readpage = ext4_readpage, + .readpages = ext4_readpages, + .writepage = ext4_normal_writepage, + .sync_page = block_sync_page, + .write_begin = ext4_write_begin, + .write_end = ext4_writeback_write_end, + .bmap = ext4_bmap, + .invalidatepage = ext4_invalidatepage, + .releasepage = ext4_releasepage, + .direct_IO = ext4_direct_IO, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, +}; + +static const struct address_space_operations ext4_journalled_aops = { + .readpage = ext4_readpage, + .readpages = ext4_readpages, + .writepage = ext4_journalled_writepage, + .sync_page = block_sync_page, + .write_begin = ext4_write_begin, + .write_end = ext4_journalled_write_end, + .set_page_dirty = ext4_journalled_set_page_dirty, + .bmap = ext4_bmap, + .invalidatepage = ext4_invalidatepage, + .releasepage = ext4_releasepage, + .is_partially_uptodate = block_is_partially_uptodate, +}; + +static const struct address_space_operations ext4_da_aops = { + .readpage = ext4_readpage, + .readpages = ext4_readpages, + .writepage = ext4_da_writepage, + .writepages = ext4_da_writepages, + .sync_page = block_sync_page, + .write_begin = ext4_da_write_begin, + .write_end = ext4_da_write_end, + .bmap = ext4_bmap, + .invalidatepage = ext4_da_invalidatepage, + .releasepage = ext4_releasepage, + .direct_IO = ext4_direct_IO, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, +}; + +void ext4_set_aops(struct inode *inode) +{ + if (ext4_should_order_data(inode) && + test_opt(inode->i_sb, DELALLOC)) + inode->i_mapping->a_ops = &ext4_da_aops; + else if (ext4_should_order_data(inode)) + inode->i_mapping->a_ops = &ext4_ordered_aops; + else if (ext4_should_writeback_data(inode) && + test_opt(inode->i_sb, DELALLOC)) + inode->i_mapping->a_ops = &ext4_da_aops; + else if (ext4_should_writeback_data(inode)) + inode->i_mapping->a_ops = &ext4_writeback_aops; + else + inode->i_mapping->a_ops = &ext4_journalled_aops; +} + +/* + * ext4_block_truncate_page() zeroes out a mapping from file offset `from' + * up to the end of the block which corresponds to `from'. + * This required during truncate. We need to physically zero the tail end + * of that block so it doesn't yield old data if the file is later grown. + */ +int ext4_block_truncate_page(handle_t *handle, + struct address_space *mapping, loff_t from) +{ + ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned blocksize, length, pos; + ext4_lblk_t iblock; + struct inode *inode = mapping->host; + struct buffer_head *bh; + struct page *page; + int err = 0; + + page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT); + if (!page) + return -EINVAL; + + blocksize = inode->i_sb->s_blocksize; + length = blocksize - (offset & (blocksize - 1)); + iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); + + /* + * For "nobh" option, we can only work if we don't need to + * read-in the page - otherwise we create buffers to do the IO. + */ + if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) && + ext4_should_writeback_data(inode) && PageUptodate(page)) { + zero_user(page, offset, length); + set_page_dirty(page); + goto unlock; + } + + if (!page_has_buffers(page)) + create_empty_buffers(page, blocksize, 0); + + /* Find the buffer that contains "offset" */ + bh = page_buffers(page); + pos = blocksize; + while (offset >= pos) { + bh = bh->b_this_page; + iblock++; + pos += blocksize; + } + + err = 0; + if (buffer_freed(bh)) { + BUFFER_TRACE(bh, "freed: skip"); + goto unlock; + } + + if (!buffer_mapped(bh)) { + BUFFER_TRACE(bh, "unmapped"); + ext4_get_block(inode, iblock, bh, 0); + /* unmapped? It's a hole - nothing to do */ + if (!buffer_mapped(bh)) { + BUFFER_TRACE(bh, "still unmapped"); + goto unlock; + } + } + + /* Ok, it's mapped. Make sure it's up-to-date */ + if (PageUptodate(page)) + set_buffer_uptodate(bh); + + if (!buffer_uptodate(bh)) { + err = -EIO; + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + /* Uhhuh. Read error. Complain and punt. */ + if (!buffer_uptodate(bh)) + goto unlock; + } + + if (ext4_should_journal_data(inode)) { + BUFFER_TRACE(bh, "get write access"); + err = ext4_journal_get_write_access(handle, bh); + if (err) + goto unlock; + } + + zero_user(page, offset, length); + + BUFFER_TRACE(bh, "zeroed end of block"); + + err = 0; + if (ext4_should_journal_data(inode)) { + err = ext4_journal_dirty_metadata(handle, bh); + } else { + if (ext4_should_order_data(inode)) + err = ext4_jbd2_file_inode(handle, inode); + mark_buffer_dirty(bh); + } + +unlock: + unlock_page(page); + page_cache_release(page); + return err; +} + +/* + * Probably it should be a library function... search for first non-zero word + * or memcmp with zero_page, whatever is better for particular architecture. + * Linus? + */ +static inline int all_zeroes(__le32 *p, __le32 *q) +{ + while (p < q) + if (*p++) + return 0; + return 1; +} + +/** + * ext4_find_shared - find the indirect blocks for partial truncation. + * @inode: inode in question + * @depth: depth of the affected branch + * @offsets: offsets of pointers in that branch (see ext4_block_to_path) + * @chain: place to store the pointers to partial indirect blocks + * @top: place to the (detached) top of branch + * + * This is a helper function used by ext4_truncate(). + * + * When we do truncate() we may have to clean the ends of several + * indirect blocks but leave the blocks themselves alive. Block is + * partially truncated if some data below the new i_size is refered + * from it (and it is on the path to the first completely truncated + * data block, indeed). We have to free the top of that path along + * with everything to the right of the path. Since no allocation + * past the truncation point is possible until ext4_truncate() + * finishes, we may safely do the latter, but top of branch may + * require special attention - pageout below the truncation point + * might try to populate it. + * + * We atomically detach the top of branch from the tree, store the + * block number of its root in *@top, pointers to buffer_heads of + * partially truncated blocks - in @chain[].bh and pointers to + * their last elements that should not be removed - in + * @chain[].p. Return value is the pointer to last filled element + * of @chain. + * + * The work left to caller to do the actual freeing of subtrees: + * a) free the subtree starting from *@top + * b) free the subtrees whose roots are stored in + * (@chain[i].p+1 .. end of @chain[i].bh->b_data) + * c) free the subtrees growing from the inode past the @chain[0]. + * (no partially truncated stuff there). */ + +static Indirect *ext4_find_shared(struct inode *inode, int depth, + ext4_lblk_t offsets[4], Indirect chain[4], __le32 *top) +{ + Indirect *partial, *p; + int k, err; + + *top = 0; + /* Make k index the deepest non-null offest + 1 */ + for (k = depth; k > 1 && !offsets[k-1]; k--) + ; + partial = ext4_get_branch(inode, k, offsets, chain, &err); + /* Writer: pointers */ + if (!partial) + partial = chain + k-1; + /* + * If the branch acquired continuation since we've looked at it - + * fine, it should all survive and (new) top doesn't belong to us. + */ + if (!partial->key && *partial->p) + /* Writer: end */ + goto no_top; + for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--) + ; + /* + * OK, we've found the last block that must survive. The rest of our + * branch should be detached before unlocking. However, if that rest + * of branch is all ours and does not grow immediately from the inode + * it's easier to cheat and just decrement partial->p. + */ + if (p == chain + k - 1 && p > chain) { + p->p--; + } else { + *top = *p->p; + /* Nope, don't do this in ext4. Must leave the tree intact */ +#if 0 + *p->p = 0; +#endif + } + /* Writer: end */ + + while (partial > p) { + brelse(partial->bh); + partial--; + } +no_top: + return partial; +} + +/* + * Zero a number of block pointers in either an inode or an indirect block. + * If we restart the transaction we must again get write access to the + * indirect block for further modification. + * + * We release `count' blocks on disk, but (last - first) may be greater + * than `count' because there can be holes in there. + */ +static void ext4_clear_blocks(handle_t *handle, struct inode *inode, + struct buffer_head *bh, ext4_fsblk_t block_to_free, + unsigned long count, __le32 *first, __le32 *last) +{ + __le32 *p; + if (try_to_extend_transaction(handle, inode)) { + if (bh) { + BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); + ext4_journal_dirty_metadata(handle, bh); + } + ext4_mark_inode_dirty(handle, inode); + ext4_journal_test_restart(handle, inode); + if (bh) { + BUFFER_TRACE(bh, "retaking write access"); + ext4_journal_get_write_access(handle, bh); + } + } + + /* + * Any buffers which are on the journal will be in memory. We find + * them on the hash table so jbd2_journal_revoke() will run jbd2_journal_forget() + * on them. We've already detached each block from the file, so + * bforget() in jbd2_journal_forget() should be safe. + * + * AKPM: turn on bforget in jbd2_journal_forget()!!! + */ + for (p = first; p < last; p++) { + u32 nr = le32_to_cpu(*p); + if (nr) { + struct buffer_head *tbh; + + *p = 0; + tbh = sb_find_get_block(inode->i_sb, nr); + ext4_forget(handle, 0, inode, tbh, nr); + } + } + + ext4_free_blocks(handle, inode, block_to_free, count, 0); +} + +/** + * ext4_free_data - free a list of data blocks + * @handle: handle for this transaction + * @inode: inode we are dealing with + * @this_bh: indirect buffer_head which contains *@first and *@last + * @first: array of block numbers + * @last: points immediately past the end of array + * + * We are freeing all blocks refered from that array (numbers are stored as + * little-endian 32-bit) and updating @inode->i_blocks appropriately. + * + * We accumulate contiguous runs of blocks to free. Conveniently, if these + * blocks are contiguous then releasing them at one time will only affect one + * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't + * actually use a lot of journal space. + * + * @this_bh will be %NULL if @first and @last point into the inode's direct + * block pointers. + */ +static void ext4_free_data(handle_t *handle, struct inode *inode, + struct buffer_head *this_bh, + __le32 *first, __le32 *last) +{ + ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */ + unsigned long count = 0; /* Number of blocks in the run */ + __le32 *block_to_free_p = NULL; /* Pointer into inode/ind + corresponding to + block_to_free */ + ext4_fsblk_t nr; /* Current block # */ + __le32 *p; /* Pointer into inode/ind + for current block */ + int err; + + if (this_bh) { /* For indirect block */ + BUFFER_TRACE(this_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, this_bh); + /* Important: if we can't update the indirect pointers + * to the blocks, we can't free them. */ + if (err) + return; + } + + for (p = first; p < last; p++) { + nr = le32_to_cpu(*p); + if (nr) { + /* accumulate blocks to free if they're contiguous */ + if (count == 0) { + block_to_free = nr; + block_to_free_p = p; + count = 1; + } else if (nr == block_to_free + count) { + count++; + } else { + ext4_clear_blocks(handle, inode, this_bh, + block_to_free, + count, block_to_free_p, p); + block_to_free = nr; + block_to_free_p = p; + count = 1; + } + } + } + + if (count > 0) + ext4_clear_blocks(handle, inode, this_bh, block_to_free, + count, block_to_free_p, p); + + if (this_bh) { + BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata"); + + /* + * The buffer head should have an attached journal head at this + * point. However, if the data is corrupted and an indirect + * block pointed to itself, it would have been detached when + * the block was cleared. Check for this instead of OOPSing. + */ + if (bh2jh(this_bh)) + ext4_journal_dirty_metadata(handle, this_bh); + else + ext4_error(inode->i_sb, __func__, + "circular indirect block detected, " + "inode=%lu, block=%llu", + inode->i_ino, + (unsigned long long) this_bh->b_blocknr); + } +} + +/** + * ext4_free_branches - free an array of branches + * @handle: JBD handle for this transaction + * @inode: inode we are dealing with + * @parent_bh: the buffer_head which contains *@first and *@last + * @first: array of block numbers + * @last: pointer immediately past the end of array + * @depth: depth of the branches to free + * + * We are freeing all blocks refered from these branches (numbers are + * stored as little-endian 32-bit) and updating @inode->i_blocks + * appropriately. + */ +static void ext4_free_branches(handle_t *handle, struct inode *inode, + struct buffer_head *parent_bh, + __le32 *first, __le32 *last, int depth) +{ + ext4_fsblk_t nr; + __le32 *p; + + if (is_handle_aborted(handle)) + return; + + if (depth--) { + struct buffer_head *bh; + int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); + p = last; + while (--p >= first) { + nr = le32_to_cpu(*p); + if (!nr) + continue; /* A hole */ + + /* Go read the buffer for the next level down */ + bh = sb_bread(inode->i_sb, nr); + + /* + * A read failure? Report error and clear slot + * (should be rare). + */ + if (!bh) { + ext4_error(inode->i_sb, "ext4_free_branches", + "Read failure, inode=%lu, block=%llu", + inode->i_ino, nr); + continue; + } + + /* This zaps the entire block. Bottom up. */ + BUFFER_TRACE(bh, "free child branches"); + ext4_free_branches(handle, inode, bh, + (__le32 *) bh->b_data, + (__le32 *) bh->b_data + addr_per_block, + depth); + + /* + * We've probably journalled the indirect block several + * times during the truncate. But it's no longer + * needed and we now drop it from the transaction via + * jbd2_journal_revoke(). + * + * That's easy if it's exclusively part of this + * transaction. But if it's part of the committing + * transaction then jbd2_journal_forget() will simply + * brelse() it. That means that if the underlying + * block is reallocated in ext4_get_block(), + * unmap_underlying_metadata() will find this block + * and will try to get rid of it. damn, damn. + * + * If this block has already been committed to the + * journal, a revoke record will be written. And + * revoke records must be emitted *before* clearing + * this block's bit in the bitmaps. + */ + ext4_forget(handle, 1, inode, bh, bh->b_blocknr); + + /* + * Everything below this this pointer has been + * released. Now let this top-of-subtree go. + * + * We want the freeing of this indirect block to be + * atomic in the journal with the updating of the + * bitmap block which owns it. So make some room in + * the journal. + * + * We zero the parent pointer *after* freeing its + * pointee in the bitmaps, so if extend_transaction() + * for some reason fails to put the bitmap changes and + * the release into the same transaction, recovery + * will merely complain about releasing a free block, + * rather than leaking blocks. + */ + if (is_handle_aborted(handle)) + return; + if (try_to_extend_transaction(handle, inode)) { + ext4_mark_inode_dirty(handle, inode); + ext4_journal_test_restart(handle, inode); + } + + ext4_free_blocks(handle, inode, nr, 1, 1); + + if (parent_bh) { + /* + * The block which we have just freed is + * pointed to by an indirect block: journal it + */ + BUFFER_TRACE(parent_bh, "get_write_access"); + if (!ext4_journal_get_write_access(handle, + parent_bh)){ + *p = 0; + BUFFER_TRACE(parent_bh, + "call ext4_journal_dirty_metadata"); + ext4_journal_dirty_metadata(handle, + parent_bh); + } + } + } + } else { + /* We have reached the bottom of the tree. */ + BUFFER_TRACE(parent_bh, "free data blocks"); + ext4_free_data(handle, inode, parent_bh, first, last); + } +} + +int ext4_can_truncate(struct inode *inode) +{ + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return 0; + if (S_ISREG(inode->i_mode)) + return 1; + if (S_ISDIR(inode->i_mode)) + return 1; + if (S_ISLNK(inode->i_mode)) + return !ext4_inode_is_fast_symlink(inode); + return 0; +} + +/* + * ext4_truncate() + * + * We block out ext4_get_block() block instantiations across the entire + * transaction, and VFS/VM ensures that ext4_truncate() cannot run + * simultaneously on behalf of the same inode. + * + * As we work through the truncate and commmit bits of it to the journal there + * is one core, guiding principle: the file's tree must always be consistent on + * disk. We must be able to restart the truncate after a crash. + * + * The file's tree may be transiently inconsistent in memory (although it + * probably isn't), but whenever we close off and commit a journal transaction, + * the contents of (the filesystem + the journal) must be consistent and + * restartable. It's pretty simple, really: bottom up, right to left (although + * left-to-right works OK too). + * + * Note that at recovery time, journal replay occurs *before* the restart of + * truncate against the orphan inode list. + * + * The committed inode has the new, desired i_size (which is the same as + * i_disksize in this case). After a crash, ext4_orphan_cleanup() will see + * that this inode's truncate did not complete and it will again call + * ext4_truncate() to have another go. So there will be instantiated blocks + * to the right of the truncation point in a crashed ext4 filesystem. But + * that's fine - as long as they are linked from the inode, the post-crash + * ext4_truncate() run will find them and release them. + */ +void ext4_truncate(struct inode *inode) +{ + handle_t *handle; + struct ext4_inode_info *ei = EXT4_I(inode); + __le32 *i_data = ei->i_data; + int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); + struct address_space *mapping = inode->i_mapping; + ext4_lblk_t offsets[4]; + Indirect chain[4]; + Indirect *partial; + __le32 nr = 0; + int n; + ext4_lblk_t last_block; + unsigned blocksize = inode->i_sb->s_blocksize; + + if (!ext4_can_truncate(inode)) + return; + + if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { + ext4_ext_truncate(inode); + return; + } + + handle = start_transaction(inode); + if (IS_ERR(handle)) + return; /* AKPM: return what? */ + + last_block = (inode->i_size + blocksize-1) + >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); + + if (inode->i_size & (blocksize - 1)) + if (ext4_block_truncate_page(handle, mapping, inode->i_size)) + goto out_stop; + + n = ext4_block_to_path(inode, last_block, offsets, NULL); + if (n == 0) + goto out_stop; /* error */ + + /* + * OK. This truncate is going to happen. We add the inode to the + * orphan list, so that if this truncate spans multiple transactions, + * and we crash, we will resume the truncate when the filesystem + * recovers. It also marks the inode dirty, to catch the new size. + * + * Implication: the file must always be in a sane, consistent + * truncatable state while each transaction commits. + */ + if (ext4_orphan_add(handle, inode)) + goto out_stop; + + /* + * From here we block out all ext4_get_block() callers who want to + * modify the block allocation tree. + */ + down_write(&ei->i_data_sem); + + ext4_discard_preallocations(inode); + + /* + * The orphan list entry will now protect us from any crash which + * occurs before the truncate completes, so it is now safe to propagate + * the new, shorter inode size (held for now in i_size) into the + * on-disk inode. We do this via i_disksize, which is the value which + * ext4 *really* writes onto the disk inode. + */ + ei->i_disksize = inode->i_size; + + if (n == 1) { /* direct blocks */ + ext4_free_data(handle, inode, NULL, i_data+offsets[0], + i_data + EXT4_NDIR_BLOCKS); + goto do_indirects; + } + + partial = ext4_find_shared(inode, n, offsets, chain, &nr); + /* Kill the top of shared branch (not detached) */ + if (nr) { + if (partial == chain) { + /* Shared branch grows from the inode */ + ext4_free_branches(handle, inode, NULL, + &nr, &nr+1, (chain+n-1) - partial); + *partial->p = 0; + /* + * We mark the inode dirty prior to restart, + * and prior to stop. No need for it here. + */ + } else { + /* Shared branch grows from an indirect block */ + BUFFER_TRACE(partial->bh, "get_write_access"); + ext4_free_branches(handle, inode, partial->bh, + partial->p, + partial->p+1, (chain+n-1) - partial); + } + } + /* Clear the ends of indirect blocks on the shared branch */ + while (partial > chain) { + ext4_free_branches(handle, inode, partial->bh, partial->p + 1, + (__le32*)partial->bh->b_data+addr_per_block, + (chain+n-1) - partial); + BUFFER_TRACE(partial->bh, "call brelse"); + brelse (partial->bh); + partial--; + } +do_indirects: + /* Kill the remaining (whole) subtrees */ + switch (offsets[0]) { + default: + nr = i_data[EXT4_IND_BLOCK]; + if (nr) { + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); + i_data[EXT4_IND_BLOCK] = 0; + } + case EXT4_IND_BLOCK: + nr = i_data[EXT4_DIND_BLOCK]; + if (nr) { + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); + i_data[EXT4_DIND_BLOCK] = 0; + } + case EXT4_DIND_BLOCK: + nr = i_data[EXT4_TIND_BLOCK]; + if (nr) { + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); + i_data[EXT4_TIND_BLOCK] = 0; + } + case EXT4_TIND_BLOCK: + ; + } + + up_write(&ei->i_data_sem); + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + + /* + * In a multi-transaction truncate, we only make the final transaction + * synchronous + */ + if (IS_SYNC(inode)) + handle->h_sync = 1; +out_stop: + /* + * If this was a simple ftruncate(), and the file will remain alive + * then we need to clear up the orphan record which we created above. + * However, if this was a real unlink then we were called by + * ext4_delete_inode(), and we allow that function to clean up the + * orphan info for us. + */ + if (inode->i_nlink) + ext4_orphan_del(handle, inode); + + ext4_journal_stop(handle); +} + +/* + * ext4_get_inode_loc returns with an extra refcount against the inode's + * underlying buffer_head on success. If 'in_mem' is true, we have all + * data in memory that is needed to recreate the on-disk version of this + * inode. + */ +static int __ext4_get_inode_loc(struct inode *inode, + struct ext4_iloc *iloc, int in_mem) +{ + struct ext4_group_desc *gdp; + struct buffer_head *bh; + struct super_block *sb = inode->i_sb; + ext4_fsblk_t block; + int inodes_per_block, inode_offset; + + iloc->bh = 0; + if (!ext4_valid_inum(sb, inode->i_ino)) + return -EIO; + + iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb); + gdp = ext4_get_group_desc(sb, iloc->block_group, NULL); + if (!gdp) + return -EIO; + + /* + * Figure out the offset within the block group inode table + */ + inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb)); + inode_offset = ((inode->i_ino - 1) % + EXT4_INODES_PER_GROUP(sb)); + block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block); + iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb); + + bh = sb_getblk(sb, block); + if (!bh) { + ext4_error(sb, "ext4_get_inode_loc", "unable to read " + "inode block - inode=%lu, block=%llu", + inode->i_ino, block); + return -EIO; + } + if (!buffer_uptodate(bh)) { + lock_buffer(bh); + + /* + * If the buffer has the write error flag, we have failed + * to write out another inode in the same block. In this + * case, we don't have to read the block because we may + * read the old inode data successfully. + */ + if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) + set_buffer_uptodate(bh); + + if (buffer_uptodate(bh)) { + /* someone brought it uptodate while we waited */ + unlock_buffer(bh); + goto has_buffer; + } + + /* + * If we have all information of the inode in memory and this + * is the only valid inode in the block, we need not read the + * block. + */ + if (in_mem) { + struct buffer_head *bitmap_bh; + int i, start; + + start = inode_offset & ~(inodes_per_block - 1); + + /* Is the inode bitmap in cache? */ + bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp)); + if (!bitmap_bh) + goto make_io; + + /* + * If the inode bitmap isn't in cache then the + * optimisation may end up performing two reads instead + * of one, so skip it. + */ + if (!buffer_uptodate(bitmap_bh)) { + brelse(bitmap_bh); + goto make_io; + } + for (i = start; i < start + inodes_per_block; i++) { + if (i == inode_offset) + continue; + if (ext4_test_bit(i, bitmap_bh->b_data)) + break; + } + brelse(bitmap_bh); + if (i == start + inodes_per_block) { + /* all other inodes are free, so skip I/O */ + memset(bh->b_data, 0, bh->b_size); + set_buffer_uptodate(bh); + unlock_buffer(bh); + goto has_buffer; + } + } + +make_io: + /* + * If we need to do any I/O, try to pre-readahead extra + * blocks from the inode table. + */ + if (EXT4_SB(sb)->s_inode_readahead_blks) { + ext4_fsblk_t b, end, table; + unsigned num; + + table = ext4_inode_table(sb, gdp); + /* Make sure s_inode_readahead_blks is a power of 2 */ + while (EXT4_SB(sb)->s_inode_readahead_blks & + (EXT4_SB(sb)->s_inode_readahead_blks-1)) + EXT4_SB(sb)->s_inode_readahead_blks = + (EXT4_SB(sb)->s_inode_readahead_blks & + (EXT4_SB(sb)->s_inode_readahead_blks-1)); + b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1); + if (table > b) + b = table; + end = b + EXT4_SB(sb)->s_inode_readahead_blks; + num = EXT4_INODES_PER_GROUP(sb); + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) + num -= le16_to_cpu(gdp->bg_itable_unused); + table += num / inodes_per_block; + if (end > table) + end = table; + while (b <= end) + sb_breadahead(sb, b++); + } + + /* + * There are other valid inodes in the buffer, this inode + * has in-inode xattrs, or we don't have this inode in memory. + * Read the block from disk. + */ + get_bh(bh); + bh->b_end_io = end_buffer_read_sync; + submit_bh(READ_META, bh); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + ext4_error(sb, __func__, + "unable to read inode block - inode=%lu, " + "block=%llu", inode->i_ino, block); + brelse(bh); + return -EIO; + } + } +has_buffer: + iloc->bh = bh; + return 0; +} + +int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) +{ + /* We have all inode data except xattrs in memory here. */ + return __ext4_get_inode_loc(inode, iloc, + !(EXT4_I(inode)->i_state & EXT4_STATE_XATTR)); +} + +void ext4_set_inode_flags(struct inode *inode) +{ + unsigned int flags = EXT4_I(inode)->i_flags; + + inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + if (flags & EXT4_SYNC_FL) + inode->i_flags |= S_SYNC; + if (flags & EXT4_APPEND_FL) + inode->i_flags |= S_APPEND; + if (flags & EXT4_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; + if (flags & EXT4_NOATIME_FL) + inode->i_flags |= S_NOATIME; + if (flags & EXT4_DIRSYNC_FL) + inode->i_flags |= S_DIRSYNC; +} + +/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ +void ext4_get_inode_flags(struct ext4_inode_info *ei) +{ + unsigned int flags = ei->vfs_inode.i_flags; + + ei->i_flags &= ~(EXT4_SYNC_FL|EXT4_APPEND_FL| + EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|EXT4_DIRSYNC_FL); + if (flags & S_SYNC) + ei->i_flags |= EXT4_SYNC_FL; + if (flags & S_APPEND) + ei->i_flags |= EXT4_APPEND_FL; + if (flags & S_IMMUTABLE) + ei->i_flags |= EXT4_IMMUTABLE_FL; + if (flags & S_NOATIME) + ei->i_flags |= EXT4_NOATIME_FL; + if (flags & S_DIRSYNC) + ei->i_flags |= EXT4_DIRSYNC_FL; +} +static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, + struct ext4_inode_info *ei) +{ + blkcnt_t i_blocks ; + struct inode *inode = &(ei->vfs_inode); + struct super_block *sb = inode->i_sb; + + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) { + /* we are using combined 48 bit field */ + i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 | + le32_to_cpu(raw_inode->i_blocks_lo); + if (ei->i_flags & EXT4_HUGE_FILE_FL) { + /* i_blocks represent file system block size */ + return i_blocks << (inode->i_blkbits - 9); + } else { + return i_blocks; + } + } else { + return le32_to_cpu(raw_inode->i_blocks_lo); + } +} + +struct inode *ext4_iget(struct super_block *sb, unsigned long ino) +{ + struct ext4_iloc iloc; + struct ext4_inode *raw_inode; + struct ext4_inode_info *ei; + struct buffer_head *bh; + struct inode *inode; + long ret; + int block; + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + ei = EXT4_I(inode); +#ifdef CONFIG_EXT4_FS_POSIX_ACL + ei->i_acl = EXT4_ACL_NOT_CACHED; + ei->i_default_acl = EXT4_ACL_NOT_CACHED; +#endif + + ret = __ext4_get_inode_loc(inode, &iloc, 0); + if (ret < 0) + goto bad_inode; + bh = iloc.bh; + raw_inode = ext4_raw_inode(&iloc); + inode->i_mode = le16_to_cpu(raw_inode->i_mode); + inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); + inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); + if (!(test_opt(inode->i_sb, NO_UID32))) { + inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; + inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; + } + inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); + + ei->i_state = 0; + ei->i_dir_start_lookup = 0; + ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); + /* We now have enough fields to check if the inode was active or not. + * This is needed because nfsd might try to access dead inodes + * the test is that same one that e2fsck uses + * NeilBrown 1999oct15 + */ + if (inode->i_nlink == 0) { + if (inode->i_mode == 0 || + !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { + /* this inode is deleted */ + brelse(bh); + ret = -ESTALE; + goto bad_inode; + } + /* The only unlinked inodes we let through here have + * valid i_mode and are being read by the orphan + * recovery code: that's fine, we're about to complete + * the process of deleting those. */ + } + ei->i_flags = le32_to_cpu(raw_inode->i_flags); + inode->i_blocks = ext4_inode_blocks(raw_inode, ei); + ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); + if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != + cpu_to_le32(EXT4_OS_HURD)) { + ei->i_file_acl |= + ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; + } + inode->i_size = ext4_isize(raw_inode); + ei->i_disksize = inode->i_size; + inode->i_generation = le32_to_cpu(raw_inode->i_generation); + ei->i_block_group = iloc.block_group; + /* + * NOTE! The in-memory inode i_data array is in little-endian order + * even on big-endian machines: we do NOT byteswap the block numbers! + */ + for (block = 0; block < EXT4_N_BLOCKS; block++) + ei->i_data[block] = raw_inode->i_block[block]; + INIT_LIST_HEAD(&ei->i_orphan); + + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { + ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); + if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > + EXT4_INODE_SIZE(inode->i_sb)) { + brelse(bh); + ret = -EIO; + goto bad_inode; + } + if (ei->i_extra_isize == 0) { + /* The extra space is currently unused. Use it. */ + ei->i_extra_isize = sizeof(struct ext4_inode) - + EXT4_GOOD_OLD_INODE_SIZE; + } else { + __le32 *magic = (void *)raw_inode + + EXT4_GOOD_OLD_INODE_SIZE + + ei->i_extra_isize; + if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) + ei->i_state |= EXT4_STATE_XATTR; + } + } else + ei->i_extra_isize = 0; + + EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode); + EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode); + EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); + EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); + + inode->i_version = le32_to_cpu(raw_inode->i_disk_version); + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { + if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) + inode->i_version |= + (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; + } + + if (S_ISREG(inode->i_mode)) { + inode->i_op = &ext4_file_inode_operations; + inode->i_fop = &ext4_file_operations; + ext4_set_aops(inode); + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &ext4_dir_inode_operations; + inode->i_fop = &ext4_dir_operations; + } else if (S_ISLNK(inode->i_mode)) { + if (ext4_inode_is_fast_symlink(inode)) + inode->i_op = &ext4_fast_symlink_inode_operations; + else { + inode->i_op = &ext4_symlink_inode_operations; + ext4_set_aops(inode); + } + } else { + inode->i_op = &ext4_special_inode_operations; + if (raw_inode->i_block[0]) + init_special_inode(inode, inode->i_mode, + old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); + else + init_special_inode(inode, inode->i_mode, + new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); + } + brelse(iloc.bh); + ext4_set_inode_flags(inode); + unlock_new_inode(inode); + return inode; + +bad_inode: + iget_failed(inode); + return ERR_PTR(ret); +} + +static int ext4_inode_blocks_set(handle_t *handle, + struct ext4_inode *raw_inode, + struct ext4_inode_info *ei) +{ + struct inode *inode = &(ei->vfs_inode); + u64 i_blocks = inode->i_blocks; + struct super_block *sb = inode->i_sb; + + if (i_blocks <= ~0U) { + /* + * i_blocks can be represnted in a 32 bit variable + * as multiple of 512 bytes + */ + raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); + raw_inode->i_blocks_high = 0; + ei->i_flags &= ~EXT4_HUGE_FILE_FL; + return 0; + } + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) + return -EFBIG; + + if (i_blocks <= 0xffffffffffffULL) { + /* + * i_blocks can be represented in a 48 bit variable + * as multiple of 512 bytes + */ + raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); + raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); + ei->i_flags &= ~EXT4_HUGE_FILE_FL; + } else { + ei->i_flags |= EXT4_HUGE_FILE_FL; + /* i_block is stored in file system block size */ + i_blocks = i_blocks >> (inode->i_blkbits - 9); + raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); + raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); + } + return 0; +} + +/* + * Post the struct inode info into an on-disk inode location in the + * buffer-cache. This gobbles the caller's reference to the + * buffer_head in the inode location struct. + * + * The caller must have write access to iloc->bh. + */ +static int ext4_do_update_inode(handle_t *handle, + struct inode *inode, + struct ext4_iloc *iloc) +{ + struct ext4_inode *raw_inode = ext4_raw_inode(iloc); + struct ext4_inode_info *ei = EXT4_I(inode); + struct buffer_head *bh = iloc->bh; + int err = 0, rc, block; + + /* For fields not not tracking in the in-memory inode, + * initialise them to zero for new inodes. */ + if (ei->i_state & EXT4_STATE_NEW) + memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); + + ext4_get_inode_flags(ei); + raw_inode->i_mode = cpu_to_le16(inode->i_mode); + if (!(test_opt(inode->i_sb, NO_UID32))) { + raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); + raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); +/* + * Fix up interoperability with old kernels. Otherwise, old inodes get + * re-used with the upper 16 bits of the uid/gid intact + */ + if (!ei->i_dtime) { + raw_inode->i_uid_high = + cpu_to_le16(high_16_bits(inode->i_uid)); + raw_inode->i_gid_high = + cpu_to_le16(high_16_bits(inode->i_gid)); + } else { + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } + } else { + raw_inode->i_uid_low = + cpu_to_le16(fs_high2lowuid(inode->i_uid)); + raw_inode->i_gid_low = + cpu_to_le16(fs_high2lowgid(inode->i_gid)); + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } + raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); + + EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode); + EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); + EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); + EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); + + if (ext4_inode_blocks_set(handle, raw_inode, ei)) + goto out_brelse; + raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); + /* clear the migrate flag in the raw_inode */ + raw_inode->i_flags = cpu_to_le32(ei->i_flags & ~EXT4_EXT_MIGRATE); + if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != + cpu_to_le32(EXT4_OS_HURD)) + raw_inode->i_file_acl_high = + cpu_to_le16(ei->i_file_acl >> 32); + raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); + ext4_isize_set(raw_inode, ei->i_disksize); + if (ei->i_disksize > 0x7fffffffULL) { + struct super_block *sb = inode->i_sb; + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_LARGE_FILE) || + EXT4_SB(sb)->s_es->s_rev_level == + cpu_to_le32(EXT4_GOOD_OLD_REV)) { + /* If this is the first large file + * created, add a flag to the superblock. + */ + err = ext4_journal_get_write_access(handle, + EXT4_SB(sb)->s_sbh); + if (err) + goto out_brelse; + ext4_update_dynamic_rev(sb); + EXT4_SET_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_LARGE_FILE); + sb->s_dirt = 1; + handle->h_sync = 1; + err = ext4_journal_dirty_metadata(handle, + EXT4_SB(sb)->s_sbh); + } + } + raw_inode->i_generation = cpu_to_le32(inode->i_generation); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + if (old_valid_dev(inode->i_rdev)) { + raw_inode->i_block[0] = + cpu_to_le32(old_encode_dev(inode->i_rdev)); + raw_inode->i_block[1] = 0; + } else { + raw_inode->i_block[0] = 0; + raw_inode->i_block[1] = + cpu_to_le32(new_encode_dev(inode->i_rdev)); + raw_inode->i_block[2] = 0; + } + } else for (block = 0; block < EXT4_N_BLOCKS; block++) + raw_inode->i_block[block] = ei->i_data[block]; + + raw_inode->i_disk_version = cpu_to_le32(inode->i_version); + if (ei->i_extra_isize) { + if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) + raw_inode->i_version_hi = + cpu_to_le32(inode->i_version >> 32); + raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); + } + + + BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); + rc = ext4_journal_dirty_metadata(handle, bh); + if (!err) + err = rc; + ei->i_state &= ~EXT4_STATE_NEW; + +out_brelse: + brelse(bh); + ext4_std_error(inode->i_sb, err); + return err; +} + +/* + * ext4_write_inode() + * + * We are called from a few places: + * + * - Within generic_file_write() for O_SYNC files. + * Here, there will be no transaction running. We wait for any running + * trasnaction to commit. + * + * - Within sys_sync(), kupdate and such. + * We wait on commit, if tol to. + * + * - Within prune_icache() (PF_MEMALLOC == true) + * Here we simply return. We can't afford to block kswapd on the + * journal commit. + * + * In all cases it is actually safe for us to return without doing anything, + * because the inode has been copied into a raw inode buffer in + * ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for + * knfsd. + * + * Note that we are absolutely dependent upon all inode dirtiers doing the + * right thing: they *must* call mark_inode_dirty() after dirtying info in + * which we are interested. + * + * It would be a bug for them to not do this. The code: + * + * mark_inode_dirty(inode) + * stuff(); + * inode->i_size = expr; + * + * is in error because a kswapd-driven write_inode() could occur while + * `stuff()' is running, and the new i_size will be lost. Plus the inode + * will no longer be on the superblock's dirty inode list. + */ +int ext4_write_inode(struct inode *inode, int wait) +{ + if (current->flags & PF_MEMALLOC) + return 0; + + if (ext4_journal_current_handle()) { + jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); + dump_stack(); + return -EIO; + } + + if (!wait) + return 0; + + return ext4_force_commit(inode->i_sb); +} + +/* + * ext4_setattr() + * + * Called from notify_change. + * + * We want to trap VFS attempts to truncate the file as soon as + * possible. In particular, we want to make sure that when the VFS + * shrinks i_size, we put the inode on the orphan list and modify + * i_disksize immediately, so that during the subsequent flushing of + * dirty pages and freeing of disk blocks, we can guarantee that any + * commit will leave the blocks being flushed in an unused state on + * disk. (On recovery, the inode will get truncated and the blocks will + * be freed, so we have a strong guarantee that no future commit will + * leave these blocks visible to the user.) + * + * Another thing we have to assure is that if we are in ordered mode + * and inode is still attached to the committing transaction, we must + * we start writeout of all the dirty pages which are being truncated. + * This way we are sure that all the data written in the previous + * transaction are already on disk (truncate waits for pages under + * writeback). + * + * Called with inode->i_mutex down. + */ +int ext4_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + int error, rc = 0; + const unsigned int ia_valid = attr->ia_valid; + + error = inode_change_ok(inode, attr); + if (error) + return error; + + if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || + (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { + handle_t *handle; + + /* (user+group)*(old+new) structure, inode write (sb, + * inode block, ? - but truncate inode update has it) */ + handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+ + EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + goto err_out; + } + error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0; + if (error) { + ext4_journal_stop(handle); + return error; + } + /* Update corresponding info in inode so that everything is in + * one transaction */ + if (attr->ia_valid & ATTR_UID) + inode->i_uid = attr->ia_uid; + if (attr->ia_valid & ATTR_GID) + inode->i_gid = attr->ia_gid; + error = ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); + } + + if (attr->ia_valid & ATTR_SIZE) { + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + + if (attr->ia_size > sbi->s_bitmap_maxbytes) { + error = -EFBIG; + goto err_out; + } + } + } + + if (S_ISREG(inode->i_mode) && + attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { + handle_t *handle; + + handle = ext4_journal_start(inode, 3); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + goto err_out; + } + + error = ext4_orphan_add(handle, inode); + EXT4_I(inode)->i_disksize = attr->ia_size; + rc = ext4_mark_inode_dirty(handle, inode); + if (!error) + error = rc; + ext4_journal_stop(handle); + + if (ext4_should_order_data(inode)) { + error = ext4_begin_ordered_truncate(inode, + attr->ia_size); + if (error) { + /* Do as much error cleanup as possible */ + handle = ext4_journal_start(inode, 3); + if (IS_ERR(handle)) { + ext4_orphan_del(NULL, inode); + goto err_out; + } + ext4_orphan_del(handle, inode); + ext4_journal_stop(handle); + goto err_out; + } + } + } + + rc = inode_setattr(inode, attr); + + /* If inode_setattr's call to ext4_truncate failed to get a + * transaction handle at all, we need to clean up the in-core + * orphan list manually. */ + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + + if (!rc && (ia_valid & ATTR_MODE)) + rc = ext4_acl_chmod(inode); + +err_out: + ext4_std_error(inode->i_sb, error); + if (!error) + error = rc; + return error; +} + +int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct inode *inode; + unsigned long delalloc_blocks; + + inode = dentry->d_inode; + generic_fillattr(inode, stat); + + /* + * We can't update i_blocks if the block allocation is delayed + * otherwise in the case of system crash before the real block + * allocation is done, we will have i_blocks inconsistent with + * on-disk file blocks. + * We always keep i_blocks updated together with real + * allocation. But to not confuse with user, stat + * will return the blocks that include the delayed allocation + * blocks for this file. + */ + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks; + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + + stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; + return 0; +} + +static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks, + int chunk) +{ + int indirects; + + /* if nrblocks are contiguous */ + if (chunk) { + /* + * With N contiguous data blocks, it need at most + * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks + * 2 dindirect blocks + * 1 tindirect block + */ + indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb); + return indirects + 3; + } + /* + * if nrblocks are not contiguous, worse case, each block touch + * a indirect block, and each indirect block touch a double indirect + * block, plus a triple indirect block + */ + indirects = nrblocks * 2 + 1; + return indirects; +} + +static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) +{ + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + return ext4_indirect_trans_blocks(inode, nrblocks, chunk); + return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); +} + +/* + * Account for index blocks, block groups bitmaps and block group + * descriptor blocks if modify datablocks and index blocks + * worse case, the indexs blocks spread over different block groups + * + * If datablocks are discontiguous, they are possible to spread over + * different block groups too. If they are contiugous, with flexbg, + * they could still across block group boundary. + * + * Also account for superblock, inode, quota and xattr blocks + */ +int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) +{ + int groups, gdpblocks; + int idxblocks; + int ret = 0; + + /* + * How many index blocks need to touch to modify nrblocks? + * The "Chunk" flag indicating whether the nrblocks is + * physically contiguous on disk + * + * For Direct IO and fallocate, they calls get_block to allocate + * one single extent at a time, so they could set the "Chunk" flag + */ + idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk); + + ret = idxblocks; + + /* + * Now let's see how many group bitmaps and group descriptors need + * to account + */ + groups = idxblocks; + if (chunk) + groups += 1; + else + groups += nrblocks; + + gdpblocks = groups; + if (groups > EXT4_SB(inode->i_sb)->s_groups_count) + groups = EXT4_SB(inode->i_sb)->s_groups_count; + if (groups > EXT4_SB(inode->i_sb)->s_gdb_count) + gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; + + /* bitmaps and block group descriptor blocks */ + ret += groups + gdpblocks; + + /* Blocks for super block, inode, quota and xattr blocks */ + ret += EXT4_META_TRANS_BLOCKS(inode->i_sb); + + return ret; +} + +/* + * Calulate the total number of credits to reserve to fit + * the modification of a single pages into a single transaction, + * which may include multiple chunks of block allocations. + * + * This could be called via ext4_write_begin() + * + * We need to consider the worse case, when + * one new block per extent. + */ +int ext4_writepage_trans_blocks(struct inode *inode) +{ + int bpp = ext4_journal_blocks_per_page(inode); + int ret; + + ret = ext4_meta_trans_blocks(inode, bpp, 0); + + /* Account for data blocks for journalled mode */ + if (ext4_should_journal_data(inode)) + ret += bpp; + return ret; +} + +/* + * Calculate the journal credits for a chunk of data modification. + * + * This is called from DIO, fallocate or whoever calling + * ext4_get_blocks_wrap() to map/allocate a chunk of contigous disk blocks. + * + * journal buffers for data blocks are not included here, as DIO + * and fallocate do no need to journal data buffers. + */ +int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks) +{ + return ext4_meta_trans_blocks(inode, nrblocks, 1); +} + +/* + * The caller must have previously called ext4_reserve_inode_write(). + * Give this, we know that the caller already has write access to iloc->bh. + */ +int ext4_mark_iloc_dirty(handle_t *handle, + struct inode *inode, struct ext4_iloc *iloc) +{ + int err = 0; + + if (test_opt(inode->i_sb, I_VERSION)) + inode_inc_iversion(inode); + + /* the do_update_inode consumes one bh->b_count */ + get_bh(iloc->bh); + + /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */ + err = ext4_do_update_inode(handle, inode, iloc); + put_bh(iloc->bh); + return err; +} + +/* + * On success, We end up with an outstanding reference count against + * iloc->bh. This _must_ be cleaned up later. + */ + +int +ext4_reserve_inode_write(handle_t *handle, struct inode *inode, + struct ext4_iloc *iloc) +{ + int err = 0; + if (handle) { + err = ext4_get_inode_loc(inode, iloc); + if (!err) { + BUFFER_TRACE(iloc->bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, iloc->bh); + if (err) { + brelse(iloc->bh); + iloc->bh = NULL; + } + } + } + ext4_std_error(inode->i_sb, err); + return err; +} + +/* + * Expand an inode by new_extra_isize bytes. + * Returns 0 on success or negative error number on failure. + */ +static int ext4_expand_extra_isize(struct inode *inode, + unsigned int new_extra_isize, + struct ext4_iloc iloc, + handle_t *handle) +{ + struct ext4_inode *raw_inode; + struct ext4_xattr_ibody_header *header; + struct ext4_xattr_entry *entry; + + if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) + return 0; + + raw_inode = ext4_raw_inode(&iloc); + + header = IHDR(inode, raw_inode); + entry = IFIRST(header); + + /* No extended attributes present */ + if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) || + header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { + memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0, + new_extra_isize); + EXT4_I(inode)->i_extra_isize = new_extra_isize; + return 0; + } + + /* try to expand with EAs present */ + return ext4_expand_extra_isize_ea(inode, new_extra_isize, + raw_inode, handle); +} + +/* + * What we do here is to mark the in-core inode as clean with respect to inode + * dirtiness (it may still be data-dirty). + * This means that the in-core inode may be reaped by prune_icache + * without having to perform any I/O. This is a very good thing, + * because *any* task may call prune_icache - even ones which + * have a transaction open against a different journal. + * + * Is this cheating? Not really. Sure, we haven't written the + * inode out, but prune_icache isn't a user-visible syncing function. + * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) + * we start and wait on commits. + * + * Is this efficient/effective? Well, we're being nice to the system + * by cleaning up our inodes proactively so they can be reaped + * without I/O. But we are potentially leaving up to five seconds' + * worth of inodes floating about which prune_icache wants us to + * write out. One way to fix that would be to get prune_icache() + * to do a write_super() to free up some memory. It has the desired + * effect. + */ +int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) +{ + struct ext4_iloc iloc; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + static unsigned int mnt_count; + int err, ret; + + might_sleep(); + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && + !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) { + /* + * We need extra buffer credits since we may write into EA block + * with this same handle. If journal_extend fails, then it will + * only result in a minor loss of functionality for that inode. + * If this is felt to be critical, then e2fsck should be run to + * force a large enough s_min_extra_isize. + */ + if ((jbd2_journal_extend(handle, + EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) { + ret = ext4_expand_extra_isize(inode, + sbi->s_want_extra_isize, + iloc, handle); + if (ret) { + EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND; + if (mnt_count != + le16_to_cpu(sbi->s_es->s_mnt_count)) { + ext4_warning(inode->i_sb, __func__, + "Unable to expand inode %lu. Delete" + " some EAs or run e2fsck.", + inode->i_ino); + mnt_count = + le16_to_cpu(sbi->s_es->s_mnt_count); + } + } + } + } + if (!err) + err = ext4_mark_iloc_dirty(handle, inode, &iloc); + return err; +} + +/* + * ext4_dirty_inode() is called from __mark_inode_dirty() + * + * We're really interested in the case where a file is being extended. + * i_size has been changed by generic_commit_write() and we thus need + * to include the updated inode in the current transaction. + * + * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks + * are allocated to the file. + * + * If the inode is marked synchronous, we don't honour that here - doing + * so would cause a commit on atime updates, which we don't bother doing. + * We handle synchronous inodes at the highest possible level. + */ +void ext4_dirty_inode(struct inode *inode) +{ + handle_t *current_handle = ext4_journal_current_handle(); + handle_t *handle; + + handle = ext4_journal_start(inode, 2); + if (IS_ERR(handle)) + goto out; + if (current_handle && + current_handle->h_transaction != handle->h_transaction) { + /* This task has a transaction open against a different fs */ + printk(KERN_EMERG "%s: transactions do not match!\n", + __func__); + } else { + jbd_debug(5, "marking dirty. outer handle=%p\n", + current_handle); + ext4_mark_inode_dirty(handle, inode); + } + ext4_journal_stop(handle); +out: + return; +} + +#if 0 +/* + * Bind an inode's backing buffer_head into this transaction, to prevent + * it from being flushed to disk early. Unlike + * ext4_reserve_inode_write, this leaves behind no bh reference and + * returns no iloc structure, so the caller needs to repeat the iloc + * lookup to mark the inode dirty later. + */ +static int ext4_pin_inode(handle_t *handle, struct inode *inode) +{ + struct ext4_iloc iloc; + + int err = 0; + if (handle) { + err = ext4_get_inode_loc(inode, &iloc); + if (!err) { + BUFFER_TRACE(iloc.bh, "get_write_access"); + err = jbd2_journal_get_write_access(handle, iloc.bh); + if (!err) + err = ext4_journal_dirty_metadata(handle, + iloc.bh); + brelse(iloc.bh); + } + } + ext4_std_error(inode->i_sb, err); + return err; +} +#endif + +int ext4_change_inode_journal_flag(struct inode *inode, int val) +{ + journal_t *journal; + handle_t *handle; + int err; + + /* + * We have to be very careful here: changing a data block's + * journaling status dynamically is dangerous. If we write a + * data block to the journal, change the status and then delete + * that block, we risk forgetting to revoke the old log record + * from the journal and so a subsequent replay can corrupt data. + * So, first we make sure that the journal is empty and that + * nobody is changing anything. + */ + + journal = EXT4_JOURNAL(inode); + if (is_journal_aborted(journal)) + return -EROFS; + + jbd2_journal_lock_updates(journal); + jbd2_journal_flush(journal); + + /* + * OK, there are no updates running now, and all cached data is + * synced to disk. We are now in a completely consistent state + * which doesn't have anything in the journal, and we know that + * no filesystem updates are running, so it is safe to modify + * the inode's in-core data-journaling state flag now. + */ + + if (val) + EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL; + else + EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL; + ext4_set_aops(inode); + + jbd2_journal_unlock_updates(journal); + + /* Finally we can mark the inode as dirty. */ + + handle = ext4_journal_start(inode, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + err = ext4_mark_inode_dirty(handle, inode); + handle->h_sync = 1; + ext4_journal_stop(handle); + ext4_std_error(inode->i_sb, err); + + return err; +} + +static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh) +{ + return !buffer_mapped(bh); +} + +int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page) +{ + loff_t size; + unsigned long len; + int ret = -EINVAL; + void *fsdata; + struct file *file = vma->vm_file; + struct inode *inode = file->f_path.dentry->d_inode; + struct address_space *mapping = inode->i_mapping; + + /* + * Get i_alloc_sem to stop truncates messing with the inode. We cannot + * get i_mutex because we are already holding mmap_sem. + */ + down_read(&inode->i_alloc_sem); + size = i_size_read(inode); + if (page->mapping != mapping || size <= page_offset(page) + || !PageUptodate(page)) { + /* page got truncated from under us? */ + goto out_unlock; + } + ret = 0; + if (PageMappedToDisk(page)) + goto out_unlock; + + if (page->index == size >> PAGE_CACHE_SHIFT) + len = size & ~PAGE_CACHE_MASK; + else + len = PAGE_CACHE_SIZE; + + if (page_has_buffers(page)) { + /* return if we have all the buffers mapped */ + if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, + ext4_bh_unmapped)) + goto out_unlock; + } + /* + * OK, we need to fill the hole... Do write_begin write_end + * to do block allocation/reservation.We are not holding + * inode.i__mutex here. That allow * parallel write_begin, + * write_end call. lock_page prevent this from happening + * on the same page though + */ + ret = mapping->a_ops->write_begin(file, mapping, page_offset(page), + len, AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata); + if (ret < 0) + goto out_unlock; + ret = mapping->a_ops->write_end(file, mapping, page_offset(page), + len, len, page, fsdata); + if (ret < 0) + goto out_unlock; + ret = 0; +out_unlock: + up_read(&inode->i_alloc_sem); + return ret; +} diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c new file mode 100644 index 0000000..dc99b47 --- /dev/null +++ b/fs/ext4/ioctl.c @@ -0,0 +1,315 @@ +/* + * linux/fs/ext4/ioctl.c + * + * Copyright (C) 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + */ + +#include <linux/fs.h> +#include <linux/jbd2.h> +#include <linux/capability.h> +#include <linux/time.h> +#include <linux/compat.h> +#include <linux/smp_lock.h> +#include <linux/mount.h> +#include <asm/uaccess.h> +#include "ext4_jbd2.h" +#include "ext4.h" + +long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = filp->f_dentry->d_inode; + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned int flags; + + ext4_debug("cmd = %u, arg = %lu\n", cmd, arg); + + switch (cmd) { + case EXT4_IOC_GETFLAGS: + ext4_get_inode_flags(ei); + flags = ei->i_flags & EXT4_FL_USER_VISIBLE; + return put_user(flags, (int __user *) arg); + case EXT4_IOC_SETFLAGS: { + handle_t *handle = NULL; + int err, migrate = 0; + struct ext4_iloc iloc; + unsigned int oldflags; + unsigned int jflag; + + if (!is_owner_or_cap(inode)) + return -EACCES; + + if (get_user(flags, (int __user *) arg)) + return -EFAULT; + + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; + + if (!S_ISDIR(inode->i_mode)) + flags &= ~EXT4_DIRSYNC_FL; + + err = -EPERM; + mutex_lock(&inode->i_mutex); + /* Is it quota file? Do not allow user to mess with it */ + if (IS_NOQUOTA(inode)) + goto flags_out; + + oldflags = ei->i_flags; + + /* The JOURNAL_DATA flag is modifiable only by root */ + jflag = flags & EXT4_JOURNAL_DATA_FL; + + /* + * The IMMUTABLE and APPEND_ONLY flags can only be changed by + * the relevant capability. + * + * This test looks nicer. Thanks to Pauline Middelink + */ + if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) { + if (!capable(CAP_LINUX_IMMUTABLE)) + goto flags_out; + } + + /* + * The JOURNAL_DATA flag can only be changed by + * the relevant capability. + */ + if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { + if (!capable(CAP_SYS_RESOURCE)) + goto flags_out; + } + if (oldflags & EXT4_EXTENTS_FL) { + /* We don't support clearning extent flags */ + if (!(flags & EXT4_EXTENTS_FL)) { + err = -EOPNOTSUPP; + goto flags_out; + } + } else if (flags & EXT4_EXTENTS_FL) { + /* migrate the file */ + migrate = 1; + flags &= ~EXT4_EXTENTS_FL; + } + + handle = ext4_journal_start(inode, 1); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto flags_out; + } + if (IS_SYNC(inode)) + handle->h_sync = 1; + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err) + goto flags_err; + + flags = flags & EXT4_FL_USER_MODIFIABLE; + flags |= oldflags & ~EXT4_FL_USER_MODIFIABLE; + ei->i_flags = flags; + + ext4_set_inode_flags(inode); + inode->i_ctime = ext4_current_time(inode); + + err = ext4_mark_iloc_dirty(handle, inode, &iloc); +flags_err: + ext4_journal_stop(handle); + if (err) + goto flags_out; + + if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) + err = ext4_change_inode_journal_flag(inode, jflag); + if (err) + goto flags_out; + if (migrate) + err = ext4_ext_migrate(inode); +flags_out: + mutex_unlock(&inode->i_mutex); + mnt_drop_write(filp->f_path.mnt); + return err; + } + case EXT4_IOC_GETVERSION: + case EXT4_IOC_GETVERSION_OLD: + return put_user(inode->i_generation, (int __user *) arg); + case EXT4_IOC_SETVERSION: + case EXT4_IOC_SETVERSION_OLD: { + handle_t *handle; + struct ext4_iloc iloc; + __u32 generation; + int err; + + if (!is_owner_or_cap(inode)) + return -EPERM; + + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; + if (get_user(generation, (int __user *) arg)) { + err = -EFAULT; + goto setversion_out; + } + + handle = ext4_journal_start(inode, 1); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto setversion_out; + } + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err == 0) { + inode->i_ctime = ext4_current_time(inode); + inode->i_generation = generation; + err = ext4_mark_iloc_dirty(handle, inode, &iloc); + } + ext4_journal_stop(handle); +setversion_out: + mnt_drop_write(filp->f_path.mnt); + return err; + } +#ifdef CONFIG_JBD2_DEBUG + case EXT4_IOC_WAIT_FOR_READONLY: + /* + * This is racy - by the time we're woken up and running, + * the superblock could be released. And the module could + * have been unloaded. So sue me. + * + * Returns 1 if it slept, else zero. + */ + { + struct super_block *sb = inode->i_sb; + DECLARE_WAITQUEUE(wait, current); + int ret = 0; + + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait); + if (timer_pending(&EXT4_SB(sb)->turn_ro_timer)) { + schedule(); + ret = 1; + } + remove_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait); + return ret; + } +#endif + case EXT4_IOC_GROUP_EXTEND: { + ext4_fsblk_t n_blocks_count; + struct super_block *sb = inode->i_sb; + int err, err2; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + if (get_user(n_blocks_count, (__u32 __user *)arg)) + return -EFAULT; + + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; + + err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count); + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); + err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + if (err == 0) + err = err2; + mnt_drop_write(filp->f_path.mnt); + + return err; + } + case EXT4_IOC_GROUP_ADD: { + struct ext4_new_group_data input; + struct super_block *sb = inode->i_sb; + int err, err2; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg, + sizeof(input))) + return -EFAULT; + + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; + + err = ext4_group_add(sb, &input); + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); + err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + if (err == 0) + err = err2; + mnt_drop_write(filp->f_path.mnt); + + return err; + } + + case EXT4_IOC_MIGRATE: + { + int err; + if (!is_owner_or_cap(inode)) + return -EACCES; + + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; + /* + * inode_mutex prevent write and truncate on the file. + * Read still goes through. We take i_data_sem in + * ext4_ext_swap_inode_data before we switch the + * inode format to prevent read. + */ + mutex_lock(&(inode->i_mutex)); + err = ext4_ext_migrate(inode); + mutex_unlock(&(inode->i_mutex)); + mnt_drop_write(filp->f_path.mnt); + return err; + } + + default: + return -ENOTTY; + } +} + +#ifdef CONFIG_COMPAT +long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + /* These are just misnamed, they actually get/put from/to user an int */ + switch (cmd) { + case EXT4_IOC32_GETFLAGS: + cmd = EXT4_IOC_GETFLAGS; + break; + case EXT4_IOC32_SETFLAGS: + cmd = EXT4_IOC_SETFLAGS; + break; + case EXT4_IOC32_GETVERSION: + cmd = EXT4_IOC_GETVERSION; + break; + case EXT4_IOC32_SETVERSION: + cmd = EXT4_IOC_SETVERSION; + break; + case EXT4_IOC32_GROUP_EXTEND: + cmd = EXT4_IOC_GROUP_EXTEND; + break; + case EXT4_IOC32_GETVERSION_OLD: + cmd = EXT4_IOC_GETVERSION_OLD; + break; + case EXT4_IOC32_SETVERSION_OLD: + cmd = EXT4_IOC_SETVERSION_OLD; + break; +#ifdef CONFIG_JBD2_DEBUG + case EXT4_IOC32_WAIT_FOR_READONLY: + cmd = EXT4_IOC_WAIT_FOR_READONLY; + break; +#endif + case EXT4_IOC32_GETRSVSZ: + cmd = EXT4_IOC_GETRSVSZ; + break; + case EXT4_IOC32_SETRSVSZ: + cmd = EXT4_IOC_SETRSVSZ; + break; + case EXT4_IOC_GROUP_ADD: + break; + default: + return -ENOIOCTLCMD; + } + return ext4_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); +} +#endif diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c new file mode 100644 index 0000000..f8e923f --- /dev/null +++ b/fs/ext4/mballoc.c @@ -0,0 +1,4906 @@ +/* + * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas <alex@clusterfs.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public Licens + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- + */ + + +/* + * mballoc.c contains the multiblocks allocation routines + */ + +#include "mballoc.h" +/* + * MUSTDO: + * - test ext4_ext_search_left() and ext4_ext_search_right() + * - search for metadata in few groups + * + * TODO v4: + * - normalization should take into account whether file is still open + * - discard preallocations if no free space left (policy?) + * - don't normalize tails + * - quota + * - reservation for superuser + * + * TODO v3: + * - bitmap read-ahead (proposed by Oleg Drokin aka green) + * - track min/max extents in each group for better group selection + * - mb_mark_used() may allocate chunk right after splitting buddy + * - tree of groups sorted by number of free blocks + * - error handling + */ + +/* + * The allocation request involve request for multiple number of blocks + * near to the goal(block) value specified. + * + * During initialization phase of the allocator we decide to use the group + * preallocation or inode preallocation depending on the size file. The + * size of the file could be the resulting file size we would have after + * allocation or the current file size which ever is larger. If the size is + * less that sbi->s_mb_stream_request we select the group + * preallocation. The default value of s_mb_stream_request is 16 + * blocks. This can also be tuned via + * /proc/fs/ext4/<partition>/stream_req. The value is represented in terms + * of number of blocks. + * + * The main motivation for having small file use group preallocation is to + * ensure that we have small file closer in the disk. + * + * First stage the allocator looks at the inode prealloc list + * ext4_inode_info->i_prealloc_list contain list of prealloc spaces for + * this particular inode. The inode prealloc space is represented as: + * + * pa_lstart -> the logical start block for this prealloc space + * pa_pstart -> the physical start block for this prealloc space + * pa_len -> lenght for this prealloc space + * pa_free -> free space available in this prealloc space + * + * The inode preallocation space is used looking at the _logical_ start + * block. If only the logical file block falls within the range of prealloc + * space we will consume the particular prealloc space. This make sure that + * that the we have contiguous physical blocks representing the file blocks + * + * The important thing to be noted in case of inode prealloc space is that + * we don't modify the values associated to inode prealloc space except + * pa_free. + * + * If we are not able to find blocks in the inode prealloc space and if we + * have the group allocation flag set then we look at the locality group + * prealloc space. These are per CPU prealloc list repreasented as + * + * ext4_sb_info.s_locality_groups[smp_processor_id()] + * + * The reason for having a per cpu locality group is to reduce the contention + * between CPUs. It is possible to get scheduled at this point. + * + * The locality group prealloc space is used looking at whether we have + * enough free space (pa_free) withing the prealloc space. + * + * If we can't allocate blocks via inode prealloc or/and locality group + * prealloc then we look at the buddy cache. The buddy cache is represented + * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets + * mapped to the buddy and bitmap information regarding different + * groups. The buddy information is attached to buddy cache inode so that + * we can access them through the page cache. The information regarding + * each group is loaded via ext4_mb_load_buddy. The information involve + * block bitmap and buddy information. The information are stored in the + * inode as: + * + * { page } + * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... + * + * + * one block each for bitmap and buddy information. So for each group we + * take up 2 blocks. A page can contain blocks_per_page (PAGE_CACHE_SIZE / + * blocksize) blocks. So it can have information regarding groups_per_page + * which is blocks_per_page/2 + * + * The buddy cache inode is not stored on disk. The inode is thrown + * away when the filesystem is unmounted. + * + * We look for count number of blocks in the buddy cache. If we were able + * to locate that many free blocks we return with additional information + * regarding rest of the contiguous physical block available + * + * Before allocating blocks via buddy cache we normalize the request + * blocks. This ensure we ask for more blocks that we needed. The extra + * blocks that we get after allocation is added to the respective prealloc + * list. In case of inode preallocation we follow a list of heuristics + * based on file size. This can be found in ext4_mb_normalize_request. If + * we are doing a group prealloc we try to normalize the request to + * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is set to + * 512 blocks. This can be tuned via + * /proc/fs/ext4/<partition/group_prealloc. The value is represented in + * terms of number of blocks. If we have mounted the file system with -O + * stripe=<value> option the group prealloc request is normalized to the + * stripe value (sbi->s_stripe) + * + * The regular allocator(using the buddy cache) support few tunables. + * + * /proc/fs/ext4/<partition>/min_to_scan + * /proc/fs/ext4/<partition>/max_to_scan + * /proc/fs/ext4/<partition>/order2_req + * + * The regular allocator use buddy scan only if the request len is power of + * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The + * value of s_mb_order2_reqs can be tuned via + * /proc/fs/ext4/<partition>/order2_req. If the request len is equal to + * stripe size (sbi->s_stripe), we try to search for contigous block in + * stripe size. This should result in better allocation on RAID setup. If + * not we search in the specific group using bitmap for best extents. The + * tunable min_to_scan and max_to_scan controll the behaviour here. + * min_to_scan indicate how long the mballoc __must__ look for a best + * extent and max_to_scanindicate how long the mballoc __can__ look for a + * best extent in the found extents. Searching for the blocks starts with + * the group specified as the goal value in allocation context via + * ac_g_ex. Each group is first checked based on the criteria whether it + * can used for allocation. ext4_mb_good_group explains how the groups are + * checked. + * + * Both the prealloc space are getting populated as above. So for the first + * request we will hit the buddy cache which will result in this prealloc + * space getting filled. The prealloc space is then later used for the + * subsequent request. + */ + +/* + * mballoc operates on the following data: + * - on-disk bitmap + * - in-core buddy (actually includes buddy and bitmap) + * - preallocation descriptors (PAs) + * + * there are two types of preallocations: + * - inode + * assiged to specific inode and can be used for this inode only. + * it describes part of inode's space preallocated to specific + * physical blocks. any block from that preallocated can be used + * independent. the descriptor just tracks number of blocks left + * unused. so, before taking some block from descriptor, one must + * make sure corresponded logical block isn't allocated yet. this + * also means that freeing any block within descriptor's range + * must discard all preallocated blocks. + * - locality group + * assigned to specific locality group which does not translate to + * permanent set of inodes: inode can join and leave group. space + * from this type of preallocation can be used for any inode. thus + * it's consumed from the beginning to the end. + * + * relation between them can be expressed as: + * in-core buddy = on-disk bitmap + preallocation descriptors + * + * this mean blocks mballoc considers used are: + * - allocated blocks (persistent) + * - preallocated blocks (non-persistent) + * + * consistency in mballoc world means that at any time a block is either + * free or used in ALL structures. notice: "any time" should not be read + * literally -- time is discrete and delimited by locks. + * + * to keep it simple, we don't use block numbers, instead we count number of + * blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA. + * + * all operations can be expressed as: + * - init buddy: buddy = on-disk + PAs + * - new PA: buddy += N; PA = N + * - use inode PA: on-disk += N; PA -= N + * - discard inode PA buddy -= on-disk - PA; PA = 0 + * - use locality group PA on-disk += N; PA -= N + * - discard locality group PA buddy -= PA; PA = 0 + * note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap + * is used in real operation because we can't know actual used + * bits from PA, only from on-disk bitmap + * + * if we follow this strict logic, then all operations above should be atomic. + * given some of them can block, we'd have to use something like semaphores + * killing performance on high-end SMP hardware. let's try to relax it using + * the following knowledge: + * 1) if buddy is referenced, it's already initialized + * 2) while block is used in buddy and the buddy is referenced, + * nobody can re-allocate that block + * 3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has + * bit set and PA claims same block, it's OK. IOW, one can set bit in + * on-disk bitmap if buddy has same bit set or/and PA covers corresponded + * block + * + * so, now we're building a concurrency table: + * - init buddy vs. + * - new PA + * blocks for PA are allocated in the buddy, buddy must be referenced + * until PA is linked to allocation group to avoid concurrent buddy init + * - use inode PA + * we need to make sure that either on-disk bitmap or PA has uptodate data + * given (3) we care that PA-=N operation doesn't interfere with init + * - discard inode PA + * the simplest way would be to have buddy initialized by the discard + * - use locality group PA + * again PA-=N must be serialized with init + * - discard locality group PA + * the simplest way would be to have buddy initialized by the discard + * - new PA vs. + * - use inode PA + * i_data_sem serializes them + * - discard inode PA + * discard process must wait until PA isn't used by another process + * - use locality group PA + * some mutex should serialize them + * - discard locality group PA + * discard process must wait until PA isn't used by another process + * - use inode PA + * - use inode PA + * i_data_sem or another mutex should serializes them + * - discard inode PA + * discard process must wait until PA isn't used by another process + * - use locality group PA + * nothing wrong here -- they're different PAs covering different blocks + * - discard locality group PA + * discard process must wait until PA isn't used by another process + * + * now we're ready to make few consequences: + * - PA is referenced and while it is no discard is possible + * - PA is referenced until block isn't marked in on-disk bitmap + * - PA changes only after on-disk bitmap + * - discard must not compete with init. either init is done before + * any discard or they're serialized somehow + * - buddy init as sum of on-disk bitmap and PAs is done atomically + * + * a special case when we've used PA to emptiness. no need to modify buddy + * in this case, but we should care about concurrent init + * + */ + + /* + * Logic in few words: + * + * - allocation: + * load group + * find blocks + * mark bits in on-disk bitmap + * release group + * + * - use preallocation: + * find proper PA (per-inode or group) + * load group + * mark bits in on-disk bitmap + * release group + * release PA + * + * - free: + * load group + * mark bits in on-disk bitmap + * release group + * + * - discard preallocations in group: + * mark PAs deleted + * move them onto local list + * load on-disk bitmap + * load group + * remove PA from object (inode or locality group) + * mark free blocks in-core + * + * - discard inode's preallocations: + */ + +/* + * Locking rules + * + * Locks: + * - bitlock on a group (group) + * - object (inode/locality) (object) + * - per-pa lock (pa) + * + * Paths: + * - new pa + * object + * group + * + * - find and use pa: + * pa + * + * - release consumed pa: + * pa + * group + * object + * + * - generate in-core bitmap: + * group + * pa + * + * - discard all for given object (inode, locality group): + * object + * pa + * group + * + * - discard all for given group: + * group + * pa + * group + * object + * + */ +static struct kmem_cache *ext4_pspace_cachep; +static struct kmem_cache *ext4_ac_cachep; +static struct kmem_cache *ext4_free_ext_cachep; +static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, + ext4_group_t group); +static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, + ext4_group_t group); +static int ext4_mb_init_per_dev_proc(struct super_block *sb); +static int ext4_mb_destroy_per_dev_proc(struct super_block *sb); +static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); + + + +static inline void *mb_correct_addr_and_bit(int *bit, void *addr) +{ +#if BITS_PER_LONG == 64 + *bit += ((unsigned long) addr & 7UL) << 3; + addr = (void *) ((unsigned long) addr & ~7UL); +#elif BITS_PER_LONG == 32 + *bit += ((unsigned long) addr & 3UL) << 3; + addr = (void *) ((unsigned long) addr & ~3UL); +#else +#error "how many bits you are?!" +#endif + return addr; +} + +static inline int mb_test_bit(int bit, void *addr) +{ + /* + * ext4_test_bit on architecture like powerpc + * needs unsigned long aligned address + */ + addr = mb_correct_addr_and_bit(&bit, addr); + return ext4_test_bit(bit, addr); +} + +static inline void mb_set_bit(int bit, void *addr) +{ + addr = mb_correct_addr_and_bit(&bit, addr); + ext4_set_bit(bit, addr); +} + +static inline void mb_set_bit_atomic(spinlock_t *lock, int bit, void *addr) +{ + addr = mb_correct_addr_and_bit(&bit, addr); + ext4_set_bit_atomic(lock, bit, addr); +} + +static inline void mb_clear_bit(int bit, void *addr) +{ + addr = mb_correct_addr_and_bit(&bit, addr); + ext4_clear_bit(bit, addr); +} + +static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr) +{ + addr = mb_correct_addr_and_bit(&bit, addr); + ext4_clear_bit_atomic(lock, bit, addr); +} + +static inline int mb_find_next_zero_bit(void *addr, int max, int start) +{ + int fix = 0, ret, tmpmax; + addr = mb_correct_addr_and_bit(&fix, addr); + tmpmax = max + fix; + start += fix; + + ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix; + if (ret > max) + return max; + return ret; +} + +static inline int mb_find_next_bit(void *addr, int max, int start) +{ + int fix = 0, ret, tmpmax; + addr = mb_correct_addr_and_bit(&fix, addr); + tmpmax = max + fix; + start += fix; + + ret = ext4_find_next_bit(addr, tmpmax, start) - fix; + if (ret > max) + return max; + return ret; +} + +static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) +{ + char *bb; + + BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b)); + BUG_ON(max == NULL); + + if (order > e4b->bd_blkbits + 1) { + *max = 0; + return NULL; + } + + /* at order 0 we see each particular block */ + *max = 1 << (e4b->bd_blkbits + 3); + if (order == 0) + return EXT4_MB_BITMAP(e4b); + + bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; + *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order]; + + return bb; +} + +#ifdef DOUBLE_CHECK +static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, + int first, int count) +{ + int i; + struct super_block *sb = e4b->bd_sb; + + if (unlikely(e4b->bd_info->bb_bitmap == NULL)) + return; + BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group)); + for (i = 0; i < count; i++) { + if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) { + ext4_fsblk_t blocknr; + blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb); + blocknr += first + i; + blocknr += + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); + + ext4_error(sb, __func__, "double-free of inode" + " %lu's block %llu(bit %u in group %lu)\n", + inode ? inode->i_ino : 0, blocknr, + first + i, e4b->bd_group); + } + mb_clear_bit(first + i, e4b->bd_info->bb_bitmap); + } +} + +static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count) +{ + int i; + + if (unlikely(e4b->bd_info->bb_bitmap == NULL)) + return; + BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group)); + for (i = 0; i < count; i++) { + BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap)); + mb_set_bit(first + i, e4b->bd_info->bb_bitmap); + } +} + +static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) +{ + if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) { + unsigned char *b1, *b2; + int i; + b1 = (unsigned char *) e4b->bd_info->bb_bitmap; + b2 = (unsigned char *) bitmap; + for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { + if (b1[i] != b2[i]) { + printk(KERN_ERR "corruption in group %lu " + "at byte %u(%u): %x in copy != %x " + "on disk/prealloc\n", + e4b->bd_group, i, i * 8, b1[i], b2[i]); + BUG(); + } + } + } +} + +#else +static inline void mb_free_blocks_double(struct inode *inode, + struct ext4_buddy *e4b, int first, int count) +{ + return; +} +static inline void mb_mark_used_double(struct ext4_buddy *e4b, + int first, int count) +{ + return; +} +static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) +{ + return; +} +#endif + +#ifdef AGGRESSIVE_CHECK + +#define MB_CHECK_ASSERT(assert) \ +do { \ + if (!(assert)) { \ + printk(KERN_EMERG \ + "Assertion failure in %s() at %s:%d: \"%s\"\n", \ + function, file, line, # assert); \ + BUG(); \ + } \ +} while (0) + +static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, + const char *function, int line) +{ + struct super_block *sb = e4b->bd_sb; + int order = e4b->bd_blkbits + 1; + int max; + int max2; + int i; + int j; + int k; + int count; + struct ext4_group_info *grp; + int fragments = 0; + int fstart; + struct list_head *cur; + void *buddy; + void *buddy2; + + { + static int mb_check_counter; + if (mb_check_counter++ % 100 != 0) + return 0; + } + + while (order > 1) { + buddy = mb_find_buddy(e4b, order, &max); + MB_CHECK_ASSERT(buddy); + buddy2 = mb_find_buddy(e4b, order - 1, &max2); + MB_CHECK_ASSERT(buddy2); + MB_CHECK_ASSERT(buddy != buddy2); + MB_CHECK_ASSERT(max * 2 == max2); + + count = 0; + for (i = 0; i < max; i++) { + + if (mb_test_bit(i, buddy)) { + /* only single bit in buddy2 may be 1 */ + if (!mb_test_bit(i << 1, buddy2)) { + MB_CHECK_ASSERT( + mb_test_bit((i<<1)+1, buddy2)); + } else if (!mb_test_bit((i << 1) + 1, buddy2)) { + MB_CHECK_ASSERT( + mb_test_bit(i << 1, buddy2)); + } + continue; + } + + /* both bits in buddy2 must be 0 */ + MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2)); + MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); + + for (j = 0; j < (1 << order); j++) { + k = (i * (1 << order)) + j; + MB_CHECK_ASSERT( + !mb_test_bit(k, EXT4_MB_BITMAP(e4b))); + } + count++; + } + MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count); + order--; + } + + fstart = -1; + buddy = mb_find_buddy(e4b, 0, &max); + for (i = 0; i < max; i++) { + if (!mb_test_bit(i, buddy)) { + MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free); + if (fstart == -1) { + fragments++; + fstart = i; + } + continue; + } + fstart = -1; + /* check used bits only */ + for (j = 0; j < e4b->bd_blkbits + 1; j++) { + buddy2 = mb_find_buddy(e4b, j, &max2); + k = i >> j; + MB_CHECK_ASSERT(k < max2); + MB_CHECK_ASSERT(mb_test_bit(k, buddy2)); + } + } + MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info)); + MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments); + + grp = ext4_get_group_info(sb, e4b->bd_group); + buddy = mb_find_buddy(e4b, 0, &max); + list_for_each(cur, &grp->bb_prealloc_list) { + ext4_group_t groupnr; + struct ext4_prealloc_space *pa; + pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); + ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k); + MB_CHECK_ASSERT(groupnr == e4b->bd_group); + for (i = 0; i < pa->pa_len; i++) + MB_CHECK_ASSERT(mb_test_bit(k + i, buddy)); + } + return 0; +} +#undef MB_CHECK_ASSERT +#define mb_check_buddy(e4b) __mb_check_buddy(e4b, \ + __FILE__, __func__, __LINE__) +#else +#define mb_check_buddy(e4b) +#endif + +/* FIXME!! need more doc */ +static void ext4_mb_mark_free_simple(struct super_block *sb, + void *buddy, unsigned first, int len, + struct ext4_group_info *grp) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + unsigned short min; + unsigned short max; + unsigned short chunk; + unsigned short border; + + BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb)); + + border = 2 << sb->s_blocksize_bits; + + while (len > 0) { + /* find how many blocks can be covered since this position */ + max = ffs(first | border) - 1; + + /* find how many blocks of power 2 we need to mark */ + min = fls(len) - 1; + + if (max < min) + min = max; + chunk = 1 << min; + + /* mark multiblock chunks only */ + grp->bb_counters[min]++; + if (min > 0) + mb_clear_bit(first >> min, + buddy + sbi->s_mb_offsets[min]); + + len -= chunk; + first += chunk; + } +} + +static void ext4_mb_generate_buddy(struct super_block *sb, + void *buddy, void *bitmap, ext4_group_t group) +{ + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + unsigned short max = EXT4_BLOCKS_PER_GROUP(sb); + unsigned short i = 0; + unsigned short first; + unsigned short len; + unsigned free = 0; + unsigned fragments = 0; + unsigned long long period = get_cycles(); + + /* initialize buddy from bitmap which is aggregation + * of on-disk bitmap and preallocations */ + i = mb_find_next_zero_bit(bitmap, max, 0); + grp->bb_first_free = i; + while (i < max) { + fragments++; + first = i; + i = mb_find_next_bit(bitmap, max, i); + len = i - first; + free += len; + if (len > 1) + ext4_mb_mark_free_simple(sb, buddy, first, len, grp); + else + grp->bb_counters[0]++; + if (i < max) + i = mb_find_next_zero_bit(bitmap, max, i); + } + grp->bb_fragments = fragments; + + if (free != grp->bb_free) { + ext4_error(sb, __func__, + "EXT4-fs: group %lu: %u blocks in bitmap, %u in gd\n", + group, free, grp->bb_free); + /* + * If we intent to continue, we consider group descritor + * corrupt and update bb_free using bitmap value + */ + grp->bb_free = free; + } + + clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); + + period = get_cycles() - period; + spin_lock(&EXT4_SB(sb)->s_bal_lock); + EXT4_SB(sb)->s_mb_buddies_generated++; + EXT4_SB(sb)->s_mb_generation_time += period; + spin_unlock(&EXT4_SB(sb)->s_bal_lock); +} + +/* The buddy information is attached the buddy cache inode + * for convenience. The information regarding each group + * is loaded via ext4_mb_load_buddy. The information involve + * block bitmap and buddy information. The information are + * stored in the inode as + * + * { page } + * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... + * + * + * one block each for bitmap and buddy information. + * So for each group we take up 2 blocks. A page can + * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks. + * So it can have information regarding groups_per_page which + * is blocks_per_page/2 + */ + +static int ext4_mb_init_cache(struct page *page, char *incore) +{ + int blocksize; + int blocks_per_page; + int groups_per_page; + int err = 0; + int i; + ext4_group_t first_group; + int first_block; + struct super_block *sb; + struct buffer_head *bhs; + struct buffer_head **bh; + struct inode *inode; + char *data; + char *bitmap; + + mb_debug("init page %lu\n", page->index); + + inode = page->mapping->host; + sb = inode->i_sb; + blocksize = 1 << inode->i_blkbits; + blocks_per_page = PAGE_CACHE_SIZE / blocksize; + + groups_per_page = blocks_per_page >> 1; + if (groups_per_page == 0) + groups_per_page = 1; + + /* allocate buffer_heads to read bitmaps */ + if (groups_per_page > 1) { + err = -ENOMEM; + i = sizeof(struct buffer_head *) * groups_per_page; + bh = kzalloc(i, GFP_NOFS); + if (bh == NULL) + goto out; + } else + bh = &bhs; + + first_group = page->index * blocks_per_page / 2; + + /* read all groups the page covers into the cache */ + for (i = 0; i < groups_per_page; i++) { + struct ext4_group_desc *desc; + + if (first_group + i >= EXT4_SB(sb)->s_groups_count) + break; + + err = -EIO; + desc = ext4_get_group_desc(sb, first_group + i, NULL); + if (desc == NULL) + goto out; + + err = -ENOMEM; + bh[i] = sb_getblk(sb, ext4_block_bitmap(sb, desc)); + if (bh[i] == NULL) + goto out; + + if (bitmap_uptodate(bh[i])) + continue; + + lock_buffer(bh[i]); + if (bitmap_uptodate(bh[i])) { + unlock_buffer(bh[i]); + continue; + } + spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); + if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { + ext4_init_block_bitmap(sb, bh[i], + first_group + i, desc); + set_bitmap_uptodate(bh[i]); + set_buffer_uptodate(bh[i]); + unlock_buffer(bh[i]); + spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); + continue; + } + spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); + if (buffer_uptodate(bh[i])) { + /* + * if not uninit if bh is uptodate, + * bitmap is also uptodate + */ + set_bitmap_uptodate(bh[i]); + unlock_buffer(bh[i]); + continue; + } + get_bh(bh[i]); + /* + * submit the buffer_head for read. We can + * safely mark the bitmap as uptodate now. + * We do it here so the bitmap uptodate bit + * get set with buffer lock held. + */ + set_bitmap_uptodate(bh[i]); + bh[i]->b_end_io = end_buffer_read_sync; + submit_bh(READ, bh[i]); + mb_debug("read bitmap for group %lu\n", first_group + i); + } + + /* wait for I/O completion */ + for (i = 0; i < groups_per_page && bh[i]; i++) + wait_on_buffer(bh[i]); + + err = -EIO; + for (i = 0; i < groups_per_page && bh[i]; i++) + if (!buffer_uptodate(bh[i])) + goto out; + + err = 0; + first_block = page->index * blocks_per_page; + /* init the page */ + memset(page_address(page), 0xff, PAGE_CACHE_SIZE); + for (i = 0; i < blocks_per_page; i++) { + int group; + struct ext4_group_info *grinfo; + + group = (first_block + i) >> 1; + if (group >= EXT4_SB(sb)->s_groups_count) + break; + + /* + * data carry information regarding this + * particular group in the format specified + * above + * + */ + data = page_address(page) + (i * blocksize); + bitmap = bh[group - first_group]->b_data; + + /* + * We place the buddy block and bitmap block + * close together + */ + if ((first_block + i) & 1) { + /* this is block of buddy */ + BUG_ON(incore == NULL); + mb_debug("put buddy for group %u in page %lu/%x\n", + group, page->index, i * blocksize); + grinfo = ext4_get_group_info(sb, group); + grinfo->bb_fragments = 0; + memset(grinfo->bb_counters, 0, + sizeof(unsigned short)*(sb->s_blocksize_bits+2)); + /* + * incore got set to the group block bitmap below + */ + ext4_lock_group(sb, group); + ext4_mb_generate_buddy(sb, data, incore, group); + ext4_unlock_group(sb, group); + incore = NULL; + } else { + /* this is block of bitmap */ + BUG_ON(incore != NULL); + mb_debug("put bitmap for group %u in page %lu/%x\n", + group, page->index, i * blocksize); + + /* see comments in ext4_mb_put_pa() */ + ext4_lock_group(sb, group); + memcpy(data, bitmap, blocksize); + + /* mark all preallocated blks used in in-core bitmap */ + ext4_mb_generate_from_pa(sb, data, group); + ext4_mb_generate_from_freelist(sb, data, group); + ext4_unlock_group(sb, group); + + /* set incore so that the buddy information can be + * generated using this + */ + incore = data; + } + } + SetPageUptodate(page); + +out: + if (bh) { + for (i = 0; i < groups_per_page && bh[i]; i++) + brelse(bh[i]); + if (bh != &bhs) + kfree(bh); + } + return err; +} + +static noinline_for_stack int +ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, + struct ext4_buddy *e4b) +{ + int blocks_per_page; + int block; + int pnum; + int poff; + struct page *page; + int ret; + struct ext4_group_info *grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct inode *inode = sbi->s_buddy_cache; + + mb_debug("load group %lu\n", group); + + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + grp = ext4_get_group_info(sb, group); + + e4b->bd_blkbits = sb->s_blocksize_bits; + e4b->bd_info = ext4_get_group_info(sb, group); + e4b->bd_sb = sb; + e4b->bd_group = group; + e4b->bd_buddy_page = NULL; + e4b->bd_bitmap_page = NULL; + e4b->alloc_semp = &grp->alloc_sem; + + /* Take the read lock on the group alloc + * sem. This would make sure a parallel + * ext4_mb_init_group happening on other + * groups mapped by the page is blocked + * till we are done with allocation + */ + down_read(e4b->alloc_semp); + + /* + * the buddy cache inode stores the block bitmap + * and buddy information in consecutive blocks. + * So for each group we need two blocks. + */ + block = group * 2; + pnum = block / blocks_per_page; + poff = block % blocks_per_page; + + /* we could use find_or_create_page(), but it locks page + * what we'd like to avoid in fast path ... */ + page = find_get_page(inode->i_mapping, pnum); + if (page == NULL || !PageUptodate(page)) { + if (page) + /* + * drop the page reference and try + * to get the page with lock. If we + * are not uptodate that implies + * somebody just created the page but + * is yet to initialize the same. So + * wait for it to initialize. + */ + page_cache_release(page); + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + if (page) { + BUG_ON(page->mapping != inode->i_mapping); + if (!PageUptodate(page)) { + ret = ext4_mb_init_cache(page, NULL); + if (ret) { + unlock_page(page); + goto err; + } + mb_cmp_bitmaps(e4b, page_address(page) + + (poff * sb->s_blocksize)); + } + unlock_page(page); + } + } + if (page == NULL || !PageUptodate(page)) { + ret = -EIO; + goto err; + } + e4b->bd_bitmap_page = page; + e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); + mark_page_accessed(page); + + block++; + pnum = block / blocks_per_page; + poff = block % blocks_per_page; + + page = find_get_page(inode->i_mapping, pnum); + if (page == NULL || !PageUptodate(page)) { + if (page) + page_cache_release(page); + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + if (page) { + BUG_ON(page->mapping != inode->i_mapping); + if (!PageUptodate(page)) { + ret = ext4_mb_init_cache(page, e4b->bd_bitmap); + if (ret) { + unlock_page(page); + goto err; + } + } + unlock_page(page); + } + } + if (page == NULL || !PageUptodate(page)) { + ret = -EIO; + goto err; + } + e4b->bd_buddy_page = page; + e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); + mark_page_accessed(page); + + BUG_ON(e4b->bd_bitmap_page == NULL); + BUG_ON(e4b->bd_buddy_page == NULL); + + return 0; + +err: + if (e4b->bd_bitmap_page) + page_cache_release(e4b->bd_bitmap_page); + if (e4b->bd_buddy_page) + page_cache_release(e4b->bd_buddy_page); + e4b->bd_buddy = NULL; + e4b->bd_bitmap = NULL; + + /* Done with the buddy cache */ + up_read(e4b->alloc_semp); + return ret; +} + +static void ext4_mb_release_desc(struct ext4_buddy *e4b) +{ + if (e4b->bd_bitmap_page) + page_cache_release(e4b->bd_bitmap_page); + if (e4b->bd_buddy_page) + page_cache_release(e4b->bd_buddy_page); + /* Done with the buddy cache */ + if (e4b->alloc_semp) + up_read(e4b->alloc_semp); +} + + +static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) +{ + int order = 1; + void *bb; + + BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b)); + BUG_ON(block >= (1 << (e4b->bd_blkbits + 3))); + + bb = EXT4_MB_BUDDY(e4b); + while (order <= e4b->bd_blkbits + 1) { + block = block >> 1; + if (!mb_test_bit(block, bb)) { + /* this block is part of buddy of order 'order' */ + return order; + } + bb += 1 << (e4b->bd_blkbits - order); + order++; + } + return 0; +} + +static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len) +{ + __u32 *addr; + + len = cur + len; + while (cur < len) { + if ((cur & 31) == 0 && (len - cur) >= 32) { + /* fast path: clear whole word at once */ + addr = bm + (cur >> 3); + *addr = 0; + cur += 32; + continue; + } + if (lock) + mb_clear_bit_atomic(lock, cur, bm); + else + mb_clear_bit(cur, bm); + cur++; + } +} + +static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len) +{ + __u32 *addr; + + len = cur + len; + while (cur < len) { + if ((cur & 31) == 0 && (len - cur) >= 32) { + /* fast path: set whole word at once */ + addr = bm + (cur >> 3); + *addr = 0xffffffff; + cur += 32; + continue; + } + if (lock) + mb_set_bit_atomic(lock, cur, bm); + else + mb_set_bit(cur, bm); + cur++; + } +} + +static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, + int first, int count) +{ + int block = 0; + int max = 0; + int order; + void *buddy; + void *buddy2; + struct super_block *sb = e4b->bd_sb; + + BUG_ON(first + count > (sb->s_blocksize << 3)); + BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group)); + mb_check_buddy(e4b); + mb_free_blocks_double(inode, e4b, first, count); + + e4b->bd_info->bb_free += count; + if (first < e4b->bd_info->bb_first_free) + e4b->bd_info->bb_first_free = first; + + /* let's maintain fragments counter */ + if (first != 0) + block = !mb_test_bit(first - 1, EXT4_MB_BITMAP(e4b)); + if (first + count < EXT4_SB(sb)->s_mb_maxs[0]) + max = !mb_test_bit(first + count, EXT4_MB_BITMAP(e4b)); + if (block && max) + e4b->bd_info->bb_fragments--; + else if (!block && !max) + e4b->bd_info->bb_fragments++; + + /* let's maintain buddy itself */ + while (count-- > 0) { + block = first++; + order = 0; + + if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) { + ext4_fsblk_t blocknr; + blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb); + blocknr += block; + blocknr += + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); + ext4_unlock_group(sb, e4b->bd_group); + ext4_error(sb, __func__, "double-free of inode" + " %lu's block %llu(bit %u in group %lu)\n", + inode ? inode->i_ino : 0, blocknr, block, + e4b->bd_group); + ext4_lock_group(sb, e4b->bd_group); + } + mb_clear_bit(block, EXT4_MB_BITMAP(e4b)); + e4b->bd_info->bb_counters[order]++; + + /* start of the buddy */ + buddy = mb_find_buddy(e4b, order, &max); + + do { + block &= ~1UL; + if (mb_test_bit(block, buddy) || + mb_test_bit(block + 1, buddy)) + break; + + /* both the buddies are free, try to coalesce them */ + buddy2 = mb_find_buddy(e4b, order + 1, &max); + + if (!buddy2) + break; + + if (order > 0) { + /* for special purposes, we don't set + * free bits in bitmap */ + mb_set_bit(block, buddy); + mb_set_bit(block + 1, buddy); + } + e4b->bd_info->bb_counters[order]--; + e4b->bd_info->bb_counters[order]--; + + block = block >> 1; + order++; + e4b->bd_info->bb_counters[order]++; + + mb_clear_bit(block, buddy2); + buddy = buddy2; + } while (1); + } + mb_check_buddy(e4b); +} + +static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, + int needed, struct ext4_free_extent *ex) +{ + int next = block; + int max; + int ord; + void *buddy; + + BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group)); + BUG_ON(ex == NULL); + + buddy = mb_find_buddy(e4b, order, &max); + BUG_ON(buddy == NULL); + BUG_ON(block >= max); + if (mb_test_bit(block, buddy)) { + ex->fe_len = 0; + ex->fe_start = 0; + ex->fe_group = 0; + return 0; + } + + /* FIXME dorp order completely ? */ + if (likely(order == 0)) { + /* find actual order */ + order = mb_find_order_for_block(e4b, block); + block = block >> order; + } + + ex->fe_len = 1 << order; + ex->fe_start = block << order; + ex->fe_group = e4b->bd_group; + + /* calc difference from given start */ + next = next - ex->fe_start; + ex->fe_len -= next; + ex->fe_start += next; + + while (needed > ex->fe_len && + (buddy = mb_find_buddy(e4b, order, &max))) { + + if (block + 1 >= max) + break; + + next = (block + 1) * (1 << order); + if (mb_test_bit(next, EXT4_MB_BITMAP(e4b))) + break; + + ord = mb_find_order_for_block(e4b, next); + + order = ord; + block = next >> order; + ex->fe_len += 1 << order; + } + + BUG_ON(ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3))); + return ex->fe_len; +} + +static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) +{ + int ord; + int mlen = 0; + int max = 0; + int cur; + int start = ex->fe_start; + int len = ex->fe_len; + unsigned ret = 0; + int len0 = len; + void *buddy; + + BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3)); + BUG_ON(e4b->bd_group != ex->fe_group); + BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group)); + mb_check_buddy(e4b); + mb_mark_used_double(e4b, start, len); + + e4b->bd_info->bb_free -= len; + if (e4b->bd_info->bb_first_free == start) + e4b->bd_info->bb_first_free += len; + + /* let's maintain fragments counter */ + if (start != 0) + mlen = !mb_test_bit(start - 1, EXT4_MB_BITMAP(e4b)); + if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0]) + max = !mb_test_bit(start + len, EXT4_MB_BITMAP(e4b)); + if (mlen && max) + e4b->bd_info->bb_fragments++; + else if (!mlen && !max) + e4b->bd_info->bb_fragments--; + + /* let's maintain buddy itself */ + while (len) { + ord = mb_find_order_for_block(e4b, start); + + if (((start >> ord) << ord) == start && len >= (1 << ord)) { + /* the whole chunk may be allocated at once! */ + mlen = 1 << ord; + buddy = mb_find_buddy(e4b, ord, &max); + BUG_ON((start >> ord) >= max); + mb_set_bit(start >> ord, buddy); + e4b->bd_info->bb_counters[ord]--; + start += mlen; + len -= mlen; + BUG_ON(len < 0); + continue; + } + + /* store for history */ + if (ret == 0) + ret = len | (ord << 16); + + /* we have to split large buddy */ + BUG_ON(ord <= 0); + buddy = mb_find_buddy(e4b, ord, &max); + mb_set_bit(start >> ord, buddy); + e4b->bd_info->bb_counters[ord]--; + + ord--; + cur = (start >> ord) & ~1U; + buddy = mb_find_buddy(e4b, ord, &max); + mb_clear_bit(cur, buddy); + mb_clear_bit(cur + 1, buddy); + e4b->bd_info->bb_counters[ord]++; + e4b->bd_info->bb_counters[ord]++; + } + + mb_set_bits(sb_bgl_lock(EXT4_SB(e4b->bd_sb), ex->fe_group), + EXT4_MB_BITMAP(e4b), ex->fe_start, len0); + mb_check_buddy(e4b); + + return ret; +} + +/* + * Must be called under group lock! + */ +static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, + struct ext4_buddy *e4b) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + int ret; + + BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group); + BUG_ON(ac->ac_status == AC_STATUS_FOUND); + + ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); + ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical; + ret = mb_mark_used(e4b, &ac->ac_b_ex); + + /* preallocation can change ac_b_ex, thus we store actually + * allocated blocks for history */ + ac->ac_f_ex = ac->ac_b_ex; + + ac->ac_status = AC_STATUS_FOUND; + ac->ac_tail = ret & 0xffff; + ac->ac_buddy = ret >> 16; + + /* + * take the page reference. We want the page to be pinned + * so that we don't get a ext4_mb_init_cache_call for this + * group until we update the bitmap. That would mean we + * double allocate blocks. The reference is dropped + * in ext4_mb_release_context + */ + ac->ac_bitmap_page = e4b->bd_bitmap_page; + get_page(ac->ac_bitmap_page); + ac->ac_buddy_page = e4b->bd_buddy_page; + get_page(ac->ac_buddy_page); + /* on allocation we use ac to track the held semaphore */ + ac->alloc_semp = e4b->alloc_semp; + e4b->alloc_semp = NULL; + /* store last allocated for subsequent stream allocation */ + if ((ac->ac_flags & EXT4_MB_HINT_DATA)) { + spin_lock(&sbi->s_md_lock); + sbi->s_mb_last_group = ac->ac_f_ex.fe_group; + sbi->s_mb_last_start = ac->ac_f_ex.fe_start; + spin_unlock(&sbi->s_md_lock); + } +} + +/* + * regular allocator, for general purposes allocation + */ + +static void ext4_mb_check_limits(struct ext4_allocation_context *ac, + struct ext4_buddy *e4b, + int finish_group) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_free_extent *bex = &ac->ac_b_ex; + struct ext4_free_extent *gex = &ac->ac_g_ex; + struct ext4_free_extent ex; + int max; + + if (ac->ac_status == AC_STATUS_FOUND) + return; + /* + * We don't want to scan for a whole year + */ + if (ac->ac_found > sbi->s_mb_max_to_scan && + !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { + ac->ac_status = AC_STATUS_BREAK; + return; + } + + /* + * Haven't found good chunk so far, let's continue + */ + if (bex->fe_len < gex->fe_len) + return; + + if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan) + && bex->fe_group == e4b->bd_group) { + /* recheck chunk's availability - we don't know + * when it was found (within this lock-unlock + * period or not) */ + max = mb_find_extent(e4b, 0, bex->fe_start, gex->fe_len, &ex); + if (max >= gex->fe_len) { + ext4_mb_use_best_found(ac, e4b); + return; + } + } +} + +/* + * The routine checks whether found extent is good enough. If it is, + * then the extent gets marked used and flag is set to the context + * to stop scanning. Otherwise, the extent is compared with the + * previous found extent and if new one is better, then it's stored + * in the context. Later, the best found extent will be used, if + * mballoc can't find good enough extent. + * + * FIXME: real allocation policy is to be designed yet! + */ +static void ext4_mb_measure_extent(struct ext4_allocation_context *ac, + struct ext4_free_extent *ex, + struct ext4_buddy *e4b) +{ + struct ext4_free_extent *bex = &ac->ac_b_ex; + struct ext4_free_extent *gex = &ac->ac_g_ex; + + BUG_ON(ex->fe_len <= 0); + BUG_ON(ex->fe_len >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); + BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); + BUG_ON(ac->ac_status != AC_STATUS_CONTINUE); + + ac->ac_found++; + + /* + * The special case - take what you catch first + */ + if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) { + *bex = *ex; + ext4_mb_use_best_found(ac, e4b); + return; + } + + /* + * Let's check whether the chuck is good enough + */ + if (ex->fe_len == gex->fe_len) { + *bex = *ex; + ext4_mb_use_best_found(ac, e4b); + return; + } + + /* + * If this is first found extent, just store it in the context + */ + if (bex->fe_len == 0) { + *bex = *ex; + return; + } + + /* + * If new found extent is better, store it in the context + */ + if (bex->fe_len < gex->fe_len) { + /* if the request isn't satisfied, any found extent + * larger than previous best one is better */ + if (ex->fe_len > bex->fe_len) + *bex = *ex; + } else if (ex->fe_len > gex->fe_len) { + /* if the request is satisfied, then we try to find + * an extent that still satisfy the request, but is + * smaller than previous one */ + if (ex->fe_len < bex->fe_len) + *bex = *ex; + } + + ext4_mb_check_limits(ac, e4b, 0); +} + +static int ext4_mb_try_best_found(struct ext4_allocation_context *ac, + struct ext4_buddy *e4b) +{ + struct ext4_free_extent ex = ac->ac_b_ex; + ext4_group_t group = ex.fe_group; + int max; + int err; + + BUG_ON(ex.fe_len <= 0); + err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); + if (err) + return err; + + ext4_lock_group(ac->ac_sb, group); + max = mb_find_extent(e4b, 0, ex.fe_start, ex.fe_len, &ex); + + if (max > 0) { + ac->ac_b_ex = ex; + ext4_mb_use_best_found(ac, e4b); + } + + ext4_unlock_group(ac->ac_sb, group); + ext4_mb_release_desc(e4b); + + return 0; +} + +static int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, + struct ext4_buddy *e4b) +{ + ext4_group_t group = ac->ac_g_ex.fe_group; + int max; + int err; + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_super_block *es = sbi->s_es; + struct ext4_free_extent ex; + + if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL)) + return 0; + + err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); + if (err) + return err; + + ext4_lock_group(ac->ac_sb, group); + max = mb_find_extent(e4b, 0, ac->ac_g_ex.fe_start, + ac->ac_g_ex.fe_len, &ex); + + if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { + ext4_fsblk_t start; + + start = (e4b->bd_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb)) + + ex.fe_start + le32_to_cpu(es->s_first_data_block); + /* use do_div to get remainder (would be 64-bit modulo) */ + if (do_div(start, sbi->s_stripe) == 0) { + ac->ac_found++; + ac->ac_b_ex = ex; + ext4_mb_use_best_found(ac, e4b); + } + } else if (max >= ac->ac_g_ex.fe_len) { + BUG_ON(ex.fe_len <= 0); + BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); + BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); + ac->ac_found++; + ac->ac_b_ex = ex; + ext4_mb_use_best_found(ac, e4b); + } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) { + /* Sometimes, caller may want to merge even small + * number of blocks to an existing extent */ + BUG_ON(ex.fe_len <= 0); + BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); + BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); + ac->ac_found++; + ac->ac_b_ex = ex; + ext4_mb_use_best_found(ac, e4b); + } + ext4_unlock_group(ac->ac_sb, group); + ext4_mb_release_desc(e4b); + + return 0; +} + +/* + * The routine scans buddy structures (not bitmap!) from given order + * to max order and tries to find big enough chunk to satisfy the req + */ +static void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, + struct ext4_buddy *e4b) +{ + struct super_block *sb = ac->ac_sb; + struct ext4_group_info *grp = e4b->bd_info; + void *buddy; + int i; + int k; + int max; + + BUG_ON(ac->ac_2order <= 0); + for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) { + if (grp->bb_counters[i] == 0) + continue; + + buddy = mb_find_buddy(e4b, i, &max); + BUG_ON(buddy == NULL); + + k = mb_find_next_zero_bit(buddy, max, 0); + BUG_ON(k >= max); + + ac->ac_found++; + + ac->ac_b_ex.fe_len = 1 << i; + ac->ac_b_ex.fe_start = k << i; + ac->ac_b_ex.fe_group = e4b->bd_group; + + ext4_mb_use_best_found(ac, e4b); + + BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len); + + if (EXT4_SB(sb)->s_mb_stats) + atomic_inc(&EXT4_SB(sb)->s_bal_2orders); + + break; + } +} + +/* + * The routine scans the group and measures all found extents. + * In order to optimize scanning, caller must pass number of + * free blocks in the group, so the routine can know upper limit. + */ +static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, + struct ext4_buddy *e4b) +{ + struct super_block *sb = ac->ac_sb; + void *bitmap = EXT4_MB_BITMAP(e4b); + struct ext4_free_extent ex; + int i; + int free; + + free = e4b->bd_info->bb_free; + BUG_ON(free <= 0); + + i = e4b->bd_info->bb_first_free; + + while (free && ac->ac_status == AC_STATUS_CONTINUE) { + i = mb_find_next_zero_bit(bitmap, + EXT4_BLOCKS_PER_GROUP(sb), i); + if (i >= EXT4_BLOCKS_PER_GROUP(sb)) { + /* + * IF we have corrupt bitmap, we won't find any + * free blocks even though group info says we + * we have free blocks + */ + ext4_error(sb, __func__, "%d free blocks as per " + "group info. But bitmap says 0\n", + free); + break; + } + + mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex); + BUG_ON(ex.fe_len <= 0); + if (free < ex.fe_len) { + ext4_error(sb, __func__, "%d free blocks as per " + "group info. But got %d blocks\n", + free, ex.fe_len); + /* + * The number of free blocks differs. This mostly + * indicate that the bitmap is corrupt. So exit + * without claiming the space. + */ + break; + } + + ext4_mb_measure_extent(ac, &ex, e4b); + + i += ex.fe_len; + free -= ex.fe_len; + } + + ext4_mb_check_limits(ac, e4b, 1); +} + +/* + * This is a special case for storages like raid5 + * we try to find stripe-aligned chunks for stripe-size requests + * XXX should do so at least for multiples of stripe size as well + */ +static void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, + struct ext4_buddy *e4b) +{ + struct super_block *sb = ac->ac_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + void *bitmap = EXT4_MB_BITMAP(e4b); + struct ext4_free_extent ex; + ext4_fsblk_t first_group_block; + ext4_fsblk_t a; + ext4_grpblk_t i; + int max; + + BUG_ON(sbi->s_stripe == 0); + + /* find first stripe-aligned block in group */ + first_group_block = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb) + + le32_to_cpu(sbi->s_es->s_first_data_block); + a = first_group_block + sbi->s_stripe - 1; + do_div(a, sbi->s_stripe); + i = (a * sbi->s_stripe) - first_group_block; + + while (i < EXT4_BLOCKS_PER_GROUP(sb)) { + if (!mb_test_bit(i, bitmap)) { + max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex); + if (max >= sbi->s_stripe) { + ac->ac_found++; + ac->ac_b_ex = ex; + ext4_mb_use_best_found(ac, e4b); + break; + } + } + i += sbi->s_stripe; + } +} + +static int ext4_mb_good_group(struct ext4_allocation_context *ac, + ext4_group_t group, int cr) +{ + unsigned free, fragments; + unsigned i, bits; + struct ext4_group_desc *desc; + struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); + + BUG_ON(cr < 0 || cr >= 4); + BUG_ON(EXT4_MB_GRP_NEED_INIT(grp)); + + free = grp->bb_free; + fragments = grp->bb_fragments; + if (free == 0) + return 0; + if (fragments == 0) + return 0; + + switch (cr) { + case 0: + BUG_ON(ac->ac_2order == 0); + /* If this group is uninitialized, skip it initially */ + desc = ext4_get_group_desc(ac->ac_sb, group, NULL); + if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) + return 0; + + bits = ac->ac_sb->s_blocksize_bits + 1; + for (i = ac->ac_2order; i <= bits; i++) + if (grp->bb_counters[i] > 0) + return 1; + break; + case 1: + if ((free / fragments) >= ac->ac_g_ex.fe_len) + return 1; + break; + case 2: + if (free >= ac->ac_g_ex.fe_len) + return 1; + break; + case 3: + return 1; + default: + BUG(); + } + + return 0; +} + +/* + * lock the group_info alloc_sem of all the groups + * belonging to the same buddy cache page. This + * make sure other parallel operation on the buddy + * cache doesn't happen whild holding the buddy cache + * lock + */ +int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group) +{ + int i; + int block, pnum; + int blocks_per_page; + int groups_per_page; + ext4_group_t first_group; + struct ext4_group_info *grp; + + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + /* + * the buddy cache inode stores the block bitmap + * and buddy information in consecutive blocks. + * So for each group we need two blocks. + */ + block = group * 2; + pnum = block / blocks_per_page; + first_group = pnum * blocks_per_page / 2; + + groups_per_page = blocks_per_page >> 1; + if (groups_per_page == 0) + groups_per_page = 1; + /* read all groups the page covers into the cache */ + for (i = 0; i < groups_per_page; i++) { + + if ((first_group + i) >= EXT4_SB(sb)->s_groups_count) + break; + grp = ext4_get_group_info(sb, first_group + i); + /* take all groups write allocation + * semaphore. This make sure there is + * no block allocation going on in any + * of that groups + */ + down_write(&grp->alloc_sem); + } + return i; +} + +void ext4_mb_put_buddy_cache_lock(struct super_block *sb, + ext4_group_t group, int locked_group) +{ + int i; + int block, pnum; + int blocks_per_page; + ext4_group_t first_group; + struct ext4_group_info *grp; + + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + /* + * the buddy cache inode stores the block bitmap + * and buddy information in consecutive blocks. + * So for each group we need two blocks. + */ + block = group * 2; + pnum = block / blocks_per_page; + first_group = pnum * blocks_per_page / 2; + /* release locks on all the groups */ + for (i = 0; i < locked_group; i++) { + + grp = ext4_get_group_info(sb, first_group + i); + /* take all groups write allocation + * semaphore. This make sure there is + * no block allocation going on in any + * of that groups + */ + up_write(&grp->alloc_sem); + } + +} + +static int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) +{ + + int ret; + void *bitmap; + int blocks_per_page; + int block, pnum, poff; + int num_grp_locked = 0; + struct ext4_group_info *this_grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct inode *inode = sbi->s_buddy_cache; + struct page *page = NULL, *bitmap_page = NULL; + + mb_debug("init group %lu\n", group); + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + this_grp = ext4_get_group_info(sb, group); + /* + * This ensures we don't add group + * to this buddy cache via resize + */ + num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group); + if (!EXT4_MB_GRP_NEED_INIT(this_grp)) { + /* + * somebody initialized the group + * return without doing anything + */ + ret = 0; + goto err; + } + /* + * the buddy cache inode stores the block bitmap + * and buddy information in consecutive blocks. + * So for each group we need two blocks. + */ + block = group * 2; + pnum = block / blocks_per_page; + poff = block % blocks_per_page; + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + if (page) { + BUG_ON(page->mapping != inode->i_mapping); + ret = ext4_mb_init_cache(page, NULL); + if (ret) { + unlock_page(page); + goto err; + } + unlock_page(page); + } + if (page == NULL || !PageUptodate(page)) { + ret = -EIO; + goto err; + } + mark_page_accessed(page); + bitmap_page = page; + bitmap = page_address(page) + (poff * sb->s_blocksize); + + /* init buddy cache */ + block++; + pnum = block / blocks_per_page; + poff = block % blocks_per_page; + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + if (page == bitmap_page) { + /* + * If both the bitmap and buddy are in + * the same page we don't need to force + * init the buddy + */ + unlock_page(page); + } else if (page) { + BUG_ON(page->mapping != inode->i_mapping); + ret = ext4_mb_init_cache(page, bitmap); + if (ret) { + unlock_page(page); + goto err; + } + unlock_page(page); + } + if (page == NULL || !PageUptodate(page)) { + ret = -EIO; + goto err; + } + mark_page_accessed(page); +err: + ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked); + if (bitmap_page) + page_cache_release(bitmap_page); + if (page) + page_cache_release(page); + return ret; +} + +static noinline_for_stack int +ext4_mb_regular_allocator(struct ext4_allocation_context *ac) +{ + ext4_group_t group; + ext4_group_t i; + int cr; + int err = 0; + int bsbits; + struct ext4_sb_info *sbi; + struct super_block *sb; + struct ext4_buddy e4b; + loff_t size, isize; + + sb = ac->ac_sb; + sbi = EXT4_SB(sb); + BUG_ON(ac->ac_status == AC_STATUS_FOUND); + + /* first, try the goal */ + err = ext4_mb_find_by_goal(ac, &e4b); + if (err || ac->ac_status == AC_STATUS_FOUND) + goto out; + + if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) + goto out; + + /* + * ac->ac2_order is set only if the fe_len is a power of 2 + * if ac2_order is set we also set criteria to 0 so that we + * try exact allocation using buddy. + */ + i = fls(ac->ac_g_ex.fe_len); + ac->ac_2order = 0; + /* + * We search using buddy data only if the order of the request + * is greater than equal to the sbi_s_mb_order2_reqs + * You can tune it via /proc/fs/ext4/<partition>/order2_req + */ + if (i >= sbi->s_mb_order2_reqs) { + /* + * This should tell if fe_len is exactly power of 2 + */ + if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0) + ac->ac_2order = i - 1; + } + + bsbits = ac->ac_sb->s_blocksize_bits; + /* if stream allocation is enabled, use global goal */ + size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; + isize = i_size_read(ac->ac_inode) >> bsbits; + if (size < isize) + size = isize; + + if (size < sbi->s_mb_stream_request && + (ac->ac_flags & EXT4_MB_HINT_DATA)) { + /* TBD: may be hot point */ + spin_lock(&sbi->s_md_lock); + ac->ac_g_ex.fe_group = sbi->s_mb_last_group; + ac->ac_g_ex.fe_start = sbi->s_mb_last_start; + spin_unlock(&sbi->s_md_lock); + } + /* Let's just scan groups to find more-less suitable blocks */ + cr = ac->ac_2order ? 0 : 1; + /* + * cr == 0 try to get exact allocation, + * cr == 3 try to get anything + */ +repeat: + for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) { + ac->ac_criteria = cr; + /* + * searching for the right group start + * from the goal value specified + */ + group = ac->ac_g_ex.fe_group; + + for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) { + struct ext4_group_info *grp; + struct ext4_group_desc *desc; + + if (group == EXT4_SB(sb)->s_groups_count) + group = 0; + + /* quick check to skip empty groups */ + grp = ext4_get_group_info(sb, group); + if (grp->bb_free == 0) + continue; + + /* + * if the group is already init we check whether it is + * a good group and if not we don't load the buddy + */ + if (EXT4_MB_GRP_NEED_INIT(grp)) { + /* + * we need full data about the group + * to make a good selection + */ + err = ext4_mb_init_group(sb, group); + if (err) + goto out; + } + + /* + * If the particular group doesn't satisfy our + * criteria we continue with the next group + */ + if (!ext4_mb_good_group(ac, group, cr)) + continue; + + err = ext4_mb_load_buddy(sb, group, &e4b); + if (err) + goto out; + + ext4_lock_group(sb, group); + if (!ext4_mb_good_group(ac, group, cr)) { + /* someone did allocation from this group */ + ext4_unlock_group(sb, group); + ext4_mb_release_desc(&e4b); + continue; + } + + ac->ac_groups_scanned++; + desc = ext4_get_group_desc(sb, group, NULL); + if (cr == 0 || (desc->bg_flags & + cpu_to_le16(EXT4_BG_BLOCK_UNINIT) && + ac->ac_2order != 0)) + ext4_mb_simple_scan_group(ac, &e4b); + else if (cr == 1 && + ac->ac_g_ex.fe_len == sbi->s_stripe) + ext4_mb_scan_aligned(ac, &e4b); + else + ext4_mb_complex_scan_group(ac, &e4b); + + ext4_unlock_group(sb, group); + ext4_mb_release_desc(&e4b); + + if (ac->ac_status != AC_STATUS_CONTINUE) + break; + } + } + + if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND && + !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { + /* + * We've been searching too long. Let's try to allocate + * the best chunk we've found so far + */ + + ext4_mb_try_best_found(ac, &e4b); + if (ac->ac_status != AC_STATUS_FOUND) { + /* + * Someone more lucky has already allocated it. + * The only thing we can do is just take first + * found block(s) + printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n"); + */ + ac->ac_b_ex.fe_group = 0; + ac->ac_b_ex.fe_start = 0; + ac->ac_b_ex.fe_len = 0; + ac->ac_status = AC_STATUS_CONTINUE; + ac->ac_flags |= EXT4_MB_HINT_FIRST; + cr = 3; + atomic_inc(&sbi->s_mb_lost_chunks); + goto repeat; + } + } +out: + return err; +} + +#ifdef EXT4_MB_HISTORY +struct ext4_mb_proc_session { + struct ext4_mb_history *history; + struct super_block *sb; + int start; + int max; +}; + +static void *ext4_mb_history_skip_empty(struct ext4_mb_proc_session *s, + struct ext4_mb_history *hs, + int first) +{ + if (hs == s->history + s->max) + hs = s->history; + if (!first && hs == s->history + s->start) + return NULL; + while (hs->orig.fe_len == 0) { + hs++; + if (hs == s->history + s->max) + hs = s->history; + if (hs == s->history + s->start) + return NULL; + } + return hs; +} + +static void *ext4_mb_seq_history_start(struct seq_file *seq, loff_t *pos) +{ + struct ext4_mb_proc_session *s = seq->private; + struct ext4_mb_history *hs; + int l = *pos; + + if (l == 0) + return SEQ_START_TOKEN; + hs = ext4_mb_history_skip_empty(s, s->history + s->start, 1); + if (!hs) + return NULL; + while (--l && (hs = ext4_mb_history_skip_empty(s, ++hs, 0)) != NULL); + return hs; +} + +static void *ext4_mb_seq_history_next(struct seq_file *seq, void *v, + loff_t *pos) +{ + struct ext4_mb_proc_session *s = seq->private; + struct ext4_mb_history *hs = v; + + ++*pos; + if (v == SEQ_START_TOKEN) + return ext4_mb_history_skip_empty(s, s->history + s->start, 1); + else + return ext4_mb_history_skip_empty(s, ++hs, 0); +} + +static int ext4_mb_seq_history_show(struct seq_file *seq, void *v) +{ + char buf[25], buf2[25], buf3[25], *fmt; + struct ext4_mb_history *hs = v; + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s " + "%-5s %-2s %-5s %-5s %-5s %-6s\n", + "pid", "inode", "original", "goal", "result", "found", + "grps", "cr", "flags", "merge", "tail", "broken"); + return 0; + } + + if (hs->op == EXT4_MB_HISTORY_ALLOC) { + fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u " + "%-5u %-5s %-5u %-6u\n"; + sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group, + hs->result.fe_start, hs->result.fe_len, + hs->result.fe_logical); + sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group, + hs->orig.fe_start, hs->orig.fe_len, + hs->orig.fe_logical); + sprintf(buf3, "%lu/%d/%u@%u", hs->goal.fe_group, + hs->goal.fe_start, hs->goal.fe_len, + hs->goal.fe_logical); + seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2, + hs->found, hs->groups, hs->cr, hs->flags, + hs->merged ? "M" : "", hs->tail, + hs->buddy ? 1 << hs->buddy : 0); + } else if (hs->op == EXT4_MB_HISTORY_PREALLOC) { + fmt = "%-5u %-8u %-23s %-23s %-23s\n"; + sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group, + hs->result.fe_start, hs->result.fe_len, + hs->result.fe_logical); + sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group, + hs->orig.fe_start, hs->orig.fe_len, + hs->orig.fe_logical); + seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2); + } else if (hs->op == EXT4_MB_HISTORY_DISCARD) { + sprintf(buf2, "%lu/%d/%u", hs->result.fe_group, + hs->result.fe_start, hs->result.fe_len); + seq_printf(seq, "%-5u %-8u %-23s discard\n", + hs->pid, hs->ino, buf2); + } else if (hs->op == EXT4_MB_HISTORY_FREE) { + sprintf(buf2, "%lu/%d/%u", hs->result.fe_group, + hs->result.fe_start, hs->result.fe_len); + seq_printf(seq, "%-5u %-8u %-23s free\n", + hs->pid, hs->ino, buf2); + } + return 0; +} + +static void ext4_mb_seq_history_stop(struct seq_file *seq, void *v) +{ +} + +static struct seq_operations ext4_mb_seq_history_ops = { + .start = ext4_mb_seq_history_start, + .next = ext4_mb_seq_history_next, + .stop = ext4_mb_seq_history_stop, + .show = ext4_mb_seq_history_show, +}; + +static int ext4_mb_seq_history_open(struct inode *inode, struct file *file) +{ + struct super_block *sb = PDE(inode)->data; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_mb_proc_session *s; + int rc; + int size; + + if (unlikely(sbi->s_mb_history == NULL)) + return -ENOMEM; + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (s == NULL) + return -ENOMEM; + s->sb = sb; + size = sizeof(struct ext4_mb_history) * sbi->s_mb_history_max; + s->history = kmalloc(size, GFP_KERNEL); + if (s->history == NULL) { + kfree(s); + return -ENOMEM; + } + + spin_lock(&sbi->s_mb_history_lock); + memcpy(s->history, sbi->s_mb_history, size); + s->max = sbi->s_mb_history_max; + s->start = sbi->s_mb_history_cur % s->max; + spin_unlock(&sbi->s_mb_history_lock); + + rc = seq_open(file, &ext4_mb_seq_history_ops); + if (rc == 0) { + struct seq_file *m = (struct seq_file *)file->private_data; + m->private = s; + } else { + kfree(s->history); + kfree(s); + } + return rc; + +} + +static int ext4_mb_seq_history_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = (struct seq_file *)file->private_data; + struct ext4_mb_proc_session *s = seq->private; + kfree(s->history); + kfree(s); + return seq_release(inode, file); +} + +static ssize_t ext4_mb_seq_history_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *ppos) +{ + struct seq_file *seq = (struct seq_file *)file->private_data; + struct ext4_mb_proc_session *s = seq->private; + struct super_block *sb = s->sb; + char str[32]; + int value; + + if (count >= sizeof(str)) { + printk(KERN_ERR "EXT4-fs: %s string too long, max %u bytes\n", + "mb_history", (int)sizeof(str)); + return -EOVERFLOW; + } + + if (copy_from_user(str, buffer, count)) + return -EFAULT; + + value = simple_strtol(str, NULL, 0); + if (value < 0) + return -ERANGE; + EXT4_SB(sb)->s_mb_history_filter = value; + + return count; +} + +static struct file_operations ext4_mb_seq_history_fops = { + .owner = THIS_MODULE, + .open = ext4_mb_seq_history_open, + .read = seq_read, + .write = ext4_mb_seq_history_write, + .llseek = seq_lseek, + .release = ext4_mb_seq_history_release, +}; + +static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) +{ + struct super_block *sb = seq->private; + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_group_t group; + + if (*pos < 0 || *pos >= sbi->s_groups_count) + return NULL; + + group = *pos + 1; + return (void *) group; +} + +static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct super_block *sb = seq->private; + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_group_t group; + + ++*pos; + if (*pos < 0 || *pos >= sbi->s_groups_count) + return NULL; + group = *pos + 1; + return (void *) group;; +} + +static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) +{ + struct super_block *sb = seq->private; + long group = (long) v; + int i; + int err; + struct ext4_buddy e4b; + struct sg { + struct ext4_group_info info; + unsigned short counters[16]; + } sg; + + group--; + if (group == 0) + seq_printf(seq, "#%-5s: %-5s %-5s %-5s " + "[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s " + "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n", + "group", "free", "frags", "first", + "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6", + "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13"); + + i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + + sizeof(struct ext4_group_info); + err = ext4_mb_load_buddy(sb, group, &e4b); + if (err) { + seq_printf(seq, "#%-5lu: I/O error\n", group); + return 0; + } + ext4_lock_group(sb, group); + memcpy(&sg, ext4_get_group_info(sb, group), i); + ext4_unlock_group(sb, group); + ext4_mb_release_desc(&e4b); + + seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free, + sg.info.bb_fragments, sg.info.bb_first_free); + for (i = 0; i <= 13; i++) + seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? + sg.info.bb_counters[i] : 0); + seq_printf(seq, " ]\n"); + + return 0; +} + +static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v) +{ +} + +static struct seq_operations ext4_mb_seq_groups_ops = { + .start = ext4_mb_seq_groups_start, + .next = ext4_mb_seq_groups_next, + .stop = ext4_mb_seq_groups_stop, + .show = ext4_mb_seq_groups_show, +}; + +static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file) +{ + struct super_block *sb = PDE(inode)->data; + int rc; + + rc = seq_open(file, &ext4_mb_seq_groups_ops); + if (rc == 0) { + struct seq_file *m = (struct seq_file *)file->private_data; + m->private = sb; + } + return rc; + +} + +static struct file_operations ext4_mb_seq_groups_fops = { + .owner = THIS_MODULE, + .open = ext4_mb_seq_groups_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static void ext4_mb_history_release(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (sbi->s_proc != NULL) { + remove_proc_entry("mb_groups", sbi->s_proc); + remove_proc_entry("mb_history", sbi->s_proc); + } + kfree(sbi->s_mb_history); +} + +static void ext4_mb_history_init(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + int i; + + if (sbi->s_proc != NULL) { + proc_create_data("mb_history", S_IRUGO, sbi->s_proc, + &ext4_mb_seq_history_fops, sb); + proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, + &ext4_mb_seq_groups_fops, sb); + } + + sbi->s_mb_history_max = 1000; + sbi->s_mb_history_cur = 0; + spin_lock_init(&sbi->s_mb_history_lock); + i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history); + sbi->s_mb_history = kzalloc(i, GFP_KERNEL); + /* if we can't allocate history, then we simple won't use it */ +} + +static noinline_for_stack void +ext4_mb_store_history(struct ext4_allocation_context *ac) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_mb_history h; + + if (unlikely(sbi->s_mb_history == NULL)) + return; + + if (!(ac->ac_op & sbi->s_mb_history_filter)) + return; + + h.op = ac->ac_op; + h.pid = current->pid; + h.ino = ac->ac_inode ? ac->ac_inode->i_ino : 0; + h.orig = ac->ac_o_ex; + h.result = ac->ac_b_ex; + h.flags = ac->ac_flags; + h.found = ac->ac_found; + h.groups = ac->ac_groups_scanned; + h.cr = ac->ac_criteria; + h.tail = ac->ac_tail; + h.buddy = ac->ac_buddy; + h.merged = 0; + if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) { + if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && + ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) + h.merged = 1; + h.goal = ac->ac_g_ex; + h.result = ac->ac_f_ex; + } + + spin_lock(&sbi->s_mb_history_lock); + memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h)); + if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max) + sbi->s_mb_history_cur = 0; + spin_unlock(&sbi->s_mb_history_lock); +} + +#else +#define ext4_mb_history_release(sb) +#define ext4_mb_history_init(sb) +#endif + + +/* Create and initialize ext4_group_info data for the given group. */ +int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *desc) +{ + int i, len; + int metalen = 0; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_info **meta_group_info; + + /* + * First check if this group is the first of a reserved block. + * If it's true, we have to allocate a new table of pointers + * to ext4_group_info structures + */ + if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { + metalen = sizeof(*meta_group_info) << + EXT4_DESC_PER_BLOCK_BITS(sb); + meta_group_info = kmalloc(metalen, GFP_KERNEL); + if (meta_group_info == NULL) { + printk(KERN_ERR "EXT4-fs: can't allocate mem for a " + "buddy group\n"); + goto exit_meta_group_info; + } + sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = + meta_group_info; + } + + /* + * calculate needed size. if change bb_counters size, + * don't forget about ext4_mb_generate_buddy() + */ + len = offsetof(typeof(**meta_group_info), + bb_counters[sb->s_blocksize_bits + 2]); + + meta_group_info = + sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]; + i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); + + meta_group_info[i] = kzalloc(len, GFP_KERNEL); + if (meta_group_info[i] == NULL) { + printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n"); + goto exit_group_info; + } + set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, + &(meta_group_info[i]->bb_state)); + + /* + * initialize bb_free to be able to skip + * empty groups without initialization + */ + if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { + meta_group_info[i]->bb_free = + ext4_free_blocks_after_init(sb, group, desc); + } else { + meta_group_info[i]->bb_free = + le16_to_cpu(desc->bg_free_blocks_count); + } + + INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); + init_rwsem(&meta_group_info[i]->alloc_sem); + meta_group_info[i]->bb_free_root.rb_node = NULL;; + +#ifdef DOUBLE_CHECK + { + struct buffer_head *bh; + meta_group_info[i]->bb_bitmap = + kmalloc(sb->s_blocksize, GFP_KERNEL); + BUG_ON(meta_group_info[i]->bb_bitmap == NULL); + bh = ext4_read_block_bitmap(sb, group); + BUG_ON(bh == NULL); + memcpy(meta_group_info[i]->bb_bitmap, bh->b_data, + sb->s_blocksize); + put_bh(bh); + } +#endif + + return 0; + +exit_group_info: + /* If a meta_group_info table has been allocated, release it now */ + if (group % EXT4_DESC_PER_BLOCK(sb) == 0) + kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]); +exit_meta_group_info: + return -ENOMEM; +} /* ext4_mb_add_groupinfo */ + +/* + * Update an existing group. + * This function is used for online resize + */ +void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add) +{ + grp->bb_free += add; +} + +static int ext4_mb_init_backend(struct super_block *sb) +{ + ext4_group_t i; + int metalen; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + int num_meta_group_infos; + int num_meta_group_infos_max; + int array_size; + struct ext4_group_info **meta_group_info; + struct ext4_group_desc *desc; + + /* This is the number of blocks used by GDT */ + num_meta_group_infos = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - + 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); + + /* + * This is the total number of blocks used by GDT including + * the number of reserved blocks for GDT. + * The s_group_info array is allocated with this value + * to allow a clean online resize without a complex + * manipulation of pointer. + * The drawback is the unused memory when no resize + * occurs but it's very low in terms of pages + * (see comments below) + * Need to handle this properly when META_BG resizing is allowed + */ + num_meta_group_infos_max = num_meta_group_infos + + le16_to_cpu(es->s_reserved_gdt_blocks); + + /* + * array_size is the size of s_group_info array. We round it + * to the next power of two because this approximation is done + * internally by kmalloc so we can have some more memory + * for free here (e.g. may be used for META_BG resize). + */ + array_size = 1; + while (array_size < sizeof(*sbi->s_group_info) * + num_meta_group_infos_max) + array_size = array_size << 1; + /* An 8TB filesystem with 64-bit pointers requires a 4096 byte + * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. + * So a two level scheme suffices for now. */ + sbi->s_group_info = kmalloc(array_size, GFP_KERNEL); + if (sbi->s_group_info == NULL) { + printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n"); + return -ENOMEM; + } + sbi->s_buddy_cache = new_inode(sb); + if (sbi->s_buddy_cache == NULL) { + printk(KERN_ERR "EXT4-fs: can't get new inode\n"); + goto err_freesgi; + } + EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; + + metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb); + for (i = 0; i < num_meta_group_infos; i++) { + if ((i + 1) == num_meta_group_infos) + metalen = sizeof(*meta_group_info) * + (sbi->s_groups_count - + (i << EXT4_DESC_PER_BLOCK_BITS(sb))); + meta_group_info = kmalloc(metalen, GFP_KERNEL); + if (meta_group_info == NULL) { + printk(KERN_ERR "EXT4-fs: can't allocate mem for a " + "buddy group\n"); + goto err_freemeta; + } + sbi->s_group_info[i] = meta_group_info; + } + + for (i = 0; i < sbi->s_groups_count; i++) { + desc = ext4_get_group_desc(sb, i, NULL); + if (desc == NULL) { + printk(KERN_ERR + "EXT4-fs: can't read descriptor %lu\n", i); + goto err_freebuddy; + } + if (ext4_mb_add_groupinfo(sb, i, desc) != 0) + goto err_freebuddy; + } + + return 0; + +err_freebuddy: + while (i-- > 0) + kfree(ext4_get_group_info(sb, i)); + i = num_meta_group_infos; +err_freemeta: + while (i-- > 0) + kfree(sbi->s_group_info[i]); + iput(sbi->s_buddy_cache); +err_freesgi: + kfree(sbi->s_group_info); + return -ENOMEM; +} + +int ext4_mb_init(struct super_block *sb, int needs_recovery) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + unsigned i, j; + unsigned offset; + unsigned max; + int ret; + + i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short); + + sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); + if (sbi->s_mb_offsets == NULL) { + return -ENOMEM; + } + + i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int); + sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); + if (sbi->s_mb_maxs == NULL) { + kfree(sbi->s_mb_maxs); + return -ENOMEM; + } + + /* order 0 is regular bitmap */ + sbi->s_mb_maxs[0] = sb->s_blocksize << 3; + sbi->s_mb_offsets[0] = 0; + + i = 1; + offset = 0; + max = sb->s_blocksize << 2; + do { + sbi->s_mb_offsets[i] = offset; + sbi->s_mb_maxs[i] = max; + offset += 1 << (sb->s_blocksize_bits - i); + max = max >> 1; + i++; + } while (i <= sb->s_blocksize_bits + 1); + + /* init file for buddy data */ + ret = ext4_mb_init_backend(sb); + if (ret != 0) { + kfree(sbi->s_mb_offsets); + kfree(sbi->s_mb_maxs); + return ret; + } + + spin_lock_init(&sbi->s_md_lock); + spin_lock_init(&sbi->s_bal_lock); + + sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; + sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; + sbi->s_mb_stats = MB_DEFAULT_STATS; + sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; + sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; + sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT; + sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; + + sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); + if (sbi->s_locality_groups == NULL) { + kfree(sbi->s_mb_offsets); + kfree(sbi->s_mb_maxs); + return -ENOMEM; + } + for_each_possible_cpu(i) { + struct ext4_locality_group *lg; + lg = per_cpu_ptr(sbi->s_locality_groups, i); + mutex_init(&lg->lg_mutex); + for (j = 0; j < PREALLOC_TB_SIZE; j++) + INIT_LIST_HEAD(&lg->lg_prealloc_list[j]); + spin_lock_init(&lg->lg_prealloc_lock); + } + + ext4_mb_init_per_dev_proc(sb); + ext4_mb_history_init(sb); + + sbi->s_journal->j_commit_callback = release_blocks_on_commit; + + printk(KERN_INFO "EXT4-fs: mballoc enabled\n"); + return 0; +} + +/* need to called with ext4 group lock (ext4_lock_group) */ +static void ext4_mb_cleanup_pa(struct ext4_group_info *grp) +{ + struct ext4_prealloc_space *pa; + struct list_head *cur, *tmp; + int count = 0; + + list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) { + pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); + list_del(&pa->pa_group_list); + count++; + kmem_cache_free(ext4_pspace_cachep, pa); + } + if (count) + mb_debug("mballoc: %u PAs left\n", count); + +} + +int ext4_mb_release(struct super_block *sb) +{ + ext4_group_t i; + int num_meta_group_infos; + struct ext4_group_info *grinfo; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (sbi->s_group_info) { + for (i = 0; i < sbi->s_groups_count; i++) { + grinfo = ext4_get_group_info(sb, i); +#ifdef DOUBLE_CHECK + kfree(grinfo->bb_bitmap); +#endif + ext4_lock_group(sb, i); + ext4_mb_cleanup_pa(grinfo); + ext4_unlock_group(sb, i); + kfree(grinfo); + } + num_meta_group_infos = (sbi->s_groups_count + + EXT4_DESC_PER_BLOCK(sb) - 1) >> + EXT4_DESC_PER_BLOCK_BITS(sb); + for (i = 0; i < num_meta_group_infos; i++) + kfree(sbi->s_group_info[i]); + kfree(sbi->s_group_info); + } + kfree(sbi->s_mb_offsets); + kfree(sbi->s_mb_maxs); + if (sbi->s_buddy_cache) + iput(sbi->s_buddy_cache); + if (sbi->s_mb_stats) { + printk(KERN_INFO + "EXT4-fs: mballoc: %u blocks %u reqs (%u success)\n", + atomic_read(&sbi->s_bal_allocated), + atomic_read(&sbi->s_bal_reqs), + atomic_read(&sbi->s_bal_success)); + printk(KERN_INFO + "EXT4-fs: mballoc: %u extents scanned, %u goal hits, " + "%u 2^N hits, %u breaks, %u lost\n", + atomic_read(&sbi->s_bal_ex_scanned), + atomic_read(&sbi->s_bal_goals), + atomic_read(&sbi->s_bal_2orders), + atomic_read(&sbi->s_bal_breaks), + atomic_read(&sbi->s_mb_lost_chunks)); + printk(KERN_INFO + "EXT4-fs: mballoc: %lu generated and it took %Lu\n", + sbi->s_mb_buddies_generated++, + sbi->s_mb_generation_time); + printk(KERN_INFO + "EXT4-fs: mballoc: %u preallocated, %u discarded\n", + atomic_read(&sbi->s_mb_preallocated), + atomic_read(&sbi->s_mb_discarded)); + } + + free_percpu(sbi->s_locality_groups); + ext4_mb_history_release(sb); + ext4_mb_destroy_per_dev_proc(sb); + + return 0; +} + +/* + * This function is called by the jbd2 layer once the commit has finished, + * so we know we can free the blocks that were released with that commit. + */ +static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) +{ + struct super_block *sb = journal->j_private; + struct ext4_buddy e4b; + struct ext4_group_info *db; + int err, count = 0, count2 = 0; + struct ext4_free_data *entry; + ext4_fsblk_t discard_block; + struct list_head *l, *ltmp; + + list_for_each_safe(l, ltmp, &txn->t_private_list) { + entry = list_entry(l, struct ext4_free_data, list); + + mb_debug("gonna free %u blocks in group %lu (0x%p):", + entry->count, entry->group, entry); + + err = ext4_mb_load_buddy(sb, entry->group, &e4b); + /* we expect to find existing buddy because it's pinned */ + BUG_ON(err != 0); + + db = e4b.bd_info; + /* there are blocks to put in buddy to make them really free */ + count += entry->count; + count2++; + ext4_lock_group(sb, entry->group); + /* Take it out of per group rb tree */ + rb_erase(&entry->node, &(db->bb_free_root)); + mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count); + + if (!db->bb_free_root.rb_node) { + /* No more items in the per group rb tree + * balance refcounts from ext4_mb_free_metadata() + */ + page_cache_release(e4b.bd_buddy_page); + page_cache_release(e4b.bd_bitmap_page); + } + ext4_unlock_group(sb, entry->group); + discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb) + + entry->start_blk + + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); + trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u", sb->s_id, + (unsigned long long) discard_block, entry->count); + sb_issue_discard(sb, discard_block, entry->count); + + kmem_cache_free(ext4_free_ext_cachep, entry); + ext4_mb_release_desc(&e4b); + } + + mb_debug("freed %u blocks in %u structures\n", count, count2); +} + +#define EXT4_MB_STATS_NAME "stats" +#define EXT4_MB_MAX_TO_SCAN_NAME "max_to_scan" +#define EXT4_MB_MIN_TO_SCAN_NAME "min_to_scan" +#define EXT4_MB_ORDER2_REQ "order2_req" +#define EXT4_MB_STREAM_REQ "stream_req" +#define EXT4_MB_GROUP_PREALLOC "group_prealloc" + +static int ext4_mb_init_per_dev_proc(struct super_block *sb) +{ +#ifdef CONFIG_PROC_FS + mode_t mode = S_IFREG | S_IRUGO | S_IWUSR; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct proc_dir_entry *proc; + + if (sbi->s_proc == NULL) + return -EINVAL; + + EXT4_PROC_HANDLER(EXT4_MB_STATS_NAME, mb_stats); + EXT4_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, mb_max_to_scan); + EXT4_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, mb_min_to_scan); + EXT4_PROC_HANDLER(EXT4_MB_ORDER2_REQ, mb_order2_reqs); + EXT4_PROC_HANDLER(EXT4_MB_STREAM_REQ, mb_stream_request); + EXT4_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, mb_group_prealloc); + return 0; + +err_out: + remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc); + remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc); + remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc); + remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc); + remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc); + remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc); + return -ENOMEM; +#else + return 0; +#endif +} + +static int ext4_mb_destroy_per_dev_proc(struct super_block *sb) +{ +#ifdef CONFIG_PROC_FS + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (sbi->s_proc == NULL) + return -EINVAL; + + remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc); + remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc); + remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc); + remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc); + remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc); + remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc); +#endif + return 0; +} + +int __init init_ext4_mballoc(void) +{ + ext4_pspace_cachep = + kmem_cache_create("ext4_prealloc_space", + sizeof(struct ext4_prealloc_space), + 0, SLAB_RECLAIM_ACCOUNT, NULL); + if (ext4_pspace_cachep == NULL) + return -ENOMEM; + + ext4_ac_cachep = + kmem_cache_create("ext4_alloc_context", + sizeof(struct ext4_allocation_context), + 0, SLAB_RECLAIM_ACCOUNT, NULL); + if (ext4_ac_cachep == NULL) { + kmem_cache_destroy(ext4_pspace_cachep); + return -ENOMEM; + } + + ext4_free_ext_cachep = + kmem_cache_create("ext4_free_block_extents", + sizeof(struct ext4_free_data), + 0, SLAB_RECLAIM_ACCOUNT, NULL); + if (ext4_free_ext_cachep == NULL) { + kmem_cache_destroy(ext4_pspace_cachep); + kmem_cache_destroy(ext4_ac_cachep); + return -ENOMEM; + } + return 0; +} + +void exit_ext4_mballoc(void) +{ + /* XXX: synchronize_rcu(); */ + kmem_cache_destroy(ext4_pspace_cachep); + kmem_cache_destroy(ext4_ac_cachep); + kmem_cache_destroy(ext4_free_ext_cachep); +} + + +/* + * Check quota and mark choosed space (ac->ac_b_ex) non-free in bitmaps + * Returns 0 if success or error code + */ +static noinline_for_stack int +ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, + handle_t *handle, unsigned long reserv_blks) +{ + struct buffer_head *bitmap_bh = NULL; + struct ext4_super_block *es; + struct ext4_group_desc *gdp; + struct buffer_head *gdp_bh; + struct ext4_sb_info *sbi; + struct super_block *sb; + ext4_fsblk_t block; + int err, len; + + BUG_ON(ac->ac_status != AC_STATUS_FOUND); + BUG_ON(ac->ac_b_ex.fe_len <= 0); + + sb = ac->ac_sb; + sbi = EXT4_SB(sb); + es = sbi->s_es; + + + err = -EIO; + bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group); + if (!bitmap_bh) + goto out_err; + + err = ext4_journal_get_write_access(handle, bitmap_bh); + if (err) + goto out_err; + + err = -EIO; + gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh); + if (!gdp) + goto out_err; + + ext4_debug("using block group %lu(%d)\n", ac->ac_b_ex.fe_group, + gdp->bg_free_blocks_count); + + err = ext4_journal_get_write_access(handle, gdp_bh); + if (err) + goto out_err; + + block = ac->ac_b_ex.fe_group * EXT4_BLOCKS_PER_GROUP(sb) + + ac->ac_b_ex.fe_start + + le32_to_cpu(es->s_first_data_block); + + len = ac->ac_b_ex.fe_len; + if (in_range(ext4_block_bitmap(sb, gdp), block, len) || + in_range(ext4_inode_bitmap(sb, gdp), block, len) || + in_range(block, ext4_inode_table(sb, gdp), + EXT4_SB(sb)->s_itb_per_group) || + in_range(block + len - 1, ext4_inode_table(sb, gdp), + EXT4_SB(sb)->s_itb_per_group)) { + ext4_error(sb, __func__, + "Allocating block %llu in system zone of %d group\n", + block, ac->ac_b_ex.fe_group); + /* File system mounted not to panic on error + * Fix the bitmap and repeat the block allocation + * We leak some of the blocks here. + */ + mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), + bitmap_bh->b_data, ac->ac_b_ex.fe_start, + ac->ac_b_ex.fe_len); + err = ext4_journal_dirty_metadata(handle, bitmap_bh); + if (!err) + err = -EAGAIN; + goto out_err; + } +#ifdef AGGRESSIVE_CHECK + { + int i; + for (i = 0; i < ac->ac_b_ex.fe_len; i++) { + BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i, + bitmap_bh->b_data)); + } + } +#endif + spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); + mb_set_bits(NULL, bitmap_bh->b_data, + ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len); + if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); + gdp->bg_free_blocks_count = + cpu_to_le16(ext4_free_blocks_after_init(sb, + ac->ac_b_ex.fe_group, + gdp)); + } + le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len); + gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); + spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); + percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len); + /* + * Now reduce the dirty block count also. Should not go negative + */ + if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) + /* release all the reserved blocks if non delalloc */ + percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks); + else + percpu_counter_sub(&sbi->s_dirtyblocks_counter, + ac->ac_b_ex.fe_len); + + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, + ac->ac_b_ex.fe_group); + spin_lock(sb_bgl_lock(sbi, flex_group)); + sbi->s_flex_groups[flex_group].free_blocks -= ac->ac_b_ex.fe_len; + spin_unlock(sb_bgl_lock(sbi, flex_group)); + } + + err = ext4_journal_dirty_metadata(handle, bitmap_bh); + if (err) + goto out_err; + err = ext4_journal_dirty_metadata(handle, gdp_bh); + +out_err: + sb->s_dirt = 1; + brelse(bitmap_bh); + return err; +} + +/* + * here we normalize request for locality group + * Group request are normalized to s_strip size if we set the same via mount + * option. If not we set it to s_mb_group_prealloc which can be configured via + * /proc/fs/ext4/<partition>/group_prealloc + * + * XXX: should we try to preallocate more than the group has now? + */ +static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) +{ + struct super_block *sb = ac->ac_sb; + struct ext4_locality_group *lg = ac->ac_lg; + + BUG_ON(lg == NULL); + if (EXT4_SB(sb)->s_stripe) + ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe; + else + ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; + mb_debug("#%u: goal %u blocks for locality group\n", + current->pid, ac->ac_g_ex.fe_len); +} + +/* + * Normalization means making request better in terms of + * size and alignment + */ +static noinline_for_stack void +ext4_mb_normalize_request(struct ext4_allocation_context *ac, + struct ext4_allocation_request *ar) +{ + int bsbits, max; + ext4_lblk_t end; + loff_t size, orig_size, start_off; + ext4_lblk_t start, orig_start; + struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); + struct ext4_prealloc_space *pa; + + /* do normalize only data requests, metadata requests + do not need preallocation */ + if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) + return; + + /* sometime caller may want exact blocks */ + if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) + return; + + /* caller may indicate that preallocation isn't + * required (it's a tail, for example) */ + if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC) + return; + + if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) { + ext4_mb_normalize_group_request(ac); + return ; + } + + bsbits = ac->ac_sb->s_blocksize_bits; + + /* first, let's learn actual file size + * given current request is allocated */ + size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; + size = size << bsbits; + if (size < i_size_read(ac->ac_inode)) + size = i_size_read(ac->ac_inode); + + /* max size of free chunks */ + max = 2 << bsbits; + +#define NRL_CHECK_SIZE(req, size, max, chunk_size) \ + (req <= (size) || max <= (chunk_size)) + + /* first, try to predict filesize */ + /* XXX: should this table be tunable? */ + start_off = 0; + if (size <= 16 * 1024) { + size = 16 * 1024; + } else if (size <= 32 * 1024) { + size = 32 * 1024; + } else if (size <= 64 * 1024) { + size = 64 * 1024; + } else if (size <= 128 * 1024) { + size = 128 * 1024; + } else if (size <= 256 * 1024) { + size = 256 * 1024; + } else if (size <= 512 * 1024) { + size = 512 * 1024; + } else if (size <= 1024 * 1024) { + size = 1024 * 1024; + } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) { + start_off = ((loff_t)ac->ac_o_ex.fe_logical >> + (21 - bsbits)) << 21; + size = 2 * 1024 * 1024; + } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) { + start_off = ((loff_t)ac->ac_o_ex.fe_logical >> + (22 - bsbits)) << 22; + size = 4 * 1024 * 1024; + } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len, + (8<<20)>>bsbits, max, 8 * 1024)) { + start_off = ((loff_t)ac->ac_o_ex.fe_logical >> + (23 - bsbits)) << 23; + size = 8 * 1024 * 1024; + } else { + start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits; + size = ac->ac_o_ex.fe_len << bsbits; + } + orig_size = size = size >> bsbits; + orig_start = start = start_off >> bsbits; + + /* don't cover already allocated blocks in selected range */ + if (ar->pleft && start <= ar->lleft) { + size -= ar->lleft + 1 - start; + start = ar->lleft + 1; + } + if (ar->pright && start + size - 1 >= ar->lright) + size -= start + size - ar->lright; + + end = start + size; + + /* check we don't cross already preallocated blocks */ + rcu_read_lock(); + list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { + unsigned long pa_end; + + if (pa->pa_deleted) + continue; + spin_lock(&pa->pa_lock); + if (pa->pa_deleted) { + spin_unlock(&pa->pa_lock); + continue; + } + + pa_end = pa->pa_lstart + pa->pa_len; + + /* PA must not overlap original request */ + BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end || + ac->ac_o_ex.fe_logical < pa->pa_lstart)); + + /* skip PA normalized request doesn't overlap with */ + if (pa->pa_lstart >= end) { + spin_unlock(&pa->pa_lock); + continue; + } + if (pa_end <= start) { + spin_unlock(&pa->pa_lock); + continue; + } + BUG_ON(pa->pa_lstart <= start && pa_end >= end); + + if (pa_end <= ac->ac_o_ex.fe_logical) { + BUG_ON(pa_end < start); + start = pa_end; + } + + if (pa->pa_lstart > ac->ac_o_ex.fe_logical) { + BUG_ON(pa->pa_lstart > end); + end = pa->pa_lstart; + } + spin_unlock(&pa->pa_lock); + } + rcu_read_unlock(); + size = end - start; + + /* XXX: extra loop to check we really don't overlap preallocations */ + rcu_read_lock(); + list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { + unsigned long pa_end; + spin_lock(&pa->pa_lock); + if (pa->pa_deleted == 0) { + pa_end = pa->pa_lstart + pa->pa_len; + BUG_ON(!(start >= pa_end || end <= pa->pa_lstart)); + } + spin_unlock(&pa->pa_lock); + } + rcu_read_unlock(); + + if (start + size <= ac->ac_o_ex.fe_logical && + start > ac->ac_o_ex.fe_logical) { + printk(KERN_ERR "start %lu, size %lu, fe_logical %lu\n", + (unsigned long) start, (unsigned long) size, + (unsigned long) ac->ac_o_ex.fe_logical); + } + BUG_ON(start + size <= ac->ac_o_ex.fe_logical && + start > ac->ac_o_ex.fe_logical); + BUG_ON(size <= 0 || size >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); + + /* now prepare goal request */ + + /* XXX: is it better to align blocks WRT to logical + * placement or satisfy big request as is */ + ac->ac_g_ex.fe_logical = start; + ac->ac_g_ex.fe_len = size; + + /* define goal start in order to merge */ + if (ar->pright && (ar->lright == (start + size))) { + /* merge to the right */ + ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size, + &ac->ac_f_ex.fe_group, + &ac->ac_f_ex.fe_start); + ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; + } + if (ar->pleft && (ar->lleft + 1 == start)) { + /* merge to the left */ + ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1, + &ac->ac_f_ex.fe_group, + &ac->ac_f_ex.fe_start); + ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; + } + + mb_debug("goal: %u(was %u) blocks at %u\n", (unsigned) size, + (unsigned) orig_size, (unsigned) start); +} + +static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + + if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) { + atomic_inc(&sbi->s_bal_reqs); + atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); + if (ac->ac_o_ex.fe_len >= ac->ac_g_ex.fe_len) + atomic_inc(&sbi->s_bal_success); + atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); + if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && + ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) + atomic_inc(&sbi->s_bal_goals); + if (ac->ac_found > sbi->s_mb_max_to_scan) + atomic_inc(&sbi->s_bal_breaks); + } + + ext4_mb_store_history(ac); +} + +/* + * use blocks preallocated to inode + */ +static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, + struct ext4_prealloc_space *pa) +{ + ext4_fsblk_t start; + ext4_fsblk_t end; + int len; + + /* found preallocated blocks, use them */ + start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart); + end = min(pa->pa_pstart + pa->pa_len, start + ac->ac_o_ex.fe_len); + len = end - start; + ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group, + &ac->ac_b_ex.fe_start); + ac->ac_b_ex.fe_len = len; + ac->ac_status = AC_STATUS_FOUND; + ac->ac_pa = pa; + + BUG_ON(start < pa->pa_pstart); + BUG_ON(start + len > pa->pa_pstart + pa->pa_len); + BUG_ON(pa->pa_free < len); + pa->pa_free -= len; + + mb_debug("use %llu/%u from inode pa %p\n", start, len, pa); +} + +/* + * use blocks preallocated to locality group + */ +static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, + struct ext4_prealloc_space *pa) +{ + unsigned int len = ac->ac_o_ex.fe_len; + + ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart, + &ac->ac_b_ex.fe_group, + &ac->ac_b_ex.fe_start); + ac->ac_b_ex.fe_len = len; + ac->ac_status = AC_STATUS_FOUND; + ac->ac_pa = pa; + + /* we don't correct pa_pstart or pa_plen here to avoid + * possible race when the group is being loaded concurrently + * instead we correct pa later, after blocks are marked + * in on-disk bitmap -- see ext4_mb_release_context() + * Other CPUs are prevented from allocating from this pa by lg_mutex + */ + mb_debug("use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa); +} + +/* + * Return the prealloc space that have minimal distance + * from the goal block. @cpa is the prealloc + * space that is having currently known minimal distance + * from the goal block. + */ +static struct ext4_prealloc_space * +ext4_mb_check_group_pa(ext4_fsblk_t goal_block, + struct ext4_prealloc_space *pa, + struct ext4_prealloc_space *cpa) +{ + ext4_fsblk_t cur_distance, new_distance; + + if (cpa == NULL) { + atomic_inc(&pa->pa_count); + return pa; + } + cur_distance = abs(goal_block - cpa->pa_pstart); + new_distance = abs(goal_block - pa->pa_pstart); + + if (cur_distance < new_distance) + return cpa; + + /* drop the previous reference */ + atomic_dec(&cpa->pa_count); + atomic_inc(&pa->pa_count); + return pa; +} + +/* + * search goal blocks in preallocated space + */ +static noinline_for_stack int +ext4_mb_use_preallocated(struct ext4_allocation_context *ac) +{ + int order, i; + struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); + struct ext4_locality_group *lg; + struct ext4_prealloc_space *pa, *cpa = NULL; + ext4_fsblk_t goal_block; + + /* only data can be preallocated */ + if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) + return 0; + + /* first, try per-file preallocation */ + rcu_read_lock(); + list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { + + /* all fields in this condition don't change, + * so we can skip locking for them */ + if (ac->ac_o_ex.fe_logical < pa->pa_lstart || + ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len) + continue; + + /* found preallocated blocks, use them */ + spin_lock(&pa->pa_lock); + if (pa->pa_deleted == 0 && pa->pa_free) { + atomic_inc(&pa->pa_count); + ext4_mb_use_inode_pa(ac, pa); + spin_unlock(&pa->pa_lock); + ac->ac_criteria = 10; + rcu_read_unlock(); + return 1; + } + spin_unlock(&pa->pa_lock); + } + rcu_read_unlock(); + + /* can we use group allocation? */ + if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)) + return 0; + + /* inode may have no locality group for some reason */ + lg = ac->ac_lg; + if (lg == NULL) + return 0; + order = fls(ac->ac_o_ex.fe_len) - 1; + if (order > PREALLOC_TB_SIZE - 1) + /* The max size of hash table is PREALLOC_TB_SIZE */ + order = PREALLOC_TB_SIZE - 1; + + goal_block = ac->ac_g_ex.fe_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb) + + ac->ac_g_ex.fe_start + + le32_to_cpu(EXT4_SB(ac->ac_sb)->s_es->s_first_data_block); + /* + * search for the prealloc space that is having + * minimal distance from the goal block. + */ + for (i = order; i < PREALLOC_TB_SIZE; i++) { + rcu_read_lock(); + list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i], + pa_inode_list) { + spin_lock(&pa->pa_lock); + if (pa->pa_deleted == 0 && + pa->pa_free >= ac->ac_o_ex.fe_len) { + + cpa = ext4_mb_check_group_pa(goal_block, + pa, cpa); + } + spin_unlock(&pa->pa_lock); + } + rcu_read_unlock(); + } + if (cpa) { + ext4_mb_use_group_pa(ac, cpa); + ac->ac_criteria = 20; + return 1; + } + return 0; +} + +/* + * the function goes through all block freed in the group + * but not yet committed and marks them used in in-core bitmap. + * buddy must be generated from this bitmap + * Need to be called with ext4 group lock (ext4_lock_group) + */ +static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, + ext4_group_t group) +{ + struct rb_node *n; + struct ext4_group_info *grp; + struct ext4_free_data *entry; + + grp = ext4_get_group_info(sb, group); + n = rb_first(&(grp->bb_free_root)); + + while (n) { + entry = rb_entry(n, struct ext4_free_data, node); + mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group), + bitmap, entry->start_blk, + entry->count); + n = rb_next(n); + } + return; +} + +/* + * the function goes through all preallocation in this group and marks them + * used in in-core bitmap. buddy must be generated from this bitmap + * Need to be called with ext4 group lock (ext4_lock_group) + */ +static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, + ext4_group_t group) +{ + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct ext4_prealloc_space *pa; + struct list_head *cur; + ext4_group_t groupnr; + ext4_grpblk_t start; + int preallocated = 0; + int count = 0; + int len; + + /* all form of preallocation discards first load group, + * so the only competing code is preallocation use. + * we don't need any locking here + * notice we do NOT ignore preallocations with pa_deleted + * otherwise we could leave used blocks available for + * allocation in buddy when concurrent ext4_mb_put_pa() + * is dropping preallocation + */ + list_for_each(cur, &grp->bb_prealloc_list) { + pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); + spin_lock(&pa->pa_lock); + ext4_get_group_no_and_offset(sb, pa->pa_pstart, + &groupnr, &start); + len = pa->pa_len; + spin_unlock(&pa->pa_lock); + if (unlikely(len == 0)) + continue; + BUG_ON(groupnr != group); + mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group), + bitmap, start, len); + preallocated += len; + count++; + } + mb_debug("prellocated %u for group %lu\n", preallocated, group); +} + +static void ext4_mb_pa_callback(struct rcu_head *head) +{ + struct ext4_prealloc_space *pa; + pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); + kmem_cache_free(ext4_pspace_cachep, pa); +} + +/* + * drops a reference to preallocated space descriptor + * if this was the last reference and the space is consumed + */ +static void ext4_mb_put_pa(struct ext4_allocation_context *ac, + struct super_block *sb, struct ext4_prealloc_space *pa) +{ + unsigned long grp; + + if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) + return; + + /* in this short window concurrent discard can set pa_deleted */ + spin_lock(&pa->pa_lock); + if (pa->pa_deleted == 1) { + spin_unlock(&pa->pa_lock); + return; + } + + pa->pa_deleted = 1; + spin_unlock(&pa->pa_lock); + + /* -1 is to protect from crossing allocation group */ + ext4_get_group_no_and_offset(sb, pa->pa_pstart - 1, &grp, NULL); + + /* + * possible race: + * + * P1 (buddy init) P2 (regular allocation) + * find block B in PA + * copy on-disk bitmap to buddy + * mark B in on-disk bitmap + * drop PA from group + * mark all PAs in buddy + * + * thus, P1 initializes buddy with B available. to prevent this + * we make "copy" and "mark all PAs" atomic and serialize "drop PA" + * against that pair + */ + ext4_lock_group(sb, grp); + list_del(&pa->pa_group_list); + ext4_unlock_group(sb, grp); + + spin_lock(pa->pa_obj_lock); + list_del_rcu(&pa->pa_inode_list); + spin_unlock(pa->pa_obj_lock); + + call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); +} + +/* + * creates new preallocated space for given inode + */ +static noinline_for_stack int +ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) +{ + struct super_block *sb = ac->ac_sb; + struct ext4_prealloc_space *pa; + struct ext4_group_info *grp; + struct ext4_inode_info *ei; + + /* preallocate only when found space is larger then requested */ + BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); + BUG_ON(ac->ac_status != AC_STATUS_FOUND); + BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); + + pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS); + if (pa == NULL) + return -ENOMEM; + + if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) { + int winl; + int wins; + int win; + int offs; + + /* we can't allocate as much as normalizer wants. + * so, found space must get proper lstart + * to cover original request */ + BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical); + BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len); + + /* we're limited by original request in that + * logical block must be covered any way + * winl is window we can move our chunk within */ + winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical; + + /* also, we should cover whole original request */ + wins = ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len; + + /* the smallest one defines real window */ + win = min(winl, wins); + + offs = ac->ac_o_ex.fe_logical % ac->ac_b_ex.fe_len; + if (offs && offs < win) + win = offs; + + ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - win; + BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); + BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); + } + + /* preallocation can change ac_b_ex, thus we store actually + * allocated blocks for history */ + ac->ac_f_ex = ac->ac_b_ex; + + pa->pa_lstart = ac->ac_b_ex.fe_logical; + pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); + pa->pa_len = ac->ac_b_ex.fe_len; + pa->pa_free = pa->pa_len; + atomic_set(&pa->pa_count, 1); + spin_lock_init(&pa->pa_lock); + INIT_LIST_HEAD(&pa->pa_inode_list); + INIT_LIST_HEAD(&pa->pa_group_list); + pa->pa_deleted = 0; + pa->pa_linear = 0; + + mb_debug("new inode pa %p: %llu/%u for %u\n", pa, + pa->pa_pstart, pa->pa_len, pa->pa_lstart); + + ext4_mb_use_inode_pa(ac, pa); + atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); + + ei = EXT4_I(ac->ac_inode); + grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); + + pa->pa_obj_lock = &ei->i_prealloc_lock; + pa->pa_inode = ac->ac_inode; + + ext4_lock_group(sb, ac->ac_b_ex.fe_group); + list_add(&pa->pa_group_list, &grp->bb_prealloc_list); + ext4_unlock_group(sb, ac->ac_b_ex.fe_group); + + spin_lock(pa->pa_obj_lock); + list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list); + spin_unlock(pa->pa_obj_lock); + + return 0; +} + +/* + * creates new preallocated space for locality group inodes belongs to + */ +static noinline_for_stack int +ext4_mb_new_group_pa(struct ext4_allocation_context *ac) +{ + struct super_block *sb = ac->ac_sb; + struct ext4_locality_group *lg; + struct ext4_prealloc_space *pa; + struct ext4_group_info *grp; + + /* preallocate only when found space is larger then requested */ + BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); + BUG_ON(ac->ac_status != AC_STATUS_FOUND); + BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); + + BUG_ON(ext4_pspace_cachep == NULL); + pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS); + if (pa == NULL) + return -ENOMEM; + + /* preallocation can change ac_b_ex, thus we store actually + * allocated blocks for history */ + ac->ac_f_ex = ac->ac_b_ex; + + pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); + pa->pa_lstart = pa->pa_pstart; + pa->pa_len = ac->ac_b_ex.fe_len; + pa->pa_free = pa->pa_len; + atomic_set(&pa->pa_count, 1); + spin_lock_init(&pa->pa_lock); + INIT_LIST_HEAD(&pa->pa_inode_list); + INIT_LIST_HEAD(&pa->pa_group_list); + pa->pa_deleted = 0; + pa->pa_linear = 1; + + mb_debug("new group pa %p: %llu/%u for %u\n", pa, + pa->pa_pstart, pa->pa_len, pa->pa_lstart); + + ext4_mb_use_group_pa(ac, pa); + atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); + + grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); + lg = ac->ac_lg; + BUG_ON(lg == NULL); + + pa->pa_obj_lock = &lg->lg_prealloc_lock; + pa->pa_inode = NULL; + + ext4_lock_group(sb, ac->ac_b_ex.fe_group); + list_add(&pa->pa_group_list, &grp->bb_prealloc_list); + ext4_unlock_group(sb, ac->ac_b_ex.fe_group); + + /* + * We will later add the new pa to the right bucket + * after updating the pa_free in ext4_mb_release_context + */ + return 0; +} + +static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac) +{ + int err; + + if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) + err = ext4_mb_new_group_pa(ac); + else + err = ext4_mb_new_inode_pa(ac); + return err; +} + +/* + * finds all unused blocks in on-disk bitmap, frees them in + * in-core bitmap and buddy. + * @pa must be unlinked from inode and group lists, so that + * nobody else can find/use it. + * the caller MUST hold group/inode locks. + * TODO: optimize the case when there are no in-core structures yet + */ +static noinline_for_stack int +ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, + struct ext4_prealloc_space *pa, + struct ext4_allocation_context *ac) +{ + struct super_block *sb = e4b->bd_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + unsigned long end; + unsigned long next; + ext4_group_t group; + ext4_grpblk_t bit; + sector_t start; + int err = 0; + int free = 0; + + BUG_ON(pa->pa_deleted == 0); + ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); + BUG_ON(group != e4b->bd_group && pa->pa_len != 0); + end = bit + pa->pa_len; + + if (ac) { + ac->ac_sb = sb; + ac->ac_inode = pa->pa_inode; + ac->ac_op = EXT4_MB_HISTORY_DISCARD; + } + + while (bit < end) { + bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit); + if (bit >= end) + break; + next = mb_find_next_bit(bitmap_bh->b_data, end, bit); + start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit + + le32_to_cpu(sbi->s_es->s_first_data_block); + mb_debug(" free preallocated %u/%u in group %u\n", + (unsigned) start, (unsigned) next - bit, + (unsigned) group); + free += next - bit; + + if (ac) { + ac->ac_b_ex.fe_group = group; + ac->ac_b_ex.fe_start = bit; + ac->ac_b_ex.fe_len = next - bit; + ac->ac_b_ex.fe_logical = 0; + ext4_mb_store_history(ac); + } + + mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); + bit = next + 1; + } + if (free != pa->pa_free) { + printk(KERN_CRIT "pa %p: logic %lu, phys. %lu, len %lu\n", + pa, (unsigned long) pa->pa_lstart, + (unsigned long) pa->pa_pstart, + (unsigned long) pa->pa_len); + ext4_error(sb, __func__, "free %u, pa_free %u\n", + free, pa->pa_free); + /* + * pa is already deleted so we use the value obtained + * from the bitmap and continue. + */ + } + atomic_add(free, &sbi->s_mb_discarded); + + return err; +} + +static noinline_for_stack int +ext4_mb_release_group_pa(struct ext4_buddy *e4b, + struct ext4_prealloc_space *pa, + struct ext4_allocation_context *ac) +{ + struct super_block *sb = e4b->bd_sb; + ext4_group_t group; + ext4_grpblk_t bit; + + if (ac) + ac->ac_op = EXT4_MB_HISTORY_DISCARD; + + BUG_ON(pa->pa_deleted == 0); + ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); + BUG_ON(group != e4b->bd_group && pa->pa_len != 0); + mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len); + atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded); + + if (ac) { + ac->ac_sb = sb; + ac->ac_inode = NULL; + ac->ac_b_ex.fe_group = group; + ac->ac_b_ex.fe_start = bit; + ac->ac_b_ex.fe_len = pa->pa_len; + ac->ac_b_ex.fe_logical = 0; + ext4_mb_store_history(ac); + } + + return 0; +} + +/* + * releases all preallocations in given group + * + * first, we need to decide discard policy: + * - when do we discard + * 1) ENOSPC + * - how many do we discard + * 1) how many requested + */ +static noinline_for_stack int +ext4_mb_discard_group_preallocations(struct super_block *sb, + ext4_group_t group, int needed) +{ + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct buffer_head *bitmap_bh = NULL; + struct ext4_prealloc_space *pa, *tmp; + struct ext4_allocation_context *ac; + struct list_head list; + struct ext4_buddy e4b; + int err; + int busy = 0; + int free = 0; + + mb_debug("discard preallocation for group %lu\n", group); + + if (list_empty(&grp->bb_prealloc_list)) + return 0; + + bitmap_bh = ext4_read_block_bitmap(sb, group); + if (bitmap_bh == NULL) { + ext4_error(sb, __func__, "Error in reading block " + "bitmap for %lu\n", group); + return 0; + } + + err = ext4_mb_load_buddy(sb, group, &e4b); + if (err) { + ext4_error(sb, __func__, "Error in loading buddy " + "information for %lu\n", group); + put_bh(bitmap_bh); + return 0; + } + + if (needed == 0) + needed = EXT4_BLOCKS_PER_GROUP(sb) + 1; + + INIT_LIST_HEAD(&list); + ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); +repeat: + ext4_lock_group(sb, group); + list_for_each_entry_safe(pa, tmp, + &grp->bb_prealloc_list, pa_group_list) { + spin_lock(&pa->pa_lock); + if (atomic_read(&pa->pa_count)) { + spin_unlock(&pa->pa_lock); + busy = 1; + continue; + } + if (pa->pa_deleted) { + spin_unlock(&pa->pa_lock); + continue; + } + + /* seems this one can be freed ... */ + pa->pa_deleted = 1; + + /* we can trust pa_free ... */ + free += pa->pa_free; + + spin_unlock(&pa->pa_lock); + + list_del(&pa->pa_group_list); + list_add(&pa->u.pa_tmp_list, &list); + } + + /* if we still need more blocks and some PAs were used, try again */ + if (free < needed && busy) { + busy = 0; + ext4_unlock_group(sb, group); + /* + * Yield the CPU here so that we don't get soft lockup + * in non preempt case. + */ + yield(); + goto repeat; + } + + /* found anything to free? */ + if (list_empty(&list)) { + BUG_ON(free != 0); + goto out; + } + + /* now free all selected PAs */ + list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { + + /* remove from object (inode or locality group) */ + spin_lock(pa->pa_obj_lock); + list_del_rcu(&pa->pa_inode_list); + spin_unlock(pa->pa_obj_lock); + + if (pa->pa_linear) + ext4_mb_release_group_pa(&e4b, pa, ac); + else + ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); + + list_del(&pa->u.pa_tmp_list); + call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); + } + +out: + ext4_unlock_group(sb, group); + if (ac) + kmem_cache_free(ext4_ac_cachep, ac); + ext4_mb_release_desc(&e4b); + put_bh(bitmap_bh); + return free; +} + +/* + * releases all non-used preallocated blocks for given inode + * + * It's important to discard preallocations under i_data_sem + * We don't want another block to be served from the prealloc + * space when we are discarding the inode prealloc space. + * + * FIXME!! Make sure it is valid at all the call sites + */ +void ext4_discard_preallocations(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct super_block *sb = inode->i_sb; + struct buffer_head *bitmap_bh = NULL; + struct ext4_prealloc_space *pa, *tmp; + struct ext4_allocation_context *ac; + ext4_group_t group = 0; + struct list_head list; + struct ext4_buddy e4b; + int err; + + if (!S_ISREG(inode->i_mode)) { + /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/ + return; + } + + mb_debug("discard preallocation for inode %lu\n", inode->i_ino); + + INIT_LIST_HEAD(&list); + + ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); +repeat: + /* first, collect all pa's in the inode */ + spin_lock(&ei->i_prealloc_lock); + while (!list_empty(&ei->i_prealloc_list)) { + pa = list_entry(ei->i_prealloc_list.next, + struct ext4_prealloc_space, pa_inode_list); + BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock); + spin_lock(&pa->pa_lock); + if (atomic_read(&pa->pa_count)) { + /* this shouldn't happen often - nobody should + * use preallocation while we're discarding it */ + spin_unlock(&pa->pa_lock); + spin_unlock(&ei->i_prealloc_lock); + printk(KERN_ERR "uh-oh! used pa while discarding\n"); + WARN_ON(1); + schedule_timeout_uninterruptible(HZ); + goto repeat; + + } + if (pa->pa_deleted == 0) { + pa->pa_deleted = 1; + spin_unlock(&pa->pa_lock); + list_del_rcu(&pa->pa_inode_list); + list_add(&pa->u.pa_tmp_list, &list); + continue; + } + + /* someone is deleting pa right now */ + spin_unlock(&pa->pa_lock); + spin_unlock(&ei->i_prealloc_lock); + + /* we have to wait here because pa_deleted + * doesn't mean pa is already unlinked from + * the list. as we might be called from + * ->clear_inode() the inode will get freed + * and concurrent thread which is unlinking + * pa from inode's list may access already + * freed memory, bad-bad-bad */ + + /* XXX: if this happens too often, we can + * add a flag to force wait only in case + * of ->clear_inode(), but not in case of + * regular truncate */ + schedule_timeout_uninterruptible(HZ); + goto repeat; + } + spin_unlock(&ei->i_prealloc_lock); + + list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { + BUG_ON(pa->pa_linear != 0); + ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); + + err = ext4_mb_load_buddy(sb, group, &e4b); + if (err) { + ext4_error(sb, __func__, "Error in loading buddy " + "information for %lu\n", group); + continue; + } + + bitmap_bh = ext4_read_block_bitmap(sb, group); + if (bitmap_bh == NULL) { + ext4_error(sb, __func__, "Error in reading block " + "bitmap for %lu\n", group); + ext4_mb_release_desc(&e4b); + continue; + } + + ext4_lock_group(sb, group); + list_del(&pa->pa_group_list); + ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); + ext4_unlock_group(sb, group); + + ext4_mb_release_desc(&e4b); + put_bh(bitmap_bh); + + list_del(&pa->u.pa_tmp_list); + call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); + } + if (ac) + kmem_cache_free(ext4_ac_cachep, ac); +} + +/* + * finds all preallocated spaces and return blocks being freed to them + * if preallocated space becomes full (no block is used from the space) + * then the function frees space in buddy + * XXX: at the moment, truncate (which is the only way to free blocks) + * discards all preallocations + */ +static void ext4_mb_return_to_preallocation(struct inode *inode, + struct ext4_buddy *e4b, + sector_t block, int count) +{ + BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list)); +} +#ifdef MB_DEBUG +static void ext4_mb_show_ac(struct ext4_allocation_context *ac) +{ + struct super_block *sb = ac->ac_sb; + ext4_group_t i; + + printk(KERN_ERR "EXT4-fs: Can't allocate:" + " Allocation context details:\n"); + printk(KERN_ERR "EXT4-fs: status %d flags %d\n", + ac->ac_status, ac->ac_flags); + printk(KERN_ERR "EXT4-fs: orig %lu/%lu/%lu@%lu, goal %lu/%lu/%lu@%lu, " + "best %lu/%lu/%lu@%lu cr %d\n", + (unsigned long)ac->ac_o_ex.fe_group, + (unsigned long)ac->ac_o_ex.fe_start, + (unsigned long)ac->ac_o_ex.fe_len, + (unsigned long)ac->ac_o_ex.fe_logical, + (unsigned long)ac->ac_g_ex.fe_group, + (unsigned long)ac->ac_g_ex.fe_start, + (unsigned long)ac->ac_g_ex.fe_len, + (unsigned long)ac->ac_g_ex.fe_logical, + (unsigned long)ac->ac_b_ex.fe_group, + (unsigned long)ac->ac_b_ex.fe_start, + (unsigned long)ac->ac_b_ex.fe_len, + (unsigned long)ac->ac_b_ex.fe_logical, + (int)ac->ac_criteria); + printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned, + ac->ac_found); + printk(KERN_ERR "EXT4-fs: groups: \n"); + for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { + struct ext4_group_info *grp = ext4_get_group_info(sb, i); + struct ext4_prealloc_space *pa; + ext4_grpblk_t start; + struct list_head *cur; + ext4_lock_group(sb, i); + list_for_each(cur, &grp->bb_prealloc_list) { + pa = list_entry(cur, struct ext4_prealloc_space, + pa_group_list); + spin_lock(&pa->pa_lock); + ext4_get_group_no_and_offset(sb, pa->pa_pstart, + NULL, &start); + spin_unlock(&pa->pa_lock); + printk(KERN_ERR "PA:%lu:%d:%u \n", i, + start, pa->pa_len); + } + ext4_unlock_group(sb, i); + + if (grp->bb_free == 0) + continue; + printk(KERN_ERR "%lu: %d/%d \n", + i, grp->bb_free, grp->bb_fragments); + } + printk(KERN_ERR "\n"); +} +#else +static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac) +{ + return; +} +#endif + +/* + * We use locality group preallocation for small size file. The size of the + * file is determined by the current size or the resulting size after + * allocation which ever is larger + * + * One can tune this size via /proc/fs/ext4/<partition>/stream_req + */ +static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + int bsbits = ac->ac_sb->s_blocksize_bits; + loff_t size, isize; + + if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) + return; + + size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; + isize = i_size_read(ac->ac_inode) >> bsbits; + size = max(size, isize); + + /* don't use group allocation for large files */ + if (size >= sbi->s_mb_stream_request) + return; + + if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) + return; + + BUG_ON(ac->ac_lg != NULL); + /* + * locality group prealloc space are per cpu. The reason for having + * per cpu locality group is to reduce the contention between block + * request from multiple CPUs. + */ + ac->ac_lg = per_cpu_ptr(sbi->s_locality_groups, raw_smp_processor_id()); + + /* we're going to use group allocation */ + ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC; + + /* serialize all allocations in the group */ + mutex_lock(&ac->ac_lg->lg_mutex); +} + +static noinline_for_stack int +ext4_mb_initialize_context(struct ext4_allocation_context *ac, + struct ext4_allocation_request *ar) +{ + struct super_block *sb = ar->inode->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + ext4_group_t group; + unsigned long len; + unsigned long goal; + ext4_grpblk_t block; + + /* we can't allocate > group size */ + len = ar->len; + + /* just a dirty hack to filter too big requests */ + if (len >= EXT4_BLOCKS_PER_GROUP(sb) - 10) + len = EXT4_BLOCKS_PER_GROUP(sb) - 10; + + /* start searching from the goal */ + goal = ar->goal; + if (goal < le32_to_cpu(es->s_first_data_block) || + goal >= ext4_blocks_count(es)) + goal = le32_to_cpu(es->s_first_data_block); + ext4_get_group_no_and_offset(sb, goal, &group, &block); + + /* set up allocation goals */ + ac->ac_b_ex.fe_logical = ar->logical; + ac->ac_b_ex.fe_group = 0; + ac->ac_b_ex.fe_start = 0; + ac->ac_b_ex.fe_len = 0; + ac->ac_status = AC_STATUS_CONTINUE; + ac->ac_groups_scanned = 0; + ac->ac_ex_scanned = 0; + ac->ac_found = 0; + ac->ac_sb = sb; + ac->ac_inode = ar->inode; + ac->ac_o_ex.fe_logical = ar->logical; + ac->ac_o_ex.fe_group = group; + ac->ac_o_ex.fe_start = block; + ac->ac_o_ex.fe_len = len; + ac->ac_g_ex.fe_logical = ar->logical; + ac->ac_g_ex.fe_group = group; + ac->ac_g_ex.fe_start = block; + ac->ac_g_ex.fe_len = len; + ac->ac_f_ex.fe_len = 0; + ac->ac_flags = ar->flags; + ac->ac_2order = 0; + ac->ac_criteria = 0; + ac->ac_pa = NULL; + ac->ac_bitmap_page = NULL; + ac->ac_buddy_page = NULL; + ac->alloc_semp = NULL; + ac->ac_lg = NULL; + + /* we have to define context: we'll we work with a file or + * locality group. this is a policy, actually */ + ext4_mb_group_or_file(ac); + + mb_debug("init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, " + "left: %u/%u, right %u/%u to %swritable\n", + (unsigned) ar->len, (unsigned) ar->logical, + (unsigned) ar->goal, ac->ac_flags, ac->ac_2order, + (unsigned) ar->lleft, (unsigned) ar->pleft, + (unsigned) ar->lright, (unsigned) ar->pright, + atomic_read(&ar->inode->i_writecount) ? "" : "non-"); + return 0; + +} + +static noinline_for_stack void +ext4_mb_discard_lg_preallocations(struct super_block *sb, + struct ext4_locality_group *lg, + int order, int total_entries) +{ + ext4_group_t group = 0; + struct ext4_buddy e4b; + struct list_head discard_list; + struct ext4_prealloc_space *pa, *tmp; + struct ext4_allocation_context *ac; + + mb_debug("discard locality group preallocation\n"); + + INIT_LIST_HEAD(&discard_list); + ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); + + spin_lock(&lg->lg_prealloc_lock); + list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], + pa_inode_list) { + spin_lock(&pa->pa_lock); + if (atomic_read(&pa->pa_count)) { + /* + * This is the pa that we just used + * for block allocation. So don't + * free that + */ + spin_unlock(&pa->pa_lock); + continue; + } + if (pa->pa_deleted) { + spin_unlock(&pa->pa_lock); + continue; + } + /* only lg prealloc space */ + BUG_ON(!pa->pa_linear); + + /* seems this one can be freed ... */ + pa->pa_deleted = 1; + spin_unlock(&pa->pa_lock); + + list_del_rcu(&pa->pa_inode_list); + list_add(&pa->u.pa_tmp_list, &discard_list); + + total_entries--; + if (total_entries <= 5) { + /* + * we want to keep only 5 entries + * allowing it to grow to 8. This + * mak sure we don't call discard + * soon for this list. + */ + break; + } + } + spin_unlock(&lg->lg_prealloc_lock); + + list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) { + + ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); + if (ext4_mb_load_buddy(sb, group, &e4b)) { + ext4_error(sb, __func__, "Error in loading buddy " + "information for %lu\n", group); + continue; + } + ext4_lock_group(sb, group); + list_del(&pa->pa_group_list); + ext4_mb_release_group_pa(&e4b, pa, ac); + ext4_unlock_group(sb, group); + + ext4_mb_release_desc(&e4b); + list_del(&pa->u.pa_tmp_list); + call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); + } + if (ac) + kmem_cache_free(ext4_ac_cachep, ac); +} + +/* + * We have incremented pa_count. So it cannot be freed at this + * point. Also we hold lg_mutex. So no parallel allocation is + * possible from this lg. That means pa_free cannot be updated. + * + * A parallel ext4_mb_discard_group_preallocations is possible. + * which can cause the lg_prealloc_list to be updated. + */ + +static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) +{ + int order, added = 0, lg_prealloc_count = 1; + struct super_block *sb = ac->ac_sb; + struct ext4_locality_group *lg = ac->ac_lg; + struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa; + + order = fls(pa->pa_free) - 1; + if (order > PREALLOC_TB_SIZE - 1) + /* The max size of hash table is PREALLOC_TB_SIZE */ + order = PREALLOC_TB_SIZE - 1; + /* Add the prealloc space to lg */ + rcu_read_lock(); + list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order], + pa_inode_list) { + spin_lock(&tmp_pa->pa_lock); + if (tmp_pa->pa_deleted) { + spin_unlock(&pa->pa_lock); + continue; + } + if (!added && pa->pa_free < tmp_pa->pa_free) { + /* Add to the tail of the previous entry */ + list_add_tail_rcu(&pa->pa_inode_list, + &tmp_pa->pa_inode_list); + added = 1; + /* + * we want to count the total + * number of entries in the list + */ + } + spin_unlock(&tmp_pa->pa_lock); + lg_prealloc_count++; + } + if (!added) + list_add_tail_rcu(&pa->pa_inode_list, + &lg->lg_prealloc_list[order]); + rcu_read_unlock(); + + /* Now trim the list to be not more than 8 elements */ + if (lg_prealloc_count > 8) { + ext4_mb_discard_lg_preallocations(sb, lg, + order, lg_prealloc_count); + return; + } + return ; +} + +/* + * release all resource we used in allocation + */ +static int ext4_mb_release_context(struct ext4_allocation_context *ac) +{ + struct ext4_prealloc_space *pa = ac->ac_pa; + if (pa) { + if (pa->pa_linear) { + /* see comment in ext4_mb_use_group_pa() */ + spin_lock(&pa->pa_lock); + pa->pa_pstart += ac->ac_b_ex.fe_len; + pa->pa_lstart += ac->ac_b_ex.fe_len; + pa->pa_free -= ac->ac_b_ex.fe_len; + pa->pa_len -= ac->ac_b_ex.fe_len; + spin_unlock(&pa->pa_lock); + } + } + if (ac->alloc_semp) + up_read(ac->alloc_semp); + if (pa) { + /* + * We want to add the pa to the right bucket. + * Remove it from the list and while adding + * make sure the list to which we are adding + * doesn't grow big. We need to release + * alloc_semp before calling ext4_mb_add_n_trim() + */ + if (pa->pa_linear && likely(pa->pa_free)) { + spin_lock(pa->pa_obj_lock); + list_del_rcu(&pa->pa_inode_list); + spin_unlock(pa->pa_obj_lock); + ext4_mb_add_n_trim(ac); + } + ext4_mb_put_pa(ac, ac->ac_sb, pa); + } + if (ac->ac_bitmap_page) + page_cache_release(ac->ac_bitmap_page); + if (ac->ac_buddy_page) + page_cache_release(ac->ac_buddy_page); + if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) + mutex_unlock(&ac->ac_lg->lg_mutex); + ext4_mb_collect_stats(ac); + return 0; +} + +static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) +{ + ext4_group_t i; + int ret; + int freed = 0; + + for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) { + ret = ext4_mb_discard_group_preallocations(sb, i, needed); + freed += ret; + needed -= ret; + } + + return freed; +} + +/* + * Main entry point into mballoc to allocate blocks + * it tries to use preallocation first, then falls back + * to usual allocation + */ +ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, + struct ext4_allocation_request *ar, int *errp) +{ + int freed; + struct ext4_allocation_context *ac = NULL; + struct ext4_sb_info *sbi; + struct super_block *sb; + ext4_fsblk_t block = 0; + unsigned long inquota; + unsigned long reserv_blks = 0; + + sb = ar->inode->i_sb; + sbi = EXT4_SB(sb); + + if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) { + /* + * With delalloc we already reserved the blocks + */ + while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) { + /* let others to free the space */ + yield(); + ar->len = ar->len >> 1; + } + if (!ar->len) { + *errp = -ENOSPC; + return 0; + } + reserv_blks = ar->len; + } + while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) { + ar->flags |= EXT4_MB_HINT_NOPREALLOC; + ar->len--; + } + if (ar->len == 0) { + *errp = -EDQUOT; + goto out3; + } + inquota = ar->len; + + if (EXT4_I(ar->inode)->i_delalloc_reserved_flag) + ar->flags |= EXT4_MB_DELALLOC_RESERVED; + + ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); + if (!ac) { + ar->len = 0; + *errp = -ENOMEM; + goto out1; + } + + *errp = ext4_mb_initialize_context(ac, ar); + if (*errp) { + ar->len = 0; + goto out2; + } + + ac->ac_op = EXT4_MB_HISTORY_PREALLOC; + if (!ext4_mb_use_preallocated(ac)) { + ac->ac_op = EXT4_MB_HISTORY_ALLOC; + ext4_mb_normalize_request(ac, ar); +repeat: + /* allocate space in core */ + ext4_mb_regular_allocator(ac); + + /* as we've just preallocated more space than + * user requested orinally, we store allocated + * space in a special descriptor */ + if (ac->ac_status == AC_STATUS_FOUND && + ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) + ext4_mb_new_preallocation(ac); + } + if (likely(ac->ac_status == AC_STATUS_FOUND)) { + *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks); + if (*errp == -EAGAIN) { + /* + * drop the reference that we took + * in ext4_mb_use_best_found + */ + ext4_mb_release_context(ac); + ac->ac_b_ex.fe_group = 0; + ac->ac_b_ex.fe_start = 0; + ac->ac_b_ex.fe_len = 0; + ac->ac_status = AC_STATUS_CONTINUE; + goto repeat; + } else if (*errp) { + ac->ac_b_ex.fe_len = 0; + ar->len = 0; + ext4_mb_show_ac(ac); + } else { + block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); + ar->len = ac->ac_b_ex.fe_len; + } + } else { + freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len); + if (freed) + goto repeat; + *errp = -ENOSPC; + ac->ac_b_ex.fe_len = 0; + ar->len = 0; + ext4_mb_show_ac(ac); + } + + ext4_mb_release_context(ac); + +out2: + kmem_cache_free(ext4_ac_cachep, ac); +out1: + if (ar->len < inquota) + DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len); +out3: + if (!ar->len) { + if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) + /* release all the reserved blocks if non delalloc */ + percpu_counter_sub(&sbi->s_dirtyblocks_counter, + reserv_blks); + } + + return block; +} + +/* + * We can merge two free data extents only if the physical blocks + * are contiguous, AND the extents were freed by the same transaction, + * AND the blocks are associated with the same group. + */ +static int can_merge(struct ext4_free_data *entry1, + struct ext4_free_data *entry2) +{ + if ((entry1->t_tid == entry2->t_tid) && + (entry1->group == entry2->group) && + ((entry1->start_blk + entry1->count) == entry2->start_blk)) + return 1; + return 0; +} + +static noinline_for_stack int +ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, + struct ext4_free_data *new_entry) +{ + ext4_grpblk_t block; + struct ext4_free_data *entry; + struct ext4_group_info *db = e4b->bd_info; + struct super_block *sb = e4b->bd_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct rb_node **n = &db->bb_free_root.rb_node, *node; + struct rb_node *parent = NULL, *new_node; + + + BUG_ON(e4b->bd_bitmap_page == NULL); + BUG_ON(e4b->bd_buddy_page == NULL); + + new_node = &new_entry->node; + block = new_entry->start_blk; + + if (!*n) { + /* first free block exent. We need to + protect buddy cache from being freed, + * otherwise we'll refresh it from + * on-disk bitmap and lose not-yet-available + * blocks */ + page_cache_get(e4b->bd_buddy_page); + page_cache_get(e4b->bd_bitmap_page); + } + while (*n) { + parent = *n; + entry = rb_entry(parent, struct ext4_free_data, node); + if (block < entry->start_blk) + n = &(*n)->rb_left; + else if (block >= (entry->start_blk + entry->count)) + n = &(*n)->rb_right; + else { + ext4_error(sb, __func__, + "Double free of blocks %d (%d %d)\n", + block, entry->start_blk, entry->count); + return 0; + } + } + + rb_link_node(new_node, parent, n); + rb_insert_color(new_node, &db->bb_free_root); + + /* Now try to see the extent can be merged to left and right */ + node = rb_prev(new_node); + if (node) { + entry = rb_entry(node, struct ext4_free_data, node); + if (can_merge(entry, new_entry)) { + new_entry->start_blk = entry->start_blk; + new_entry->count += entry->count; + rb_erase(node, &(db->bb_free_root)); + spin_lock(&sbi->s_md_lock); + list_del(&entry->list); + spin_unlock(&sbi->s_md_lock); + kmem_cache_free(ext4_free_ext_cachep, entry); + } + } + + node = rb_next(new_node); + if (node) { + entry = rb_entry(node, struct ext4_free_data, node); + if (can_merge(new_entry, entry)) { + new_entry->count += entry->count; + rb_erase(node, &(db->bb_free_root)); + spin_lock(&sbi->s_md_lock); + list_del(&entry->list); + spin_unlock(&sbi->s_md_lock); + kmem_cache_free(ext4_free_ext_cachep, entry); + } + } + /* Add the extent to transaction's private list */ + spin_lock(&sbi->s_md_lock); + list_add(&new_entry->list, &handle->h_transaction->t_private_list); + spin_unlock(&sbi->s_md_lock); + return 0; +} + +/* + * Main entry point into mballoc to free blocks + */ +void ext4_mb_free_blocks(handle_t *handle, struct inode *inode, + unsigned long block, unsigned long count, + int metadata, unsigned long *freed) +{ + struct buffer_head *bitmap_bh = NULL; + struct super_block *sb = inode->i_sb; + struct ext4_allocation_context *ac = NULL; + struct ext4_group_desc *gdp; + struct ext4_super_block *es; + unsigned long overflow; + ext4_grpblk_t bit; + struct buffer_head *gd_bh; + ext4_group_t block_group; + struct ext4_sb_info *sbi; + struct ext4_buddy e4b; + int err = 0; + int ret; + + *freed = 0; + + sbi = EXT4_SB(sb); + es = EXT4_SB(sb)->s_es; + if (block < le32_to_cpu(es->s_first_data_block) || + block + count < block || + block + count > ext4_blocks_count(es)) { + ext4_error(sb, __func__, + "Freeing blocks not in datazone - " + "block = %lu, count = %lu", block, count); + goto error_return; + } + + ext4_debug("freeing block %lu\n", block); + + ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); + if (ac) { + ac->ac_op = EXT4_MB_HISTORY_FREE; + ac->ac_inode = inode; + ac->ac_sb = sb; + } + +do_more: + overflow = 0; + ext4_get_group_no_and_offset(sb, block, &block_group, &bit); + + /* + * Check to see if we are freeing blocks across a group + * boundary. + */ + if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { + overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb); + count -= overflow; + } + bitmap_bh = ext4_read_block_bitmap(sb, block_group); + if (!bitmap_bh) { + err = -EIO; + goto error_return; + } + gdp = ext4_get_group_desc(sb, block_group, &gd_bh); + if (!gdp) { + err = -EIO; + goto error_return; + } + + if (in_range(ext4_block_bitmap(sb, gdp), block, count) || + in_range(ext4_inode_bitmap(sb, gdp), block, count) || + in_range(block, ext4_inode_table(sb, gdp), + EXT4_SB(sb)->s_itb_per_group) || + in_range(block + count - 1, ext4_inode_table(sb, gdp), + EXT4_SB(sb)->s_itb_per_group)) { + + ext4_error(sb, __func__, + "Freeing blocks in system zone - " + "Block = %lu, count = %lu", block, count); + /* err = 0. ext4_std_error should be a no op */ + goto error_return; + } + + BUFFER_TRACE(bitmap_bh, "getting write access"); + err = ext4_journal_get_write_access(handle, bitmap_bh); + if (err) + goto error_return; + + /* + * We are about to modify some metadata. Call the journal APIs + * to unshare ->b_data if a currently-committing transaction is + * using it + */ + BUFFER_TRACE(gd_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, gd_bh); + if (err) + goto error_return; +#ifdef AGGRESSIVE_CHECK + { + int i; + for (i = 0; i < count; i++) + BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); + } +#endif + if (ac) { + ac->ac_b_ex.fe_group = block_group; + ac->ac_b_ex.fe_start = bit; + ac->ac_b_ex.fe_len = count; + ext4_mb_store_history(ac); + } + + err = ext4_mb_load_buddy(sb, block_group, &e4b); + if (err) + goto error_return; + if (metadata) { + struct ext4_free_data *new_entry; + /* + * blocks being freed are metadata. these blocks shouldn't + * be used until this transaction is committed + */ + new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); + new_entry->start_blk = bit; + new_entry->group = block_group; + new_entry->count = count; + new_entry->t_tid = handle->h_transaction->t_tid; + ext4_lock_group(sb, block_group); + mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data, + bit, count); + ext4_mb_free_metadata(handle, &e4b, new_entry); + ext4_unlock_group(sb, block_group); + } else { + ext4_lock_group(sb, block_group); + /* need to update group_info->bb_free and bitmap + * with group lock held. generate_buddy look at + * them with group lock_held + */ + mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data, + bit, count); + mb_free_blocks(inode, &e4b, bit, count); + ext4_mb_return_to_preallocation(inode, &e4b, block, count); + ext4_unlock_group(sb, block_group); + } + + spin_lock(sb_bgl_lock(sbi, block_group)); + le16_add_cpu(&gdp->bg_free_blocks_count, count); + gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); + spin_unlock(sb_bgl_lock(sbi, block_group)); + percpu_counter_add(&sbi->s_freeblocks_counter, count); + + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, block_group); + spin_lock(sb_bgl_lock(sbi, flex_group)); + sbi->s_flex_groups[flex_group].free_blocks += count; + spin_unlock(sb_bgl_lock(sbi, flex_group)); + } + + ext4_mb_release_desc(&e4b); + + *freed += count; + + /* We dirtied the bitmap block */ + BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); + err = ext4_journal_dirty_metadata(handle, bitmap_bh); + + /* And the group descriptor block */ + BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); + ret = ext4_journal_dirty_metadata(handle, gd_bh); + if (!err) + err = ret; + + if (overflow && !err) { + block += count; + count = overflow; + put_bh(bitmap_bh); + goto do_more; + } + sb->s_dirt = 1; +error_return: + brelse(bitmap_bh); + ext4_std_error(sb, err); + if (ac) + kmem_cache_free(ext4_ac_cachep, ac); + return; +} diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h new file mode 100644 index 0000000..85eb45c --- /dev/null +++ b/fs/ext4/mballoc.h @@ -0,0 +1,308 @@ +/* + * fs/ext4/mballoc.h + * + * Written by: Alex Tomas <alex@clusterfs.com> + * + */ +#ifndef _EXT4_MBALLOC_H +#define _EXT4_MBALLOC_H + +#include <linux/time.h> +#include <linux/fs.h> +#include <linux/namei.h> +#include <linux/quotaops.h> +#include <linux/buffer_head.h> +#include <linux/module.h> +#include <linux/swap.h> +#include <linux/proc_fs.h> +#include <linux/pagemap.h> +#include <linux/seq_file.h> +#include <linux/version.h> +#include <linux/blkdev.h> +#include <linux/marker.h> +#include <linux/mutex.h> +#include "ext4_jbd2.h" +#include "ext4.h" +#include "group.h" + +/* + * with AGGRESSIVE_CHECK allocator runs consistency checks over + * structures. these checks slow things down a lot + */ +#define AGGRESSIVE_CHECK__ + +/* + * with DOUBLE_CHECK defined mballoc creates persistent in-core + * bitmaps, maintains and uses them to check for double allocations + */ +#define DOUBLE_CHECK__ + +/* + */ +#define MB_DEBUG__ +#ifdef MB_DEBUG +#define mb_debug(fmt, a...) printk(fmt, ##a) +#else +#define mb_debug(fmt, a...) +#endif + +/* + * with EXT4_MB_HISTORY mballoc stores last N allocations in memory + * and you can monitor it in /proc/fs/ext4/<dev>/mb_history + */ +#define EXT4_MB_HISTORY +#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */ +#define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */ +#define EXT4_MB_HISTORY_DISCARD 4 /* preallocation discarded */ +#define EXT4_MB_HISTORY_FREE 8 /* free */ + +#define EXT4_MB_HISTORY_DEFAULT (EXT4_MB_HISTORY_ALLOC | \ + EXT4_MB_HISTORY_PREALLOC) + +/* + * How long mballoc can look for a best extent (in found extents) + */ +#define MB_DEFAULT_MAX_TO_SCAN 200 + +/* + * How long mballoc must look for a best extent + */ +#define MB_DEFAULT_MIN_TO_SCAN 10 + +/* + * How many groups mballoc will scan looking for the best chunk + */ +#define MB_DEFAULT_MAX_GROUPS_TO_SCAN 5 + +/* + * with 'ext4_mb_stats' allocator will collect stats that will be + * shown at umount. The collecting costs though! + */ +#define MB_DEFAULT_STATS 1 + +/* + * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served + * by the stream allocator, which purpose is to pack requests + * as close each to other as possible to produce smooth I/O traffic + * We use locality group prealloc space for stream request. + * We can tune the same via /proc/fs/ext4/<parition>/stream_req + */ +#define MB_DEFAULT_STREAM_THRESHOLD 16 /* 64K */ + +/* + * for which requests use 2^N search using buddies + */ +#define MB_DEFAULT_ORDER2_REQS 2 + +/* + * default group prealloc size 512 blocks + */ +#define MB_DEFAULT_GROUP_PREALLOC 512 + + +struct ext4_free_data { + /* this links the free block information from group_info */ + struct rb_node node; + + /* this links the free block information from ext4_sb_info */ + struct list_head list; + + /* group which free block extent belongs */ + ext4_group_t group; + + /* free block extent */ + ext4_grpblk_t start_blk; + ext4_grpblk_t count; + + /* transaction which freed this extent */ + tid_t t_tid; +}; + +struct ext4_group_info { + unsigned long bb_state; + struct rb_root bb_free_root; + unsigned short bb_first_free; + unsigned short bb_free; + unsigned short bb_fragments; + struct list_head bb_prealloc_list; +#ifdef DOUBLE_CHECK + void *bb_bitmap; +#endif + struct rw_semaphore alloc_sem; + unsigned short bb_counters[]; +}; + +#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 +#define EXT4_GROUP_INFO_LOCKED_BIT 1 + +#define EXT4_MB_GRP_NEED_INIT(grp) \ + (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) + + +struct ext4_prealloc_space { + struct list_head pa_inode_list; + struct list_head pa_group_list; + union { + struct list_head pa_tmp_list; + struct rcu_head pa_rcu; + } u; + spinlock_t pa_lock; + atomic_t pa_count; + unsigned pa_deleted; + ext4_fsblk_t pa_pstart; /* phys. block */ + ext4_lblk_t pa_lstart; /* log. block */ + unsigned short pa_len; /* len of preallocated chunk */ + unsigned short pa_free; /* how many blocks are free */ + unsigned short pa_linear; /* consumed in one direction + * strictly, for grp prealloc */ + spinlock_t *pa_obj_lock; + struct inode *pa_inode; /* hack, for history only */ +}; + + +struct ext4_free_extent { + ext4_lblk_t fe_logical; + ext4_grpblk_t fe_start; + ext4_group_t fe_group; + int fe_len; +}; + +/* + * Locality group: + * we try to group all related changes together + * so that writeback can flush/allocate them together as well + * Size of lg_prealloc_list hash is determined by MB_DEFAULT_GROUP_PREALLOC + * (512). We store prealloc space into the hash based on the pa_free blocks + * order value.ie, fls(pa_free)-1; + */ +#define PREALLOC_TB_SIZE 10 +struct ext4_locality_group { + /* for allocator */ + /* to serialize allocates */ + struct mutex lg_mutex; + /* list of preallocations */ + struct list_head lg_prealloc_list[PREALLOC_TB_SIZE]; + spinlock_t lg_prealloc_lock; +}; + +struct ext4_allocation_context { + struct inode *ac_inode; + struct super_block *ac_sb; + + /* original request */ + struct ext4_free_extent ac_o_ex; + + /* goal request (after normalization) */ + struct ext4_free_extent ac_g_ex; + + /* the best found extent */ + struct ext4_free_extent ac_b_ex; + + /* copy of the bext found extent taken before preallocation efforts */ + struct ext4_free_extent ac_f_ex; + + /* number of iterations done. we have to track to limit searching */ + unsigned long ac_ex_scanned; + __u16 ac_groups_scanned; + __u16 ac_found; + __u16 ac_tail; + __u16 ac_buddy; + __u16 ac_flags; /* allocation hints */ + __u8 ac_status; + __u8 ac_criteria; + __u8 ac_repeats; + __u8 ac_2order; /* if request is to allocate 2^N blocks and + * N > 0, the field stores N, otherwise 0 */ + __u8 ac_op; /* operation, for history only */ + struct page *ac_bitmap_page; + struct page *ac_buddy_page; + /* + * pointer to the held semaphore upon successful + * block allocation + */ + struct rw_semaphore *alloc_semp; + struct ext4_prealloc_space *ac_pa; + struct ext4_locality_group *ac_lg; +}; + +#define AC_STATUS_CONTINUE 1 +#define AC_STATUS_FOUND 2 +#define AC_STATUS_BREAK 3 + +struct ext4_mb_history { + struct ext4_free_extent orig; /* orig allocation */ + struct ext4_free_extent goal; /* goal allocation */ + struct ext4_free_extent result; /* result allocation */ + unsigned pid; + unsigned ino; + __u16 found; /* how many extents have been found */ + __u16 groups; /* how many groups have been scanned */ + __u16 tail; /* what tail broke some buddy */ + __u16 buddy; /* buddy the tail ^^^ broke */ + __u16 flags; + __u8 cr:3; /* which phase the result extent was found at */ + __u8 op:4; + __u8 merged:1; +}; + +struct ext4_buddy { + struct page *bd_buddy_page; + void *bd_buddy; + struct page *bd_bitmap_page; + void *bd_bitmap; + struct ext4_group_info *bd_info; + struct super_block *bd_sb; + __u16 bd_blkbits; + ext4_group_t bd_group; + struct rw_semaphore *alloc_semp; +}; +#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) +#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) + +#ifndef EXT4_MB_HISTORY +static inline void ext4_mb_store_history(struct ext4_allocation_context *ac) +{ + return; +} +#endif + +#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) + +struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t); + + +static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) +{ + struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); + + bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state)); +} + +static inline void ext4_unlock_group(struct super_block *sb, + ext4_group_t group) +{ + struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); + + bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state)); +} + +static inline int ext4_is_group_locked(struct super_block *sb, + ext4_group_t group) +{ + struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); + + return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT, + &(grinfo->bb_state)); +} + +static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, + struct ext4_free_extent *fex) +{ + ext4_fsblk_t block; + + block = (ext4_fsblk_t) fex->fe_group * EXT4_BLOCKS_PER_GROUP(sb) + + fex->fe_start + + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); + return block; +} +#endif diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c new file mode 100644 index 0000000..9aa0fbe --- /dev/null +++ b/fs/ext4/migrate.c @@ -0,0 +1,622 @@ +/* + * Copyright IBM Corporation, 2007 + * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ + +#include <linux/module.h> +#include "ext4_jbd2.h" +#include "ext4_extents.h" + +/* + * The contiguous blocks details which can be + * represented by a single extent + */ +struct list_blocks_struct { + ext4_lblk_t first_block, last_block; + ext4_fsblk_t first_pblock, last_pblock; +}; + +static int finish_range(handle_t *handle, struct inode *inode, + struct list_blocks_struct *lb) + +{ + int retval = 0, needed; + struct ext4_extent newext; + struct ext4_ext_path *path; + if (lb->first_pblock == 0) + return 0; + + /* Add the extent to temp inode*/ + newext.ee_block = cpu_to_le32(lb->first_block); + newext.ee_len = cpu_to_le16(lb->last_block - lb->first_block + 1); + ext4_ext_store_pblock(&newext, lb->first_pblock); + path = ext4_ext_find_extent(inode, lb->first_block, NULL); + + if (IS_ERR(path)) { + retval = PTR_ERR(path); + path = NULL; + goto err_out; + } + + /* + * Calculate the credit needed to inserting this extent + * Since we are doing this in loop we may accumalate extra + * credit. But below we try to not accumalate too much + * of them by restarting the journal. + */ + needed = ext4_ext_calc_credits_for_single_extent(inode, + lb->last_block - lb->first_block + 1, path); + + /* + * Make sure the credit we accumalated is not really high + */ + if (needed && handle->h_buffer_credits >= EXT4_RESERVE_TRANS_BLOCKS) { + retval = ext4_journal_restart(handle, needed); + if (retval) + goto err_out; + } else if (needed) { + retval = ext4_journal_extend(handle, needed); + if (retval) { + /* + * IF not able to extend the journal restart the journal + */ + retval = ext4_journal_restart(handle, needed); + if (retval) + goto err_out; + } + } + retval = ext4_ext_insert_extent(handle, inode, path, &newext); +err_out: + if (path) { + ext4_ext_drop_refs(path); + kfree(path); + } + lb->first_pblock = 0; + return retval; +} + +static int update_extent_range(handle_t *handle, struct inode *inode, + ext4_fsblk_t pblock, ext4_lblk_t blk_num, + struct list_blocks_struct *lb) +{ + int retval; + /* + * See if we can add on to the existing range (if it exists) + */ + if (lb->first_pblock && + (lb->last_pblock+1 == pblock) && + (lb->last_block+1 == blk_num)) { + lb->last_pblock = pblock; + lb->last_block = blk_num; + return 0; + } + /* + * Start a new range. + */ + retval = finish_range(handle, inode, lb); + lb->first_pblock = lb->last_pblock = pblock; + lb->first_block = lb->last_block = blk_num; + + return retval; +} + +static int update_ind_extent_range(handle_t *handle, struct inode *inode, + ext4_fsblk_t pblock, ext4_lblk_t *blk_nump, + struct list_blocks_struct *lb) +{ + struct buffer_head *bh; + __le32 *i_data; + int i, retval = 0; + ext4_lblk_t blk_count = *blk_nump; + unsigned long max_entries = inode->i_sb->s_blocksize >> 2; + + if (!pblock) { + /* Only update the file block number */ + *blk_nump += max_entries; + return 0; + } + + bh = sb_bread(inode->i_sb, pblock); + if (!bh) + return -EIO; + + i_data = (__le32 *)bh->b_data; + for (i = 0; i < max_entries; i++, blk_count++) { + if (i_data[i]) { + retval = update_extent_range(handle, inode, + le32_to_cpu(i_data[i]), + blk_count, lb); + if (retval) + break; + } + } + + /* Update the file block number */ + *blk_nump = blk_count; + put_bh(bh); + return retval; + +} + +static int update_dind_extent_range(handle_t *handle, struct inode *inode, + ext4_fsblk_t pblock, ext4_lblk_t *blk_nump, + struct list_blocks_struct *lb) +{ + struct buffer_head *bh; + __le32 *i_data; + int i, retval = 0; + ext4_lblk_t blk_count = *blk_nump; + unsigned long max_entries = inode->i_sb->s_blocksize >> 2; + + if (!pblock) { + /* Only update the file block number */ + *blk_nump += max_entries * max_entries; + return 0; + } + bh = sb_bread(inode->i_sb, pblock); + if (!bh) + return -EIO; + + i_data = (__le32 *)bh->b_data; + for (i = 0; i < max_entries; i++) { + if (i_data[i]) { + retval = update_ind_extent_range(handle, inode, + le32_to_cpu(i_data[i]), + &blk_count, lb); + if (retval) + break; + } else { + /* Only update the file block number */ + blk_count += max_entries; + } + } + + /* Update the file block number */ + *blk_nump = blk_count; + put_bh(bh); + return retval; + +} + +static int update_tind_extent_range(handle_t *handle, struct inode *inode, + ext4_fsblk_t pblock, ext4_lblk_t *blk_nump, + struct list_blocks_struct *lb) +{ + struct buffer_head *bh; + __le32 *i_data; + int i, retval = 0; + ext4_lblk_t blk_count = *blk_nump; + unsigned long max_entries = inode->i_sb->s_blocksize >> 2; + + if (!pblock) { + /* Only update the file block number */ + *blk_nump += max_entries * max_entries * max_entries; + return 0; + } + bh = sb_bread(inode->i_sb, pblock); + if (!bh) + return -EIO; + + i_data = (__le32 *)bh->b_data; + for (i = 0; i < max_entries; i++) { + if (i_data[i]) { + retval = update_dind_extent_range(handle, inode, + le32_to_cpu(i_data[i]), + &blk_count, lb); + if (retval) + break; + } else + /* Only update the file block number */ + blk_count += max_entries * max_entries; + } + /* Update the file block number */ + *blk_nump = blk_count; + put_bh(bh); + return retval; + +} + +static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode) +{ + int retval = 0, needed; + + if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS) + return 0; + /* + * We are freeing a blocks. During this we touch + * superblock, group descriptor and block bitmap. + * So allocate a credit of 3. We may update + * quota (user and group). + */ + needed = 3 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); + + if (ext4_journal_extend(handle, needed) != 0) + retval = ext4_journal_restart(handle, needed); + + return retval; +} + +static int free_dind_blocks(handle_t *handle, + struct inode *inode, __le32 i_data) +{ + int i; + __le32 *tmp_idata; + struct buffer_head *bh; + unsigned long max_entries = inode->i_sb->s_blocksize >> 2; + + bh = sb_bread(inode->i_sb, le32_to_cpu(i_data)); + if (!bh) + return -EIO; + + tmp_idata = (__le32 *)bh->b_data; + for (i = 0; i < max_entries; i++) { + if (tmp_idata[i]) { + extend_credit_for_blkdel(handle, inode); + ext4_free_blocks(handle, inode, + le32_to_cpu(tmp_idata[i]), 1, 1); + } + } + put_bh(bh); + extend_credit_for_blkdel(handle, inode); + ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1); + return 0; +} + +static int free_tind_blocks(handle_t *handle, + struct inode *inode, __le32 i_data) +{ + int i, retval = 0; + __le32 *tmp_idata; + struct buffer_head *bh; + unsigned long max_entries = inode->i_sb->s_blocksize >> 2; + + bh = sb_bread(inode->i_sb, le32_to_cpu(i_data)); + if (!bh) + return -EIO; + + tmp_idata = (__le32 *)bh->b_data; + for (i = 0; i < max_entries; i++) { + if (tmp_idata[i]) { + retval = free_dind_blocks(handle, + inode, tmp_idata[i]); + if (retval) { + put_bh(bh); + return retval; + } + } + } + put_bh(bh); + extend_credit_for_blkdel(handle, inode); + ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1); + return 0; +} + +static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data) +{ + int retval; + + /* ei->i_data[EXT4_IND_BLOCK] */ + if (i_data[0]) { + extend_credit_for_blkdel(handle, inode); + ext4_free_blocks(handle, inode, + le32_to_cpu(i_data[0]), 1, 1); + } + + /* ei->i_data[EXT4_DIND_BLOCK] */ + if (i_data[1]) { + retval = free_dind_blocks(handle, inode, i_data[1]); + if (retval) + return retval; + } + + /* ei->i_data[EXT4_TIND_BLOCK] */ + if (i_data[2]) { + retval = free_tind_blocks(handle, inode, i_data[2]); + if (retval) + return retval; + } + return 0; +} + +static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, + struct inode *tmp_inode) +{ + int retval; + __le32 i_data[3]; + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_inode_info *tmp_ei = EXT4_I(tmp_inode); + + /* + * One credit accounted for writing the + * i_data field of the original inode + */ + retval = ext4_journal_extend(handle, 1); + if (retval) { + retval = ext4_journal_restart(handle, 1); + if (retval) + goto err_out; + } + + i_data[0] = ei->i_data[EXT4_IND_BLOCK]; + i_data[1] = ei->i_data[EXT4_DIND_BLOCK]; + i_data[2] = ei->i_data[EXT4_TIND_BLOCK]; + + down_write(&EXT4_I(inode)->i_data_sem); + /* + * if EXT4_EXT_MIGRATE is cleared a block allocation + * happened after we started the migrate. We need to + * fail the migrate + */ + if (!(EXT4_I(inode)->i_flags & EXT4_EXT_MIGRATE)) { + retval = -EAGAIN; + up_write(&EXT4_I(inode)->i_data_sem); + goto err_out; + } else + EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags & + ~EXT4_EXT_MIGRATE; + /* + * We have the extent map build with the tmp inode. + * Now copy the i_data across + */ + ei->i_flags |= EXT4_EXTENTS_FL; + memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data)); + + /* + * Update i_blocks with the new blocks that got + * allocated while adding extents for extent index + * blocks. + * + * While converting to extents we need not + * update the orignal inode i_blocks for extent blocks + * via quota APIs. The quota update happened via tmp_inode already. + */ + spin_lock(&inode->i_lock); + inode->i_blocks += tmp_inode->i_blocks; + spin_unlock(&inode->i_lock); + up_write(&EXT4_I(inode)->i_data_sem); + + /* + * We mark the inode dirty after, because we decrement the + * i_blocks when freeing the indirect meta-data blocks + */ + retval = free_ind_block(handle, inode, i_data); + ext4_mark_inode_dirty(handle, inode); + +err_out: + return retval; +} + +static int free_ext_idx(handle_t *handle, struct inode *inode, + struct ext4_extent_idx *ix) +{ + int i, retval = 0; + ext4_fsblk_t block; + struct buffer_head *bh; + struct ext4_extent_header *eh; + + block = idx_pblock(ix); + bh = sb_bread(inode->i_sb, block); + if (!bh) + return -EIO; + + eh = (struct ext4_extent_header *)bh->b_data; + if (eh->eh_depth != 0) { + ix = EXT_FIRST_INDEX(eh); + for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) { + retval = free_ext_idx(handle, inode, ix); + if (retval) + break; + } + } + put_bh(bh); + extend_credit_for_blkdel(handle, inode); + ext4_free_blocks(handle, inode, block, 1, 1); + return retval; +} + +/* + * Free the extent meta data blocks only + */ +static int free_ext_block(handle_t *handle, struct inode *inode) +{ + int i, retval = 0; + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_extent_header *eh = (struct ext4_extent_header *)ei->i_data; + struct ext4_extent_idx *ix; + if (eh->eh_depth == 0) + /* + * No extra blocks allocated for extent meta data + */ + return 0; + ix = EXT_FIRST_INDEX(eh); + for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) { + retval = free_ext_idx(handle, inode, ix); + if (retval) + return retval; + } + return retval; + +} + +int ext4_ext_migrate(struct inode *inode) +{ + handle_t *handle; + int retval = 0, i; + __le32 *i_data; + ext4_lblk_t blk_count = 0; + struct ext4_inode_info *ei; + struct inode *tmp_inode = NULL; + struct list_blocks_struct lb; + unsigned long max_entries; + + if (!test_opt(inode->i_sb, EXTENTS)) + /* + * if mounted with noextents we don't allow the migrate + */ + return -EINVAL; + + if ((EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + return -EINVAL; + + if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0) + /* + * don't migrate fast symlink + */ + return retval; + + handle = ext4_journal_start(inode, + EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + + 2 * EXT4_QUOTA_INIT_BLOCKS(inode->i_sb) + + 1); + if (IS_ERR(handle)) { + retval = PTR_ERR(handle); + return retval; + } + tmp_inode = ext4_new_inode(handle, + inode->i_sb->s_root->d_inode, + S_IFREG); + if (IS_ERR(tmp_inode)) { + retval = -ENOMEM; + ext4_journal_stop(handle); + return retval; + } + i_size_write(tmp_inode, i_size_read(inode)); + /* + * We don't want the inode to be reclaimed + * if we got interrupted in between. We have + * this tmp inode carrying reference to the + * data blocks of the original file. We set + * the i_nlink to zero at the last stage after + * switching the original file to extent format + */ + tmp_inode->i_nlink = 1; + + ext4_ext_tree_init(handle, tmp_inode); + ext4_orphan_add(handle, tmp_inode); + ext4_journal_stop(handle); + + /* + * start with one credit accounted for + * superblock modification. + * + * For the tmp_inode we already have commited the + * trascation that created the inode. Later as and + * when we add extents we extent the journal + */ + /* + * Even though we take i_mutex we can still cause block allocation + * via mmap write to holes. If we have allocated new blocks we fail + * migrate. New block allocation will clear EXT4_EXT_MIGRATE flag. + * The flag is updated with i_data_sem held to prevent racing with + * block allocation. + */ + down_read((&EXT4_I(inode)->i_data_sem)); + EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags | EXT4_EXT_MIGRATE; + up_read((&EXT4_I(inode)->i_data_sem)); + + handle = ext4_journal_start(inode, 1); + + ei = EXT4_I(inode); + i_data = ei->i_data; + memset(&lb, 0, sizeof(lb)); + + /* 32 bit block address 4 bytes */ + max_entries = inode->i_sb->s_blocksize >> 2; + for (i = 0; i < EXT4_NDIR_BLOCKS; i++, blk_count++) { + if (i_data[i]) { + retval = update_extent_range(handle, tmp_inode, + le32_to_cpu(i_data[i]), + blk_count, &lb); + if (retval) + goto err_out; + } + } + if (i_data[EXT4_IND_BLOCK]) { + retval = update_ind_extent_range(handle, tmp_inode, + le32_to_cpu(i_data[EXT4_IND_BLOCK]), + &blk_count, &lb); + if (retval) + goto err_out; + } else + blk_count += max_entries; + if (i_data[EXT4_DIND_BLOCK]) { + retval = update_dind_extent_range(handle, tmp_inode, + le32_to_cpu(i_data[EXT4_DIND_BLOCK]), + &blk_count, &lb); + if (retval) + goto err_out; + } else + blk_count += max_entries * max_entries; + if (i_data[EXT4_TIND_BLOCK]) { + retval = update_tind_extent_range(handle, tmp_inode, + le32_to_cpu(i_data[EXT4_TIND_BLOCK]), + &blk_count, &lb); + if (retval) + goto err_out; + } + /* + * Build the last extent + */ + retval = finish_range(handle, tmp_inode, &lb); +err_out: + if (retval) + /* + * Failure case delete the extent information with the + * tmp_inode + */ + free_ext_block(handle, tmp_inode); + else { + retval = ext4_ext_swap_inode_data(handle, inode, tmp_inode); + if (retval) + /* + * if we fail to swap inode data free the extent + * details of the tmp inode + */ + free_ext_block(handle, tmp_inode); + } + + /* We mark the tmp_inode dirty via ext4_ext_tree_init. */ + if (ext4_journal_extend(handle, 1) != 0) + ext4_journal_restart(handle, 1); + + /* + * Mark the tmp_inode as of size zero + */ + i_size_write(tmp_inode, 0); + + /* + * set the i_blocks count to zero + * so that the ext4_delete_inode does the + * right job + * + * We don't need to take the i_lock because + * the inode is not visible to user space. + */ + tmp_inode->i_blocks = 0; + + /* Reset the extent details */ + ext4_ext_tree_init(handle, tmp_inode); + + /* + * Set the i_nlink to zero so that + * generic_drop_inode really deletes the + * inode + */ + tmp_inode->i_nlink = 0; + + ext4_journal_stop(handle); + + iput(tmp_inode); + + return retval; +} diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c new file mode 100644 index 0000000..da94b20 --- /dev/null +++ b/fs/ext4/namei.c @@ -0,0 +1,2482 @@ +/* + * linux/fs/ext4/namei.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/namei.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * Directory entry file type support and forward compatibility hooks + * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998 + * Hash Tree Directory indexing (c) + * Daniel Phillips, 2001 + * Hash Tree Directory indexing porting + * Christopher Li, 2002 + * Hash Tree Directory indexing cleanup + * Theodore Ts'o, 2002 + */ + +#include <linux/fs.h> +#include <linux/pagemap.h> +#include <linux/jbd2.h> +#include <linux/time.h> +#include <linux/fcntl.h> +#include <linux/stat.h> +#include <linux/string.h> +#include <linux/quotaops.h> +#include <linux/buffer_head.h> +#include <linux/bio.h> +#include "ext4.h" +#include "ext4_jbd2.h" + +#include "namei.h" +#include "xattr.h" +#include "acl.h" + +/* + * define how far ahead to read directories while searching them. + */ +#define NAMEI_RA_CHUNKS 2 +#define NAMEI_RA_BLOCKS 4 +#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) +#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) + +static struct buffer_head *ext4_append(handle_t *handle, + struct inode *inode, + ext4_lblk_t *block, int *err) +{ + struct buffer_head *bh; + + *block = inode->i_size >> inode->i_sb->s_blocksize_bits; + + bh = ext4_bread(handle, inode, *block, 1, err); + if (bh) { + inode->i_size += inode->i_sb->s_blocksize; + EXT4_I(inode)->i_disksize = inode->i_size; + *err = ext4_journal_get_write_access(handle, bh); + if (*err) { + brelse(bh); + bh = NULL; + } + } + return bh; +} + +#ifndef assert +#define assert(test) J_ASSERT(test) +#endif + +#ifndef swap +#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0) +#endif + +#ifdef DX_DEBUG +#define dxtrace(command) command +#else +#define dxtrace(command) +#endif + +struct fake_dirent +{ + __le32 inode; + __le16 rec_len; + u8 name_len; + u8 file_type; +}; + +struct dx_countlimit +{ + __le16 limit; + __le16 count; +}; + +struct dx_entry +{ + __le32 hash; + __le32 block; +}; + +/* + * dx_root_info is laid out so that if it should somehow get overlaid by a + * dirent the two low bits of the hash version will be zero. Therefore, the + * hash version mod 4 should never be 0. Sincerely, the paranoia department. + */ + +struct dx_root +{ + struct fake_dirent dot; + char dot_name[4]; + struct fake_dirent dotdot; + char dotdot_name[4]; + struct dx_root_info + { + __le32 reserved_zero; + u8 hash_version; + u8 info_length; /* 8 */ + u8 indirect_levels; + u8 unused_flags; + } + info; + struct dx_entry entries[0]; +}; + +struct dx_node +{ + struct fake_dirent fake; + struct dx_entry entries[0]; +}; + + +struct dx_frame +{ + struct buffer_head *bh; + struct dx_entry *entries; + struct dx_entry *at; +}; + +struct dx_map_entry +{ + u32 hash; + u16 offs; + u16 size; +}; + +static inline ext4_lblk_t dx_get_block(struct dx_entry *entry); +static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value); +static inline unsigned dx_get_hash(struct dx_entry *entry); +static void dx_set_hash(struct dx_entry *entry, unsigned value); +static unsigned dx_get_count(struct dx_entry *entries); +static unsigned dx_get_limit(struct dx_entry *entries); +static void dx_set_count(struct dx_entry *entries, unsigned value); +static void dx_set_limit(struct dx_entry *entries, unsigned value); +static unsigned dx_root_limit(struct inode *dir, unsigned infosize); +static unsigned dx_node_limit(struct inode *dir); +static struct dx_frame *dx_probe(const struct qstr *d_name, + struct inode *dir, + struct dx_hash_info *hinfo, + struct dx_frame *frame, + int *err); +static void dx_release(struct dx_frame *frames); +static int dx_make_map(struct ext4_dir_entry_2 *de, int size, + struct dx_hash_info *hinfo, struct dx_map_entry map[]); +static void dx_sort_map(struct dx_map_entry *map, unsigned count); +static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to, + struct dx_map_entry *offsets, int count); +static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size); +static void dx_insert_block(struct dx_frame *frame, + u32 hash, ext4_lblk_t block); +static int ext4_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, + struct dx_frame *frames, + __u32 *start_hash); +static struct buffer_head * ext4_dx_find_entry(struct inode *dir, + const struct qstr *d_name, + struct ext4_dir_entry_2 **res_dir, + int *err); +static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode); + +/* + * p is at least 6 bytes before the end of page + */ +static inline struct ext4_dir_entry_2 * +ext4_next_entry(struct ext4_dir_entry_2 *p) +{ + return (struct ext4_dir_entry_2 *)((char *)p + + ext4_rec_len_from_disk(p->rec_len)); +} + +/* + * Future: use high four bits of block for coalesce-on-delete flags + * Mask them off for now. + */ + +static inline ext4_lblk_t dx_get_block(struct dx_entry *entry) +{ + return le32_to_cpu(entry->block) & 0x00ffffff; +} + +static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value) +{ + entry->block = cpu_to_le32(value); +} + +static inline unsigned dx_get_hash(struct dx_entry *entry) +{ + return le32_to_cpu(entry->hash); +} + +static inline void dx_set_hash(struct dx_entry *entry, unsigned value) +{ + entry->hash = cpu_to_le32(value); +} + +static inline unsigned dx_get_count(struct dx_entry *entries) +{ + return le16_to_cpu(((struct dx_countlimit *) entries)->count); +} + +static inline unsigned dx_get_limit(struct dx_entry *entries) +{ + return le16_to_cpu(((struct dx_countlimit *) entries)->limit); +} + +static inline void dx_set_count(struct dx_entry *entries, unsigned value) +{ + ((struct dx_countlimit *) entries)->count = cpu_to_le16(value); +} + +static inline void dx_set_limit(struct dx_entry *entries, unsigned value) +{ + ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); +} + +static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize) +{ + unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) - + EXT4_DIR_REC_LEN(2) - infosize; + return entry_space / sizeof(struct dx_entry); +} + +static inline unsigned dx_node_limit(struct inode *dir) +{ + unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0); + return entry_space / sizeof(struct dx_entry); +} + +/* + * Debug + */ +#ifdef DX_DEBUG +static void dx_show_index(char * label, struct dx_entry *entries) +{ + int i, n = dx_get_count (entries); + printk(KERN_DEBUG "%s index ", label); + for (i = 0; i < n; i++) { + printk("%x->%lu ", i ? dx_get_hash(entries + i) : + 0, (unsigned long)dx_get_block(entries + i)); + } + printk("\n"); +} + +struct stats +{ + unsigned names; + unsigned space; + unsigned bcount; +}; + +static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_entry_2 *de, + int size, int show_names) +{ + unsigned names = 0, space = 0; + char *base = (char *) de; + struct dx_hash_info h = *hinfo; + + printk("names: "); + while ((char *) de < base + size) + { + if (de->inode) + { + if (show_names) + { + int len = de->name_len; + char *name = de->name; + while (len--) printk("%c", *name++); + ext4fs_dirhash(de->name, de->name_len, &h); + printk(":%x.%u ", h.hash, + ((char *) de - base)); + } + space += EXT4_DIR_REC_LEN(de->name_len); + names++; + } + de = ext4_next_entry(de); + } + printk("(%i)\n", names); + return (struct stats) { names, space, 1 }; +} + +struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, + struct dx_entry *entries, int levels) +{ + unsigned blocksize = dir->i_sb->s_blocksize; + unsigned count = dx_get_count(entries), names = 0, space = 0, i; + unsigned bcount = 0; + struct buffer_head *bh; + int err; + printk("%i indexed blocks...\n", count); + for (i = 0; i < count; i++, entries++) + { + ext4_lblk_t block = dx_get_block(entries); + ext4_lblk_t hash = i ? dx_get_hash(entries): 0; + u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash; + struct stats stats; + printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range); + if (!(bh = ext4_bread (NULL,dir, block, 0,&err))) continue; + stats = levels? + dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1): + dx_show_leaf(hinfo, (struct ext4_dir_entry_2 *) bh->b_data, blocksize, 0); + names += stats.names; + space += stats.space; + bcount += stats.bcount; + brelse(bh); + } + if (bcount) + printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n", + levels ? "" : " ", names, space/bcount, + (space/bcount)*100/blocksize); + return (struct stats) { names, space, bcount}; +} +#endif /* DX_DEBUG */ + +/* + * Probe for a directory leaf block to search. + * + * dx_probe can return ERR_BAD_DX_DIR, which means there was a format + * error in the directory index, and the caller should fall back to + * searching the directory normally. The callers of dx_probe **MUST** + * check for this error code, and make sure it never gets reflected + * back to userspace. + */ +static struct dx_frame * +dx_probe(const struct qstr *d_name, struct inode *dir, + struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) +{ + unsigned count, indirect; + struct dx_entry *at, *entries, *p, *q, *m; + struct dx_root *root; + struct buffer_head *bh; + struct dx_frame *frame = frame_in; + u32 hash; + + frame->bh = NULL; + if (!(bh = ext4_bread (NULL,dir, 0, 0, err))) + goto fail; + root = (struct dx_root *) bh->b_data; + if (root->info.hash_version != DX_HASH_TEA && + root->info.hash_version != DX_HASH_HALF_MD4 && + root->info.hash_version != DX_HASH_LEGACY) { + ext4_warning(dir->i_sb, __func__, + "Unrecognised inode hash code %d", + root->info.hash_version); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; + } + hinfo->hash_version = root->info.hash_version; + if (hinfo->hash_version <= DX_HASH_TEA) + hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; + hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; + if (d_name) + ext4fs_dirhash(d_name->name, d_name->len, hinfo); + hash = hinfo->hash; + + if (root->info.unused_flags & 1) { + ext4_warning(dir->i_sb, __func__, + "Unimplemented inode hash flags: %#06x", + root->info.unused_flags); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; + } + + if ((indirect = root->info.indirect_levels) > 1) { + ext4_warning(dir->i_sb, __func__, + "Unimplemented inode hash depth: %#06x", + root->info.indirect_levels); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; + } + + entries = (struct dx_entry *) (((char *)&root->info) + + root->info.info_length); + + if (dx_get_limit(entries) != dx_root_limit(dir, + root->info.info_length)) { + ext4_warning(dir->i_sb, __func__, + "dx entry: limit != root limit"); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; + } + + dxtrace(printk("Look up %x", hash)); + while (1) + { + count = dx_get_count(entries); + if (!count || count > dx_get_limit(entries)) { + ext4_warning(dir->i_sb, __func__, + "dx entry: no count or count > limit"); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail2; + } + + p = entries + 1; + q = entries + count - 1; + while (p <= q) + { + m = p + (q - p)/2; + dxtrace(printk(".")); + if (dx_get_hash(m) > hash) + q = m - 1; + else + p = m + 1; + } + + if (0) // linear search cross check + { + unsigned n = count - 1; + at = entries; + while (n--) + { + dxtrace(printk(",")); + if (dx_get_hash(++at) > hash) + { + at--; + break; + } + } + assert (at == p - 1); + } + + at = p - 1; + dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at))); + frame->bh = bh; + frame->entries = entries; + frame->at = at; + if (!indirect--) return frame; + if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err))) + goto fail2; + at = entries = ((struct dx_node *) bh->b_data)->entries; + if (dx_get_limit(entries) != dx_node_limit (dir)) { + ext4_warning(dir->i_sb, __func__, + "dx entry: limit != node limit"); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail2; + } + frame++; + frame->bh = NULL; + } +fail2: + while (frame >= frame_in) { + brelse(frame->bh); + frame--; + } +fail: + if (*err == ERR_BAD_DX_DIR) + ext4_warning(dir->i_sb, __func__, + "Corrupt dir inode %ld, running e2fsck is " + "recommended.", dir->i_ino); + return NULL; +} + +static void dx_release (struct dx_frame *frames) +{ + if (frames[0].bh == NULL) + return; + + if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels) + brelse(frames[1].bh); + brelse(frames[0].bh); +} + +/* + * This function increments the frame pointer to search the next leaf + * block, and reads in the necessary intervening nodes if the search + * should be necessary. Whether or not the search is necessary is + * controlled by the hash parameter. If the hash value is even, then + * the search is only continued if the next block starts with that + * hash value. This is used if we are searching for a specific file. + * + * If the hash value is HASH_NB_ALWAYS, then always go to the next block. + * + * This function returns 1 if the caller should continue to search, + * or 0 if it should not. If there is an error reading one of the + * index blocks, it will a negative error code. + * + * If start_hash is non-null, it will be filled in with the starting + * hash of the next page. + */ +static int ext4_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, + struct dx_frame *frames, + __u32 *start_hash) +{ + struct dx_frame *p; + struct buffer_head *bh; + int err, num_frames = 0; + __u32 bhash; + + p = frame; + /* + * Find the next leaf page by incrementing the frame pointer. + * If we run out of entries in the interior node, loop around and + * increment pointer in the parent node. When we break out of + * this loop, num_frames indicates the number of interior + * nodes need to be read. + */ + while (1) { + if (++(p->at) < p->entries + dx_get_count(p->entries)) + break; + if (p == frames) + return 0; + num_frames++; + p--; + } + + /* + * If the hash is 1, then continue only if the next page has a + * continuation hash of any value. This is used for readdir + * handling. Otherwise, check to see if the hash matches the + * desired contiuation hash. If it doesn't, return since + * there's no point to read in the successive index pages. + */ + bhash = dx_get_hash(p->at); + if (start_hash) + *start_hash = bhash; + if ((hash & 1) == 0) { + if ((bhash & ~1) != hash) + return 0; + } + /* + * If the hash is HASH_NB_ALWAYS, we always go to the next + * block so no check is necessary + */ + while (num_frames--) { + if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at), + 0, &err))) + return err; /* Failure */ + p++; + brelse(p->bh); + p->bh = bh; + p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; + } + return 1; +} + + +/* + * This function fills a red-black tree with information from a + * directory block. It returns the number directory entries loaded + * into the tree. If there is an error it is returned in err. + */ +static int htree_dirblock_to_tree(struct file *dir_file, + struct inode *dir, ext4_lblk_t block, + struct dx_hash_info *hinfo, + __u32 start_hash, __u32 start_minor_hash) +{ + struct buffer_head *bh; + struct ext4_dir_entry_2 *de, *top; + int err, count = 0; + + dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n", + (unsigned long)block)); + if (!(bh = ext4_bread (NULL, dir, block, 0, &err))) + return err; + + de = (struct ext4_dir_entry_2 *) bh->b_data; + top = (struct ext4_dir_entry_2 *) ((char *) de + + dir->i_sb->s_blocksize - + EXT4_DIR_REC_LEN(0)); + for (; de < top; de = ext4_next_entry(de)) { + if (!ext4_check_dir_entry("htree_dirblock_to_tree", dir, de, bh, + (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb)) + +((char *)de - bh->b_data))) { + /* On error, skip the f_pos to the next block. */ + dir_file->f_pos = (dir_file->f_pos | + (dir->i_sb->s_blocksize - 1)) + 1; + brelse(bh); + return count; + } + ext4fs_dirhash(de->name, de->name_len, hinfo); + if ((hinfo->hash < start_hash) || + ((hinfo->hash == start_hash) && + (hinfo->minor_hash < start_minor_hash))) + continue; + if (de->inode == 0) + continue; + if ((err = ext4_htree_store_dirent(dir_file, + hinfo->hash, hinfo->minor_hash, de)) != 0) { + brelse(bh); + return err; + } + count++; + } + brelse(bh); + return count; +} + + +/* + * This function fills a red-black tree with information from a + * directory. We start scanning the directory in hash order, starting + * at start_hash and start_minor_hash. + * + * This function returns the number of entries inserted into the tree, + * or a negative error code. + */ +int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, + __u32 start_minor_hash, __u32 *next_hash) +{ + struct dx_hash_info hinfo; + struct ext4_dir_entry_2 *de; + struct dx_frame frames[2], *frame; + struct inode *dir; + ext4_lblk_t block; + int count = 0; + int ret, err; + __u32 hashval; + + dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n", + start_hash, start_minor_hash)); + dir = dir_file->f_path.dentry->d_inode; + if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) { + hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; + if (hinfo.hash_version <= DX_HASH_TEA) + hinfo.hash_version += + EXT4_SB(dir->i_sb)->s_hash_unsigned; + hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; + count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, + start_hash, start_minor_hash); + *next_hash = ~0; + return count; + } + hinfo.hash = start_hash; + hinfo.minor_hash = 0; + frame = dx_probe(NULL, dir, &hinfo, frames, &err); + if (!frame) + return err; + + /* Add '.' and '..' from the htree header */ + if (!start_hash && !start_minor_hash) { + de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; + if ((err = ext4_htree_store_dirent(dir_file, 0, 0, de)) != 0) + goto errout; + count++; + } + if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) { + de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; + de = ext4_next_entry(de); + if ((err = ext4_htree_store_dirent(dir_file, 2, 0, de)) != 0) + goto errout; + count++; + } + + while (1) { + block = dx_get_block(frame->at); + ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo, + start_hash, start_minor_hash); + if (ret < 0) { + err = ret; + goto errout; + } + count += ret; + hashval = ~0; + ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS, + frame, frames, &hashval); + *next_hash = hashval; + if (ret < 0) { + err = ret; + goto errout; + } + /* + * Stop if: (a) there are no more entries, or + * (b) we have inserted at least one entry and the + * next hash value is not a continuation + */ + if ((ret == 0) || + (count && ((hashval & 1) == 0))) + break; + } + dx_release(frames); + dxtrace(printk(KERN_DEBUG "Fill tree: returned %d entries, " + "next hash: %x\n", count, *next_hash)); + return count; +errout: + dx_release(frames); + return (err); +} + + +/* + * Directory block splitting, compacting + */ + +/* + * Create map of hash values, offsets, and sizes, stored at end of block. + * Returns number of entries mapped. + */ +static int dx_make_map (struct ext4_dir_entry_2 *de, int size, + struct dx_hash_info *hinfo, struct dx_map_entry *map_tail) +{ + int count = 0; + char *base = (char *) de; + struct dx_hash_info h = *hinfo; + + while ((char *) de < base + size) + { + if (de->name_len && de->inode) { + ext4fs_dirhash(de->name, de->name_len, &h); + map_tail--; + map_tail->hash = h.hash; + map_tail->offs = (u16) ((char *) de - base); + map_tail->size = le16_to_cpu(de->rec_len); + count++; + cond_resched(); + } + /* XXX: do we need to check rec_len == 0 case? -Chris */ + de = ext4_next_entry(de); + } + return count; +} + +/* Sort map by hash value */ +static void dx_sort_map (struct dx_map_entry *map, unsigned count) +{ + struct dx_map_entry *p, *q, *top = map + count - 1; + int more; + /* Combsort until bubble sort doesn't suck */ + while (count > 2) { + count = count*10/13; + if (count - 9 < 2) /* 9, 10 -> 11 */ + count = 11; + for (p = top, q = p - count; q >= map; p--, q--) + if (p->hash < q->hash) + swap(*p, *q); + } + /* Garden variety bubble sort */ + do { + more = 0; + q = top; + while (q-- > map) { + if (q[1].hash >= q[0].hash) + continue; + swap(*(q+1), *q); + more = 1; + } + } while(more); +} + +static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block) +{ + struct dx_entry *entries = frame->entries; + struct dx_entry *old = frame->at, *new = old + 1; + int count = dx_get_count(entries); + + assert(count < dx_get_limit(entries)); + assert(old < entries + count); + memmove(new + 1, new, (char *)(entries + count) - (char *)(new)); + dx_set_hash(new, hash); + dx_set_block(new, block); + dx_set_count(entries, count + 1); +} + +static void ext4_update_dx_flag(struct inode *inode) +{ + if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_COMPAT_DIR_INDEX)) + EXT4_I(inode)->i_flags &= ~EXT4_INDEX_FL; +} + +/* + * NOTE! unlike strncmp, ext4_match returns 1 for success, 0 for failure. + * + * `len <= EXT4_NAME_LEN' is guaranteed by caller. + * `de != NULL' is guaranteed by caller. + */ +static inline int ext4_match (int len, const char * const name, + struct ext4_dir_entry_2 * de) +{ + if (len != de->name_len) + return 0; + if (!de->inode) + return 0; + return !memcmp(name, de->name, len); +} + +/* + * Returns 0 if not found, -1 on failure, and 1 on success + */ +static inline int search_dirblock(struct buffer_head *bh, + struct inode *dir, + const struct qstr *d_name, + unsigned long offset, + struct ext4_dir_entry_2 ** res_dir) +{ + struct ext4_dir_entry_2 * de; + char * dlimit; + int de_len; + const char *name = d_name->name; + int namelen = d_name->len; + + de = (struct ext4_dir_entry_2 *) bh->b_data; + dlimit = bh->b_data + dir->i_sb->s_blocksize; + while ((char *) de < dlimit) { + /* this code is executed quadratically often */ + /* do minimal checking `by hand' */ + + if ((char *) de + namelen <= dlimit && + ext4_match (namelen, name, de)) { + /* found a match - just to be sure, do a full check */ + if (!ext4_check_dir_entry("ext4_find_entry", + dir, de, bh, offset)) + return -1; + *res_dir = de; + return 1; + } + /* prevent looping on a bad block */ + de_len = ext4_rec_len_from_disk(de->rec_len); + if (de_len <= 0) + return -1; + offset += de_len; + de = (struct ext4_dir_entry_2 *) ((char *) de + de_len); + } + return 0; +} + + +/* + * ext4_find_entry() + * + * finds an entry in the specified directory with the wanted name. It + * returns the cache buffer in which the entry was found, and the entry + * itself (as a parameter - res_dir). It does NOT read the inode of the + * entry - you'll have to do that yourself if you want to. + * + * The returned buffer_head has ->b_count elevated. The caller is expected + * to brelse() it when appropriate. + */ +static struct buffer_head * ext4_find_entry (struct inode *dir, + const struct qstr *d_name, + struct ext4_dir_entry_2 ** res_dir) +{ + struct super_block *sb; + struct buffer_head *bh_use[NAMEI_RA_SIZE]; + struct buffer_head *bh, *ret = NULL; + ext4_lblk_t start, block, b; + int ra_max = 0; /* Number of bh's in the readahead + buffer, bh_use[] */ + int ra_ptr = 0; /* Current index into readahead + buffer */ + int num = 0; + ext4_lblk_t nblocks; + int i, err; + int namelen; + + *res_dir = NULL; + sb = dir->i_sb; + namelen = d_name->len; + if (namelen > EXT4_NAME_LEN) + return NULL; + if (is_dx(dir)) { + bh = ext4_dx_find_entry(dir, d_name, res_dir, &err); + /* + * On success, or if the error was file not found, + * return. Otherwise, fall back to doing a search the + * old fashioned way. + */ + if (bh || (err != ERR_BAD_DX_DIR)) + return bh; + dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " + "falling back\n")); + } + nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); + start = EXT4_I(dir)->i_dir_start_lookup; + if (start >= nblocks) + start = 0; + block = start; +restart: + do { + /* + * We deal with the read-ahead logic here. + */ + if (ra_ptr >= ra_max) { + /* Refill the readahead buffer */ + ra_ptr = 0; + b = block; + for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) { + /* + * Terminate if we reach the end of the + * directory and must wrap, or if our + * search has finished at this block. + */ + if (b >= nblocks || (num && block == start)) { + bh_use[ra_max] = NULL; + break; + } + num++; + bh = ext4_getblk(NULL, dir, b++, 0, &err); + bh_use[ra_max] = bh; + if (bh) + ll_rw_block(READ_META, 1, &bh); + } + } + if ((bh = bh_use[ra_ptr++]) == NULL) + goto next; + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + /* read error, skip block & hope for the best */ + ext4_error(sb, __func__, "reading directory #%lu " + "offset %lu", dir->i_ino, + (unsigned long)block); + brelse(bh); + goto next; + } + i = search_dirblock(bh, dir, d_name, + block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); + if (i == 1) { + EXT4_I(dir)->i_dir_start_lookup = block; + ret = bh; + goto cleanup_and_exit; + } else { + brelse(bh); + if (i < 0) + goto cleanup_and_exit; + } + next: + if (++block >= nblocks) + block = 0; + } while (block != start); + + /* + * If the directory has grown while we were searching, then + * search the last part of the directory before giving up. + */ + block = nblocks; + nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); + if (block < nblocks) { + start = 0; + goto restart; + } + +cleanup_and_exit: + /* Clean up the read-ahead blocks */ + for (; ra_ptr < ra_max; ra_ptr++) + brelse(bh_use[ra_ptr]); + return ret; +} + +static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name, + struct ext4_dir_entry_2 **res_dir, int *err) +{ + struct super_block * sb; + struct dx_hash_info hinfo; + u32 hash; + struct dx_frame frames[2], *frame; + struct ext4_dir_entry_2 *de, *top; + struct buffer_head *bh; + ext4_lblk_t block; + int retval; + int namelen = d_name->len; + const u8 *name = d_name->name; + + sb = dir->i_sb; + /* NFS may look up ".." - look at dx_root directory block */ + if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){ + if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err))) + return NULL; + } else { + frame = frames; + frame->bh = NULL; /* for dx_release() */ + frame->at = (struct dx_entry *)frames; /* hack for zero entry*/ + dx_set_block(frame->at, 0); /* dx_root block is 0 */ + } + hash = hinfo.hash; + do { + block = dx_get_block(frame->at); + if (!(bh = ext4_bread (NULL,dir, block, 0, err))) + goto errout; + de = (struct ext4_dir_entry_2 *) bh->b_data; + top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize - + EXT4_DIR_REC_LEN(0)); + for (; de < top; de = ext4_next_entry(de)) { + int off = (block << EXT4_BLOCK_SIZE_BITS(sb)) + + ((char *) de - bh->b_data); + + if (!ext4_check_dir_entry(__func__, dir, de, bh, off)) { + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto errout; + } + + if (ext4_match(namelen, name, de)) { + *res_dir = de; + dx_release(frames); + return bh; + } + } + brelse(bh); + /* Check to see if we should continue to search */ + retval = ext4_htree_next_block(dir, hash, frame, + frames, NULL); + if (retval < 0) { + ext4_warning(sb, __func__, + "error reading index page in directory #%lu", + dir->i_ino); + *err = retval; + goto errout; + } + } while (retval == 1); + + *err = -ENOENT; +errout: + dxtrace(printk(KERN_DEBUG "%s not found\n", name)); + dx_release (frames); + return NULL; +} + +static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +{ + struct inode *inode; + struct ext4_dir_entry_2 *de; + struct buffer_head *bh; + + if (dentry->d_name.len > EXT4_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + + bh = ext4_find_entry(dir, &dentry->d_name, &de); + inode = NULL; + if (bh) { + unsigned long ino = le32_to_cpu(de->inode); + brelse(bh); + if (!ext4_valid_inum(dir->i_sb, ino)) { + ext4_error(dir->i_sb, "ext4_lookup", + "bad inode number: %lu", ino); + return ERR_PTR(-EIO); + } + inode = ext4_iget(dir->i_sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + } + return d_splice_alias(inode, dentry); +} + + +struct dentry *ext4_get_parent(struct dentry *child) +{ + unsigned long ino; + struct inode *inode; + static const struct qstr dotdot = { + .name = "..", + .len = 2, + }; + struct ext4_dir_entry_2 * de; + struct buffer_head *bh; + + bh = ext4_find_entry(child->d_inode, &dotdot, &de); + inode = NULL; + if (!bh) + return ERR_PTR(-ENOENT); + ino = le32_to_cpu(de->inode); + brelse(bh); + + if (!ext4_valid_inum(child->d_inode->i_sb, ino)) { + ext4_error(child->d_inode->i_sb, "ext4_get_parent", + "bad inode number: %lu", ino); + return ERR_PTR(-EIO); + } + + return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino)); +} + +#define S_SHIFT 12 +static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = { + [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR, + [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK, + [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK, +}; + +static inline void ext4_set_de_type(struct super_block *sb, + struct ext4_dir_entry_2 *de, + umode_t mode) { + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE)) + de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; +} + +/* + * Move count entries from end of map between two memory locations. + * Returns pointer to last entry moved. + */ +static struct ext4_dir_entry_2 * +dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count) +{ + unsigned rec_len = 0; + + while (count--) { + struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) (from + map->offs); + rec_len = EXT4_DIR_REC_LEN(de->name_len); + memcpy (to, de, rec_len); + ((struct ext4_dir_entry_2 *) to)->rec_len = + ext4_rec_len_to_disk(rec_len); + de->inode = 0; + map++; + to += rec_len; + } + return (struct ext4_dir_entry_2 *) (to - rec_len); +} + +/* + * Compact each dir entry in the range to the minimal rec_len. + * Returns pointer to last entry in range. + */ +static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size) +{ + struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base; + unsigned rec_len = 0; + + prev = to = de; + while ((char*)de < base + size) { + next = ext4_next_entry(de); + if (de->inode && de->name_len) { + rec_len = EXT4_DIR_REC_LEN(de->name_len); + if (de > to) + memmove(to, de, rec_len); + to->rec_len = ext4_rec_len_to_disk(rec_len); + prev = to; + to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len); + } + de = next; + } + return prev; +} + +/* + * Split a full leaf block to make room for a new dir entry. + * Allocate a new block, and move entries so that they are approx. equally full. + * Returns pointer to de in block into which the new entry will be inserted. + */ +static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, + struct buffer_head **bh,struct dx_frame *frame, + struct dx_hash_info *hinfo, int *error) +{ + unsigned blocksize = dir->i_sb->s_blocksize; + unsigned count, continued; + struct buffer_head *bh2; + ext4_lblk_t newblock; + u32 hash2; + struct dx_map_entry *map; + char *data1 = (*bh)->b_data, *data2; + unsigned split, move, size, i; + struct ext4_dir_entry_2 *de = NULL, *de2; + int err = 0; + + bh2 = ext4_append (handle, dir, &newblock, &err); + if (!(bh2)) { + brelse(*bh); + *bh = NULL; + goto errout; + } + + BUFFER_TRACE(*bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, *bh); + if (err) + goto journal_error; + + BUFFER_TRACE(frame->bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, frame->bh); + if (err) + goto journal_error; + + data2 = bh2->b_data; + + /* create map in the end of data2 block */ + map = (struct dx_map_entry *) (data2 + blocksize); + count = dx_make_map((struct ext4_dir_entry_2 *) data1, + blocksize, hinfo, map); + map -= count; + dx_sort_map(map, count); + /* Split the existing block in the middle, size-wise */ + size = 0; + move = 0; + for (i = count-1; i >= 0; i--) { + /* is more than half of this entry in 2nd half of the block? */ + if (size + map[i].size/2 > blocksize/2) + break; + size += map[i].size; + move++; + } + /* map index at which we will split */ + split = count - move; + hash2 = map[split].hash; + continued = hash2 == map[split - 1].hash; + dxtrace(printk(KERN_INFO "Split block %lu at %x, %i/%i\n", + (unsigned long)dx_get_block(frame->at), + hash2, split, count-split)); + + /* Fancy dance to stay within two buffers */ + de2 = dx_move_dirents(data1, data2, map + split, count - split); + de = dx_pack_dirents(data1, blocksize); + de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de); + de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2); + dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1)); + dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1)); + + /* Which block gets the new entry? */ + if (hinfo->hash >= hash2) + { + swap(*bh, bh2); + de = de2; + } + dx_insert_block(frame, hash2 + continued, newblock); + err = ext4_journal_dirty_metadata(handle, bh2); + if (err) + goto journal_error; + err = ext4_journal_dirty_metadata(handle, frame->bh); + if (err) + goto journal_error; + brelse(bh2); + dxtrace(dx_show_index("frame", frame->entries)); + return de; + +journal_error: + brelse(*bh); + brelse(bh2); + *bh = NULL; + ext4_std_error(dir->i_sb, err); +errout: + *error = err; + return NULL; +} + +/* + * Add a new entry into a directory (leaf) block. If de is non-NULL, + * it points to a directory entry which is guaranteed to be large + * enough for new directory entry. If de is NULL, then + * add_dirent_to_buf will attempt search the directory block for + * space. It will return -ENOSPC if no space is available, and -EIO + * and -EEXIST if directory entry already exists. + * + * NOTE! bh is NOT released in the case where ENOSPC is returned. In + * all other cases bh is released. + */ +static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, + struct inode *inode, struct ext4_dir_entry_2 *de, + struct buffer_head *bh) +{ + struct inode *dir = dentry->d_parent->d_inode; + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + unsigned long offset = 0; + unsigned short reclen; + int nlen, rlen, err; + char *top; + + reclen = EXT4_DIR_REC_LEN(namelen); + if (!de) { + de = (struct ext4_dir_entry_2 *)bh->b_data; + top = bh->b_data + dir->i_sb->s_blocksize - reclen; + while ((char *) de <= top) { + if (!ext4_check_dir_entry("ext4_add_entry", dir, de, + bh, offset)) { + brelse(bh); + return -EIO; + } + if (ext4_match(namelen, name, de)) { + brelse(bh); + return -EEXIST; + } + nlen = EXT4_DIR_REC_LEN(de->name_len); + rlen = ext4_rec_len_from_disk(de->rec_len); + if ((de->inode? rlen - nlen: rlen) >= reclen) + break; + de = (struct ext4_dir_entry_2 *)((char *)de + rlen); + offset += rlen; + } + if ((char *) de > top) + return -ENOSPC; + } + BUFFER_TRACE(bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, bh); + if (err) { + ext4_std_error(dir->i_sb, err); + brelse(bh); + return err; + } + + /* By now the buffer is marked for journaling */ + nlen = EXT4_DIR_REC_LEN(de->name_len); + rlen = ext4_rec_len_from_disk(de->rec_len); + if (de->inode) { + struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen); + de1->rec_len = ext4_rec_len_to_disk(rlen - nlen); + de->rec_len = ext4_rec_len_to_disk(nlen); + de = de1; + } + de->file_type = EXT4_FT_UNKNOWN; + if (inode) { + de->inode = cpu_to_le32(inode->i_ino); + ext4_set_de_type(dir->i_sb, de, inode->i_mode); + } else + de->inode = 0; + de->name_len = namelen; + memcpy(de->name, name, namelen); + /* + * XXX shouldn't update any times until successful + * completion of syscall, but too many callers depend + * on this. + * + * XXX similarly, too many callers depend on + * ext4_new_inode() setting the times, but error + * recovery deletes the inode, so the worst that can + * happen is that the times are slightly out of date + * and/or different from the directory change time. + */ + dir->i_mtime = dir->i_ctime = ext4_current_time(dir); + ext4_update_dx_flag(dir); + dir->i_version++; + ext4_mark_inode_dirty(handle, dir); + BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); + err = ext4_journal_dirty_metadata(handle, bh); + if (err) + ext4_std_error(dir->i_sb, err); + brelse(bh); + return 0; +} + +/* + * This converts a one block unindexed directory to a 3 block indexed + * directory, and adds the dentry to the indexed directory. + */ +static int make_indexed_dir(handle_t *handle, struct dentry *dentry, + struct inode *inode, struct buffer_head *bh) +{ + struct inode *dir = dentry->d_parent->d_inode; + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + struct buffer_head *bh2; + struct dx_root *root; + struct dx_frame frames[2], *frame; + struct dx_entry *entries; + struct ext4_dir_entry_2 *de, *de2; + char *data1, *top; + unsigned len; + int retval; + unsigned blocksize; + struct dx_hash_info hinfo; + ext4_lblk_t block; + struct fake_dirent *fde; + + blocksize = dir->i_sb->s_blocksize; + dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino)); + retval = ext4_journal_get_write_access(handle, bh); + if (retval) { + ext4_std_error(dir->i_sb, retval); + brelse(bh); + return retval; + } + root = (struct dx_root *) bh->b_data; + + /* The 0th block becomes the root, move the dirents out */ + fde = &root->dotdot; + de = (struct ext4_dir_entry_2 *)((char *)fde + + ext4_rec_len_from_disk(fde->rec_len)); + if ((char *) de >= (((char *) root) + blocksize)) { + ext4_error(dir->i_sb, __func__, + "invalid rec_len for '..' in inode %lu", + dir->i_ino); + brelse(bh); + return -EIO; + } + len = ((char *) root) + blocksize - (char *) de; + + /* Allocate new block for the 0th block's dirents */ + bh2 = ext4_append(handle, dir, &block, &retval); + if (!(bh2)) { + brelse(bh); + return retval; + } + EXT4_I(dir)->i_flags |= EXT4_INDEX_FL; + data1 = bh2->b_data; + + memcpy (data1, de, len); + de = (struct ext4_dir_entry_2 *) data1; + top = data1 + len; + while ((char *)(de2 = ext4_next_entry(de)) < top) + de = de2; + de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de); + /* Initialize the root; the dot dirents already exist */ + de = (struct ext4_dir_entry_2 *) (&root->dotdot); + de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2)); + memset (&root->info, 0, sizeof(root->info)); + root->info.info_length = sizeof(root->info); + root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; + entries = root->entries; + dx_set_block(entries, 1); + dx_set_count(entries, 1); + dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info))); + + /* Initialize as for dx_probe */ + hinfo.hash_version = root->info.hash_version; + if (hinfo.hash_version <= DX_HASH_TEA) + hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; + hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; + ext4fs_dirhash(name, namelen, &hinfo); + frame = frames; + frame->entries = entries; + frame->at = entries; + frame->bh = bh; + bh = bh2; + de = do_split(handle,dir, &bh, frame, &hinfo, &retval); + dx_release (frames); + if (!(de)) + return retval; + + return add_dirent_to_buf(handle, dentry, inode, de, bh); +} + +/* + * ext4_add_entry() + * + * adds a file entry to the specified directory, using the same + * semantics as ext4_find_entry(). It returns NULL if it failed. + * + * NOTE!! The inode part of 'de' is left at 0 - which means you + * may not sleep between calling this and putting something into + * the entry, as someone else might have used it while you slept. + */ +static int ext4_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode) +{ + struct inode *dir = dentry->d_parent->d_inode; + unsigned long offset; + struct buffer_head *bh; + struct ext4_dir_entry_2 *de; + struct super_block *sb; + int retval; + int dx_fallback=0; + unsigned blocksize; + ext4_lblk_t block, blocks; + + sb = dir->i_sb; + blocksize = sb->s_blocksize; + if (!dentry->d_name.len) + return -EINVAL; + if (is_dx(dir)) { + retval = ext4_dx_add_entry(handle, dentry, inode); + if (!retval || (retval != ERR_BAD_DX_DIR)) + return retval; + EXT4_I(dir)->i_flags &= ~EXT4_INDEX_FL; + dx_fallback++; + ext4_mark_inode_dirty(handle, dir); + } + blocks = dir->i_size >> sb->s_blocksize_bits; + for (block = 0, offset = 0; block < blocks; block++) { + bh = ext4_bread(handle, dir, block, 0, &retval); + if(!bh) + return retval; + retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh); + if (retval != -ENOSPC) + return retval; + + if (blocks == 1 && !dx_fallback && + EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) + return make_indexed_dir(handle, dentry, inode, bh); + brelse(bh); + } + bh = ext4_append(handle, dir, &block, &retval); + if (!bh) + return retval; + de = (struct ext4_dir_entry_2 *) bh->b_data; + de->inode = 0; + de->rec_len = ext4_rec_len_to_disk(blocksize); + return add_dirent_to_buf(handle, dentry, inode, de, bh); +} + +/* + * Returns 0 for success, or a negative error value + */ +static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode) +{ + struct dx_frame frames[2], *frame; + struct dx_entry *entries, *at; + struct dx_hash_info hinfo; + struct buffer_head *bh; + struct inode *dir = dentry->d_parent->d_inode; + struct super_block *sb = dir->i_sb; + struct ext4_dir_entry_2 *de; + int err; + + frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err); + if (!frame) + return err; + entries = frame->entries; + at = frame->at; + + if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err))) + goto cleanup; + + BUFFER_TRACE(bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, bh); + if (err) + goto journal_error; + + err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); + if (err != -ENOSPC) { + bh = NULL; + goto cleanup; + } + + /* Block full, should compress but for now just split */ + dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", + dx_get_count(entries), dx_get_limit(entries))); + /* Need to split index? */ + if (dx_get_count(entries) == dx_get_limit(entries)) { + ext4_lblk_t newblock; + unsigned icount = dx_get_count(entries); + int levels = frame - frames; + struct dx_entry *entries2; + struct dx_node *node2; + struct buffer_head *bh2; + + if (levels && (dx_get_count(frames->entries) == + dx_get_limit(frames->entries))) { + ext4_warning(sb, __func__, + "Directory index full!"); + err = -ENOSPC; + goto cleanup; + } + bh2 = ext4_append (handle, dir, &newblock, &err); + if (!(bh2)) + goto cleanup; + node2 = (struct dx_node *)(bh2->b_data); + entries2 = node2->entries; + node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize); + node2->fake.inode = 0; + BUFFER_TRACE(frame->bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, frame->bh); + if (err) + goto journal_error; + if (levels) { + unsigned icount1 = icount/2, icount2 = icount - icount1; + unsigned hash2 = dx_get_hash(entries + icount1); + dxtrace(printk(KERN_DEBUG "Split index %i/%i\n", + icount1, icount2)); + + BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ + err = ext4_journal_get_write_access(handle, + frames[0].bh); + if (err) + goto journal_error; + + memcpy((char *) entries2, (char *) (entries + icount1), + icount2 * sizeof(struct dx_entry)); + dx_set_count(entries, icount1); + dx_set_count(entries2, icount2); + dx_set_limit(entries2, dx_node_limit(dir)); + + /* Which index block gets the new entry? */ + if (at - entries >= icount1) { + frame->at = at = at - entries - icount1 + entries2; + frame->entries = entries = entries2; + swap(frame->bh, bh2); + } + dx_insert_block(frames + 0, hash2, newblock); + dxtrace(dx_show_index("node", frames[1].entries)); + dxtrace(dx_show_index("node", + ((struct dx_node *) bh2->b_data)->entries)); + err = ext4_journal_dirty_metadata(handle, bh2); + if (err) + goto journal_error; + brelse (bh2); + } else { + dxtrace(printk(KERN_DEBUG + "Creating second level index...\n")); + memcpy((char *) entries2, (char *) entries, + icount * sizeof(struct dx_entry)); + dx_set_limit(entries2, dx_node_limit(dir)); + + /* Set up root */ + dx_set_count(entries, 1); + dx_set_block(entries + 0, newblock); + ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; + + /* Add new access path frame */ + frame = frames + 1; + frame->at = at = at - entries + entries2; + frame->entries = entries = entries2; + frame->bh = bh2; + err = ext4_journal_get_write_access(handle, + frame->bh); + if (err) + goto journal_error; + } + ext4_journal_dirty_metadata(handle, frames[0].bh); + } + de = do_split(handle, dir, &bh, frame, &hinfo, &err); + if (!de) + goto cleanup; + err = add_dirent_to_buf(handle, dentry, inode, de, bh); + bh = NULL; + goto cleanup; + +journal_error: + ext4_std_error(dir->i_sb, err); +cleanup: + if (bh) + brelse(bh); + dx_release(frames); + return err; +} + +/* + * ext4_delete_entry deletes a directory entry by merging it with the + * previous entry + */ +static int ext4_delete_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh) +{ + struct ext4_dir_entry_2 *de, *pde; + int i; + + i = 0; + pde = NULL; + de = (struct ext4_dir_entry_2 *) bh->b_data; + while (i < bh->b_size) { + if (!ext4_check_dir_entry("ext4_delete_entry", dir, de, bh, i)) + return -EIO; + if (de == de_del) { + BUFFER_TRACE(bh, "get_write_access"); + ext4_journal_get_write_access(handle, bh); + if (pde) + pde->rec_len = ext4_rec_len_to_disk( + ext4_rec_len_from_disk(pde->rec_len) + + ext4_rec_len_from_disk(de->rec_len)); + else + de->inode = 0; + dir->i_version++; + BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); + ext4_journal_dirty_metadata(handle, bh); + return 0; + } + i += ext4_rec_len_from_disk(de->rec_len); + pde = de; + de = ext4_next_entry(de); + } + return -ENOENT; +} + +/* + * DIR_NLINK feature is set if 1) nlinks > EXT4_LINK_MAX or 2) nlinks == 2, + * since this indicates that nlinks count was previously 1. + */ +static void ext4_inc_count(handle_t *handle, struct inode *inode) +{ + inc_nlink(inode); + if (is_dx(inode) && inode->i_nlink > 1) { + /* limit is 16-bit i_links_count */ + if (inode->i_nlink >= EXT4_LINK_MAX || inode->i_nlink == 2) { + inode->i_nlink = 1; + EXT4_SET_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_DIR_NLINK); + } + } +} + +/* + * If a directory had nlink == 1, then we should let it be 1. This indicates + * directory has >EXT4_LINK_MAX subdirs. + */ +static void ext4_dec_count(handle_t *handle, struct inode *inode) +{ + drop_nlink(inode); + if (S_ISDIR(inode->i_mode) && inode->i_nlink == 0) + inc_nlink(inode); +} + + +static int ext4_add_nondir(handle_t *handle, + struct dentry *dentry, struct inode *inode) +{ + int err = ext4_add_entry(handle, dentry, inode); + if (!err) { + ext4_mark_inode_dirty(handle, inode); + d_instantiate(dentry, inode); + return 0; + } + drop_nlink(inode); + iput(inode); + return err; +} + +/* + * By the time this is called, we already have created + * the directory cache entry for the new file, but it + * is so far negative - it has no inode. + * + * If the create succeeds, we fill in the inode information + * with d_instantiate(). + */ +static int ext4_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + handle_t *handle; + struct inode *inode; + int err, retries = 0; + +retry: + handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + + 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + + inode = ext4_new_inode (handle, dir, mode); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + inode->i_op = &ext4_file_inode_operations; + inode->i_fop = &ext4_file_operations; + ext4_set_aops(inode); + err = ext4_add_nondir(handle, dentry, inode); + } + ext4_journal_stop(handle); + if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +} + +static int ext4_mknod(struct inode *dir, struct dentry *dentry, + int mode, dev_t rdev) +{ + handle_t *handle; + struct inode *inode; + int err, retries = 0; + + if (!new_valid_dev(rdev)) + return -EINVAL; + +retry: + handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + + 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + + inode = ext4_new_inode(handle, dir, mode); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + init_special_inode(inode, inode->i_mode, rdev); +#ifdef CONFIG_EXT4_FS_XATTR + inode->i_op = &ext4_special_inode_operations; +#endif + err = ext4_add_nondir(handle, dentry, inode); + } + ext4_journal_stop(handle); + if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +} + +static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + handle_t *handle; + struct inode *inode; + struct buffer_head *dir_block; + struct ext4_dir_entry_2 *de; + int err, retries = 0; + + if (EXT4_DIR_LINK_MAX(dir)) + return -EMLINK; + +retry: + handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + + 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + + inode = ext4_new_inode(handle, dir, S_IFDIR | mode); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_stop; + + inode->i_op = &ext4_dir_inode_operations; + inode->i_fop = &ext4_dir_operations; + inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; + dir_block = ext4_bread(handle, inode, 0, 1, &err); + if (!dir_block) + goto out_clear_inode; + BUFFER_TRACE(dir_block, "get_write_access"); + ext4_journal_get_write_access(handle, dir_block); + de = (struct ext4_dir_entry_2 *) dir_block->b_data; + de->inode = cpu_to_le32(inode->i_ino); + de->name_len = 1; + de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len)); + strcpy(de->name, "."); + ext4_set_de_type(dir->i_sb, de, S_IFDIR); + de = ext4_next_entry(de); + de->inode = cpu_to_le32(dir->i_ino); + de->rec_len = ext4_rec_len_to_disk(inode->i_sb->s_blocksize - + EXT4_DIR_REC_LEN(1)); + de->name_len = 2; + strcpy(de->name, ".."); + ext4_set_de_type(dir->i_sb, de, S_IFDIR); + inode->i_nlink = 2; + BUFFER_TRACE(dir_block, "call ext4_journal_dirty_metadata"); + ext4_journal_dirty_metadata(handle, dir_block); + brelse(dir_block); + ext4_mark_inode_dirty(handle, inode); + err = ext4_add_entry(handle, dentry, inode); + if (err) { +out_clear_inode: + clear_nlink(inode); + ext4_mark_inode_dirty(handle, inode); + iput(inode); + goto out_stop; + } + ext4_inc_count(handle, dir); + ext4_update_dx_flag(dir); + ext4_mark_inode_dirty(handle, dir); + d_instantiate(dentry, inode); +out_stop: + ext4_journal_stop(handle); + if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +} + +/* + * routine to check that the specified directory is empty (for rmdir) + */ +static int empty_dir(struct inode *inode) +{ + unsigned long offset; + struct buffer_head *bh; + struct ext4_dir_entry_2 *de, *de1; + struct super_block *sb; + int err = 0; + + sb = inode->i_sb; + if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) || + !(bh = ext4_bread(NULL, inode, 0, 0, &err))) { + if (err) + ext4_error(inode->i_sb, __func__, + "error %d reading directory #%lu offset 0", + err, inode->i_ino); + else + ext4_warning(inode->i_sb, __func__, + "bad directory (dir #%lu) - no data block", + inode->i_ino); + return 1; + } + de = (struct ext4_dir_entry_2 *) bh->b_data; + de1 = ext4_next_entry(de); + if (le32_to_cpu(de->inode) != inode->i_ino || + !le32_to_cpu(de1->inode) || + strcmp(".", de->name) || + strcmp("..", de1->name)) { + ext4_warning(inode->i_sb, "empty_dir", + "bad directory (dir #%lu) - no `.' or `..'", + inode->i_ino); + brelse(bh); + return 1; + } + offset = ext4_rec_len_from_disk(de->rec_len) + + ext4_rec_len_from_disk(de1->rec_len); + de = ext4_next_entry(de1); + while (offset < inode->i_size) { + if (!bh || + (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { + err = 0; + brelse(bh); + bh = ext4_bread(NULL, inode, + offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err); + if (!bh) { + if (err) + ext4_error(sb, __func__, + "error %d reading directory" + " #%lu offset %lu", + err, inode->i_ino, offset); + offset += sb->s_blocksize; + continue; + } + de = (struct ext4_dir_entry_2 *) bh->b_data; + } + if (!ext4_check_dir_entry("empty_dir", inode, de, bh, offset)) { + de = (struct ext4_dir_entry_2 *)(bh->b_data + + sb->s_blocksize); + offset = (offset | (sb->s_blocksize - 1)) + 1; + continue; + } + if (le32_to_cpu(de->inode)) { + brelse(bh); + return 0; + } + offset += ext4_rec_len_from_disk(de->rec_len); + de = ext4_next_entry(de); + } + brelse(bh); + return 1; +} + +/* ext4_orphan_add() links an unlinked or truncated inode into a list of + * such inodes, starting at the superblock, in case we crash before the + * file is closed/deleted, or in case the inode truncate spans multiple + * transactions and the last transaction is not recovered after a crash. + * + * At filesystem recovery time, we walk this list deleting unlinked + * inodes and truncating linked inodes in ext4_orphan_cleanup(). + */ +int ext4_orphan_add(handle_t *handle, struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct ext4_iloc iloc; + int err = 0, rc; + + lock_super(sb); + if (!list_empty(&EXT4_I(inode)->i_orphan)) + goto out_unlock; + + /* Orphan handling is only valid for files with data blocks + * being truncated, or files being unlinked. */ + + /* @@@ FIXME: Observation from aviro: + * I think I can trigger J_ASSERT in ext4_orphan_add(). We block + * here (on lock_super()), so race with ext4_link() which might bump + * ->i_nlink. For, say it, character device. Not a regular file, + * not a directory, not a symlink and ->i_nlink > 0. + */ + J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); + + BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); + err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); + if (err) + goto out_unlock; + + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err) + goto out_unlock; + + /* Insert this inode at the head of the on-disk orphan list... */ + NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan); + EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); + err = ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh); + rc = ext4_mark_iloc_dirty(handle, inode, &iloc); + if (!err) + err = rc; + + /* Only add to the head of the in-memory list if all the + * previous operations succeeded. If the orphan_add is going to + * fail (possibly taking the journal offline), we can't risk + * leaving the inode on the orphan list: stray orphan-list + * entries can cause panics at unmount time. + * + * This is safe: on error we're going to ignore the orphan list + * anyway on the next recovery. */ + if (!err) + list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); + + jbd_debug(4, "superblock will point to %lu\n", inode->i_ino); + jbd_debug(4, "orphan inode %lu will point to %d\n", + inode->i_ino, NEXT_ORPHAN(inode)); +out_unlock: + unlock_super(sb); + ext4_std_error(inode->i_sb, err); + return err; +} + +/* + * ext4_orphan_del() removes an unlinked or truncated inode from the list + * of such inodes stored on disk, because it is finally being cleaned up. + */ +int ext4_orphan_del(handle_t *handle, struct inode *inode) +{ + struct list_head *prev; + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi; + unsigned long ino_next; + struct ext4_iloc iloc; + int err = 0; + + lock_super(inode->i_sb); + if (list_empty(&ei->i_orphan)) { + unlock_super(inode->i_sb); + return 0; + } + + ino_next = NEXT_ORPHAN(inode); + prev = ei->i_orphan.prev; + sbi = EXT4_SB(inode->i_sb); + + jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino); + + list_del_init(&ei->i_orphan); + + /* If we're on an error path, we may not have a valid + * transaction handle with which to update the orphan list on + * disk, but we still need to remove the inode from the linked + * list in memory. */ + if (!handle) + goto out; + + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err) + goto out_err; + + if (prev == &sbi->s_orphan) { + jbd_debug(4, "superblock will point to %lu\n", ino_next); + BUFFER_TRACE(sbi->s_sbh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sbi->s_sbh); + if (err) + goto out_brelse; + sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); + err = ext4_journal_dirty_metadata(handle, sbi->s_sbh); + } else { + struct ext4_iloc iloc2; + struct inode *i_prev = + &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode; + + jbd_debug(4, "orphan inode %lu will point to %lu\n", + i_prev->i_ino, ino_next); + err = ext4_reserve_inode_write(handle, i_prev, &iloc2); + if (err) + goto out_brelse; + NEXT_ORPHAN(i_prev) = ino_next; + err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2); + } + if (err) + goto out_brelse; + NEXT_ORPHAN(inode) = 0; + err = ext4_mark_iloc_dirty(handle, inode, &iloc); + +out_err: + ext4_std_error(inode->i_sb, err); +out: + unlock_super(inode->i_sb); + return err; + +out_brelse: + brelse(iloc.bh); + goto out_err; +} + +static int ext4_rmdir(struct inode *dir, struct dentry *dentry) +{ + int retval; + struct inode *inode; + struct buffer_head *bh; + struct ext4_dir_entry_2 *de; + handle_t *handle; + + /* Initialize quotas before so that eventual writes go in + * separate transaction */ + DQUOT_INIT(dentry->d_inode); + handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + retval = -ENOENT; + bh = ext4_find_entry(dir, &dentry->d_name, &de); + if (!bh) + goto end_rmdir; + + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + + inode = dentry->d_inode; + + retval = -EIO; + if (le32_to_cpu(de->inode) != inode->i_ino) + goto end_rmdir; + + retval = -ENOTEMPTY; + if (!empty_dir(inode)) + goto end_rmdir; + + retval = ext4_delete_entry(handle, dir, de, bh); + if (retval) + goto end_rmdir; + if (!EXT4_DIR_LINK_EMPTY(inode)) + ext4_warning(inode->i_sb, "ext4_rmdir", + "empty directory has too many links (%d)", + inode->i_nlink); + inode->i_version++; + clear_nlink(inode); + /* There's no need to set i_disksize: the fact that i_nlink is + * zero will ensure that the right thing happens during any + * recovery. */ + inode->i_size = 0; + ext4_orphan_add(handle, inode); + inode->i_ctime = dir->i_ctime = dir->i_mtime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + ext4_dec_count(handle, dir); + ext4_update_dx_flag(dir); + ext4_mark_inode_dirty(handle, dir); + +end_rmdir: + ext4_journal_stop(handle); + brelse(bh); + return retval; +} + +static int ext4_unlink(struct inode *dir, struct dentry *dentry) +{ + int retval; + struct inode *inode; + struct buffer_head *bh; + struct ext4_dir_entry_2 *de; + handle_t *handle; + + /* Initialize quotas before so that eventual writes go + * in separate transaction */ + DQUOT_INIT(dentry->d_inode); + handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + + retval = -ENOENT; + bh = ext4_find_entry(dir, &dentry->d_name, &de); + if (!bh) + goto end_unlink; + + inode = dentry->d_inode; + + retval = -EIO; + if (le32_to_cpu(de->inode) != inode->i_ino) + goto end_unlink; + + if (!inode->i_nlink) { + ext4_warning(inode->i_sb, "ext4_unlink", + "Deleting nonexistent file (%lu), %d", + inode->i_ino, inode->i_nlink); + inode->i_nlink = 1; + } + retval = ext4_delete_entry(handle, dir, de, bh); + if (retval) + goto end_unlink; + dir->i_ctime = dir->i_mtime = ext4_current_time(dir); + ext4_update_dx_flag(dir); + ext4_mark_inode_dirty(handle, dir); + drop_nlink(inode); + if (!inode->i_nlink) + ext4_orphan_add(handle, inode); + inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + retval = 0; + +end_unlink: + ext4_journal_stop(handle); + brelse(bh); + return retval; +} + +static int ext4_symlink(struct inode *dir, + struct dentry *dentry, const char *symname) +{ + handle_t *handle; + struct inode *inode; + int l, err, retries = 0; + + l = strlen(symname)+1; + if (l > dir->i_sb->s_blocksize) + return -ENAMETOOLONG; + +retry: + handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 + + 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + + inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_stop; + + if (l > sizeof(EXT4_I(inode)->i_data)) { + inode->i_op = &ext4_symlink_inode_operations; + ext4_set_aops(inode); + /* + * page_symlink() calls into ext4_prepare/commit_write. + * We have a transaction open. All is sweetness. It also sets + * i_size in generic_commit_write(). + */ + err = __page_symlink(inode, symname, l, 1); + if (err) { + clear_nlink(inode); + ext4_mark_inode_dirty(handle, inode); + iput(inode); + goto out_stop; + } + } else { + /* clear the extent format for fast symlink */ + EXT4_I(inode)->i_flags &= ~EXT4_EXTENTS_FL; + inode->i_op = &ext4_fast_symlink_inode_operations; + memcpy((char *)&EXT4_I(inode)->i_data, symname, l); + inode->i_size = l-1; + } + EXT4_I(inode)->i_disksize = inode->i_size; + err = ext4_add_nondir(handle, dentry, inode); +out_stop: + ext4_journal_stop(handle); + if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +} + +static int ext4_link(struct dentry *old_dentry, + struct inode *dir, struct dentry *dentry) +{ + handle_t *handle; + struct inode *inode = old_dentry->d_inode; + int err, retries = 0; + + if (EXT4_DIR_LINK_MAX(inode)) + return -EMLINK; + + /* + * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing + * otherwise has the potential to corrupt the orphan inode list. + */ + if (inode->i_nlink == 0) + return -ENOENT; + +retry: + handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + handle->h_sync = 1; + + inode->i_ctime = ext4_current_time(inode); + ext4_inc_count(handle, inode); + atomic_inc(&inode->i_count); + + err = ext4_add_nondir(handle, dentry, inode); + ext4_journal_stop(handle); + if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +} + +#define PARENT_INO(buffer) \ + (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer))->inode) + +/* + * Anybody can rename anything with this: the permission checks are left to the + * higher-level routines. + */ +static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + handle_t *handle; + struct inode *old_inode, *new_inode; + struct buffer_head *old_bh, *new_bh, *dir_bh; + struct ext4_dir_entry_2 *old_de, *new_de; + int retval; + + old_bh = new_bh = dir_bh = NULL; + + /* Initialize quotas before so that eventual writes go + * in separate transaction */ + if (new_dentry->d_inode) + DQUOT_INIT(new_dentry->d_inode); + handle = ext4_journal_start(old_dir, 2 * + EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) + handle->h_sync = 1; + + old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de); + /* + * Check for inode number is _not_ due to possible IO errors. + * We might rmdir the source, keep it as pwd of some process + * and merrily kill the link to whatever was created under the + * same name. Goodbye sticky bit ;-< + */ + old_inode = old_dentry->d_inode; + retval = -ENOENT; + if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino) + goto end_rename; + + new_inode = new_dentry->d_inode; + new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de); + if (new_bh) { + if (!new_inode) { + brelse(new_bh); + new_bh = NULL; + } + } + if (S_ISDIR(old_inode->i_mode)) { + if (new_inode) { + retval = -ENOTEMPTY; + if (!empty_dir(new_inode)) + goto end_rename; + } + retval = -EIO; + dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval); + if (!dir_bh) + goto end_rename; + if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino) + goto end_rename; + retval = -EMLINK; + if (!new_inode && new_dir != old_dir && + new_dir->i_nlink >= EXT4_LINK_MAX) + goto end_rename; + } + if (!new_bh) { + retval = ext4_add_entry(handle, new_dentry, old_inode); + if (retval) + goto end_rename; + } else { + BUFFER_TRACE(new_bh, "get write access"); + ext4_journal_get_write_access(handle, new_bh); + new_de->inode = cpu_to_le32(old_inode->i_ino); + if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb, + EXT4_FEATURE_INCOMPAT_FILETYPE)) + new_de->file_type = old_de->file_type; + new_dir->i_version++; + new_dir->i_ctime = new_dir->i_mtime = + ext4_current_time(new_dir); + ext4_mark_inode_dirty(handle, new_dir); + BUFFER_TRACE(new_bh, "call ext4_journal_dirty_metadata"); + ext4_journal_dirty_metadata(handle, new_bh); + brelse(new_bh); + new_bh = NULL; + } + + /* + * Like most other Unix systems, set the ctime for inodes on a + * rename. + */ + old_inode->i_ctime = ext4_current_time(old_inode); + ext4_mark_inode_dirty(handle, old_inode); + + /* + * ok, that's it + */ + if (le32_to_cpu(old_de->inode) != old_inode->i_ino || + old_de->name_len != old_dentry->d_name.len || + strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) || + (retval = ext4_delete_entry(handle, old_dir, + old_de, old_bh)) == -ENOENT) { + /* old_de could have moved from under us during htree split, so + * make sure that we are deleting the right entry. We might + * also be pointing to a stale entry in the unused part of + * old_bh so just checking inum and the name isn't enough. */ + struct buffer_head *old_bh2; + struct ext4_dir_entry_2 *old_de2; + + old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de2); + if (old_bh2) { + retval = ext4_delete_entry(handle, old_dir, + old_de2, old_bh2); + brelse(old_bh2); + } + } + if (retval) { + ext4_warning(old_dir->i_sb, "ext4_rename", + "Deleting old file (%lu), %d, error=%d", + old_dir->i_ino, old_dir->i_nlink, retval); + } + + if (new_inode) { + ext4_dec_count(handle, new_inode); + new_inode->i_ctime = ext4_current_time(new_inode); + } + old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir); + ext4_update_dx_flag(old_dir); + if (dir_bh) { + BUFFER_TRACE(dir_bh, "get_write_access"); + ext4_journal_get_write_access(handle, dir_bh); + PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino); + BUFFER_TRACE(dir_bh, "call ext4_journal_dirty_metadata"); + ext4_journal_dirty_metadata(handle, dir_bh); + ext4_dec_count(handle, old_dir); + if (new_inode) { + /* checked empty_dir above, can't have another parent, + * ext4_dec_count() won't work for many-linked dirs */ + new_inode->i_nlink = 0; + } else { + ext4_inc_count(handle, new_dir); + ext4_update_dx_flag(new_dir); + ext4_mark_inode_dirty(handle, new_dir); + } + } + ext4_mark_inode_dirty(handle, old_dir); + if (new_inode) { + ext4_mark_inode_dirty(handle, new_inode); + if (!new_inode->i_nlink) + ext4_orphan_add(handle, new_inode); + } + retval = 0; + +end_rename: + brelse(dir_bh); + brelse(old_bh); + brelse(new_bh); + ext4_journal_stop(handle); + return retval; +} + +/* + * directories can handle most operations... + */ +const struct inode_operations ext4_dir_inode_operations = { + .create = ext4_create, + .lookup = ext4_lookup, + .link = ext4_link, + .unlink = ext4_unlink, + .symlink = ext4_symlink, + .mkdir = ext4_mkdir, + .rmdir = ext4_rmdir, + .mknod = ext4_mknod, + .rename = ext4_rename, + .setattr = ext4_setattr, +#ifdef CONFIG_EXT4_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext4_listxattr, + .removexattr = generic_removexattr, +#endif + .permission = ext4_permission, +}; + +const struct inode_operations ext4_special_inode_operations = { + .setattr = ext4_setattr, +#ifdef CONFIG_EXT4_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext4_listxattr, + .removexattr = generic_removexattr, +#endif + .permission = ext4_permission, +}; diff --git a/fs/ext4/namei.h b/fs/ext4/namei.h new file mode 100644 index 0000000..5e4dfff --- /dev/null +++ b/fs/ext4/namei.h @@ -0,0 +1,8 @@ +/* linux/fs/ext4/namei.h + * + * Copyright (C) 2005 Simtec Electronics + * Ben Dooks <ben@simtec.co.uk> + * +*/ + +extern struct dentry *ext4_get_parent(struct dentry *child); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c new file mode 100644 index 0000000..a027691 --- /dev/null +++ b/fs/ext4/resize.c @@ -0,0 +1,1097 @@ +/* + * linux/fs/ext4/resize.c + * + * Support for resizing an ext4 filesystem while it is mounted. + * + * Copyright (C) 2001, 2002 Andreas Dilger <adilger@clusterfs.com> + * + * This could probably be made into a module, because it is not often in use. + */ + + +#define EXT4FS_DEBUG + +#include <linux/errno.h> +#include <linux/slab.h> + +#include "ext4_jbd2.h" +#include "group.h" + +#define outside(b, first, last) ((b) < (first) || (b) >= (last)) +#define inside(b, first, last) ((b) >= (first) && (b) < (last)) + +static int verify_group_input(struct super_block *sb, + struct ext4_new_group_data *input) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + ext4_fsblk_t start = ext4_blocks_count(es); + ext4_fsblk_t end = start + input->blocks_count; + ext4_group_t group = input->group; + ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group; + unsigned overhead = ext4_bg_has_super(sb, group) ? + (1 + ext4_bg_num_gdb(sb, group) + + le16_to_cpu(es->s_reserved_gdt_blocks)) : 0; + ext4_fsblk_t metaend = start + overhead; + struct buffer_head *bh = NULL; + ext4_grpblk_t free_blocks_count, offset; + int err = -EINVAL; + + input->free_blocks_count = free_blocks_count = + input->blocks_count - 2 - overhead - sbi->s_itb_per_group; + + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG "EXT4-fs: adding %s group %u: %u blocks " + "(%d free, %u reserved)\n", + ext4_bg_has_super(sb, input->group) ? "normal" : + "no-super", input->group, input->blocks_count, + free_blocks_count, input->reserved_blocks); + + ext4_get_group_no_and_offset(sb, start, NULL, &offset); + if (group != sbi->s_groups_count) + ext4_warning(sb, __func__, + "Cannot add at group %u (only %lu groups)", + input->group, sbi->s_groups_count); + else if (offset != 0) + ext4_warning(sb, __func__, "Last group not full"); + else if (input->reserved_blocks > input->blocks_count / 5) + ext4_warning(sb, __func__, "Reserved blocks too high (%u)", + input->reserved_blocks); + else if (free_blocks_count < 0) + ext4_warning(sb, __func__, "Bad blocks count %u", + input->blocks_count); + else if (!(bh = sb_bread(sb, end - 1))) + ext4_warning(sb, __func__, + "Cannot read last block (%llu)", + end - 1); + else if (outside(input->block_bitmap, start, end)) + ext4_warning(sb, __func__, + "Block bitmap not in group (block %llu)", + (unsigned long long)input->block_bitmap); + else if (outside(input->inode_bitmap, start, end)) + ext4_warning(sb, __func__, + "Inode bitmap not in group (block %llu)", + (unsigned long long)input->inode_bitmap); + else if (outside(input->inode_table, start, end) || + outside(itend - 1, start, end)) + ext4_warning(sb, __func__, + "Inode table not in group (blocks %llu-%llu)", + (unsigned long long)input->inode_table, itend - 1); + else if (input->inode_bitmap == input->block_bitmap) + ext4_warning(sb, __func__, + "Block bitmap same as inode bitmap (%llu)", + (unsigned long long)input->block_bitmap); + else if (inside(input->block_bitmap, input->inode_table, itend)) + ext4_warning(sb, __func__, + "Block bitmap (%llu) in inode table (%llu-%llu)", + (unsigned long long)input->block_bitmap, + (unsigned long long)input->inode_table, itend - 1); + else if (inside(input->inode_bitmap, input->inode_table, itend)) + ext4_warning(sb, __func__, + "Inode bitmap (%llu) in inode table (%llu-%llu)", + (unsigned long long)input->inode_bitmap, + (unsigned long long)input->inode_table, itend - 1); + else if (inside(input->block_bitmap, start, metaend)) + ext4_warning(sb, __func__, + "Block bitmap (%llu) in GDT table" + " (%llu-%llu)", + (unsigned long long)input->block_bitmap, + start, metaend - 1); + else if (inside(input->inode_bitmap, start, metaend)) + ext4_warning(sb, __func__, + "Inode bitmap (%llu) in GDT table" + " (%llu-%llu)", + (unsigned long long)input->inode_bitmap, + start, metaend - 1); + else if (inside(input->inode_table, start, metaend) || + inside(itend - 1, start, metaend)) + ext4_warning(sb, __func__, + "Inode table (%llu-%llu) overlaps" + "GDT table (%llu-%llu)", + (unsigned long long)input->inode_table, + itend - 1, start, metaend - 1); + else + err = 0; + brelse(bh); + + return err; +} + +static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, + ext4_fsblk_t blk) +{ + struct buffer_head *bh; + int err; + + bh = sb_getblk(sb, blk); + if (!bh) + return ERR_PTR(-EIO); + if ((err = ext4_journal_get_write_access(handle, bh))) { + brelse(bh); + bh = ERR_PTR(err); + } else { + lock_buffer(bh); + memset(bh->b_data, 0, sb->s_blocksize); + set_buffer_uptodate(bh); + unlock_buffer(bh); + } + + return bh; +} + +/* + * If we have fewer than thresh credits, extend by EXT4_MAX_TRANS_DATA. + * If that fails, restart the transaction & regain write access for the + * buffer head which is used for block_bitmap modifications. + */ +static int extend_or_restart_transaction(handle_t *handle, int thresh, + struct buffer_head *bh) +{ + int err; + + if (handle->h_buffer_credits >= thresh) + return 0; + + err = ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA); + if (err < 0) + return err; + if (err) { + if ((err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA))) + return err; + if ((err = ext4_journal_get_write_access(handle, bh))) + return err; + } + + return 0; +} + +/* + * Set up the block and inode bitmaps, and the inode table for the new group. + * This doesn't need to be part of the main transaction, since we are only + * changing blocks outside the actual filesystem. We still do journaling to + * ensure the recovery is correct in case of a failure just after resize. + * If any part of this fails, we simply abort the resize. + */ +static int setup_new_group_blocks(struct super_block *sb, + struct ext4_new_group_data *input) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t start = ext4_group_first_block_no(sb, input->group); + int reserved_gdb = ext4_bg_has_super(sb, input->group) ? + le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0; + unsigned long gdblocks = ext4_bg_num_gdb(sb, input->group); + struct buffer_head *bh; + handle_t *handle; + ext4_fsblk_t block; + ext4_grpblk_t bit; + int i; + int err = 0, err2; + + /* This transaction may be extended/restarted along the way */ + handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA); + + if (IS_ERR(handle)) + return PTR_ERR(handle); + + lock_super(sb); + if (input->group != sbi->s_groups_count) { + err = -EBUSY; + goto exit_journal; + } + + if (IS_ERR(bh = bclean(handle, sb, input->block_bitmap))) { + err = PTR_ERR(bh); + goto exit_journal; + } + + if (ext4_bg_has_super(sb, input->group)) { + ext4_debug("mark backup superblock %#04llx (+0)\n", start); + ext4_set_bit(0, bh->b_data); + } + + /* Copy all of the GDT blocks into the backup in this group */ + for (i = 0, bit = 1, block = start + 1; + i < gdblocks; i++, block++, bit++) { + struct buffer_head *gdb; + + ext4_debug("update backup group %#04llx (+%d)\n", block, bit); + + if ((err = extend_or_restart_transaction(handle, 1, bh))) + goto exit_bh; + + gdb = sb_getblk(sb, block); + if (!gdb) { + err = -EIO; + goto exit_bh; + } + if ((err = ext4_journal_get_write_access(handle, gdb))) { + brelse(gdb); + goto exit_bh; + } + lock_buffer(gdb); + memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); + set_buffer_uptodate(gdb); + unlock_buffer(gdb); + ext4_journal_dirty_metadata(handle, gdb); + ext4_set_bit(bit, bh->b_data); + brelse(gdb); + } + + /* Zero out all of the reserved backup group descriptor table blocks */ + for (i = 0, bit = gdblocks + 1, block = start + bit; + i < reserved_gdb; i++, block++, bit++) { + struct buffer_head *gdb; + + ext4_debug("clear reserved block %#04llx (+%d)\n", block, bit); + + if ((err = extend_or_restart_transaction(handle, 1, bh))) + goto exit_bh; + + if (IS_ERR(gdb = bclean(handle, sb, block))) { + err = PTR_ERR(bh); + goto exit_bh; + } + ext4_journal_dirty_metadata(handle, gdb); + ext4_set_bit(bit, bh->b_data); + brelse(gdb); + } + ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap, + input->block_bitmap - start); + ext4_set_bit(input->block_bitmap - start, bh->b_data); + ext4_debug("mark inode bitmap %#04llx (+%llu)\n", input->inode_bitmap, + input->inode_bitmap - start); + ext4_set_bit(input->inode_bitmap - start, bh->b_data); + + /* Zero out all of the inode table blocks */ + for (i = 0, block = input->inode_table, bit = block - start; + i < sbi->s_itb_per_group; i++, bit++, block++) { + struct buffer_head *it; + + ext4_debug("clear inode block %#04llx (+%d)\n", block, bit); + + if ((err = extend_or_restart_transaction(handle, 1, bh))) + goto exit_bh; + + if (IS_ERR(it = bclean(handle, sb, block))) { + err = PTR_ERR(it); + goto exit_bh; + } + ext4_journal_dirty_metadata(handle, it); + brelse(it); + ext4_set_bit(bit, bh->b_data); + } + + if ((err = extend_or_restart_transaction(handle, 2, bh))) + goto exit_bh; + + mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, bh->b_data); + ext4_journal_dirty_metadata(handle, bh); + brelse(bh); + /* Mark unused entries in inode bitmap used */ + ext4_debug("clear inode bitmap %#04llx (+%llu)\n", + input->inode_bitmap, input->inode_bitmap - start); + if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) { + err = PTR_ERR(bh); + goto exit_journal; + } + + mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, + bh->b_data); + ext4_journal_dirty_metadata(handle, bh); +exit_bh: + brelse(bh); + +exit_journal: + unlock_super(sb); + if ((err2 = ext4_journal_stop(handle)) && !err) + err = err2; + + return err; +} + +/* + * Iterate through the groups which hold BACKUP superblock/GDT copies in an + * ext4 filesystem. The counters should be initialized to 1, 5, and 7 before + * calling this for the first time. In a sparse filesystem it will be the + * sequence of powers of 3, 5, and 7: 1, 3, 5, 7, 9, 25, 27, 49, 81, ... + * For a non-sparse filesystem it will be every group: 1, 2, 3, 4, ... + */ +static unsigned ext4_list_backups(struct super_block *sb, unsigned *three, + unsigned *five, unsigned *seven) +{ + unsigned *min = three; + int mult = 3; + unsigned ret; + + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) { + ret = *min; + *min += 1; + return ret; + } + + if (*five < *min) { + min = five; + mult = 5; + } + if (*seven < *min) { + min = seven; + mult = 7; + } + + ret = *min; + *min *= mult; + + return ret; +} + +/* + * Check that all of the backup GDT blocks are held in the primary GDT block. + * It is assumed that they are stored in group order. Returns the number of + * groups in current filesystem that have BACKUPS, or -ve error code. + */ +static int verify_reserved_gdb(struct super_block *sb, + struct buffer_head *primary) +{ + const ext4_fsblk_t blk = primary->b_blocknr; + const ext4_group_t end = EXT4_SB(sb)->s_groups_count; + unsigned three = 1; + unsigned five = 5; + unsigned seven = 7; + unsigned grp; + __le32 *p = (__le32 *)primary->b_data; + int gdbackups = 0; + + while ((grp = ext4_list_backups(sb, &three, &five, &seven)) < end) { + if (le32_to_cpu(*p++) != + grp * EXT4_BLOCKS_PER_GROUP(sb) + blk){ + ext4_warning(sb, __func__, + "reserved GDT %llu" + " missing grp %d (%llu)", + blk, grp, + grp * + (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) + + blk); + return -EINVAL; + } + if (++gdbackups > EXT4_ADDR_PER_BLOCK(sb)) + return -EFBIG; + } + + return gdbackups; +} + +/* + * Called when we need to bring a reserved group descriptor table block into + * use from the resize inode. The primary copy of the new GDT block currently + * is an indirect block (under the double indirect block in the resize inode). + * The new backup GDT blocks will be stored as leaf blocks in this indirect + * block, in group order. Even though we know all the block numbers we need, + * we check to ensure that the resize inode has actually reserved these blocks. + * + * Don't need to update the block bitmaps because the blocks are still in use. + * + * We get all of the error cases out of the way, so that we are sure to not + * fail once we start modifying the data on disk, because JBD has no rollback. + */ +static int add_new_gdb(handle_t *handle, struct inode *inode, + struct ext4_new_group_data *input, + struct buffer_head **primary) +{ + struct super_block *sb = inode->i_sb; + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + unsigned long gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb); + ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num; + struct buffer_head **o_group_desc, **n_group_desc; + struct buffer_head *dind; + int gdbackups; + struct ext4_iloc iloc; + __le32 *data; + int err; + + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG + "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n", + gdb_num); + + /* + * If we are not using the primary superblock/GDT copy don't resize, + * because the user tools have no way of handling this. Probably a + * bad time to do it anyways. + */ + if (EXT4_SB(sb)->s_sbh->b_blocknr != + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) { + ext4_warning(sb, __func__, + "won't resize using backup superblock at %llu", + (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr); + return -EPERM; + } + + *primary = sb_bread(sb, gdblock); + if (!*primary) + return -EIO; + + if ((gdbackups = verify_reserved_gdb(sb, *primary)) < 0) { + err = gdbackups; + goto exit_bh; + } + + data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK; + dind = sb_bread(sb, le32_to_cpu(*data)); + if (!dind) { + err = -EIO; + goto exit_bh; + } + + data = (__le32 *)dind->b_data; + if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) { + ext4_warning(sb, __func__, + "new group %u GDT block %llu not reserved", + input->group, gdblock); + err = -EINVAL; + goto exit_dind; + } + + if ((err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh))) + goto exit_dind; + + if ((err = ext4_journal_get_write_access(handle, *primary))) + goto exit_sbh; + + if ((err = ext4_journal_get_write_access(handle, dind))) + goto exit_primary; + + /* ext4_reserve_inode_write() gets a reference on the iloc */ + if ((err = ext4_reserve_inode_write(handle, inode, &iloc))) + goto exit_dindj; + + n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *), + GFP_NOFS); + if (!n_group_desc) { + err = -ENOMEM; + ext4_warning(sb, __func__, + "not enough memory for %lu groups", gdb_num + 1); + goto exit_inode; + } + + /* + * Finally, we have all of the possible failures behind us... + * + * Remove new GDT block from inode double-indirect block and clear out + * the new GDT block for use (which also "frees" the backup GDT blocks + * from the reserved inode). We don't need to change the bitmaps for + * these blocks, because they are marked as in-use from being in the + * reserved inode, and will become GDT blocks (primary and backup). + */ + data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0; + ext4_journal_dirty_metadata(handle, dind); + brelse(dind); + inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; + ext4_mark_iloc_dirty(handle, inode, &iloc); + memset((*primary)->b_data, 0, sb->s_blocksize); + ext4_journal_dirty_metadata(handle, *primary); + + o_group_desc = EXT4_SB(sb)->s_group_desc; + memcpy(n_group_desc, o_group_desc, + EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *)); + n_group_desc[gdb_num] = *primary; + EXT4_SB(sb)->s_group_desc = n_group_desc; + EXT4_SB(sb)->s_gdb_count++; + kfree(o_group_desc); + + le16_add_cpu(&es->s_reserved_gdt_blocks, -1); + ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh); + + return 0; + +exit_inode: + /* ext4_journal_release_buffer(handle, iloc.bh); */ + brelse(iloc.bh); +exit_dindj: + /* ext4_journal_release_buffer(handle, dind); */ +exit_primary: + /* ext4_journal_release_buffer(handle, *primary); */ +exit_sbh: + /* ext4_journal_release_buffer(handle, *primary); */ +exit_dind: + brelse(dind); +exit_bh: + brelse(*primary); + + ext4_debug("leaving with error %d\n", err); + return err; +} + +/* + * Called when we are adding a new group which has a backup copy of each of + * the GDT blocks (i.e. sparse group) and there are reserved GDT blocks. + * We need to add these reserved backup GDT blocks to the resize inode, so + * that they are kept for future resizing and not allocated to files. + * + * Each reserved backup GDT block will go into a different indirect block. + * The indirect blocks are actually the primary reserved GDT blocks, + * so we know in advance what their block numbers are. We only get the + * double-indirect block to verify it is pointing to the primary reserved + * GDT blocks so we don't overwrite a data block by accident. The reserved + * backup GDT blocks are stored in their reserved primary GDT block. + */ +static int reserve_backup_gdb(handle_t *handle, struct inode *inode, + struct ext4_new_group_data *input) +{ + struct super_block *sb = inode->i_sb; + int reserved_gdb =le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks); + struct buffer_head **primary; + struct buffer_head *dind; + struct ext4_iloc iloc; + ext4_fsblk_t blk; + __le32 *data, *end; + int gdbackups = 0; + int res, i; + int err; + + primary = kmalloc(reserved_gdb * sizeof(*primary), GFP_NOFS); + if (!primary) + return -ENOMEM; + + data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK; + dind = sb_bread(sb, le32_to_cpu(*data)); + if (!dind) { + err = -EIO; + goto exit_free; + } + + blk = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + EXT4_SB(sb)->s_gdb_count; + data = (__le32 *)dind->b_data + (EXT4_SB(sb)->s_gdb_count % + EXT4_ADDR_PER_BLOCK(sb)); + end = (__le32 *)dind->b_data + EXT4_ADDR_PER_BLOCK(sb); + + /* Get each reserved primary GDT block and verify it holds backups */ + for (res = 0; res < reserved_gdb; res++, blk++) { + if (le32_to_cpu(*data) != blk) { + ext4_warning(sb, __func__, + "reserved block %llu" + " not at offset %ld", + blk, + (long)(data - (__le32 *)dind->b_data)); + err = -EINVAL; + goto exit_bh; + } + primary[res] = sb_bread(sb, blk); + if (!primary[res]) { + err = -EIO; + goto exit_bh; + } + if ((gdbackups = verify_reserved_gdb(sb, primary[res])) < 0) { + brelse(primary[res]); + err = gdbackups; + goto exit_bh; + } + if (++data >= end) + data = (__le32 *)dind->b_data; + } + + for (i = 0; i < reserved_gdb; i++) { + if ((err = ext4_journal_get_write_access(handle, primary[i]))) { + /* + int j; + for (j = 0; j < i; j++) + ext4_journal_release_buffer(handle, primary[j]); + */ + goto exit_bh; + } + } + + if ((err = ext4_reserve_inode_write(handle, inode, &iloc))) + goto exit_bh; + + /* + * Finally we can add each of the reserved backup GDT blocks from + * the new group to its reserved primary GDT block. + */ + blk = input->group * EXT4_BLOCKS_PER_GROUP(sb); + for (i = 0; i < reserved_gdb; i++) { + int err2; + data = (__le32 *)primary[i]->b_data; + /* printk("reserving backup %lu[%u] = %lu\n", + primary[i]->b_blocknr, gdbackups, + blk + primary[i]->b_blocknr); */ + data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr); + err2 = ext4_journal_dirty_metadata(handle, primary[i]); + if (!err) + err = err2; + } + inode->i_blocks += reserved_gdb * sb->s_blocksize >> 9; + ext4_mark_iloc_dirty(handle, inode, &iloc); + +exit_bh: + while (--res >= 0) + brelse(primary[res]); + brelse(dind); + +exit_free: + kfree(primary); + + return err; +} + +/* + * Update the backup copies of the ext4 metadata. These don't need to be part + * of the main resize transaction, because e2fsck will re-write them if there + * is a problem (basically only OOM will cause a problem). However, we + * _should_ update the backups if possible, in case the primary gets trashed + * for some reason and we need to run e2fsck from a backup superblock. The + * important part is that the new block and inode counts are in the backup + * superblocks, and the location of the new group metadata in the GDT backups. + * + * We do not need lock_super() for this, because these blocks are not + * otherwise touched by the filesystem code when it is mounted. We don't + * need to worry about last changing from sbi->s_groups_count, because the + * worst that can happen is that we do not copy the full number of backups + * at this time. The resize which changed s_groups_count will backup again. + */ +static void update_backups(struct super_block *sb, + int blk_off, char *data, int size) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + const ext4_group_t last = sbi->s_groups_count; + const int bpg = EXT4_BLOCKS_PER_GROUP(sb); + unsigned three = 1; + unsigned five = 5; + unsigned seven = 7; + ext4_group_t group; + int rest = sb->s_blocksize - size; + handle_t *handle; + int err = 0, err2; + + handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA); + if (IS_ERR(handle)) { + group = 1; + err = PTR_ERR(handle); + goto exit_err; + } + + while ((group = ext4_list_backups(sb, &three, &five, &seven)) < last) { + struct buffer_head *bh; + + /* Out of journal space, and can't get more - abort - so sad */ + if (handle->h_buffer_credits == 0 && + ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA) && + (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA))) + break; + + bh = sb_getblk(sb, group * bpg + blk_off); + if (!bh) { + err = -EIO; + break; + } + ext4_debug("update metadata backup %#04lx\n", + (unsigned long)bh->b_blocknr); + if ((err = ext4_journal_get_write_access(handle, bh))) + break; + lock_buffer(bh); + memcpy(bh->b_data, data, size); + if (rest) + memset(bh->b_data + size, 0, rest); + set_buffer_uptodate(bh); + unlock_buffer(bh); + ext4_journal_dirty_metadata(handle, bh); + brelse(bh); + } + if ((err2 = ext4_journal_stop(handle)) && !err) + err = err2; + + /* + * Ugh! Need to have e2fsck write the backup copies. It is too + * late to revert the resize, we shouldn't fail just because of + * the backup copies (they are only needed in case of corruption). + * + * However, if we got here we have a journal problem too, so we + * can't really start a transaction to mark the superblock. + * Chicken out and just set the flag on the hope it will be written + * to disk, and if not - we will simply wait until next fsck. + */ +exit_err: + if (err) { + ext4_warning(sb, __func__, + "can't update backup for group %lu (err %d), " + "forcing fsck on next reboot", group, err); + sbi->s_mount_state &= ~EXT4_VALID_FS; + sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS); + mark_buffer_dirty(sbi->s_sbh); + } +} + +/* Add group descriptor data to an existing or new group descriptor block. + * Ensure we handle all possible error conditions _before_ we start modifying + * the filesystem, because we cannot abort the transaction and not have it + * write the data to disk. + * + * If we are on a GDT block boundary, we need to get the reserved GDT block. + * Otherwise, we may need to add backup GDT blocks for a sparse group. + * + * We only need to hold the superblock lock while we are actually adding + * in the new group's counts to the superblock. Prior to that we have + * not really "added" the group at all. We re-check that we are still + * adding in the last group in case things have changed since verifying. + */ +int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + int reserved_gdb = ext4_bg_has_super(sb, input->group) ? + le16_to_cpu(es->s_reserved_gdt_blocks) : 0; + struct buffer_head *primary = NULL; + struct ext4_group_desc *gdp; + struct inode *inode = NULL; + handle_t *handle; + int gdb_off, gdb_num; + int num_grp_locked = 0; + int err, err2; + + gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb); + gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb); + + if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) { + ext4_warning(sb, __func__, + "Can't resize non-sparse filesystem further"); + return -EPERM; + } + + if (ext4_blocks_count(es) + input->blocks_count < + ext4_blocks_count(es)) { + ext4_warning(sb, __func__, "blocks_count overflow\n"); + return -EINVAL; + } + + if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) < + le32_to_cpu(es->s_inodes_count)) { + ext4_warning(sb, __func__, "inodes_count overflow\n"); + return -EINVAL; + } + + if (reserved_gdb || gdb_off == 0) { + if (!EXT4_HAS_COMPAT_FEATURE(sb, + EXT4_FEATURE_COMPAT_RESIZE_INODE) + || !le16_to_cpu(es->s_reserved_gdt_blocks)) { + ext4_warning(sb, __func__, + "No reserved GDT blocks, can't resize"); + return -EPERM; + } + inode = ext4_iget(sb, EXT4_RESIZE_INO); + if (IS_ERR(inode)) { + ext4_warning(sb, __func__, + "Error opening resize inode"); + return PTR_ERR(inode); + } + } + + + if ((err = verify_group_input(sb, input))) + goto exit_put; + + if ((err = setup_new_group_blocks(sb, input))) + goto exit_put; + + /* + * We will always be modifying at least the superblock and a GDT + * block. If we are adding a group past the last current GDT block, + * we will also modify the inode and the dindirect block. If we + * are adding a group with superblock/GDT backups we will also + * modify each of the reserved GDT dindirect blocks. + */ + handle = ext4_journal_start_sb(sb, + ext4_bg_has_super(sb, input->group) ? + 3 + reserved_gdb : 4); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto exit_put; + } + + lock_super(sb); + if (input->group != sbi->s_groups_count) { + ext4_warning(sb, __func__, + "multiple resizers run on filesystem!"); + err = -EBUSY; + goto exit_journal; + } + + if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh))) + goto exit_journal; + + /* + * We will only either add reserved group blocks to a backup group + * or remove reserved blocks for the first group in a new group block. + * Doing both would be mean more complex code, and sane people don't + * use non-sparse filesystems anymore. This is already checked above. + */ + if (gdb_off) { + primary = sbi->s_group_desc[gdb_num]; + if ((err = ext4_journal_get_write_access(handle, primary))) + goto exit_journal; + + if (reserved_gdb && ext4_bg_num_gdb(sb, input->group) && + (err = reserve_backup_gdb(handle, inode, input))) + goto exit_journal; + } else if ((err = add_new_gdb(handle, inode, input, &primary))) + goto exit_journal; + + /* + * OK, now we've set up the new group. Time to make it active. + * + * Current kernels don't lock all allocations via lock_super(), + * so we have to be safe wrt. concurrent accesses the group + * data. So we need to be careful to set all of the relevant + * group descriptor data etc. *before* we enable the group. + * + * The key field here is sbi->s_groups_count: as long as + * that retains its old value, nobody is going to access the new + * group. + * + * So first we update all the descriptor metadata for the new + * group; then we update the total disk blocks count; then we + * update the groups count to enable the group; then finally we + * update the free space counts so that the system can start + * using the new disk blocks. + */ + + num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, input->group); + /* Update group descriptor block for new group */ + gdp = (struct ext4_group_desc *)((char *)primary->b_data + + gdb_off * EXT4_DESC_SIZE(sb)); + + memset(gdp, 0, EXT4_DESC_SIZE(sb)); + ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */ + ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */ + ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */ + gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count); + gdp->bg_free_inodes_count = cpu_to_le16(EXT4_INODES_PER_GROUP(sb)); + gdp->bg_flags = cpu_to_le16(EXT4_BG_INODE_ZEROED); + gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp); + + /* + * We can allocate memory for mb_alloc based on the new group + * descriptor + */ + err = ext4_mb_add_groupinfo(sb, input->group, gdp); + if (err) { + ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked); + goto exit_journal; + } + + /* + * Make the new blocks and inodes valid next. We do this before + * increasing the group count so that once the group is enabled, + * all of its blocks and inodes are already valid. + * + * We always allocate group-by-group, then block-by-block or + * inode-by-inode within a group, so enabling these + * blocks/inodes before the group is live won't actually let us + * allocate the new space yet. + */ + ext4_blocks_count_set(es, ext4_blocks_count(es) + + input->blocks_count); + le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb)); + + /* + * We need to protect s_groups_count against other CPUs seeing + * inconsistent state in the superblock. + * + * The precise rules we use are: + * + * * Writers of s_groups_count *must* hold lock_super + * AND + * * Writers must perform a smp_wmb() after updating all dependent + * data and before modifying the groups count + * + * * Readers must hold lock_super() over the access + * OR + * * Readers must perform an smp_rmb() after reading the groups count + * and before reading any dependent data. + * + * NB. These rules can be relaxed when checking the group count + * while freeing data, as we can only allocate from a block + * group after serialising against the group count, and we can + * only then free after serialising in turn against that + * allocation. + */ + smp_wmb(); + + /* Update the global fs size fields */ + sbi->s_groups_count++; + ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked); + + ext4_journal_dirty_metadata(handle, primary); + + /* Update the reserved block counts only once the new group is + * active. */ + ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) + + input->reserved_blocks); + + /* Update the free space counts */ + percpu_counter_add(&sbi->s_freeblocks_counter, + input->free_blocks_count); + percpu_counter_add(&sbi->s_freeinodes_counter, + EXT4_INODES_PER_GROUP(sb)); + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { + ext4_group_t flex_group; + flex_group = ext4_flex_group(sbi, input->group); + sbi->s_flex_groups[flex_group].free_blocks += + input->free_blocks_count; + sbi->s_flex_groups[flex_group].free_inodes += + EXT4_INODES_PER_GROUP(sb); + } + + ext4_journal_dirty_metadata(handle, sbi->s_sbh); + sb->s_dirt = 1; + +exit_journal: + unlock_super(sb); + if ((err2 = ext4_journal_stop(handle)) && !err) + err = err2; + if (!err) { + update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, + sizeof(struct ext4_super_block)); + update_backups(sb, primary->b_blocknr, primary->b_data, + primary->b_size); + } +exit_put: + iput(inode); + return err; +} /* ext4_group_add */ + +/* + * Extend the filesystem to the new number of blocks specified. This entry + * point is only used to extend the current filesystem to the end of the last + * existing group. It can be accessed via ioctl, or by "remount,resize=<size>" + * for emergencies (because it has no dependencies on reserved blocks). + * + * If we _really_ wanted, we could use default values to call ext4_group_add() + * allow the "remount" trick to work for arbitrary resizing, assuming enough + * GDT blocks are reserved to grow to the desired size. + */ +int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, + ext4_fsblk_t n_blocks_count) +{ + ext4_fsblk_t o_blocks_count; + ext4_group_t o_groups_count; + ext4_grpblk_t last; + ext4_grpblk_t add; + struct buffer_head *bh; + handle_t *handle; + int err; + ext4_group_t group; + + /* We don't need to worry about locking wrt other resizers just + * yet: we're going to revalidate es->s_blocks_count after + * taking lock_super() below. */ + o_blocks_count = ext4_blocks_count(es); + o_groups_count = EXT4_SB(sb)->s_groups_count; + + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG "EXT4-fs: extending last group from %llu uto %llu blocks\n", + o_blocks_count, n_blocks_count); + + if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) + return 0; + + if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { + printk(KERN_ERR "EXT4-fs: filesystem on %s:" + " too large to resize to %llu blocks safely\n", + sb->s_id, n_blocks_count); + if (sizeof(sector_t) < 8) + ext4_warning(sb, __func__, + "CONFIG_LBD not enabled\n"); + return -EINVAL; + } + + if (n_blocks_count < o_blocks_count) { + ext4_warning(sb, __func__, + "can't shrink FS - resize aborted"); + return -EBUSY; + } + + /* Handle the remaining blocks in the last group only. */ + ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last); + + if (last == 0) { + ext4_warning(sb, __func__, + "need to use ext2online to resize further"); + return -EPERM; + } + + add = EXT4_BLOCKS_PER_GROUP(sb) - last; + + if (o_blocks_count + add < o_blocks_count) { + ext4_warning(sb, __func__, "blocks_count overflow"); + return -EINVAL; + } + + if (o_blocks_count + add > n_blocks_count) + add = n_blocks_count - o_blocks_count; + + if (o_blocks_count + add < n_blocks_count) + ext4_warning(sb, __func__, + "will only finish group (%llu" + " blocks, %u new)", + o_blocks_count + add, add); + + /* See if the device is actually as big as what was requested */ + bh = sb_bread(sb, o_blocks_count + add - 1); + if (!bh) { + ext4_warning(sb, __func__, + "can't read last block, resize aborted"); + return -ENOSPC; + } + brelse(bh); + + /* We will update the superblock, one block bitmap, and + * one group descriptor via ext4_free_blocks(). + */ + handle = ext4_journal_start_sb(sb, 3); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + ext4_warning(sb, __func__, "error %d on journal start", err); + goto exit_put; + } + + lock_super(sb); + if (o_blocks_count != ext4_blocks_count(es)) { + ext4_warning(sb, __func__, + "multiple resizers run on filesystem!"); + unlock_super(sb); + ext4_journal_stop(handle); + err = -EBUSY; + goto exit_put; + } + + if ((err = ext4_journal_get_write_access(handle, + EXT4_SB(sb)->s_sbh))) { + ext4_warning(sb, __func__, + "error %d on journal write access", err); + unlock_super(sb); + ext4_journal_stop(handle); + goto exit_put; + } + ext4_blocks_count_set(es, o_blocks_count + add); + ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh); + sb->s_dirt = 1; + unlock_super(sb); + ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, + o_blocks_count + add); + /* We add the blocks to the bitmap and set the group need init bit */ + ext4_add_groupblocks(handle, sb, o_blocks_count, add); + ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, + o_blocks_count + add); + if ((err = ext4_journal_stop(handle))) + goto exit_put; + + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n", + ext4_blocks_count(es)); + update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es, + sizeof(struct ext4_super_block)); +exit_put: + return err; +} /* ext4_group_extend */ diff --git a/fs/ext4/super.c b/fs/ext4/super.c new file mode 100644 index 0000000..e522b8c --- /dev/null +++ b/fs/ext4/super.c @@ -0,0 +1,3639 @@ +/* + * linux/fs/ext4/super.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include <linux/module.h> +#include <linux/string.h> +#include <linux/fs.h> +#include <linux/time.h> +#include <linux/jbd2.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/blkdev.h> +#include <linux/parser.h> +#include <linux/smp_lock.h> +#include <linux/buffer_head.h> +#include <linux/exportfs.h> +#include <linux/vfs.h> +#include <linux/random.h> +#include <linux/mount.h> +#include <linux/namei.h> +#include <linux/quotaops.h> +#include <linux/seq_file.h> +#include <linux/proc_fs.h> +#include <linux/marker.h> +#include <linux/log2.h> +#include <linux/crc16.h> +#include <asm/uaccess.h> + +#include "ext4.h" +#include "ext4_jbd2.h" +#include "xattr.h" +#include "acl.h" +#include "namei.h" +#include "group.h" + +struct proc_dir_entry *ext4_proc_root; + +static int ext4_load_journal(struct super_block *, struct ext4_super_block *, + unsigned long journal_devnum); +static int ext4_create_journal(struct super_block *, struct ext4_super_block *, + unsigned int); +static void ext4_commit_super(struct super_block *sb, + struct ext4_super_block *es, int sync); +static void ext4_mark_recovery_complete(struct super_block *sb, + struct ext4_super_block *es); +static void ext4_clear_journal_err(struct super_block *sb, + struct ext4_super_block *es); +static int ext4_sync_fs(struct super_block *sb, int wait); +static const char *ext4_decode_error(struct super_block *sb, int errno, + char nbuf[16]); +static int ext4_remount(struct super_block *sb, int *flags, char *data); +static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); +static void ext4_unlockfs(struct super_block *sb); +static void ext4_write_super(struct super_block *sb); +static void ext4_write_super_lockfs(struct super_block *sb); + + +ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, + struct ext4_group_desc *bg) +{ + return le32_to_cpu(bg->bg_block_bitmap_lo) | + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0); +} + +ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, + struct ext4_group_desc *bg) +{ + return le32_to_cpu(bg->bg_inode_bitmap_lo) | + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0); +} + +ext4_fsblk_t ext4_inode_table(struct super_block *sb, + struct ext4_group_desc *bg) +{ + return le32_to_cpu(bg->bg_inode_table_lo) | + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0); +} + +void ext4_block_bitmap_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk) +{ + bg->bg_block_bitmap_lo = cpu_to_le32((u32)blk); + if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) + bg->bg_block_bitmap_hi = cpu_to_le32(blk >> 32); +} + +void ext4_inode_bitmap_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk) +{ + bg->bg_inode_bitmap_lo = cpu_to_le32((u32)blk); + if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) + bg->bg_inode_bitmap_hi = cpu_to_le32(blk >> 32); +} + +void ext4_inode_table_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk) +{ + bg->bg_inode_table_lo = cpu_to_le32((u32)blk); + if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) + bg->bg_inode_table_hi = cpu_to_le32(blk >> 32); +} + +/* + * Wrappers for jbd2_journal_start/end. + * + * The only special thing we need to do here is to make sure that all + * journal_end calls result in the superblock being marked dirty, so + * that sync() will call the filesystem's write_super callback if + * appropriate. + */ +handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) +{ + journal_t *journal; + + if (sb->s_flags & MS_RDONLY) + return ERR_PTR(-EROFS); + + /* Special case here: if the journal has aborted behind our + * backs (eg. EIO in the commit thread), then we still need to + * take the FS itself readonly cleanly. */ + journal = EXT4_SB(sb)->s_journal; + if (is_journal_aborted(journal)) { + ext4_abort(sb, __func__, + "Detected aborted journal"); + return ERR_PTR(-EROFS); + } + + return jbd2_journal_start(journal, nblocks); +} + +/* + * The only special thing we need to do here is to make sure that all + * jbd2_journal_stop calls result in the superblock being marked dirty, so + * that sync() will call the filesystem's write_super callback if + * appropriate. + */ +int __ext4_journal_stop(const char *where, handle_t *handle) +{ + struct super_block *sb; + int err; + int rc; + + sb = handle->h_transaction->t_journal->j_private; + err = handle->h_err; + rc = jbd2_journal_stop(handle); + + if (!err) + err = rc; + if (err) + __ext4_std_error(sb, where, err); + return err; +} + +void ext4_journal_abort_handle(const char *caller, const char *err_fn, + struct buffer_head *bh, handle_t *handle, int err) +{ + char nbuf[16]; + const char *errstr = ext4_decode_error(NULL, err, nbuf); + + if (bh) + BUFFER_TRACE(bh, "abort"); + + if (!handle->h_err) + handle->h_err = err; + + if (is_handle_aborted(handle)) + return; + + printk(KERN_ERR "%s: aborting transaction: %s in %s\n", + caller, errstr, err_fn); + + jbd2_journal_abort_handle(handle); +} + +/* Deal with the reporting of failure conditions on a filesystem such as + * inconsistencies detected or read IO failures. + * + * On ext2, we can store the error state of the filesystem in the + * superblock. That is not possible on ext4, because we may have other + * write ordering constraints on the superblock which prevent us from + * writing it out straight away; and given that the journal is about to + * be aborted, we can't rely on the current, or future, transactions to + * write out the superblock safely. + * + * We'll just use the jbd2_journal_abort() error code to record an error in + * the journal instead. On recovery, the journal will compain about + * that error until we've noted it down and cleared it. + */ + +static void ext4_handle_error(struct super_block *sb) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + + EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; + es->s_state |= cpu_to_le16(EXT4_ERROR_FS); + + if (sb->s_flags & MS_RDONLY) + return; + + if (!test_opt(sb, ERRORS_CONT)) { + journal_t *journal = EXT4_SB(sb)->s_journal; + + EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT; + if (journal) + jbd2_journal_abort(journal, -EIO); + } + if (test_opt(sb, ERRORS_RO)) { + printk(KERN_CRIT "Remounting filesystem read-only\n"); + sb->s_flags |= MS_RDONLY; + } + ext4_commit_super(sb, es, 1); + if (test_opt(sb, ERRORS_PANIC)) + panic("EXT4-fs (device %s): panic forced after error\n", + sb->s_id); +} + +void ext4_error(struct super_block *sb, const char *function, + const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function); + vprintk(fmt, args); + printk("\n"); + va_end(args); + + ext4_handle_error(sb); +} + +static const char *ext4_decode_error(struct super_block *sb, int errno, + char nbuf[16]) +{ + char *errstr = NULL; + + switch (errno) { + case -EIO: + errstr = "IO failure"; + break; + case -ENOMEM: + errstr = "Out of memory"; + break; + case -EROFS: + if (!sb || EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT) + errstr = "Journal has aborted"; + else + errstr = "Readonly filesystem"; + break; + default: + /* If the caller passed in an extra buffer for unknown + * errors, textualise them now. Else we just return + * NULL. */ + if (nbuf) { + /* Check for truncated error codes... */ + if (snprintf(nbuf, 16, "error %d", -errno) >= 0) + errstr = nbuf; + } + break; + } + + return errstr; +} + +/* __ext4_std_error decodes expected errors from journaling functions + * automatically and invokes the appropriate error response. */ + +void __ext4_std_error(struct super_block *sb, const char *function, int errno) +{ + char nbuf[16]; + const char *errstr; + + /* Special case: if the error is EROFS, and we're not already + * inside a transaction, then there's really no point in logging + * an error. */ + if (errno == -EROFS && journal_current_handle() == NULL && + (sb->s_flags & MS_RDONLY)) + return; + + errstr = ext4_decode_error(sb, errno, nbuf); + printk(KERN_CRIT "EXT4-fs error (device %s) in %s: %s\n", + sb->s_id, function, errstr); + + ext4_handle_error(sb); +} + +/* + * ext4_abort is a much stronger failure handler than ext4_error. The + * abort function may be used to deal with unrecoverable failures such + * as journal IO errors or ENOMEM at a critical moment in log management. + * + * We unconditionally force the filesystem into an ABORT|READONLY state, + * unless the error response on the fs has been set to panic in which + * case we take the easy way out and panic immediately. + */ + +void ext4_abort(struct super_block *sb, const char *function, + const char *fmt, ...) +{ + va_list args; + + printk(KERN_CRIT "ext4_abort called.\n"); + + va_start(args, fmt); + printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function); + vprintk(fmt, args); + printk("\n"); + va_end(args); + + if (test_opt(sb, ERRORS_PANIC)) + panic("EXT4-fs panic from previous error\n"); + + if (sb->s_flags & MS_RDONLY) + return; + + printk(KERN_CRIT "Remounting filesystem read-only\n"); + EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; + sb->s_flags |= MS_RDONLY; + EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT; + if (EXT4_SB(sb)->s_journal) + jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); +} + +void ext4_warning(struct super_block *sb, const char *function, + const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + printk(KERN_WARNING "EXT4-fs warning (device %s): %s: ", + sb->s_id, function); + vprintk(fmt, args); + printk("\n"); + va_end(args); +} + +void ext4_update_dynamic_rev(struct super_block *sb) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + + if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV) + return; + + ext4_warning(sb, __func__, + "updating to rev %d because of new feature flag, " + "running e2fsck is recommended", + EXT4_DYNAMIC_REV); + + es->s_first_ino = cpu_to_le32(EXT4_GOOD_OLD_FIRST_INO); + es->s_inode_size = cpu_to_le16(EXT4_GOOD_OLD_INODE_SIZE); + es->s_rev_level = cpu_to_le32(EXT4_DYNAMIC_REV); + /* leave es->s_feature_*compat flags alone */ + /* es->s_uuid will be set by e2fsck if empty */ + + /* + * The rest of the superblock fields should be zero, and if not it + * means they are likely already in use, so leave them alone. We + * can leave it up to e2fsck to clean up any inconsistencies there. + */ +} + +/* + * Open the external journal device + */ +static struct block_device *ext4_blkdev_get(dev_t dev) +{ + struct block_device *bdev; + char b[BDEVNAME_SIZE]; + + bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); + if (IS_ERR(bdev)) + goto fail; + return bdev; + +fail: + printk(KERN_ERR "EXT4: failed to open journal device %s: %ld\n", + __bdevname(dev, b), PTR_ERR(bdev)); + return NULL; +} + +/* + * Release the journal device + */ +static int ext4_blkdev_put(struct block_device *bdev) +{ + bd_release(bdev); + return blkdev_put(bdev, FMODE_READ|FMODE_WRITE); +} + +static int ext4_blkdev_remove(struct ext4_sb_info *sbi) +{ + struct block_device *bdev; + int ret = -ENODEV; + + bdev = sbi->journal_bdev; + if (bdev) { + ret = ext4_blkdev_put(bdev); + sbi->journal_bdev = NULL; + } + return ret; +} + +static inline struct inode *orphan_list_entry(struct list_head *l) +{ + return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode; +} + +static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi) +{ + struct list_head *l; + + printk(KERN_ERR "sb orphan head is %d\n", + le32_to_cpu(sbi->s_es->s_last_orphan)); + + printk(KERN_ERR "sb_info orphan list:\n"); + list_for_each(l, &sbi->s_orphan) { + struct inode *inode = orphan_list_entry(l); + printk(KERN_ERR " " + "inode %s:%lu at %p: mode %o, nlink %d, next %d\n", + inode->i_sb->s_id, inode->i_ino, inode, + inode->i_mode, inode->i_nlink, + NEXT_ORPHAN(inode)); + } +} + +static void ext4_put_super(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + int i, err; + + ext4_mb_release(sb); + ext4_ext_release(sb); + ext4_xattr_put_super(sb); + err = jbd2_journal_destroy(sbi->s_journal); + sbi->s_journal = NULL; + if (err < 0) + ext4_abort(sb, __func__, "Couldn't clean up the journal"); + + if (!(sb->s_flags & MS_RDONLY)) { + EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + es->s_state = cpu_to_le16(sbi->s_mount_state); + ext4_commit_super(sb, es, 1); + } + if (sbi->s_proc) { + remove_proc_entry("inode_readahead_blks", sbi->s_proc); + remove_proc_entry(sb->s_id, ext4_proc_root); + } + + for (i = 0; i < sbi->s_gdb_count; i++) + brelse(sbi->s_group_desc[i]); + kfree(sbi->s_group_desc); + kfree(sbi->s_flex_groups); + percpu_counter_destroy(&sbi->s_freeblocks_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); + percpu_counter_destroy(&sbi->s_dirtyblocks_counter); + brelse(sbi->s_sbh); +#ifdef CONFIG_QUOTA + for (i = 0; i < MAXQUOTAS; i++) + kfree(sbi->s_qf_names[i]); +#endif + + /* Debugging code just in case the in-memory inode orphan list + * isn't empty. The on-disk one can be non-empty if we've + * detected an error and taken the fs readonly, but the + * in-memory list had better be clean by this point. */ + if (!list_empty(&sbi->s_orphan)) + dump_orphan_list(sb, sbi); + J_ASSERT(list_empty(&sbi->s_orphan)); + + invalidate_bdev(sb->s_bdev); + if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) { + /* + * Invalidate the journal device's buffers. We don't want them + * floating about in memory - the physical journal device may + * hotswapped, and it breaks the `ro-after' testing code. + */ + sync_blockdev(sbi->journal_bdev); + invalidate_bdev(sbi->journal_bdev); + ext4_blkdev_remove(sbi); + } + sb->s_fs_info = NULL; + kfree(sbi); + return; +} + +static struct kmem_cache *ext4_inode_cachep; + +/* + * Called inside transaction, so use GFP_NOFS + */ +static struct inode *ext4_alloc_inode(struct super_block *sb) +{ + struct ext4_inode_info *ei; + + ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS); + if (!ei) + return NULL; +#ifdef CONFIG_EXT4_FS_POSIX_ACL + ei->i_acl = EXT4_ACL_NOT_CACHED; + ei->i_default_acl = EXT4_ACL_NOT_CACHED; +#endif + ei->vfs_inode.i_version = 1; + ei->vfs_inode.i_data.writeback_index = 0; + memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); + INIT_LIST_HEAD(&ei->i_prealloc_list); + spin_lock_init(&ei->i_prealloc_lock); + jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode); + ei->i_reserved_data_blocks = 0; + ei->i_reserved_meta_blocks = 0; + ei->i_allocated_meta_blocks = 0; + ei->i_delalloc_reserved_flag = 0; + spin_lock_init(&(ei->i_block_reservation_lock)); + return &ei->vfs_inode; +} + +static void ext4_destroy_inode(struct inode *inode) +{ + if (!list_empty(&(EXT4_I(inode)->i_orphan))) { + printk("EXT4 Inode %p: orphan list check failed!\n", + EXT4_I(inode)); + print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4, + EXT4_I(inode), sizeof(struct ext4_inode_info), + true); + dump_stack(); + } + kmem_cache_free(ext4_inode_cachep, EXT4_I(inode)); +} + +static void init_once(void *foo) +{ + struct ext4_inode_info *ei = (struct ext4_inode_info *) foo; + + INIT_LIST_HEAD(&ei->i_orphan); +#ifdef CONFIG_EXT4_FS_XATTR + init_rwsem(&ei->xattr_sem); +#endif + init_rwsem(&ei->i_data_sem); + inode_init_once(&ei->vfs_inode); +} + +static int init_inodecache(void) +{ + ext4_inode_cachep = kmem_cache_create("ext4_inode_cache", + sizeof(struct ext4_inode_info), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + init_once); + if (ext4_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static void destroy_inodecache(void) +{ + kmem_cache_destroy(ext4_inode_cachep); +} + +static void ext4_clear_inode(struct inode *inode) +{ +#ifdef CONFIG_EXT4_FS_POSIX_ACL + if (EXT4_I(inode)->i_acl && + EXT4_I(inode)->i_acl != EXT4_ACL_NOT_CACHED) { + posix_acl_release(EXT4_I(inode)->i_acl); + EXT4_I(inode)->i_acl = EXT4_ACL_NOT_CACHED; + } + if (EXT4_I(inode)->i_default_acl && + EXT4_I(inode)->i_default_acl != EXT4_ACL_NOT_CACHED) { + posix_acl_release(EXT4_I(inode)->i_default_acl); + EXT4_I(inode)->i_default_acl = EXT4_ACL_NOT_CACHED; + } +#endif + ext4_discard_preallocations(inode); + jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal, + &EXT4_I(inode)->jinode); +} + +static inline void ext4_show_quota_options(struct seq_file *seq, + struct super_block *sb) +{ +#if defined(CONFIG_QUOTA) + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (sbi->s_jquota_fmt) + seq_printf(seq, ",jqfmt=%s", + (sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold" : "vfsv0"); + + if (sbi->s_qf_names[USRQUOTA]) + seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]); + + if (sbi->s_qf_names[GRPQUOTA]) + seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); + + if (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA) + seq_puts(seq, ",usrquota"); + + if (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA) + seq_puts(seq, ",grpquota"); +#endif +} + +/* + * Show an option if + * - it's set to a non-default value OR + * - if the per-sb default is different from the global default + */ +static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) +{ + int def_errors; + unsigned long def_mount_opts; + struct super_block *sb = vfs->mnt_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + + def_mount_opts = le32_to_cpu(es->s_default_mount_opts); + def_errors = le16_to_cpu(es->s_errors); + + if (sbi->s_sb_block != 1) + seq_printf(seq, ",sb=%llu", sbi->s_sb_block); + if (test_opt(sb, MINIX_DF)) + seq_puts(seq, ",minixdf"); + if (test_opt(sb, GRPID) && !(def_mount_opts & EXT4_DEFM_BSDGROUPS)) + seq_puts(seq, ",grpid"); + if (!test_opt(sb, GRPID) && (def_mount_opts & EXT4_DEFM_BSDGROUPS)) + seq_puts(seq, ",nogrpid"); + if (sbi->s_resuid != EXT4_DEF_RESUID || + le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID) { + seq_printf(seq, ",resuid=%u", sbi->s_resuid); + } + if (sbi->s_resgid != EXT4_DEF_RESGID || + le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) { + seq_printf(seq, ",resgid=%u", sbi->s_resgid); + } + if (test_opt(sb, ERRORS_RO)) { + if (def_errors == EXT4_ERRORS_PANIC || + def_errors == EXT4_ERRORS_CONTINUE) { + seq_puts(seq, ",errors=remount-ro"); + } + } + if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE) + seq_puts(seq, ",errors=continue"); + if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC) + seq_puts(seq, ",errors=panic"); + if (test_opt(sb, NO_UID32) && !(def_mount_opts & EXT4_DEFM_UID16)) + seq_puts(seq, ",nouid32"); + if (test_opt(sb, DEBUG) && !(def_mount_opts & EXT4_DEFM_DEBUG)) + seq_puts(seq, ",debug"); + if (test_opt(sb, OLDALLOC)) + seq_puts(seq, ",oldalloc"); +#ifdef CONFIG_EXT4_FS_XATTR + if (test_opt(sb, XATTR_USER) && + !(def_mount_opts & EXT4_DEFM_XATTR_USER)) + seq_puts(seq, ",user_xattr"); + if (!test_opt(sb, XATTR_USER) && + (def_mount_opts & EXT4_DEFM_XATTR_USER)) { + seq_puts(seq, ",nouser_xattr"); + } +#endif +#ifdef CONFIG_EXT4_FS_POSIX_ACL + if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL)) + seq_puts(seq, ",acl"); + if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL)) + seq_puts(seq, ",noacl"); +#endif + if (!test_opt(sb, RESERVATION)) + seq_puts(seq, ",noreservation"); + if (sbi->s_commit_interval) { + seq_printf(seq, ",commit=%u", + (unsigned) (sbi->s_commit_interval / HZ)); + } + /* + * We're changing the default of barrier mount option, so + * let's always display its mount state so it's clear what its + * status is. + */ + seq_puts(seq, ",barrier="); + seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0"); + if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) + seq_puts(seq, ",journal_async_commit"); + if (test_opt(sb, NOBH)) + seq_puts(seq, ",nobh"); + if (!test_opt(sb, EXTENTS)) + seq_puts(seq, ",noextents"); + if (test_opt(sb, I_VERSION)) + seq_puts(seq, ",i_version"); + if (!test_opt(sb, DELALLOC)) + seq_puts(seq, ",nodelalloc"); + + + if (sbi->s_stripe) + seq_printf(seq, ",stripe=%lu", sbi->s_stripe); + /* + * journal mode get enabled in different ways + * So just print the value even if we didn't specify it + */ + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) + seq_puts(seq, ",data=journal"); + else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) + seq_puts(seq, ",data=ordered"); + else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) + seq_puts(seq, ",data=writeback"); + + if (sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS) + seq_printf(seq, ",inode_readahead_blks=%u", + sbi->s_inode_readahead_blks); + + if (test_opt(sb, DATA_ERR_ABORT)) + seq_puts(seq, ",data_err=abort"); + + ext4_show_quota_options(seq, sb); + return 0; +} + + +static struct inode *ext4_nfs_get_inode(struct super_block *sb, + u64 ino, u32 generation) +{ + struct inode *inode; + + if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) + return ERR_PTR(-ESTALE); + if (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)) + return ERR_PTR(-ESTALE); + + /* iget isn't really right if the inode is currently unallocated!! + * + * ext4_read_inode will return a bad_inode if the inode had been + * deleted, so we should be safe. + * + * Currently we don't know the generation for parent directory, so + * a generation of 0 means "accept any" + */ + inode = ext4_iget(sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + if (generation && inode->i_generation != generation) { + iput(inode); + return ERR_PTR(-ESTALE); + } + + return inode; +} + +static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, + ext4_nfs_get_inode); +} + +static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + ext4_nfs_get_inode); +} + +#ifdef CONFIG_QUOTA +#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group") +#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) + +static int ext4_dquot_initialize(struct inode *inode, int type); +static int ext4_dquot_drop(struct inode *inode); +static int ext4_write_dquot(struct dquot *dquot); +static int ext4_acquire_dquot(struct dquot *dquot); +static int ext4_release_dquot(struct dquot *dquot); +static int ext4_mark_dquot_dirty(struct dquot *dquot); +static int ext4_write_info(struct super_block *sb, int type); +static int ext4_quota_on(struct super_block *sb, int type, int format_id, + char *path, int remount); +static int ext4_quota_on_mount(struct super_block *sb, int type); +static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off); +static ssize_t ext4_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off); + +static struct dquot_operations ext4_quota_operations = { + .initialize = ext4_dquot_initialize, + .drop = ext4_dquot_drop, + .alloc_space = dquot_alloc_space, + .alloc_inode = dquot_alloc_inode, + .free_space = dquot_free_space, + .free_inode = dquot_free_inode, + .transfer = dquot_transfer, + .write_dquot = ext4_write_dquot, + .acquire_dquot = ext4_acquire_dquot, + .release_dquot = ext4_release_dquot, + .mark_dirty = ext4_mark_dquot_dirty, + .write_info = ext4_write_info +}; + +static struct quotactl_ops ext4_qctl_operations = { + .quota_on = ext4_quota_on, + .quota_off = vfs_quota_off, + .quota_sync = vfs_quota_sync, + .get_info = vfs_get_dqinfo, + .set_info = vfs_set_dqinfo, + .get_dqblk = vfs_get_dqblk, + .set_dqblk = vfs_set_dqblk +}; +#endif + +static const struct super_operations ext4_sops = { + .alloc_inode = ext4_alloc_inode, + .destroy_inode = ext4_destroy_inode, + .write_inode = ext4_write_inode, + .dirty_inode = ext4_dirty_inode, + .delete_inode = ext4_delete_inode, + .put_super = ext4_put_super, + .write_super = ext4_write_super, + .sync_fs = ext4_sync_fs, + .write_super_lockfs = ext4_write_super_lockfs, + .unlockfs = ext4_unlockfs, + .statfs = ext4_statfs, + .remount_fs = ext4_remount, + .clear_inode = ext4_clear_inode, + .show_options = ext4_show_options, +#ifdef CONFIG_QUOTA + .quota_read = ext4_quota_read, + .quota_write = ext4_quota_write, +#endif +}; + +static const struct export_operations ext4_export_ops = { + .fh_to_dentry = ext4_fh_to_dentry, + .fh_to_parent = ext4_fh_to_parent, + .get_parent = ext4_get_parent, +}; + +enum { + Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, + Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, + Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov, + Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, + Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, + Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, + Opt_journal_checksum, Opt_journal_async_commit, + Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, + Opt_data_err_abort, Opt_data_err_ignore, + Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, + Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, + Opt_stripe, Opt_delalloc, Opt_nodelalloc, + Opt_inode_readahead_blks +}; + +static const match_table_t tokens = { + {Opt_bsd_df, "bsddf"}, + {Opt_minix_df, "minixdf"}, + {Opt_grpid, "grpid"}, + {Opt_grpid, "bsdgroups"}, + {Opt_nogrpid, "nogrpid"}, + {Opt_nogrpid, "sysvgroups"}, + {Opt_resgid, "resgid=%u"}, + {Opt_resuid, "resuid=%u"}, + {Opt_sb, "sb=%u"}, + {Opt_err_cont, "errors=continue"}, + {Opt_err_panic, "errors=panic"}, + {Opt_err_ro, "errors=remount-ro"}, + {Opt_nouid32, "nouid32"}, + {Opt_debug, "debug"}, + {Opt_oldalloc, "oldalloc"}, + {Opt_orlov, "orlov"}, + {Opt_user_xattr, "user_xattr"}, + {Opt_nouser_xattr, "nouser_xattr"}, + {Opt_acl, "acl"}, + {Opt_noacl, "noacl"}, + {Opt_reservation, "reservation"}, + {Opt_noreservation, "noreservation"}, + {Opt_noload, "noload"}, + {Opt_nobh, "nobh"}, + {Opt_bh, "bh"}, + {Opt_commit, "commit=%u"}, + {Opt_journal_update, "journal=update"}, + {Opt_journal_inum, "journal=%u"}, + {Opt_journal_dev, "journal_dev=%u"}, + {Opt_journal_checksum, "journal_checksum"}, + {Opt_journal_async_commit, "journal_async_commit"}, + {Opt_abort, "abort"}, + {Opt_data_journal, "data=journal"}, + {Opt_data_ordered, "data=ordered"}, + {Opt_data_writeback, "data=writeback"}, + {Opt_data_err_abort, "data_err=abort"}, + {Opt_data_err_ignore, "data_err=ignore"}, + {Opt_offusrjquota, "usrjquota="}, + {Opt_usrjquota, "usrjquota=%s"}, + {Opt_offgrpjquota, "grpjquota="}, + {Opt_grpjquota, "grpjquota=%s"}, + {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, + {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, + {Opt_grpquota, "grpquota"}, + {Opt_noquota, "noquota"}, + {Opt_quota, "quota"}, + {Opt_usrquota, "usrquota"}, + {Opt_barrier, "barrier=%u"}, + {Opt_extents, "extents"}, + {Opt_noextents, "noextents"}, + {Opt_i_version, "i_version"}, + {Opt_stripe, "stripe=%u"}, + {Opt_resize, "resize"}, + {Opt_delalloc, "delalloc"}, + {Opt_nodelalloc, "nodelalloc"}, + {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, + {Opt_err, NULL}, +}; + +static ext4_fsblk_t get_sb_block(void **data) +{ + ext4_fsblk_t sb_block; + char *options = (char *) *data; + + if (!options || strncmp(options, "sb=", 3) != 0) + return 1; /* Default location */ + options += 3; + /*todo: use simple_strtoll with >32bit ext4 */ + sb_block = simple_strtoul(options, &options, 0); + if (*options && *options != ',') { + printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n", + (char *) *data); + return 1; + } + if (*options == ',') + options++; + *data = (void *) options; + return sb_block; +} + +static int parse_options(char *options, struct super_block *sb, + unsigned int *inum, unsigned long *journal_devnum, + ext4_fsblk_t *n_blocks_count, int is_remount) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + char *p; + substring_t args[MAX_OPT_ARGS]; + int data_opt = 0; + int option; +#ifdef CONFIG_QUOTA + int qtype, qfmt; + char *qname; +#endif + ext4_fsblk_t last_block; + + if (!options) + return 1; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_bsd_df: + clear_opt(sbi->s_mount_opt, MINIX_DF); + break; + case Opt_minix_df: + set_opt(sbi->s_mount_opt, MINIX_DF); + break; + case Opt_grpid: + set_opt(sbi->s_mount_opt, GRPID); + break; + case Opt_nogrpid: + clear_opt(sbi->s_mount_opt, GRPID); + break; + case Opt_resuid: + if (match_int(&args[0], &option)) + return 0; + sbi->s_resuid = option; + break; + case Opt_resgid: + if (match_int(&args[0], &option)) + return 0; + sbi->s_resgid = option; + break; + case Opt_sb: + /* handled by get_sb_block() instead of here */ + /* *sb_block = match_int(&args[0]); */ + break; + case Opt_err_panic: + clear_opt(sbi->s_mount_opt, ERRORS_CONT); + clear_opt(sbi->s_mount_opt, ERRORS_RO); + set_opt(sbi->s_mount_opt, ERRORS_PANIC); + break; + case Opt_err_ro: + clear_opt(sbi->s_mount_opt, ERRORS_CONT); + clear_opt(sbi->s_mount_opt, ERRORS_PANIC); + set_opt(sbi->s_mount_opt, ERRORS_RO); + break; + case Opt_err_cont: + clear_opt(sbi->s_mount_opt, ERRORS_RO); + clear_opt(sbi->s_mount_opt, ERRORS_PANIC); + set_opt(sbi->s_mount_opt, ERRORS_CONT); + break; + case Opt_nouid32: + set_opt(sbi->s_mount_opt, NO_UID32); + break; + case Opt_debug: + set_opt(sbi->s_mount_opt, DEBUG); + break; + case Opt_oldalloc: + set_opt(sbi->s_mount_opt, OLDALLOC); + break; + case Opt_orlov: + clear_opt(sbi->s_mount_opt, OLDALLOC); + break; +#ifdef CONFIG_EXT4_FS_XATTR + case Opt_user_xattr: + set_opt(sbi->s_mount_opt, XATTR_USER); + break; + case Opt_nouser_xattr: + clear_opt(sbi->s_mount_opt, XATTR_USER); + break; +#else + case Opt_user_xattr: + case Opt_nouser_xattr: + printk(KERN_ERR "EXT4 (no)user_xattr options " + "not supported\n"); + break; +#endif +#ifdef CONFIG_EXT4_FS_POSIX_ACL + case Opt_acl: + set_opt(sbi->s_mount_opt, POSIX_ACL); + break; + case Opt_noacl: + clear_opt(sbi->s_mount_opt, POSIX_ACL); + break; +#else + case Opt_acl: + case Opt_noacl: + printk(KERN_ERR "EXT4 (no)acl options " + "not supported\n"); + break; +#endif + case Opt_reservation: + set_opt(sbi->s_mount_opt, RESERVATION); + break; + case Opt_noreservation: + clear_opt(sbi->s_mount_opt, RESERVATION); + break; + case Opt_journal_update: + /* @@@ FIXME */ + /* Eventually we will want to be able to create + a journal file here. For now, only allow the + user to specify an existing inode to be the + journal file. */ + if (is_remount) { + printk(KERN_ERR "EXT4-fs: cannot specify " + "journal on remount\n"); + return 0; + } + set_opt(sbi->s_mount_opt, UPDATE_JOURNAL); + break; + case Opt_journal_inum: + if (is_remount) { + printk(KERN_ERR "EXT4-fs: cannot specify " + "journal on remount\n"); + return 0; + } + if (match_int(&args[0], &option)) + return 0; + *inum = option; + break; + case Opt_journal_dev: + if (is_remount) { + printk(KERN_ERR "EXT4-fs: cannot specify " + "journal on remount\n"); + return 0; + } + if (match_int(&args[0], &option)) + return 0; + *journal_devnum = option; + break; + case Opt_journal_checksum: + set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM); + break; + case Opt_journal_async_commit: + set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT); + set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM); + break; + case Opt_noload: + set_opt(sbi->s_mount_opt, NOLOAD); + break; + case Opt_commit: + if (match_int(&args[0], &option)) + return 0; + if (option < 0) + return 0; + if (option == 0) + option = JBD2_DEFAULT_MAX_COMMIT_AGE; + sbi->s_commit_interval = HZ * option; + break; + case Opt_data_journal: + data_opt = EXT4_MOUNT_JOURNAL_DATA; + goto datacheck; + case Opt_data_ordered: + data_opt = EXT4_MOUNT_ORDERED_DATA; + goto datacheck; + case Opt_data_writeback: + data_opt = EXT4_MOUNT_WRITEBACK_DATA; + datacheck: + if (is_remount) { + if ((sbi->s_mount_opt & EXT4_MOUNT_DATA_FLAGS) + != data_opt) { + printk(KERN_ERR + "EXT4-fs: cannot change data " + "mode on remount\n"); + return 0; + } + } else { + sbi->s_mount_opt &= ~EXT4_MOUNT_DATA_FLAGS; + sbi->s_mount_opt |= data_opt; + } + break; + case Opt_data_err_abort: + set_opt(sbi->s_mount_opt, DATA_ERR_ABORT); + break; + case Opt_data_err_ignore: + clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT); + break; +#ifdef CONFIG_QUOTA + case Opt_usrjquota: + qtype = USRQUOTA; + goto set_qf_name; + case Opt_grpjquota: + qtype = GRPQUOTA; +set_qf_name: + if ((sb_any_quota_enabled(sb) || + sb_any_quota_suspended(sb)) && + !sbi->s_qf_names[qtype]) { + printk(KERN_ERR + "EXT4-fs: Cannot change journaled " + "quota options when quota turned on.\n"); + return 0; + } + qname = match_strdup(&args[0]); + if (!qname) { + printk(KERN_ERR + "EXT4-fs: not enough memory for " + "storing quotafile name.\n"); + return 0; + } + if (sbi->s_qf_names[qtype] && + strcmp(sbi->s_qf_names[qtype], qname)) { + printk(KERN_ERR + "EXT4-fs: %s quota file already " + "specified.\n", QTYPE2NAME(qtype)); + kfree(qname); + return 0; + } + sbi->s_qf_names[qtype] = qname; + if (strchr(sbi->s_qf_names[qtype], '/')) { + printk(KERN_ERR + "EXT4-fs: quotafile must be on " + "filesystem root.\n"); + kfree(sbi->s_qf_names[qtype]); + sbi->s_qf_names[qtype] = NULL; + return 0; + } + set_opt(sbi->s_mount_opt, QUOTA); + break; + case Opt_offusrjquota: + qtype = USRQUOTA; + goto clear_qf_name; + case Opt_offgrpjquota: + qtype = GRPQUOTA; +clear_qf_name: + if ((sb_any_quota_enabled(sb) || + sb_any_quota_suspended(sb)) && + sbi->s_qf_names[qtype]) { + printk(KERN_ERR "EXT4-fs: Cannot change " + "journaled quota options when " + "quota turned on.\n"); + return 0; + } + /* + * The space will be released later when all options + * are confirmed to be correct + */ + sbi->s_qf_names[qtype] = NULL; + break; + case Opt_jqfmt_vfsold: + qfmt = QFMT_VFS_OLD; + goto set_qf_format; + case Opt_jqfmt_vfsv0: + qfmt = QFMT_VFS_V0; +set_qf_format: + if ((sb_any_quota_enabled(sb) || + sb_any_quota_suspended(sb)) && + sbi->s_jquota_fmt != qfmt) { + printk(KERN_ERR "EXT4-fs: Cannot change " + "journaled quota options when " + "quota turned on.\n"); + return 0; + } + sbi->s_jquota_fmt = qfmt; + break; + case Opt_quota: + case Opt_usrquota: + set_opt(sbi->s_mount_opt, QUOTA); + set_opt(sbi->s_mount_opt, USRQUOTA); + break; + case Opt_grpquota: + set_opt(sbi->s_mount_opt, QUOTA); + set_opt(sbi->s_mount_opt, GRPQUOTA); + break; + case Opt_noquota: + if (sb_any_quota_enabled(sb)) { + printk(KERN_ERR "EXT4-fs: Cannot change quota " + "options when quota turned on.\n"); + return 0; + } + clear_opt(sbi->s_mount_opt, QUOTA); + clear_opt(sbi->s_mount_opt, USRQUOTA); + clear_opt(sbi->s_mount_opt, GRPQUOTA); + break; +#else + case Opt_quota: + case Opt_usrquota: + case Opt_grpquota: + printk(KERN_ERR + "EXT4-fs: quota options not supported.\n"); + break; + case Opt_usrjquota: + case Opt_grpjquota: + case Opt_offusrjquota: + case Opt_offgrpjquota: + case Opt_jqfmt_vfsold: + case Opt_jqfmt_vfsv0: + printk(KERN_ERR + "EXT4-fs: journaled quota options not " + "supported.\n"); + break; + case Opt_noquota: + break; +#endif + case Opt_abort: + set_opt(sbi->s_mount_opt, ABORT); + break; + case Opt_barrier: + if (match_int(&args[0], &option)) + return 0; + if (option) + set_opt(sbi->s_mount_opt, BARRIER); + else + clear_opt(sbi->s_mount_opt, BARRIER); + break; + case Opt_ignore: + break; + case Opt_resize: + if (!is_remount) { + printk("EXT4-fs: resize option only available " + "for remount\n"); + return 0; + } + if (match_int(&args[0], &option) != 0) + return 0; + *n_blocks_count = option; + break; + case Opt_nobh: + set_opt(sbi->s_mount_opt, NOBH); + break; + case Opt_bh: + clear_opt(sbi->s_mount_opt, NOBH); + break; + case Opt_extents: + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_EXTENTS)) { + ext4_warning(sb, __func__, + "extents feature not enabled " + "on this filesystem, use tune2fs\n"); + return 0; + } + set_opt(sbi->s_mount_opt, EXTENTS); + break; + case Opt_noextents: + /* + * When e2fsprogs support resizing an already existing + * ext3 file system to greater than 2**32 we need to + * add support to block allocator to handle growing + * already existing block mapped inode so that blocks + * allocated for them fall within 2**32 + */ + last_block = ext4_blocks_count(sbi->s_es) - 1; + if (last_block > 0xffffffffULL) { + printk(KERN_ERR "EXT4-fs: Filesystem too " + "large to mount with " + "-o noextents options\n"); + return 0; + } + clear_opt(sbi->s_mount_opt, EXTENTS); + break; + case Opt_i_version: + set_opt(sbi->s_mount_opt, I_VERSION); + sb->s_flags |= MS_I_VERSION; + break; + case Opt_nodelalloc: + clear_opt(sbi->s_mount_opt, DELALLOC); + break; + case Opt_stripe: + if (match_int(&args[0], &option)) + return 0; + if (option < 0) + return 0; + sbi->s_stripe = option; + break; + case Opt_delalloc: + set_opt(sbi->s_mount_opt, DELALLOC); + break; + case Opt_inode_readahead_blks: + if (match_int(&args[0], &option)) + return 0; + if (option < 0 || option > (1 << 30)) + return 0; + sbi->s_inode_readahead_blks = option; + break; + default: + printk(KERN_ERR + "EXT4-fs: Unrecognized mount option \"%s\" " + "or missing value\n", p); + return 0; + } + } +#ifdef CONFIG_QUOTA + if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { + if ((sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA) && + sbi->s_qf_names[USRQUOTA]) + clear_opt(sbi->s_mount_opt, USRQUOTA); + + if ((sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA) && + sbi->s_qf_names[GRPQUOTA]) + clear_opt(sbi->s_mount_opt, GRPQUOTA); + + if ((sbi->s_qf_names[USRQUOTA] && + (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)) || + (sbi->s_qf_names[GRPQUOTA] && + (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA))) { + printk(KERN_ERR "EXT4-fs: old and new quota " + "format mixing.\n"); + return 0; + } + + if (!sbi->s_jquota_fmt) { + printk(KERN_ERR "EXT4-fs: journaled quota format " + "not specified.\n"); + return 0; + } + } else { + if (sbi->s_jquota_fmt) { + printk(KERN_ERR "EXT4-fs: journaled quota format " + "specified with no journaling " + "enabled.\n"); + return 0; + } + } +#endif + return 1; +} + +static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, + int read_only) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + int res = 0; + + if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) { + printk(KERN_ERR "EXT4-fs warning: revision level too high, " + "forcing read-only mode\n"); + res = MS_RDONLY; + } + if (read_only) + return res; + if (!(sbi->s_mount_state & EXT4_VALID_FS)) + printk(KERN_WARNING "EXT4-fs warning: mounting unchecked fs, " + "running e2fsck is recommended\n"); + else if ((sbi->s_mount_state & EXT4_ERROR_FS)) + printk(KERN_WARNING + "EXT4-fs warning: mounting fs with errors, " + "running e2fsck is recommended\n"); + else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 && + le16_to_cpu(es->s_mnt_count) >= + (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) + printk(KERN_WARNING + "EXT4-fs warning: maximal mount count reached, " + "running e2fsck is recommended\n"); + else if (le32_to_cpu(es->s_checkinterval) && + (le32_to_cpu(es->s_lastcheck) + + le32_to_cpu(es->s_checkinterval) <= get_seconds())) + printk(KERN_WARNING + "EXT4-fs warning: checktime reached, " + "running e2fsck is recommended\n"); +#if 0 + /* @@@ We _will_ want to clear the valid bit if we find + * inconsistencies, to force a fsck at reboot. But for + * a plain journaled filesystem we can keep it set as + * valid forever! :) + */ + es->s_state &= cpu_to_le16(~EXT4_VALID_FS); +#endif + if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) + es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT); + le16_add_cpu(&es->s_mnt_count, 1); + es->s_mtime = cpu_to_le32(get_seconds()); + ext4_update_dynamic_rev(sb); + EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + + ext4_commit_super(sb, es, 1); + if (test_opt(sb, DEBUG)) + printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%lu, " + "bpg=%lu, ipg=%lu, mo=%04lx]\n", + sb->s_blocksize, + sbi->s_groups_count, + EXT4_BLOCKS_PER_GROUP(sb), + EXT4_INODES_PER_GROUP(sb), + sbi->s_mount_opt); + + printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n", + sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" : + "external", EXT4_SB(sb)->s_journal->j_devname); + return res; +} + +static int ext4_fill_flex_info(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_desc *gdp = NULL; + struct buffer_head *bh; + ext4_group_t flex_group_count; + ext4_group_t flex_group; + int groups_per_flex = 0; + int i; + + if (!sbi->s_es->s_log_groups_per_flex) { + sbi->s_log_groups_per_flex = 0; + return 1; + } + + sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; + groups_per_flex = 1 << sbi->s_log_groups_per_flex; + + /* We allocate both existing and potentially added groups */ + flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) + + ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) << + EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex; + sbi->s_flex_groups = kzalloc(flex_group_count * + sizeof(struct flex_groups), GFP_KERNEL); + if (sbi->s_flex_groups == NULL) { + printk(KERN_ERR "EXT4-fs: not enough memory for " + "%lu flex groups\n", flex_group_count); + goto failed; + } + + for (i = 0; i < sbi->s_groups_count; i++) { + gdp = ext4_get_group_desc(sb, i, &bh); + + flex_group = ext4_flex_group(sbi, i); + sbi->s_flex_groups[flex_group].free_inodes += + le16_to_cpu(gdp->bg_free_inodes_count); + sbi->s_flex_groups[flex_group].free_blocks += + le16_to_cpu(gdp->bg_free_blocks_count); + } + + return 1; +failed: + return 0; +} + +__le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group, + struct ext4_group_desc *gdp) +{ + __u16 crc = 0; + + if (sbi->s_es->s_feature_ro_compat & + cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { + int offset = offsetof(struct ext4_group_desc, bg_checksum); + __le32 le_group = cpu_to_le32(block_group); + + crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid)); + crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group)); + crc = crc16(crc, (__u8 *)gdp, offset); + offset += sizeof(gdp->bg_checksum); /* skip checksum */ + /* for checksum of struct ext4_group_desc do the rest...*/ + if ((sbi->s_es->s_feature_incompat & + cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT)) && + offset < le16_to_cpu(sbi->s_es->s_desc_size)) + crc = crc16(crc, (__u8 *)gdp + offset, + le16_to_cpu(sbi->s_es->s_desc_size) - + offset); + } + + return cpu_to_le16(crc); +} + +int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 block_group, + struct ext4_group_desc *gdp) +{ + if ((sbi->s_es->s_feature_ro_compat & + cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) && + (gdp->bg_checksum != ext4_group_desc_csum(sbi, block_group, gdp))) + return 0; + + return 1; +} + +/* Called at mount-time, super-block is locked */ +static int ext4_check_descriptors(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block); + ext4_fsblk_t last_block; + ext4_fsblk_t block_bitmap; + ext4_fsblk_t inode_bitmap; + ext4_fsblk_t inode_table; + int flexbg_flag = 0; + ext4_group_t i; + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) + flexbg_flag = 1; + + ext4_debug("Checking group descriptors"); + + for (i = 0; i < sbi->s_groups_count; i++) { + struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); + + if (i == sbi->s_groups_count - 1 || flexbg_flag) + last_block = ext4_blocks_count(sbi->s_es) - 1; + else + last_block = first_block + + (EXT4_BLOCKS_PER_GROUP(sb) - 1); + + block_bitmap = ext4_block_bitmap(sb, gdp); + if (block_bitmap < first_block || block_bitmap > last_block) { + printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " + "Block bitmap for group %lu not in group " + "(block %llu)!\n", i, block_bitmap); + return 0; + } + inode_bitmap = ext4_inode_bitmap(sb, gdp); + if (inode_bitmap < first_block || inode_bitmap > last_block) { + printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " + "Inode bitmap for group %lu not in group " + "(block %llu)!\n", i, inode_bitmap); + return 0; + } + inode_table = ext4_inode_table(sb, gdp); + if (inode_table < first_block || + inode_table + sbi->s_itb_per_group - 1 > last_block) { + printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " + "Inode table for group %lu not in group " + "(block %llu)!\n", i, inode_table); + return 0; + } + spin_lock(sb_bgl_lock(sbi, i)); + if (!ext4_group_desc_csum_verify(sbi, i, gdp)) { + printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " + "Checksum for group %lu failed (%u!=%u)\n", + i, le16_to_cpu(ext4_group_desc_csum(sbi, i, + gdp)), le16_to_cpu(gdp->bg_checksum)); + if (!(sb->s_flags & MS_RDONLY)) { + spin_unlock(sb_bgl_lock(sbi, i)); + return 0; + } + } + spin_unlock(sb_bgl_lock(sbi, i)); + if (!flexbg_flag) + first_block += EXT4_BLOCKS_PER_GROUP(sb); + } + + ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb)); + sbi->s_es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb)); + return 1; +} + +/* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at + * the superblock) which were deleted from all directories, but held open by + * a process at the time of a crash. We walk the list and try to delete these + * inodes at recovery time (only with a read-write filesystem). + * + * In order to keep the orphan inode chain consistent during traversal (in + * case of crash during recovery), we link each inode into the superblock + * orphan list_head and handle it the same way as an inode deletion during + * normal operation (which journals the operations for us). + * + * We only do an iget() and an iput() on each inode, which is very safe if we + * accidentally point at an in-use or already deleted inode. The worst that + * can happen in this case is that we get a "bit already cleared" message from + * ext4_free_inode(). The only reason we would point at a wrong inode is if + * e2fsck was run on this filesystem, and it must have already done the orphan + * inode cleanup for us, so we can safely abort without any further action. + */ +static void ext4_orphan_cleanup(struct super_block *sb, + struct ext4_super_block *es) +{ + unsigned int s_flags = sb->s_flags; + int nr_orphans = 0, nr_truncates = 0; +#ifdef CONFIG_QUOTA + int i; +#endif + if (!es->s_last_orphan) { + jbd_debug(4, "no orphan inodes to clean up\n"); + return; + } + + if (bdev_read_only(sb->s_bdev)) { + printk(KERN_ERR "EXT4-fs: write access " + "unavailable, skipping orphan cleanup.\n"); + return; + } + + if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { + if (es->s_last_orphan) + jbd_debug(1, "Errors on filesystem, " + "clearing orphan list.\n"); + es->s_last_orphan = 0; + jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); + return; + } + + if (s_flags & MS_RDONLY) { + printk(KERN_INFO "EXT4-fs: %s: orphan cleanup on readonly fs\n", + sb->s_id); + sb->s_flags &= ~MS_RDONLY; + } +#ifdef CONFIG_QUOTA + /* Needed for iput() to work correctly and not trash data */ + sb->s_flags |= MS_ACTIVE; + /* Turn on quotas so that they are updated correctly */ + for (i = 0; i < MAXQUOTAS; i++) { + if (EXT4_SB(sb)->s_qf_names[i]) { + int ret = ext4_quota_on_mount(sb, i); + if (ret < 0) + printk(KERN_ERR + "EXT4-fs: Cannot turn on journaled " + "quota: error %d\n", ret); + } + } +#endif + + while (es->s_last_orphan) { + struct inode *inode; + + inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan)); + if (IS_ERR(inode)) { + es->s_last_orphan = 0; + break; + } + + list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); + DQUOT_INIT(inode); + if (inode->i_nlink) { + printk(KERN_DEBUG + "%s: truncating inode %lu to %lld bytes\n", + __func__, inode->i_ino, inode->i_size); + jbd_debug(2, "truncating inode %lu to %lld bytes\n", + inode->i_ino, inode->i_size); + ext4_truncate(inode); + nr_truncates++; + } else { + printk(KERN_DEBUG + "%s: deleting unreferenced inode %lu\n", + __func__, inode->i_ino); + jbd_debug(2, "deleting unreferenced inode %lu\n", + inode->i_ino); + nr_orphans++; + } + iput(inode); /* The delete magic happens here! */ + } + +#define PLURAL(x) (x), ((x) == 1) ? "" : "s" + + if (nr_orphans) + printk(KERN_INFO "EXT4-fs: %s: %d orphan inode%s deleted\n", + sb->s_id, PLURAL(nr_orphans)); + if (nr_truncates) + printk(KERN_INFO "EXT4-fs: %s: %d truncate%s cleaned up\n", + sb->s_id, PLURAL(nr_truncates)); +#ifdef CONFIG_QUOTA + /* Turn quotas off */ + for (i = 0; i < MAXQUOTAS; i++) { + if (sb_dqopt(sb)->files[i]) + vfs_quota_off(sb, i, 0); + } +#endif + sb->s_flags = s_flags; /* Restore MS_RDONLY status */ +} +/* + * Maximal extent format file size. + * Resulting logical blkno at s_maxbytes must fit in our on-disk + * extent format containers, within a sector_t, and within i_blocks + * in the vfs. ext4 inode has 48 bits of i_block in fsblock units, + * so that won't be a limiting factor. + * + * Note, this does *not* consider any metadata overhead for vfs i_blocks. + */ +static loff_t ext4_max_size(int blkbits, int has_huge_files) +{ + loff_t res; + loff_t upper_limit = MAX_LFS_FILESIZE; + + /* small i_blocks in vfs inode? */ + if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) { + /* + * CONFIG_LSF is not enabled implies the inode + * i_block represent total blocks in 512 bytes + * 32 == size of vfs inode i_blocks * 8 + */ + upper_limit = (1LL << 32) - 1; + + /* total blocks in file system block size */ + upper_limit >>= (blkbits - 9); + upper_limit <<= blkbits; + } + + /* 32-bit extent-start container, ee_block */ + res = 1LL << 32; + res <<= blkbits; + res -= 1; + + /* Sanity check against vm- & vfs- imposed limits */ + if (res > upper_limit) + res = upper_limit; + + return res; +} + +/* + * Maximal bitmap file size. There is a direct, and {,double-,triple-}indirect + * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks. + * We need to be 1 filesystem block less than the 2^48 sector limit. + */ +static loff_t ext4_max_bitmap_size(int bits, int has_huge_files) +{ + loff_t res = EXT4_NDIR_BLOCKS; + int meta_blocks; + loff_t upper_limit; + /* This is calculated to be the largest file size for a + * dense, bitmapped file such that the total number of + * sectors in the file, including data and all indirect blocks, + * does not exceed 2^48 -1 + * __u32 i_blocks_lo and _u16 i_blocks_high representing the + * total number of 512 bytes blocks of the file + */ + + if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) { + /* + * !has_huge_files or CONFIG_LSF is not enabled + * implies the inode i_block represent total blocks in + * 512 bytes 32 == size of vfs inode i_blocks * 8 + */ + upper_limit = (1LL << 32) - 1; + + /* total blocks in file system block size */ + upper_limit >>= (bits - 9); + + } else { + /* + * We use 48 bit ext4_inode i_blocks + * With EXT4_HUGE_FILE_FL set the i_blocks + * represent total number of blocks in + * file system block size + */ + upper_limit = (1LL << 48) - 1; + + } + + /* indirect blocks */ + meta_blocks = 1; + /* double indirect blocks */ + meta_blocks += 1 + (1LL << (bits-2)); + /* tripple indirect blocks */ + meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2))); + + upper_limit -= meta_blocks; + upper_limit <<= bits; + + res += 1LL << (bits-2); + res += 1LL << (2*(bits-2)); + res += 1LL << (3*(bits-2)); + res <<= bits; + if (res > upper_limit) + res = upper_limit; + + if (res > MAX_LFS_FILESIZE) + res = MAX_LFS_FILESIZE; + + return res; +} + +static ext4_fsblk_t descriptor_loc(struct super_block *sb, + ext4_fsblk_t logical_sb_block, int nr) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_group_t bg, first_meta_bg; + int has_super = 0; + + first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg); + + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) || + nr < first_meta_bg) + return logical_sb_block + nr + 1; + bg = sbi->s_desc_per_block * nr; + if (ext4_bg_has_super(sb, bg)) + has_super = 1; + return (has_super + ext4_group_first_block_no(sb, bg)); +} + +/** + * ext4_get_stripe_size: Get the stripe size. + * @sbi: In memory super block info + * + * If we have specified it via mount option, then + * use the mount option value. If the value specified at mount time is + * greater than the blocks per group use the super block value. + * If the super block value is greater than blocks per group return 0. + * Allocator needs it be less than blocks per group. + * + */ +static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi) +{ + unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride); + unsigned long stripe_width = + le32_to_cpu(sbi->s_es->s_raid_stripe_width); + + if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group) + return sbi->s_stripe; + + if (stripe_width <= sbi->s_blocks_per_group) + return stripe_width; + + if (stride <= sbi->s_blocks_per_group) + return stride; + + return 0; +} + +static int ext4_fill_super(struct super_block *sb, void *data, int silent) + __releases(kernel_lock) + __acquires(kernel_lock) + +{ + struct buffer_head *bh; + struct ext4_super_block *es = NULL; + struct ext4_sb_info *sbi; + ext4_fsblk_t block; + ext4_fsblk_t sb_block = get_sb_block(&data); + ext4_fsblk_t logical_sb_block; + unsigned long offset = 0; + unsigned int journal_inum = 0; + unsigned long journal_devnum = 0; + unsigned long def_mount_opts; + struct inode *root; + char *cp; + int ret = -EINVAL; + int blocksize; + unsigned int db_count; + unsigned int i; + int needs_recovery, has_huge_files; + __le32 features; + __u64 blocks_count; + int err; + + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + sb->s_fs_info = sbi; + sbi->s_mount_opt = 0; + sbi->s_resuid = EXT4_DEF_RESUID; + sbi->s_resgid = EXT4_DEF_RESGID; + sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; + sbi->s_sb_block = sb_block; + + unlock_kernel(); + + /* Cleanup superblock name */ + for (cp = sb->s_id; (cp = strchr(cp, '/'));) + *cp = '!'; + + blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); + if (!blocksize) { + printk(KERN_ERR "EXT4-fs: unable to set blocksize\n"); + goto out_fail; + } + + /* + * The ext4 superblock will not be buffer aligned for other than 1kB + * block sizes. We need to calculate the offset from buffer start. + */ + if (blocksize != EXT4_MIN_BLOCK_SIZE) { + logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE; + offset = do_div(logical_sb_block, blocksize); + } else { + logical_sb_block = sb_block; + } + + if (!(bh = sb_bread(sb, logical_sb_block))) { + printk(KERN_ERR "EXT4-fs: unable to read superblock\n"); + goto out_fail; + } + /* + * Note: s_es must be initialized as soon as possible because + * some ext4 macro-instructions depend on its value + */ + es = (struct ext4_super_block *) (((char *)bh->b_data) + offset); + sbi->s_es = es; + sb->s_magic = le16_to_cpu(es->s_magic); + if (sb->s_magic != EXT4_SUPER_MAGIC) + goto cantfind_ext4; + + /* Set defaults before we parse the mount options */ + def_mount_opts = le32_to_cpu(es->s_default_mount_opts); + if (def_mount_opts & EXT4_DEFM_DEBUG) + set_opt(sbi->s_mount_opt, DEBUG); + if (def_mount_opts & EXT4_DEFM_BSDGROUPS) + set_opt(sbi->s_mount_opt, GRPID); + if (def_mount_opts & EXT4_DEFM_UID16) + set_opt(sbi->s_mount_opt, NO_UID32); +#ifdef CONFIG_EXT4_FS_XATTR + if (def_mount_opts & EXT4_DEFM_XATTR_USER) + set_opt(sbi->s_mount_opt, XATTR_USER); +#endif +#ifdef CONFIG_EXT4_FS_POSIX_ACL + if (def_mount_opts & EXT4_DEFM_ACL) + set_opt(sbi->s_mount_opt, POSIX_ACL); +#endif + if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) + sbi->s_mount_opt |= EXT4_MOUNT_JOURNAL_DATA; + else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED) + sbi->s_mount_opt |= EXT4_MOUNT_ORDERED_DATA; + else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK) + sbi->s_mount_opt |= EXT4_MOUNT_WRITEBACK_DATA; + + if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC) + set_opt(sbi->s_mount_opt, ERRORS_PANIC); + else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE) + set_opt(sbi->s_mount_opt, ERRORS_CONT); + else + set_opt(sbi->s_mount_opt, ERRORS_RO); + + sbi->s_resuid = le16_to_cpu(es->s_def_resuid); + sbi->s_resgid = le16_to_cpu(es->s_def_resgid); + + set_opt(sbi->s_mount_opt, RESERVATION); + set_opt(sbi->s_mount_opt, BARRIER); + + /* + * turn on extents feature by default in ext4 filesystem + * only if feature flag already set by mkfs or tune2fs. + * Use -o noextents to turn it off + */ + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) + set_opt(sbi->s_mount_opt, EXTENTS); + else + ext4_warning(sb, __func__, + "extents feature not enabled on this filesystem, " + "use tune2fs.\n"); + + /* + * enable delayed allocation by default + * Use -o nodelalloc to turn it off + */ + set_opt(sbi->s_mount_opt, DELALLOC); + + + if (!parse_options((char *) data, sb, &journal_inum, &journal_devnum, + NULL, 0)) + goto failed_mount; + + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + ((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); + + if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV && + (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) || + EXT4_HAS_RO_COMPAT_FEATURE(sb, ~0U) || + EXT4_HAS_INCOMPAT_FEATURE(sb, ~0U))) + printk(KERN_WARNING + "EXT4-fs warning: feature flags set on rev 0 fs, " + "running e2fsck is recommended\n"); + + /* + * Check feature flags regardless of the revision level, since we + * previously didn't change the revision level when setting the flags, + * so there is a chance incompat flags are set on a rev 0 filesystem. + */ + features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP); + if (features) { + printk(KERN_ERR "EXT4-fs: %s: couldn't mount because of " + "unsupported optional features (%x).\n", + sb->s_id, le32_to_cpu(features)); + goto failed_mount; + } + features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP); + if (!(sb->s_flags & MS_RDONLY) && features) { + printk(KERN_ERR "EXT4-fs: %s: couldn't mount RDWR because of " + "unsupported optional features (%x).\n", + sb->s_id, le32_to_cpu(features)); + goto failed_mount; + } + has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_HUGE_FILE); + if (has_huge_files) { + /* + * Large file size enabled file system can only be + * mount if kernel is build with CONFIG_LSF + */ + if (sizeof(root->i_blocks) < sizeof(u64) && + !(sb->s_flags & MS_RDONLY)) { + printk(KERN_ERR "EXT4-fs: %s: Filesystem with huge " + "files cannot be mounted read-write " + "without CONFIG_LSF.\n", sb->s_id); + goto failed_mount; + } + } + blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); + + if (blocksize < EXT4_MIN_BLOCK_SIZE || + blocksize > EXT4_MAX_BLOCK_SIZE) { + printk(KERN_ERR + "EXT4-fs: Unsupported filesystem blocksize %d on %s.\n", + blocksize, sb->s_id); + goto failed_mount; + } + + if (sb->s_blocksize != blocksize) { + + /* Validate the filesystem blocksize */ + if (!sb_set_blocksize(sb, blocksize)) { + printk(KERN_ERR "EXT4-fs: bad block size %d.\n", + blocksize); + goto failed_mount; + } + + brelse(bh); + logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE; + offset = do_div(logical_sb_block, blocksize); + bh = sb_bread(sb, logical_sb_block); + if (!bh) { + printk(KERN_ERR + "EXT4-fs: Can't read superblock on 2nd try.\n"); + goto failed_mount; + } + es = (struct ext4_super_block *)(((char *)bh->b_data) + offset); + sbi->s_es = es; + if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) { + printk(KERN_ERR + "EXT4-fs: Magic mismatch, very weird !\n"); + goto failed_mount; + } + } + + sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, + has_huge_files); + sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); + + if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { + sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE; + sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO; + } else { + sbi->s_inode_size = le16_to_cpu(es->s_inode_size); + sbi->s_first_ino = le32_to_cpu(es->s_first_ino); + if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) || + (!is_power_of_2(sbi->s_inode_size)) || + (sbi->s_inode_size > blocksize)) { + printk(KERN_ERR + "EXT4-fs: unsupported inode size: %d\n", + sbi->s_inode_size); + goto failed_mount; + } + if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) + sb->s_time_gran = 1 << (EXT4_EPOCH_BITS - 2); + } + sbi->s_desc_size = le16_to_cpu(es->s_desc_size); + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) { + if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT || + sbi->s_desc_size > EXT4_MAX_DESC_SIZE || + !is_power_of_2(sbi->s_desc_size)) { + printk(KERN_ERR + "EXT4-fs: unsupported descriptor size %lu\n", + sbi->s_desc_size); + goto failed_mount; + } + } else + sbi->s_desc_size = EXT4_MIN_DESC_SIZE; + sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); + sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); + if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0) + goto cantfind_ext4; + sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb); + if (sbi->s_inodes_per_block == 0) + goto cantfind_ext4; + sbi->s_itb_per_group = sbi->s_inodes_per_group / + sbi->s_inodes_per_block; + sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb); + sbi->s_sbh = bh; + sbi->s_mount_state = le16_to_cpu(es->s_state); + sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); + sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); + for (i = 0; i < 4; i++) + sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); + sbi->s_def_hash_version = es->s_def_hash_version; + i = le32_to_cpu(es->s_flags); + if (i & EXT2_FLAGS_UNSIGNED_HASH) + sbi->s_hash_unsigned = 3; + else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { +#ifdef __CHAR_UNSIGNED__ + es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); + sbi->s_hash_unsigned = 3; +#else + es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); +#endif + sb->s_dirt = 1; + } + + if (sbi->s_blocks_per_group > blocksize * 8) { + printk(KERN_ERR + "EXT4-fs: #blocks per group too big: %lu\n", + sbi->s_blocks_per_group); + goto failed_mount; + } + if (sbi->s_inodes_per_group > blocksize * 8) { + printk(KERN_ERR + "EXT4-fs: #inodes per group too big: %lu\n", + sbi->s_inodes_per_group); + goto failed_mount; + } + + if (ext4_blocks_count(es) > + (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { + printk(KERN_ERR "EXT4-fs: filesystem on %s:" + " too large to mount safely\n", sb->s_id); + if (sizeof(sector_t) < 8) + printk(KERN_WARNING "EXT4-fs: CONFIG_LBD not " + "enabled\n"); + goto failed_mount; + } + + if (EXT4_BLOCKS_PER_GROUP(sb) == 0) + goto cantfind_ext4; + + /* + * It makes no sense for the first data block to be beyond the end + * of the filesystem. + */ + if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) { + printk(KERN_WARNING "EXT4-fs: bad geometry: first data" + "block %u is beyond end of filesystem (%llu)\n", + le32_to_cpu(es->s_first_data_block), + ext4_blocks_count(es)); + goto failed_mount; + } + blocks_count = (ext4_blocks_count(es) - + le32_to_cpu(es->s_first_data_block) + + EXT4_BLOCKS_PER_GROUP(sb) - 1); + do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb)); + if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) { + printk(KERN_WARNING "EXT4-fs: groups count too large: %u " + "(block count %llu, first data block %u, " + "blocks per group %lu)\n", sbi->s_groups_count, + ext4_blocks_count(es), + le32_to_cpu(es->s_first_data_block), + EXT4_BLOCKS_PER_GROUP(sb)); + goto failed_mount; + } + sbi->s_groups_count = blocks_count; + db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / + EXT4_DESC_PER_BLOCK(sb); + sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *), + GFP_KERNEL); + if (sbi->s_group_desc == NULL) { + printk(KERN_ERR "EXT4-fs: not enough memory\n"); + goto failed_mount; + } + +#ifdef CONFIG_PROC_FS + if (ext4_proc_root) + sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root); + + if (sbi->s_proc) + proc_create_data("inode_readahead_blks", 0644, sbi->s_proc, + &ext4_ui_proc_fops, + &sbi->s_inode_readahead_blks); +#endif + + bgl_lock_init(&sbi->s_blockgroup_lock); + + for (i = 0; i < db_count; i++) { + block = descriptor_loc(sb, logical_sb_block, i); + sbi->s_group_desc[i] = sb_bread(sb, block); + if (!sbi->s_group_desc[i]) { + printk(KERN_ERR "EXT4-fs: " + "can't read group descriptor %d\n", i); + db_count = i; + goto failed_mount2; + } + } + if (!ext4_check_descriptors(sb)) { + printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n"); + goto failed_mount2; + } + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) + if (!ext4_fill_flex_info(sb)) { + printk(KERN_ERR + "EXT4-fs: unable to initialize " + "flex_bg meta info!\n"); + goto failed_mount2; + } + + sbi->s_gdb_count = db_count; + get_random_bytes(&sbi->s_next_generation, sizeof(u32)); + spin_lock_init(&sbi->s_next_gen_lock); + + err = percpu_counter_init(&sbi->s_freeblocks_counter, + ext4_count_free_blocks(sb)); + if (!err) { + err = percpu_counter_init(&sbi->s_freeinodes_counter, + ext4_count_free_inodes(sb)); + } + if (!err) { + err = percpu_counter_init(&sbi->s_dirs_counter, + ext4_count_dirs(sb)); + } + if (!err) { + err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0); + } + if (err) { + printk(KERN_ERR "EXT4-fs: insufficient memory\n"); + goto failed_mount3; + } + + sbi->s_stripe = ext4_get_stripe_size(sbi); + + /* + * set up enough so that it can read an inode + */ + sb->s_op = &ext4_sops; + sb->s_export_op = &ext4_export_ops; + sb->s_xattr = ext4_xattr_handlers; +#ifdef CONFIG_QUOTA + sb->s_qcop = &ext4_qctl_operations; + sb->dq_op = &ext4_quota_operations; +#endif + INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ + + sb->s_root = NULL; + + needs_recovery = (es->s_last_orphan != 0 || + EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_RECOVER)); + + /* + * The first inode we look at is the journal inode. Don't try + * root first: it may be modified in the journal! + */ + if (!test_opt(sb, NOLOAD) && + EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) { + if (ext4_load_journal(sb, es, journal_devnum)) + goto failed_mount3; + if (!(sb->s_flags & MS_RDONLY) && + EXT4_SB(sb)->s_journal->j_failed_commit) { + printk(KERN_CRIT "EXT4-fs error (device %s): " + "ext4_fill_super: Journal transaction " + "%u is corrupt\n", sb->s_id, + EXT4_SB(sb)->s_journal->j_failed_commit); + if (test_opt(sb, ERRORS_RO)) { + printk(KERN_CRIT + "Mounting filesystem read-only\n"); + sb->s_flags |= MS_RDONLY; + EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; + es->s_state |= cpu_to_le16(EXT4_ERROR_FS); + } + if (test_opt(sb, ERRORS_PANIC)) { + EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; + es->s_state |= cpu_to_le16(EXT4_ERROR_FS); + ext4_commit_super(sb, es, 1); + printk(KERN_CRIT + "EXT4-fs (device %s): mount failed\n", + sb->s_id); + goto failed_mount4; + } + } + } else if (journal_inum) { + if (ext4_create_journal(sb, es, journal_inum)) + goto failed_mount3; + } else { + if (!silent) + printk(KERN_ERR + "ext4: No journal on filesystem on %s\n", + sb->s_id); + goto failed_mount3; + } + + if (ext4_blocks_count(es) > 0xffffffffULL && + !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, + JBD2_FEATURE_INCOMPAT_64BIT)) { + printk(KERN_ERR "ext4: Failed to set 64-bit journal feature\n"); + goto failed_mount4; + } + + if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { + jbd2_journal_set_features(sbi->s_journal, + JBD2_FEATURE_COMPAT_CHECKSUM, 0, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); + } else if (test_opt(sb, JOURNAL_CHECKSUM)) { + jbd2_journal_set_features(sbi->s_journal, + JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0); + jbd2_journal_clear_features(sbi->s_journal, 0, 0, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); + } else { + jbd2_journal_clear_features(sbi->s_journal, + JBD2_FEATURE_COMPAT_CHECKSUM, 0, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); + } + + /* We have now updated the journal if required, so we can + * validate the data journaling mode. */ + switch (test_opt(sb, DATA_FLAGS)) { + case 0: + /* No mode set, assume a default based on the journal + * capabilities: ORDERED_DATA if the journal can + * cope, else JOURNAL_DATA + */ + if (jbd2_journal_check_available_features + (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) + set_opt(sbi->s_mount_opt, ORDERED_DATA); + else + set_opt(sbi->s_mount_opt, JOURNAL_DATA); + break; + + case EXT4_MOUNT_ORDERED_DATA: + case EXT4_MOUNT_WRITEBACK_DATA: + if (!jbd2_journal_check_available_features + (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { + printk(KERN_ERR "EXT4-fs: Journal does not support " + "requested data journaling mode\n"); + goto failed_mount4; + } + default: + break; + } + + if (test_opt(sb, NOBH)) { + if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) { + printk(KERN_WARNING "EXT4-fs: Ignoring nobh option - " + "its supported only with writeback mode\n"); + clear_opt(sbi->s_mount_opt, NOBH); + } + } + /* + * The jbd2_journal_load will have done any necessary log recovery, + * so we can safely mount the rest of the filesystem now. + */ + + root = ext4_iget(sb, EXT4_ROOT_INO); + if (IS_ERR(root)) { + printk(KERN_ERR "EXT4-fs: get root inode failed\n"); + ret = PTR_ERR(root); + goto failed_mount4; + } + if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { + iput(root); + printk(KERN_ERR "EXT4-fs: corrupt root inode, run e2fsck\n"); + goto failed_mount4; + } + sb->s_root = d_alloc_root(root); + if (!sb->s_root) { + printk(KERN_ERR "EXT4-fs: get root dentry failed\n"); + iput(root); + ret = -ENOMEM; + goto failed_mount4; + } + + ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY); + + /* determine the minimum size of new large inodes, if present */ + if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) { + sbi->s_want_extra_isize = sizeof(struct ext4_inode) - + EXT4_GOOD_OLD_INODE_SIZE; + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE)) { + if (sbi->s_want_extra_isize < + le16_to_cpu(es->s_want_extra_isize)) + sbi->s_want_extra_isize = + le16_to_cpu(es->s_want_extra_isize); + if (sbi->s_want_extra_isize < + le16_to_cpu(es->s_min_extra_isize)) + sbi->s_want_extra_isize = + le16_to_cpu(es->s_min_extra_isize); + } + } + /* Check if enough inode space is available */ + if (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize > + sbi->s_inode_size) { + sbi->s_want_extra_isize = sizeof(struct ext4_inode) - + EXT4_GOOD_OLD_INODE_SIZE; + printk(KERN_INFO "EXT4-fs: required extra inode space not" + "available.\n"); + } + + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { + printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - " + "requested data journaling mode\n"); + clear_opt(sbi->s_mount_opt, DELALLOC); + } else if (test_opt(sb, DELALLOC)) + printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n"); + + ext4_ext_init(sb); + err = ext4_mb_init(sb, needs_recovery); + if (err) { + printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n", + err); + goto failed_mount4; + } + + /* + * akpm: core read_super() calls in here with the superblock locked. + * That deadlocks, because orphan cleanup needs to lock the superblock + * in numerous places. Here we just pop the lock - it's relatively + * harmless, because we are now ready to accept write_super() requests, + * and aviro says that's the only reason for hanging onto the + * superblock lock. + */ + EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; + ext4_orphan_cleanup(sb, es); + EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; + if (needs_recovery) + printk(KERN_INFO "EXT4-fs: recovery complete.\n"); + ext4_mark_recovery_complete(sb, es); + printk(KERN_INFO "EXT4-fs: mounted filesystem with %s data mode.\n", + test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal": + test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered": + "writeback"); + + lock_kernel(); + return 0; + +cantfind_ext4: + if (!silent) + printk(KERN_ERR "VFS: Can't find ext4 filesystem on dev %s.\n", + sb->s_id); + goto failed_mount; + +failed_mount4: + jbd2_journal_destroy(sbi->s_journal); + sbi->s_journal = NULL; +failed_mount3: + percpu_counter_destroy(&sbi->s_freeblocks_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); + percpu_counter_destroy(&sbi->s_dirtyblocks_counter); +failed_mount2: + for (i = 0; i < db_count; i++) + brelse(sbi->s_group_desc[i]); + kfree(sbi->s_group_desc); +failed_mount: + if (sbi->s_proc) { + remove_proc_entry("inode_readahead_blks", sbi->s_proc); + remove_proc_entry(sb->s_id, ext4_proc_root); + } +#ifdef CONFIG_QUOTA + for (i = 0; i < MAXQUOTAS; i++) + kfree(sbi->s_qf_names[i]); +#endif + ext4_blkdev_remove(sbi); + brelse(bh); +out_fail: + sb->s_fs_info = NULL; + kfree(sbi); + lock_kernel(); + return ret; +} + +/* + * Setup any per-fs journal parameters now. We'll do this both on + * initial mount, once the journal has been initialised but before we've + * done any recovery; and again on any subsequent remount. + */ +static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (sbi->s_commit_interval) + journal->j_commit_interval = sbi->s_commit_interval; + /* We could also set up an ext4-specific default for the commit + * interval here, but for now we'll just fall back to the jbd + * default. */ + + spin_lock(&journal->j_state_lock); + if (test_opt(sb, BARRIER)) + journal->j_flags |= JBD2_BARRIER; + else + journal->j_flags &= ~JBD2_BARRIER; + if (test_opt(sb, DATA_ERR_ABORT)) + journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR; + else + journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR; + spin_unlock(&journal->j_state_lock); +} + +static journal_t *ext4_get_journal(struct super_block *sb, + unsigned int journal_inum) +{ + struct inode *journal_inode; + journal_t *journal; + + /* First, test for the existence of a valid inode on disk. Bad + * things happen if we iget() an unused inode, as the subsequent + * iput() will try to delete it. */ + + journal_inode = ext4_iget(sb, journal_inum); + if (IS_ERR(journal_inode)) { + printk(KERN_ERR "EXT4-fs: no journal found.\n"); + return NULL; + } + if (!journal_inode->i_nlink) { + make_bad_inode(journal_inode); + iput(journal_inode); + printk(KERN_ERR "EXT4-fs: journal inode is deleted.\n"); + return NULL; + } + + jbd_debug(2, "Journal inode found at %p: %lld bytes\n", + journal_inode, journal_inode->i_size); + if (!S_ISREG(journal_inode->i_mode)) { + printk(KERN_ERR "EXT4-fs: invalid journal inode.\n"); + iput(journal_inode); + return NULL; + } + + journal = jbd2_journal_init_inode(journal_inode); + if (!journal) { + printk(KERN_ERR "EXT4-fs: Could not load journal inode\n"); + iput(journal_inode); + return NULL; + } + journal->j_private = sb; + ext4_init_journal_params(sb, journal); + return journal; +} + +static journal_t *ext4_get_dev_journal(struct super_block *sb, + dev_t j_dev) +{ + struct buffer_head *bh; + journal_t *journal; + ext4_fsblk_t start; + ext4_fsblk_t len; + int hblock, blocksize; + ext4_fsblk_t sb_block; + unsigned long offset; + struct ext4_super_block *es; + struct block_device *bdev; + + bdev = ext4_blkdev_get(j_dev); + if (bdev == NULL) + return NULL; + + if (bd_claim(bdev, sb)) { + printk(KERN_ERR + "EXT4: failed to claim external journal device.\n"); + blkdev_put(bdev, FMODE_READ|FMODE_WRITE); + return NULL; + } + + blocksize = sb->s_blocksize; + hblock = bdev_hardsect_size(bdev); + if (blocksize < hblock) { + printk(KERN_ERR + "EXT4-fs: blocksize too small for journal device.\n"); + goto out_bdev; + } + + sb_block = EXT4_MIN_BLOCK_SIZE / blocksize; + offset = EXT4_MIN_BLOCK_SIZE % blocksize; + set_blocksize(bdev, blocksize); + if (!(bh = __bread(bdev, sb_block, blocksize))) { + printk(KERN_ERR "EXT4-fs: couldn't read superblock of " + "external journal\n"); + goto out_bdev; + } + + es = (struct ext4_super_block *) (((char *)bh->b_data) + offset); + if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) || + !(le32_to_cpu(es->s_feature_incompat) & + EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) { + printk(KERN_ERR "EXT4-fs: external journal has " + "bad superblock\n"); + brelse(bh); + goto out_bdev; + } + + if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) { + printk(KERN_ERR "EXT4-fs: journal UUID does not match\n"); + brelse(bh); + goto out_bdev; + } + + len = ext4_blocks_count(es); + start = sb_block + 1; + brelse(bh); /* we're done with the superblock */ + + journal = jbd2_journal_init_dev(bdev, sb->s_bdev, + start, len, blocksize); + if (!journal) { + printk(KERN_ERR "EXT4-fs: failed to create device journal\n"); + goto out_bdev; + } + journal->j_private = sb; + ll_rw_block(READ, 1, &journal->j_sb_buffer); + wait_on_buffer(journal->j_sb_buffer); + if (!buffer_uptodate(journal->j_sb_buffer)) { + printk(KERN_ERR "EXT4-fs: I/O error on journal device\n"); + goto out_journal; + } + if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) { + printk(KERN_ERR "EXT4-fs: External journal has more than one " + "user (unsupported) - %d\n", + be32_to_cpu(journal->j_superblock->s_nr_users)); + goto out_journal; + } + EXT4_SB(sb)->journal_bdev = bdev; + ext4_init_journal_params(sb, journal); + return journal; +out_journal: + jbd2_journal_destroy(journal); +out_bdev: + ext4_blkdev_put(bdev); + return NULL; +} + +static int ext4_load_journal(struct super_block *sb, + struct ext4_super_block *es, + unsigned long journal_devnum) +{ + journal_t *journal; + unsigned int journal_inum = le32_to_cpu(es->s_journal_inum); + dev_t journal_dev; + int err = 0; + int really_read_only; + + if (journal_devnum && + journal_devnum != le32_to_cpu(es->s_journal_dev)) { + printk(KERN_INFO "EXT4-fs: external journal device major/minor " + "numbers have changed\n"); + journal_dev = new_decode_dev(journal_devnum); + } else + journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); + + really_read_only = bdev_read_only(sb->s_bdev); + + /* + * Are we loading a blank journal or performing recovery after a + * crash? For recovery, we need to check in advance whether we + * can get read-write access to the device. + */ + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { + if (sb->s_flags & MS_RDONLY) { + printk(KERN_INFO "EXT4-fs: INFO: recovery " + "required on readonly filesystem.\n"); + if (really_read_only) { + printk(KERN_ERR "EXT4-fs: write access " + "unavailable, cannot proceed.\n"); + return -EROFS; + } + printk(KERN_INFO "EXT4-fs: write access will " + "be enabled during recovery.\n"); + } + } + + if (journal_inum && journal_dev) { + printk(KERN_ERR "EXT4-fs: filesystem has both journal " + "and inode journals!\n"); + return -EINVAL; + } + + if (journal_inum) { + if (!(journal = ext4_get_journal(sb, journal_inum))) + return -EINVAL; + } else { + if (!(journal = ext4_get_dev_journal(sb, journal_dev))) + return -EINVAL; + } + + if (journal->j_flags & JBD2_BARRIER) + printk(KERN_INFO "EXT4-fs: barriers enabled\n"); + else + printk(KERN_INFO "EXT4-fs: barriers disabled\n"); + + if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { + err = jbd2_journal_update_format(journal); + if (err) { + printk(KERN_ERR "EXT4-fs: error updating journal.\n"); + jbd2_journal_destroy(journal); + return err; + } + } + + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) + err = jbd2_journal_wipe(journal, !really_read_only); + if (!err) + err = jbd2_journal_load(journal); + + if (err) { + printk(KERN_ERR "EXT4-fs: error loading journal.\n"); + jbd2_journal_destroy(journal); + return err; + } + + EXT4_SB(sb)->s_journal = journal; + ext4_clear_journal_err(sb, es); + + if (journal_devnum && + journal_devnum != le32_to_cpu(es->s_journal_dev)) { + es->s_journal_dev = cpu_to_le32(journal_devnum); + sb->s_dirt = 1; + + /* Make sure we flush the recovery flag to disk. */ + ext4_commit_super(sb, es, 1); + } + + return 0; +} + +static int ext4_create_journal(struct super_block *sb, + struct ext4_super_block *es, + unsigned int journal_inum) +{ + journal_t *journal; + int err; + + if (sb->s_flags & MS_RDONLY) { + printk(KERN_ERR "EXT4-fs: readonly filesystem when trying to " + "create journal.\n"); + return -EROFS; + } + + journal = ext4_get_journal(sb, journal_inum); + if (!journal) + return -EINVAL; + + printk(KERN_INFO "EXT4-fs: creating new journal on inode %u\n", + journal_inum); + + err = jbd2_journal_create(journal); + if (err) { + printk(KERN_ERR "EXT4-fs: error creating journal.\n"); + jbd2_journal_destroy(journal); + return -EIO; + } + + EXT4_SB(sb)->s_journal = journal; + + ext4_update_dynamic_rev(sb); + EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL); + + es->s_journal_inum = cpu_to_le32(journal_inum); + sb->s_dirt = 1; + + /* Make sure we flush the recovery flag to disk. */ + ext4_commit_super(sb, es, 1); + + return 0; +} + +static void ext4_commit_super(struct super_block *sb, + struct ext4_super_block *es, int sync) +{ + struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; + + if (!sbh) + return; + if (buffer_write_io_error(sbh)) { + /* + * Oh, dear. A previous attempt to write the + * superblock failed. This could happen because the + * USB device was yanked out. Or it could happen to + * be a transient write error and maybe the block will + * be remapped. Nothing we can do but to retry the + * write and hope for the best. + */ + printk(KERN_ERR "ext4: previous I/O error to " + "superblock detected for %s.\n", sb->s_id); + clear_buffer_write_io_error(sbh); + set_buffer_uptodate(sbh); + } + es->s_wtime = cpu_to_le32(get_seconds()); + ext4_free_blocks_count_set(es, ext4_count_free_blocks(sb)); + es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb)); + BUFFER_TRACE(sbh, "marking dirty"); + mark_buffer_dirty(sbh); + if (sync) { + sync_dirty_buffer(sbh); + if (buffer_write_io_error(sbh)) { + printk(KERN_ERR "ext4: I/O error while writing " + "superblock for %s.\n", sb->s_id); + clear_buffer_write_io_error(sbh); + set_buffer_uptodate(sbh); + } + } +} + + +/* + * Have we just finished recovery? If so, and if we are mounting (or + * remounting) the filesystem readonly, then we will end up with a + * consistent fs on disk. Record that fact. + */ +static void ext4_mark_recovery_complete(struct super_block *sb, + struct ext4_super_block *es) +{ + journal_t *journal = EXT4_SB(sb)->s_journal; + + jbd2_journal_lock_updates(journal); + if (jbd2_journal_flush(journal) < 0) + goto out; + + lock_super(sb); + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) && + sb->s_flags & MS_RDONLY) { + EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + sb->s_dirt = 0; + ext4_commit_super(sb, es, 1); + } + unlock_super(sb); + +out: + jbd2_journal_unlock_updates(journal); +} + +/* + * If we are mounting (or read-write remounting) a filesystem whose journal + * has recorded an error from a previous lifetime, move that error to the + * main filesystem now. + */ +static void ext4_clear_journal_err(struct super_block *sb, + struct ext4_super_block *es) +{ + journal_t *journal; + int j_errno; + const char *errstr; + + journal = EXT4_SB(sb)->s_journal; + + /* + * Now check for any error status which may have been recorded in the + * journal by a prior ext4_error() or ext4_abort() + */ + + j_errno = jbd2_journal_errno(journal); + if (j_errno) { + char nbuf[16]; + + errstr = ext4_decode_error(sb, j_errno, nbuf); + ext4_warning(sb, __func__, "Filesystem error recorded " + "from previous mount: %s", errstr); + ext4_warning(sb, __func__, "Marking fs in need of " + "filesystem check."); + + EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; + es->s_state |= cpu_to_le16(EXT4_ERROR_FS); + ext4_commit_super(sb, es, 1); + + jbd2_journal_clear_err(journal); + } +} + +/* + * Force the running and committing transactions to commit, + * and wait on the commit. + */ +int ext4_force_commit(struct super_block *sb) +{ + journal_t *journal; + int ret; + + if (sb->s_flags & MS_RDONLY) + return 0; + + journal = EXT4_SB(sb)->s_journal; + sb->s_dirt = 0; + ret = ext4_journal_force_commit(journal); + return ret; +} + +/* + * Ext4 always journals updates to the superblock itself, so we don't + * have to propagate any other updates to the superblock on disk at this + * point. (We can probably nuke this function altogether, and remove + * any mention to sb->s_dirt in all of fs/ext4; eventual cleanup...) + */ +static void ext4_write_super(struct super_block *sb) +{ + if (mutex_trylock(&sb->s_lock) != 0) + BUG(); + sb->s_dirt = 0; +} + +static int ext4_sync_fs(struct super_block *sb, int wait) +{ + tid_t target; + + trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait); + sb->s_dirt = 0; + if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) { + if (wait) + jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target); + } + return 0; +} + +/* + * LVM calls this function before a (read-only) snapshot is created. This + * gives us a chance to flush the journal completely and mark the fs clean. + */ +static void ext4_write_super_lockfs(struct super_block *sb) +{ + sb->s_dirt = 0; + + if (!(sb->s_flags & MS_RDONLY)) { + journal_t *journal = EXT4_SB(sb)->s_journal; + + /* Now we set up the journal barrier. */ + jbd2_journal_lock_updates(journal); + + /* + * We don't want to clear needs_recovery flag when we failed + * to flush the journal. + */ + if (jbd2_journal_flush(journal) < 0) + return; + + /* Journal blocked and flushed, clear needs_recovery flag. */ + EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1); + } +} + +/* + * Called by LVM after the snapshot is done. We need to reset the RECOVER + * flag here, even though the filesystem is not technically dirty yet. + */ +static void ext4_unlockfs(struct super_block *sb) +{ + if (!(sb->s_flags & MS_RDONLY)) { + lock_super(sb); + /* Reser the needs_recovery flag before the fs is unlocked. */ + EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1); + unlock_super(sb); + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + } +} + +static int ext4_remount(struct super_block *sb, int *flags, char *data) +{ + struct ext4_super_block *es; + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t n_blocks_count = 0; + unsigned long old_sb_flags; + struct ext4_mount_options old_opts; + ext4_group_t g; + int err; +#ifdef CONFIG_QUOTA + int i; +#endif + + /* Store the original options */ + old_sb_flags = sb->s_flags; + old_opts.s_mount_opt = sbi->s_mount_opt; + old_opts.s_resuid = sbi->s_resuid; + old_opts.s_resgid = sbi->s_resgid; + old_opts.s_commit_interval = sbi->s_commit_interval; +#ifdef CONFIG_QUOTA + old_opts.s_jquota_fmt = sbi->s_jquota_fmt; + for (i = 0; i < MAXQUOTAS; i++) + old_opts.s_qf_names[i] = sbi->s_qf_names[i]; +#endif + + /* + * Allow the "check" option to be passed as a remount option. + */ + if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) { + err = -EINVAL; + goto restore_opts; + } + + if (sbi->s_mount_opt & EXT4_MOUNT_ABORT) + ext4_abort(sb, __func__, "Abort forced by user"); + + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + ((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); + + es = sbi->s_es; + + ext4_init_journal_params(sb, sbi->s_journal); + + if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) || + n_blocks_count > ext4_blocks_count(es)) { + if (sbi->s_mount_opt & EXT4_MOUNT_ABORT) { + err = -EROFS; + goto restore_opts; + } + + if (*flags & MS_RDONLY) { + /* + * First of all, the unconditional stuff we have to do + * to disable replay of the journal when we next remount + */ + sb->s_flags |= MS_RDONLY; + + /* + * OK, test if we are remounting a valid rw partition + * readonly, and if so set the rdonly flag and then + * mark the partition as valid again. + */ + if (!(es->s_state & cpu_to_le16(EXT4_VALID_FS)) && + (sbi->s_mount_state & EXT4_VALID_FS)) + es->s_state = cpu_to_le16(sbi->s_mount_state); + + /* + * We have to unlock super so that we can wait for + * transactions. + */ + unlock_super(sb); + ext4_mark_recovery_complete(sb, es); + lock_super(sb); + } else { + __le32 ret; + if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb, + ~EXT4_FEATURE_RO_COMPAT_SUPP))) { + printk(KERN_WARNING "EXT4-fs: %s: couldn't " + "remount RDWR because of unsupported " + "optional features (%x).\n", + sb->s_id, le32_to_cpu(ret)); + err = -EROFS; + goto restore_opts; + } + + /* + * Make sure the group descriptor checksums + * are sane. If they aren't, refuse to + * remount r/w. + */ + for (g = 0; g < sbi->s_groups_count; g++) { + struct ext4_group_desc *gdp = + ext4_get_group_desc(sb, g, NULL); + + if (!ext4_group_desc_csum_verify(sbi, g, gdp)) { + printk(KERN_ERR + "EXT4-fs: ext4_remount: " + "Checksum for group %lu failed (%u!=%u)\n", + g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)), + le16_to_cpu(gdp->bg_checksum)); + err = -EINVAL; + goto restore_opts; + } + } + + /* + * If we have an unprocessed orphan list hanging + * around from a previously readonly bdev mount, + * require a full umount/remount for now. + */ + if (es->s_last_orphan) { + printk(KERN_WARNING "EXT4-fs: %s: couldn't " + "remount RDWR because of unprocessed " + "orphan inode list. Please " + "umount/remount instead.\n", + sb->s_id); + err = -EINVAL; + goto restore_opts; + } + + /* + * Mounting a RDONLY partition read-write, so reread + * and store the current valid flag. (It may have + * been changed by e2fsck since we originally mounted + * the partition.) + */ + ext4_clear_journal_err(sb, es); + sbi->s_mount_state = le16_to_cpu(es->s_state); + if ((err = ext4_group_extend(sb, es, n_blocks_count))) + goto restore_opts; + if (!ext4_setup_super(sb, es, 0)) + sb->s_flags &= ~MS_RDONLY; + } + } +#ifdef CONFIG_QUOTA + /* Release old quota file names */ + for (i = 0; i < MAXQUOTAS; i++) + if (old_opts.s_qf_names[i] && + old_opts.s_qf_names[i] != sbi->s_qf_names[i]) + kfree(old_opts.s_qf_names[i]); +#endif + return 0; +restore_opts: + sb->s_flags = old_sb_flags; + sbi->s_mount_opt = old_opts.s_mount_opt; + sbi->s_resuid = old_opts.s_resuid; + sbi->s_resgid = old_opts.s_resgid; + sbi->s_commit_interval = old_opts.s_commit_interval; +#ifdef CONFIG_QUOTA + sbi->s_jquota_fmt = old_opts.s_jquota_fmt; + for (i = 0; i < MAXQUOTAS; i++) { + if (sbi->s_qf_names[i] && + old_opts.s_qf_names[i] != sbi->s_qf_names[i]) + kfree(sbi->s_qf_names[i]); + sbi->s_qf_names[i] = old_opts.s_qf_names[i]; + } +#endif + return err; +} + +static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + u64 fsid; + + if (test_opt(sb, MINIX_DF)) { + sbi->s_overhead_last = 0; + } else if (sbi->s_blocks_last != ext4_blocks_count(es)) { + ext4_group_t ngroups = sbi->s_groups_count, i; + ext4_fsblk_t overhead = 0; + smp_rmb(); + + /* + * Compute the overhead (FS structures). This is constant + * for a given filesystem unless the number of block groups + * changes so we cache the previous value until it does. + */ + + /* + * All of the blocks before first_data_block are + * overhead + */ + overhead = le32_to_cpu(es->s_first_data_block); + + /* + * Add the overhead attributed to the superblock and + * block group descriptors. If the sparse superblocks + * feature is turned on, then not all groups have this. + */ + for (i = 0; i < ngroups; i++) { + overhead += ext4_bg_has_super(sb, i) + + ext4_bg_num_gdb(sb, i); + cond_resched(); + } + + /* + * Every block group has an inode bitmap, a block + * bitmap, and an inode table. + */ + overhead += ngroups * (2 + sbi->s_itb_per_group); + sbi->s_overhead_last = overhead; + smp_wmb(); + sbi->s_blocks_last = ext4_blocks_count(es); + } + + buf->f_type = EXT4_SUPER_MAGIC; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last; + buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) - + percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter); + ext4_free_blocks_count_set(es, buf->f_bfree); + buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); + if (buf->f_bfree < ext4_r_blocks_count(es)) + buf->f_bavail = 0; + buf->f_files = le32_to_cpu(es->s_inodes_count); + buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); + es->s_free_inodes_count = cpu_to_le32(buf->f_ffree); + buf->f_namelen = EXT4_NAME_LEN; + fsid = le64_to_cpup((void *)es->s_uuid) ^ + le64_to_cpup((void *)es->s_uuid + sizeof(u64)); + buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; + buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; + return 0; +} + +/* Helper function for writing quotas on sync - we need to start transaction before quota file + * is locked for write. Otherwise the are possible deadlocks: + * Process 1 Process 2 + * ext4_create() quota_sync() + * jbd2_journal_start() write_dquot() + * DQUOT_INIT() down(dqio_mutex) + * down(dqio_mutex) jbd2_journal_start() + * + */ + +#ifdef CONFIG_QUOTA + +static inline struct inode *dquot_to_inode(struct dquot *dquot) +{ + return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type]; +} + +static int ext4_dquot_initialize(struct inode *inode, int type) +{ + handle_t *handle; + int ret, err; + + /* We may create quota structure so we need to reserve enough blocks */ + handle = ext4_journal_start(inode, 2*EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ret = dquot_initialize(inode, type); + err = ext4_journal_stop(handle); + if (!ret) + ret = err; + return ret; +} + +static int ext4_dquot_drop(struct inode *inode) +{ + handle_t *handle; + int ret, err; + + /* We may delete quota structure so we need to reserve enough blocks */ + handle = ext4_journal_start(inode, 2*EXT4_QUOTA_DEL_BLOCKS(inode->i_sb)); + if (IS_ERR(handle)) { + /* + * We call dquot_drop() anyway to at least release references + * to quota structures so that umount does not hang. + */ + dquot_drop(inode); + return PTR_ERR(handle); + } + ret = dquot_drop(inode); + err = ext4_journal_stop(handle); + if (!ret) + ret = err; + return ret; +} + +static int ext4_write_dquot(struct dquot *dquot) +{ + int ret, err; + handle_t *handle; + struct inode *inode; + + inode = dquot_to_inode(dquot); + handle = ext4_journal_start(inode, + EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ret = dquot_commit(dquot); + err = ext4_journal_stop(handle); + if (!ret) + ret = err; + return ret; +} + +static int ext4_acquire_dquot(struct dquot *dquot) +{ + int ret, err; + handle_t *handle; + + handle = ext4_journal_start(dquot_to_inode(dquot), + EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ret = dquot_acquire(dquot); + err = ext4_journal_stop(handle); + if (!ret) + ret = err; + return ret; +} + +static int ext4_release_dquot(struct dquot *dquot) +{ + int ret, err; + handle_t *handle; + + handle = ext4_journal_start(dquot_to_inode(dquot), + EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb)); + if (IS_ERR(handle)) { + /* Release dquot anyway to avoid endless cycle in dqput() */ + dquot_release(dquot); + return PTR_ERR(handle); + } + ret = dquot_release(dquot); + err = ext4_journal_stop(handle); + if (!ret) + ret = err; + return ret; +} + +static int ext4_mark_dquot_dirty(struct dquot *dquot) +{ + /* Are we journaling quotas? */ + if (EXT4_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] || + EXT4_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) { + dquot_mark_dquot_dirty(dquot); + return ext4_write_dquot(dquot); + } else { + return dquot_mark_dquot_dirty(dquot); + } +} + +static int ext4_write_info(struct super_block *sb, int type) +{ + int ret, err; + handle_t *handle; + + /* Data block + inode block */ + handle = ext4_journal_start(sb->s_root->d_inode, 2); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ret = dquot_commit_info(sb, type); + err = ext4_journal_stop(handle); + if (!ret) + ret = err; + return ret; +} + +/* + * Turn on quotas during mount time - we need to find + * the quota file and such... + */ +static int ext4_quota_on_mount(struct super_block *sb, int type) +{ + return vfs_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type], + EXT4_SB(sb)->s_jquota_fmt, type); +} + +/* + * Standard function to be called on quota_on + */ +static int ext4_quota_on(struct super_block *sb, int type, int format_id, + char *name, int remount) +{ + int err; + struct path path; + + if (!test_opt(sb, QUOTA)) + return -EINVAL; + /* When remounting, no checks are needed and in fact, name is NULL */ + if (remount) + return vfs_quota_on(sb, type, format_id, name, remount); + + err = kern_path(name, LOOKUP_FOLLOW, &path); + if (err) + return err; + + /* Quotafile not on the same filesystem? */ + if (path.mnt->mnt_sb != sb) { + path_put(&path); + return -EXDEV; + } + /* Journaling quota? */ + if (EXT4_SB(sb)->s_qf_names[type]) { + /* Quotafile not in fs root? */ + if (path.dentry->d_parent != sb->s_root) + printk(KERN_WARNING + "EXT4-fs: Quota file not on filesystem root. " + "Journaled quota will not work.\n"); + } + + /* + * When we journal data on quota file, we have to flush journal to see + * all updates to the file when we bypass pagecache... + */ + if (ext4_should_journal_data(path.dentry->d_inode)) { + /* + * We don't need to lock updates but journal_flush() could + * otherwise be livelocked... + */ + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); + err = jbd2_journal_flush(EXT4_SB(sb)->s_journal); + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + if (err) { + path_put(&path); + return err; + } + } + + err = vfs_quota_on_path(sb, type, format_id, &path); + path_put(&path); + return err; +} + +/* Read data from quotafile - avoid pagecache and such because we cannot afford + * acquiring the locks... As quota files are never truncated and quota code + * itself serializes the operations (and noone else should touch the files) + * we don't have to be afraid of races */ +static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); + int err = 0; + int offset = off & (sb->s_blocksize - 1); + int tocopy; + size_t toread; + struct buffer_head *bh; + loff_t i_size = i_size_read(inode); + + if (off > i_size) + return 0; + if (off+len > i_size) + len = i_size-off; + toread = len; + while (toread > 0) { + tocopy = sb->s_blocksize - offset < toread ? + sb->s_blocksize - offset : toread; + bh = ext4_bread(NULL, inode, blk, 0, &err); + if (err) + return err; + if (!bh) /* A hole? */ + memset(data, 0, tocopy); + else + memcpy(data, bh->b_data+offset, tocopy); + brelse(bh); + offset = 0; + toread -= tocopy; + data += tocopy; + blk++; + } + return len; +} + +/* Write to quotafile (we know the transaction is already started and has + * enough credits) */ +static ssize_t ext4_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); + int err = 0; + int offset = off & (sb->s_blocksize - 1); + int tocopy; + int journal_quota = EXT4_SB(sb)->s_qf_names[type] != NULL; + size_t towrite = len; + struct buffer_head *bh; + handle_t *handle = journal_current_handle(); + + if (!handle) { + printk(KERN_WARNING "EXT4-fs: Quota write (off=%llu, len=%llu)" + " cancelled because transaction is not started.\n", + (unsigned long long)off, (unsigned long long)len); + return -EIO; + } + mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); + while (towrite > 0) { + tocopy = sb->s_blocksize - offset < towrite ? + sb->s_blocksize - offset : towrite; + bh = ext4_bread(handle, inode, blk, 1, &err); + if (!bh) + goto out; + if (journal_quota) { + err = ext4_journal_get_write_access(handle, bh); + if (err) { + brelse(bh); + goto out; + } + } + lock_buffer(bh); + memcpy(bh->b_data+offset, data, tocopy); + flush_dcache_page(bh->b_page); + unlock_buffer(bh); + if (journal_quota) + err = ext4_journal_dirty_metadata(handle, bh); + else { + /* Always do at least ordered writes for quotas */ + err = ext4_jbd2_file_inode(handle, inode); + mark_buffer_dirty(bh); + } + brelse(bh); + if (err) + goto out; + offset = 0; + towrite -= tocopy; + data += tocopy; + blk++; + } +out: + if (len == towrite) { + mutex_unlock(&inode->i_mutex); + return err; + } + if (inode->i_size < off+len-towrite) { + i_size_write(inode, off+len-towrite); + EXT4_I(inode)->i_disksize = inode->i_size; + } + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + ext4_mark_inode_dirty(handle, inode); + mutex_unlock(&inode->i_mutex); + return len - towrite; +} + +#endif + +static int ext4_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) +{ + return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt); +} + +#ifdef CONFIG_PROC_FS +static int ext4_ui_proc_show(struct seq_file *m, void *v) +{ + unsigned int *p = m->private; + + seq_printf(m, "%u\n", *p); + return 0; +} + +static int ext4_ui_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, ext4_ui_proc_show, PDE(inode)->data); +} + +static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf, + size_t cnt, loff_t *ppos) +{ + unsigned int *p = PDE(file->f_path.dentry->d_inode)->data; + char str[32]; + unsigned long value; + + if (cnt >= sizeof(str)) + return -EINVAL; + if (copy_from_user(str, buf, cnt)) + return -EFAULT; + value = simple_strtol(str, NULL, 0); + if (value < 0) + return -ERANGE; + *p = value; + return cnt; +} + +const struct file_operations ext4_ui_proc_fops = { + .owner = THIS_MODULE, + .open = ext4_ui_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = ext4_ui_proc_write, +}; +#endif + +static struct file_system_type ext4_fs_type = { + .owner = THIS_MODULE, + .name = "ext4", + .get_sb = ext4_get_sb, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +#ifdef CONFIG_EXT4DEV_COMPAT +static int ext4dev_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) +{ + printk(KERN_WARNING "EXT4-fs: Update your userspace programs " + "to mount using ext4\n"); + printk(KERN_WARNING "EXT4-fs: ext4dev backwards compatibility " + "will go away by 2.6.31\n"); + return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt); +} + +static struct file_system_type ext4dev_fs_type = { + .owner = THIS_MODULE, + .name = "ext4dev", + .get_sb = ext4dev_get_sb, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS("ext4dev"); +#endif + +static int __init init_ext4_fs(void) +{ + int err; + + ext4_proc_root = proc_mkdir("fs/ext4", NULL); + err = init_ext4_mballoc(); + if (err) + return err; + + err = init_ext4_xattr(); + if (err) + goto out2; + err = init_inodecache(); + if (err) + goto out1; + err = register_filesystem(&ext4_fs_type); + if (err) + goto out; +#ifdef CONFIG_EXT4DEV_COMPAT + err = register_filesystem(&ext4dev_fs_type); + if (err) { + unregister_filesystem(&ext4_fs_type); + goto out; + } +#endif + return 0; +out: + destroy_inodecache(); +out1: + exit_ext4_xattr(); +out2: + exit_ext4_mballoc(); + return err; +} + +static void __exit exit_ext4_fs(void) +{ + unregister_filesystem(&ext4_fs_type); +#ifdef CONFIG_EXT4DEV_COMPAT + unregister_filesystem(&ext4dev_fs_type); +#endif + destroy_inodecache(); + exit_ext4_xattr(); + exit_ext4_mballoc(); + remove_proc_entry("fs/ext4", NULL); +} + +MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); +MODULE_DESCRIPTION("Fourth Extended Filesystem with extents"); +MODULE_LICENSE("GPL"); +module_init(init_ext4_fs) +module_exit(exit_ext4_fs) diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c new file mode 100644 index 0000000..00740cb --- /dev/null +++ b/fs/ext4/symlink.c @@ -0,0 +1,54 @@ +/* + * linux/fs/ext4/symlink.c + * + * Only fast symlinks left here - the rest is done by generic code. AV, 1999 + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/symlink.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext4 symlink handling code + */ + +#include <linux/fs.h> +#include <linux/jbd2.h> +#include <linux/namei.h> +#include "ext4.h" +#include "xattr.h" + +static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct ext4_inode_info *ei = EXT4_I(dentry->d_inode); + nd_set_link(nd, (char *) ei->i_data); + return NULL; +} + +const struct inode_operations ext4_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, +#ifdef CONFIG_EXT4_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext4_listxattr, + .removexattr = generic_removexattr, +#endif +}; + +const struct inode_operations ext4_fast_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = ext4_follow_link, +#ifdef CONFIG_EXT4_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext4_listxattr, + .removexattr = generic_removexattr, +#endif +}; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c new file mode 100644 index 0000000..80626d5 --- /dev/null +++ b/fs/ext4/xattr.c @@ -0,0 +1,1591 @@ +/* + * linux/fs/ext4/xattr.c + * + * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> + * + * Fix by Harrison Xing <harrison@mountainviewdata.com>. + * Ext4 code with a lot of help from Eric Jarman <ejarman@acm.org>. + * Extended attributes for symlinks and special files added per + * suggestion of Luka Renko <luka.renko@hermes.si>. + * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>, + * Red Hat Inc. + * ea-in-inode support by Alex Tomas <alex@clusterfs.com> aka bzzz + * and Andreas Gruenbacher <agruen@suse.de>. + */ + +/* + * Extended attributes are stored directly in inodes (on file systems with + * inodes bigger than 128 bytes) and on additional disk blocks. The i_file_acl + * field contains the block number if an inode uses an additional block. All + * attributes must fit in the inode and one additional block. Blocks that + * contain the identical set of attributes may be shared among several inodes. + * Identical blocks are detected by keeping a cache of blocks that have + * recently been accessed. + * + * The attributes in inodes and on blocks have a different header; the entries + * are stored in the same format: + * + * +------------------+ + * | header | + * | entry 1 | | + * | entry 2 | | growing downwards + * | entry 3 | v + * | four null bytes | + * | . . . | + * | value 1 | ^ + * | value 3 | | growing upwards + * | value 2 | | + * +------------------+ + * + * The header is followed by multiple entry descriptors. In disk blocks, the + * entry descriptors are kept sorted. In inodes, they are unsorted. The + * attribute values are aligned to the end of the block in no specific order. + * + * Locking strategy + * ---------------- + * EXT4_I(inode)->i_file_acl is protected by EXT4_I(inode)->xattr_sem. + * EA blocks are only changed if they are exclusive to an inode, so + * holding xattr_sem also means that nothing but the EA block's reference + * count can change. Multiple writers to the same block are synchronized + * by the buffer lock. + */ + +#include <linux/init.h> +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/mbcache.h> +#include <linux/quotaops.h> +#include <linux/rwsem.h> +#include "ext4_jbd2.h" +#include "ext4.h" +#include "xattr.h" +#include "acl.h" + +#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data)) +#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr)) +#define BFIRST(bh) ENTRY(BHDR(bh)+1) +#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) + +#ifdef EXT4_XATTR_DEBUG +# define ea_idebug(inode, f...) do { \ + printk(KERN_DEBUG "inode %s:%lu: ", \ + inode->i_sb->s_id, inode->i_ino); \ + printk(f); \ + printk("\n"); \ + } while (0) +# define ea_bdebug(bh, f...) do { \ + char b[BDEVNAME_SIZE]; \ + printk(KERN_DEBUG "block %s:%lu: ", \ + bdevname(bh->b_bdev, b), \ + (unsigned long) bh->b_blocknr); \ + printk(f); \ + printk("\n"); \ + } while (0) +#else +# define ea_idebug(f...) +# define ea_bdebug(f...) +#endif + +static void ext4_xattr_cache_insert(struct buffer_head *); +static struct buffer_head *ext4_xattr_cache_find(struct inode *, + struct ext4_xattr_header *, + struct mb_cache_entry **); +static void ext4_xattr_rehash(struct ext4_xattr_header *, + struct ext4_xattr_entry *); +static int ext4_xattr_list(struct inode *inode, char *buffer, + size_t buffer_size); + +static struct mb_cache *ext4_xattr_cache; + +static struct xattr_handler *ext4_xattr_handler_map[] = { + [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, +#ifdef CONFIG_EXT4_FS_POSIX_ACL + [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext4_xattr_acl_access_handler, + [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext4_xattr_acl_default_handler, +#endif + [EXT4_XATTR_INDEX_TRUSTED] = &ext4_xattr_trusted_handler, +#ifdef CONFIG_EXT4_FS_SECURITY + [EXT4_XATTR_INDEX_SECURITY] = &ext4_xattr_security_handler, +#endif +}; + +struct xattr_handler *ext4_xattr_handlers[] = { + &ext4_xattr_user_handler, + &ext4_xattr_trusted_handler, +#ifdef CONFIG_EXT4_FS_POSIX_ACL + &ext4_xattr_acl_access_handler, + &ext4_xattr_acl_default_handler, +#endif +#ifdef CONFIG_EXT4_FS_SECURITY + &ext4_xattr_security_handler, +#endif + NULL +}; + +static inline struct xattr_handler * +ext4_xattr_handler(int name_index) +{ + struct xattr_handler *handler = NULL; + + if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map)) + handler = ext4_xattr_handler_map[name_index]; + return handler; +} + +/* + * Inode operation listxattr() + * + * dentry->d_inode->i_mutex: don't care + */ +ssize_t +ext4_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + return ext4_xattr_list(dentry->d_inode, buffer, size); +} + +static int +ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end) +{ + while (!IS_LAST_ENTRY(entry)) { + struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(entry); + if ((void *)next >= end) + return -EIO; + entry = next; + } + return 0; +} + +static inline int +ext4_xattr_check_block(struct buffer_head *bh) +{ + int error; + + if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || + BHDR(bh)->h_blocks != cpu_to_le32(1)) + return -EIO; + error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size); + return error; +} + +static inline int +ext4_xattr_check_entry(struct ext4_xattr_entry *entry, size_t size) +{ + size_t value_size = le32_to_cpu(entry->e_value_size); + + if (entry->e_value_block != 0 || value_size > size || + le16_to_cpu(entry->e_value_offs) + value_size > size) + return -EIO; + return 0; +} + +static int +ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index, + const char *name, size_t size, int sorted) +{ + struct ext4_xattr_entry *entry; + size_t name_len; + int cmp = 1; + + if (name == NULL) + return -EINVAL; + name_len = strlen(name); + entry = *pentry; + for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { + cmp = name_index - entry->e_name_index; + if (!cmp) + cmp = name_len - entry->e_name_len; + if (!cmp) + cmp = memcmp(name, entry->e_name, name_len); + if (cmp <= 0 && (sorted || cmp == 0)) + break; + } + *pentry = entry; + if (!cmp && ext4_xattr_check_entry(entry, size)) + return -EIO; + return cmp ? -ENODATA : 0; +} + +static int +ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, + void *buffer, size_t buffer_size) +{ + struct buffer_head *bh = NULL; + struct ext4_xattr_entry *entry; + size_t size; + int error; + + ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", + name_index, name, buffer, (long)buffer_size); + + error = -ENODATA; + if (!EXT4_I(inode)->i_file_acl) + goto cleanup; + ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl); + bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); + if (!bh) + goto cleanup; + ea_bdebug(bh, "b_count=%d, refcount=%d", + atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); + if (ext4_xattr_check_block(bh)) { +bad_block: ext4_error(inode->i_sb, __func__, + "inode %lu: bad block %llu", inode->i_ino, + EXT4_I(inode)->i_file_acl); + error = -EIO; + goto cleanup; + } + ext4_xattr_cache_insert(bh); + entry = BFIRST(bh); + error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1); + if (error == -EIO) + goto bad_block; + if (error) + goto cleanup; + size = le32_to_cpu(entry->e_value_size); + if (buffer) { + error = -ERANGE; + if (size > buffer_size) + goto cleanup; + memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs), + size); + } + error = size; + +cleanup: + brelse(bh); + return error; +} + +static int +ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, + void *buffer, size_t buffer_size) +{ + struct ext4_xattr_ibody_header *header; + struct ext4_xattr_entry *entry; + struct ext4_inode *raw_inode; + struct ext4_iloc iloc; + size_t size; + void *end; + int error; + + if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR)) + return -ENODATA; + error = ext4_get_inode_loc(inode, &iloc); + if (error) + return error; + raw_inode = ext4_raw_inode(&iloc); + header = IHDR(inode, raw_inode); + entry = IFIRST(header); + end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; + error = ext4_xattr_check_names(entry, end); + if (error) + goto cleanup; + error = ext4_xattr_find_entry(&entry, name_index, name, + end - (void *)entry, 0); + if (error) + goto cleanup; + size = le32_to_cpu(entry->e_value_size); + if (buffer) { + error = -ERANGE; + if (size > buffer_size) + goto cleanup; + memcpy(buffer, (void *)IFIRST(header) + + le16_to_cpu(entry->e_value_offs), size); + } + error = size; + +cleanup: + brelse(iloc.bh); + return error; +} + +/* + * ext4_xattr_get() + * + * Copy an extended attribute into the buffer + * provided, or compute the buffer size required. + * Buffer is NULL to compute the size of the buffer required. + * + * Returns a negative error number on failure, or the number of bytes + * used / required on success. + */ +int +ext4_xattr_get(struct inode *inode, int name_index, const char *name, + void *buffer, size_t buffer_size) +{ + int error; + + down_read(&EXT4_I(inode)->xattr_sem); + error = ext4_xattr_ibody_get(inode, name_index, name, buffer, + buffer_size); + if (error == -ENODATA) + error = ext4_xattr_block_get(inode, name_index, name, buffer, + buffer_size); + up_read(&EXT4_I(inode)->xattr_sem); + return error; +} + +static int +ext4_xattr_list_entries(struct inode *inode, struct ext4_xattr_entry *entry, + char *buffer, size_t buffer_size) +{ + size_t rest = buffer_size; + + for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { + struct xattr_handler *handler = + ext4_xattr_handler(entry->e_name_index); + + if (handler) { + size_t size = handler->list(inode, buffer, rest, + entry->e_name, + entry->e_name_len); + if (buffer) { + if (size > rest) + return -ERANGE; + buffer += size; + } + rest -= size; + } + } + return buffer_size - rest; +} + +static int +ext4_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size) +{ + struct buffer_head *bh = NULL; + int error; + + ea_idebug(inode, "buffer=%p, buffer_size=%ld", + buffer, (long)buffer_size); + + error = 0; + if (!EXT4_I(inode)->i_file_acl) + goto cleanup; + ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl); + bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); + error = -EIO; + if (!bh) + goto cleanup; + ea_bdebug(bh, "b_count=%d, refcount=%d", + atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); + if (ext4_xattr_check_block(bh)) { + ext4_error(inode->i_sb, __func__, + "inode %lu: bad block %llu", inode->i_ino, + EXT4_I(inode)->i_file_acl); + error = -EIO; + goto cleanup; + } + ext4_xattr_cache_insert(bh); + error = ext4_xattr_list_entries(inode, BFIRST(bh), buffer, buffer_size); + +cleanup: + brelse(bh); + + return error; +} + +static int +ext4_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size) +{ + struct ext4_xattr_ibody_header *header; + struct ext4_inode *raw_inode; + struct ext4_iloc iloc; + void *end; + int error; + + if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR)) + return 0; + error = ext4_get_inode_loc(inode, &iloc); + if (error) + return error; + raw_inode = ext4_raw_inode(&iloc); + header = IHDR(inode, raw_inode); + end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; + error = ext4_xattr_check_names(IFIRST(header), end); + if (error) + goto cleanup; + error = ext4_xattr_list_entries(inode, IFIRST(header), + buffer, buffer_size); + +cleanup: + brelse(iloc.bh); + return error; +} + +/* + * ext4_xattr_list() + * + * Copy a list of attribute names into the buffer + * provided, or compute the buffer size required. + * Buffer is NULL to compute the size of the buffer required. + * + * Returns a negative error number on failure, or the number of bytes + * used / required on success. + */ +static int +ext4_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) +{ + int i_error, b_error; + + down_read(&EXT4_I(inode)->xattr_sem); + i_error = ext4_xattr_ibody_list(inode, buffer, buffer_size); + if (i_error < 0) { + b_error = 0; + } else { + if (buffer) { + buffer += i_error; + buffer_size -= i_error; + } + b_error = ext4_xattr_block_list(inode, buffer, buffer_size); + if (b_error < 0) + i_error = 0; + } + up_read(&EXT4_I(inode)->xattr_sem); + return i_error + b_error; +} + +/* + * If the EXT4_FEATURE_COMPAT_EXT_ATTR feature of this file system is + * not set, set it. + */ +static void ext4_xattr_update_super_block(handle_t *handle, + struct super_block *sb) +{ + if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR)) + return; + + if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) { + EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR); + sb->s_dirt = 1; + ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh); + } +} + +/* + * Release the xattr block BH: If the reference count is > 1, decrement + * it; otherwise free the block. + */ +static void +ext4_xattr_release_block(handle_t *handle, struct inode *inode, + struct buffer_head *bh) +{ + struct mb_cache_entry *ce = NULL; + int error = 0; + + ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr); + error = ext4_journal_get_write_access(handle, bh); + if (error) + goto out; + + lock_buffer(bh); + if (BHDR(bh)->h_refcount == cpu_to_le32(1)) { + ea_bdebug(bh, "refcount now=0; freeing"); + if (ce) + mb_cache_entry_free(ce); + ext4_free_blocks(handle, inode, bh->b_blocknr, 1, 1); + get_bh(bh); + ext4_forget(handle, 1, inode, bh, bh->b_blocknr); + } else { + le32_add_cpu(&BHDR(bh)->h_refcount, -1); + error = ext4_journal_dirty_metadata(handle, bh); + if (IS_SYNC(inode)) + handle->h_sync = 1; + DQUOT_FREE_BLOCK(inode, 1); + ea_bdebug(bh, "refcount now=%d; releasing", + le32_to_cpu(BHDR(bh)->h_refcount)); + if (ce) + mb_cache_entry_release(ce); + } + unlock_buffer(bh); +out: + ext4_std_error(inode->i_sb, error); + return; +} + +/* + * Find the available free space for EAs. This also returns the total number of + * bytes used by EA entries. + */ +static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last, + size_t *min_offs, void *base, int *total) +{ + for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { + *total += EXT4_XATTR_LEN(last->e_name_len); + if (!last->e_value_block && last->e_value_size) { + size_t offs = le16_to_cpu(last->e_value_offs); + if (offs < *min_offs) + *min_offs = offs; + } + } + return (*min_offs - ((void *)last - base) - sizeof(__u32)); +} + +struct ext4_xattr_info { + int name_index; + const char *name; + const void *value; + size_t value_len; +}; + +struct ext4_xattr_search { + struct ext4_xattr_entry *first; + void *base; + void *end; + struct ext4_xattr_entry *here; + int not_found; +}; + +static int +ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) +{ + struct ext4_xattr_entry *last; + size_t free, min_offs = s->end - s->base, name_len = strlen(i->name); + + /* Compute min_offs and last. */ + last = s->first; + for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { + if (!last->e_value_block && last->e_value_size) { + size_t offs = le16_to_cpu(last->e_value_offs); + if (offs < min_offs) + min_offs = offs; + } + } + free = min_offs - ((void *)last - s->base) - sizeof(__u32); + if (!s->not_found) { + if (!s->here->e_value_block && s->here->e_value_size) { + size_t size = le32_to_cpu(s->here->e_value_size); + free += EXT4_XATTR_SIZE(size); + } + free += EXT4_XATTR_LEN(name_len); + } + if (i->value) { + if (free < EXT4_XATTR_SIZE(i->value_len) || + free < EXT4_XATTR_LEN(name_len) + + EXT4_XATTR_SIZE(i->value_len)) + return -ENOSPC; + } + + if (i->value && s->not_found) { + /* Insert the new name. */ + size_t size = EXT4_XATTR_LEN(name_len); + size_t rest = (void *)last - (void *)s->here + sizeof(__u32); + memmove((void *)s->here + size, s->here, rest); + memset(s->here, 0, size); + s->here->e_name_index = i->name_index; + s->here->e_name_len = name_len; + memcpy(s->here->e_name, i->name, name_len); + } else { + if (!s->here->e_value_block && s->here->e_value_size) { + void *first_val = s->base + min_offs; + size_t offs = le16_to_cpu(s->here->e_value_offs); + void *val = s->base + offs; + size_t size = EXT4_XATTR_SIZE( + le32_to_cpu(s->here->e_value_size)); + + if (i->value && size == EXT4_XATTR_SIZE(i->value_len)) { + /* The old and the new value have the same + size. Just replace. */ + s->here->e_value_size = + cpu_to_le32(i->value_len); + memset(val + size - EXT4_XATTR_PAD, 0, + EXT4_XATTR_PAD); /* Clear pad bytes. */ + memcpy(val, i->value, i->value_len); + return 0; + } + + /* Remove the old value. */ + memmove(first_val + size, first_val, val - first_val); + memset(first_val, 0, size); + s->here->e_value_size = 0; + s->here->e_value_offs = 0; + min_offs += size; + + /* Adjust all value offsets. */ + last = s->first; + while (!IS_LAST_ENTRY(last)) { + size_t o = le16_to_cpu(last->e_value_offs); + if (!last->e_value_block && + last->e_value_size && o < offs) + last->e_value_offs = + cpu_to_le16(o + size); + last = EXT4_XATTR_NEXT(last); + } + } + if (!i->value) { + /* Remove the old name. */ + size_t size = EXT4_XATTR_LEN(name_len); + last = ENTRY((void *)last - size); + memmove(s->here, (void *)s->here + size, + (void *)last - (void *)s->here + sizeof(__u32)); + memset(last, 0, size); + } + } + + if (i->value) { + /* Insert the new value. */ + s->here->e_value_size = cpu_to_le32(i->value_len); + if (i->value_len) { + size_t size = EXT4_XATTR_SIZE(i->value_len); + void *val = s->base + min_offs - size; + s->here->e_value_offs = cpu_to_le16(min_offs - size); + memset(val + size - EXT4_XATTR_PAD, 0, + EXT4_XATTR_PAD); /* Clear the pad bytes. */ + memcpy(val, i->value, i->value_len); + } + } + return 0; +} + +struct ext4_xattr_block_find { + struct ext4_xattr_search s; + struct buffer_head *bh; +}; + +static int +ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i, + struct ext4_xattr_block_find *bs) +{ + struct super_block *sb = inode->i_sb; + int error; + + ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", + i->name_index, i->name, i->value, (long)i->value_len); + + if (EXT4_I(inode)->i_file_acl) { + /* The inode already has an extended attribute block. */ + bs->bh = sb_bread(sb, EXT4_I(inode)->i_file_acl); + error = -EIO; + if (!bs->bh) + goto cleanup; + ea_bdebug(bs->bh, "b_count=%d, refcount=%d", + atomic_read(&(bs->bh->b_count)), + le32_to_cpu(BHDR(bs->bh)->h_refcount)); + if (ext4_xattr_check_block(bs->bh)) { + ext4_error(sb, __func__, + "inode %lu: bad block %llu", inode->i_ino, + EXT4_I(inode)->i_file_acl); + error = -EIO; + goto cleanup; + } + /* Find the named attribute. */ + bs->s.base = BHDR(bs->bh); + bs->s.first = BFIRST(bs->bh); + bs->s.end = bs->bh->b_data + bs->bh->b_size; + bs->s.here = bs->s.first; + error = ext4_xattr_find_entry(&bs->s.here, i->name_index, + i->name, bs->bh->b_size, 1); + if (error && error != -ENODATA) + goto cleanup; + bs->s.not_found = error; + } + error = 0; + +cleanup: + return error; +} + +static int +ext4_xattr_block_set(handle_t *handle, struct inode *inode, + struct ext4_xattr_info *i, + struct ext4_xattr_block_find *bs) +{ + struct super_block *sb = inode->i_sb; + struct buffer_head *new_bh = NULL; + struct ext4_xattr_search *s = &bs->s; + struct mb_cache_entry *ce = NULL; + int error = 0; + +#define header(x) ((struct ext4_xattr_header *)(x)) + + if (i->value && i->value_len > sb->s_blocksize) + return -ENOSPC; + if (s->base) { + ce = mb_cache_entry_get(ext4_xattr_cache, bs->bh->b_bdev, + bs->bh->b_blocknr); + error = ext4_journal_get_write_access(handle, bs->bh); + if (error) + goto cleanup; + lock_buffer(bs->bh); + + if (header(s->base)->h_refcount == cpu_to_le32(1)) { + if (ce) { + mb_cache_entry_free(ce); + ce = NULL; + } + ea_bdebug(bs->bh, "modifying in-place"); + error = ext4_xattr_set_entry(i, s); + if (!error) { + if (!IS_LAST_ENTRY(s->first)) + ext4_xattr_rehash(header(s->base), + s->here); + ext4_xattr_cache_insert(bs->bh); + } + unlock_buffer(bs->bh); + if (error == -EIO) + goto bad_block; + if (!error) + error = ext4_journal_dirty_metadata(handle, + bs->bh); + if (error) + goto cleanup; + goto inserted; + } else { + int offset = (char *)s->here - bs->bh->b_data; + + unlock_buffer(bs->bh); + jbd2_journal_release_buffer(handle, bs->bh); + if (ce) { + mb_cache_entry_release(ce); + ce = NULL; + } + ea_bdebug(bs->bh, "cloning"); + s->base = kmalloc(bs->bh->b_size, GFP_NOFS); + error = -ENOMEM; + if (s->base == NULL) + goto cleanup; + memcpy(s->base, BHDR(bs->bh), bs->bh->b_size); + s->first = ENTRY(header(s->base)+1); + header(s->base)->h_refcount = cpu_to_le32(1); + s->here = ENTRY(s->base + offset); + s->end = s->base + bs->bh->b_size; + } + } else { + /* Allocate a buffer where we construct the new block. */ + s->base = kzalloc(sb->s_blocksize, GFP_NOFS); + /* assert(header == s->base) */ + error = -ENOMEM; + if (s->base == NULL) + goto cleanup; + header(s->base)->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); + header(s->base)->h_blocks = cpu_to_le32(1); + header(s->base)->h_refcount = cpu_to_le32(1); + s->first = ENTRY(header(s->base)+1); + s->here = ENTRY(header(s->base)+1); + s->end = s->base + sb->s_blocksize; + } + + error = ext4_xattr_set_entry(i, s); + if (error == -EIO) + goto bad_block; + if (error) + goto cleanup; + if (!IS_LAST_ENTRY(s->first)) + ext4_xattr_rehash(header(s->base), s->here); + +inserted: + if (!IS_LAST_ENTRY(s->first)) { + new_bh = ext4_xattr_cache_find(inode, header(s->base), &ce); + if (new_bh) { + /* We found an identical block in the cache. */ + if (new_bh == bs->bh) + ea_bdebug(new_bh, "keeping"); + else { + /* The old block is released after updating + the inode. */ + error = -EDQUOT; + if (DQUOT_ALLOC_BLOCK(inode, 1)) + goto cleanup; + error = ext4_journal_get_write_access(handle, + new_bh); + if (error) + goto cleanup_dquot; + lock_buffer(new_bh); + le32_add_cpu(&BHDR(new_bh)->h_refcount, 1); + ea_bdebug(new_bh, "reusing; refcount now=%d", + le32_to_cpu(BHDR(new_bh)->h_refcount)); + unlock_buffer(new_bh); + error = ext4_journal_dirty_metadata(handle, + new_bh); + if (error) + goto cleanup_dquot; + } + mb_cache_entry_release(ce); + ce = NULL; + } else if (bs->bh && s->base == bs->bh->b_data) { + /* We were modifying this block in-place. */ + ea_bdebug(bs->bh, "keeping this block"); + new_bh = bs->bh; + get_bh(new_bh); + } else { + /* We need to allocate a new block */ + ext4_fsblk_t goal = ext4_group_first_block_no(sb, + EXT4_I(inode)->i_block_group); + ext4_fsblk_t block = ext4_new_meta_block(handle, inode, + goal, &error); + if (error) + goto cleanup; + ea_idebug(inode, "creating block %d", block); + + new_bh = sb_getblk(sb, block); + if (!new_bh) { +getblk_failed: + ext4_free_blocks(handle, inode, block, 1, 1); + error = -EIO; + goto cleanup; + } + lock_buffer(new_bh); + error = ext4_journal_get_create_access(handle, new_bh); + if (error) { + unlock_buffer(new_bh); + goto getblk_failed; + } + memcpy(new_bh->b_data, s->base, new_bh->b_size); + set_buffer_uptodate(new_bh); + unlock_buffer(new_bh); + ext4_xattr_cache_insert(new_bh); + error = ext4_journal_dirty_metadata(handle, new_bh); + if (error) + goto cleanup; + } + } + + /* Update the inode. */ + EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; + + /* Drop the previous xattr block. */ + if (bs->bh && bs->bh != new_bh) + ext4_xattr_release_block(handle, inode, bs->bh); + error = 0; + +cleanup: + if (ce) + mb_cache_entry_release(ce); + brelse(new_bh); + if (!(bs->bh && s->base == bs->bh->b_data)) + kfree(s->base); + + return error; + +cleanup_dquot: + DQUOT_FREE_BLOCK(inode, 1); + goto cleanup; + +bad_block: + ext4_error(inode->i_sb, __func__, + "inode %lu: bad block %llu", inode->i_ino, + EXT4_I(inode)->i_file_acl); + goto cleanup; + +#undef header +} + +struct ext4_xattr_ibody_find { + struct ext4_xattr_search s; + struct ext4_iloc iloc; +}; + +static int +ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is) +{ + struct ext4_xattr_ibody_header *header; + struct ext4_inode *raw_inode; + int error; + + if (EXT4_I(inode)->i_extra_isize == 0) + return 0; + raw_inode = ext4_raw_inode(&is->iloc); + header = IHDR(inode, raw_inode); + is->s.base = is->s.first = IFIRST(header); + is->s.here = is->s.first; + is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; + if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) { + error = ext4_xattr_check_names(IFIRST(header), is->s.end); + if (error) + return error; + /* Find the named attribute. */ + error = ext4_xattr_find_entry(&is->s.here, i->name_index, + i->name, is->s.end - + (void *)is->s.base, 0); + if (error && error != -ENODATA) + return error; + is->s.not_found = error; + } + return 0; +} + +static int +ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, + struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is) +{ + struct ext4_xattr_ibody_header *header; + struct ext4_xattr_search *s = &is->s; + int error; + + if (EXT4_I(inode)->i_extra_isize == 0) + return -ENOSPC; + error = ext4_xattr_set_entry(i, s); + if (error) + return error; + header = IHDR(inode, ext4_raw_inode(&is->iloc)); + if (!IS_LAST_ENTRY(s->first)) { + header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); + EXT4_I(inode)->i_state |= EXT4_STATE_XATTR; + } else { + header->h_magic = cpu_to_le32(0); + EXT4_I(inode)->i_state &= ~EXT4_STATE_XATTR; + } + return 0; +} + +/* + * ext4_xattr_set_handle() + * + * Create, replace or remove an extended attribute for this inode. Buffer + * is NULL to remove an existing extended attribute, and non-NULL to + * either replace an existing extended attribute, or create a new extended + * attribute. The flags XATTR_REPLACE and XATTR_CREATE + * specify that an extended attribute must exist and must not exist + * previous to the call, respectively. + * + * Returns 0, or a negative error number on failure. + */ +int +ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, + const char *name, const void *value, size_t value_len, + int flags) +{ + struct ext4_xattr_info i = { + .name_index = name_index, + .name = name, + .value = value, + .value_len = value_len, + + }; + struct ext4_xattr_ibody_find is = { + .s = { .not_found = -ENODATA, }, + }; + struct ext4_xattr_block_find bs = { + .s = { .not_found = -ENODATA, }, + }; + unsigned long no_expand; + int error; + + if (!name) + return -EINVAL; + if (strlen(name) > 255) + return -ERANGE; + down_write(&EXT4_I(inode)->xattr_sem); + no_expand = EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND; + EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND; + + error = ext4_get_inode_loc(inode, &is.iloc); + if (error) + goto cleanup; + + if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) { + struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc); + memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); + EXT4_I(inode)->i_state &= ~EXT4_STATE_NEW; + } + + error = ext4_xattr_ibody_find(inode, &i, &is); + if (error) + goto cleanup; + if (is.s.not_found) + error = ext4_xattr_block_find(inode, &i, &bs); + if (error) + goto cleanup; + if (is.s.not_found && bs.s.not_found) { + error = -ENODATA; + if (flags & XATTR_REPLACE) + goto cleanup; + error = 0; + if (!value) + goto cleanup; + } else { + error = -EEXIST; + if (flags & XATTR_CREATE) + goto cleanup; + } + error = ext4_journal_get_write_access(handle, is.iloc.bh); + if (error) + goto cleanup; + if (!value) { + if (!is.s.not_found) + error = ext4_xattr_ibody_set(handle, inode, &i, &is); + else if (!bs.s.not_found) + error = ext4_xattr_block_set(handle, inode, &i, &bs); + } else { + error = ext4_xattr_ibody_set(handle, inode, &i, &is); + if (!error && !bs.s.not_found) { + i.value = NULL; + error = ext4_xattr_block_set(handle, inode, &i, &bs); + } else if (error == -ENOSPC) { + if (EXT4_I(inode)->i_file_acl && !bs.s.base) { + error = ext4_xattr_block_find(inode, &i, &bs); + if (error) + goto cleanup; + } + error = ext4_xattr_block_set(handle, inode, &i, &bs); + if (error) + goto cleanup; + if (!is.s.not_found) { + i.value = NULL; + error = ext4_xattr_ibody_set(handle, inode, &i, + &is); + } + } + } + if (!error) { + ext4_xattr_update_super_block(handle, inode->i_sb); + inode->i_ctime = ext4_current_time(inode); + if (!value) + EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND; + error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); + /* + * The bh is consumed by ext4_mark_iloc_dirty, even with + * error != 0. + */ + is.iloc.bh = NULL; + if (IS_SYNC(inode)) + handle->h_sync = 1; + } + +cleanup: + brelse(is.iloc.bh); + brelse(bs.bh); + if (no_expand == 0) + EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND; + up_write(&EXT4_I(inode)->xattr_sem); + return error; +} + +/* + * ext4_xattr_set() + * + * Like ext4_xattr_set_handle, but start from an inode. This extended + * attribute modification is a filesystem transaction by itself. + * + * Returns 0, or a negative error number on failure. + */ +int +ext4_xattr_set(struct inode *inode, int name_index, const char *name, + const void *value, size_t value_len, int flags) +{ + handle_t *handle; + int error, retries = 0; + +retry: + handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + } else { + int error2; + + error = ext4_xattr_set_handle(handle, inode, name_index, name, + value, value_len, flags); + error2 = ext4_journal_stop(handle); + if (error == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + if (error == 0) + error = error2; + } + + return error; +} + +/* + * Shift the EA entries in the inode to create space for the increased + * i_extra_isize. + */ +static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry, + int value_offs_shift, void *to, + void *from, size_t n, int blocksize) +{ + struct ext4_xattr_entry *last = entry; + int new_offs; + + /* Adjust the value offsets of the entries */ + for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { + if (!last->e_value_block && last->e_value_size) { + new_offs = le16_to_cpu(last->e_value_offs) + + value_offs_shift; + BUG_ON(new_offs + le32_to_cpu(last->e_value_size) + > blocksize); + last->e_value_offs = cpu_to_le16(new_offs); + } + } + /* Shift the entries by n bytes */ + memmove(to, from, n); +} + +/* + * Expand an inode by new_extra_isize bytes when EAs are present. + * Returns 0 on success or negative error number on failure. + */ +int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, + struct ext4_inode *raw_inode, handle_t *handle) +{ + struct ext4_xattr_ibody_header *header; + struct ext4_xattr_entry *entry, *last, *first; + struct buffer_head *bh = NULL; + struct ext4_xattr_ibody_find *is = NULL; + struct ext4_xattr_block_find *bs = NULL; + char *buffer = NULL, *b_entry_name = NULL; + size_t min_offs, free; + int total_ino, total_blk; + void *base, *start, *end; + int extra_isize = 0, error = 0, tried_min_extra_isize = 0; + int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize); + + down_write(&EXT4_I(inode)->xattr_sem); +retry: + if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) { + up_write(&EXT4_I(inode)->xattr_sem); + return 0; + } + + header = IHDR(inode, raw_inode); + entry = IFIRST(header); + + /* + * Check if enough free space is available in the inode to shift the + * entries ahead by new_extra_isize. + */ + + base = start = entry; + end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; + min_offs = end - base; + last = entry; + total_ino = sizeof(struct ext4_xattr_ibody_header); + + free = ext4_xattr_free_space(last, &min_offs, base, &total_ino); + if (free >= new_extra_isize) { + entry = IFIRST(header); + ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize + - new_extra_isize, (void *)raw_inode + + EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize, + (void *)header, total_ino, + inode->i_sb->s_blocksize); + EXT4_I(inode)->i_extra_isize = new_extra_isize; + error = 0; + goto cleanup; + } + + /* + * Enough free space isn't available in the inode, check if + * EA block can hold new_extra_isize bytes. + */ + if (EXT4_I(inode)->i_file_acl) { + bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); + error = -EIO; + if (!bh) + goto cleanup; + if (ext4_xattr_check_block(bh)) { + ext4_error(inode->i_sb, __func__, + "inode %lu: bad block %llu", inode->i_ino, + EXT4_I(inode)->i_file_acl); + error = -EIO; + goto cleanup; + } + base = BHDR(bh); + first = BFIRST(bh); + end = bh->b_data + bh->b_size; + min_offs = end - base; + free = ext4_xattr_free_space(first, &min_offs, base, + &total_blk); + if (free < new_extra_isize) { + if (!tried_min_extra_isize && s_min_extra_isize) { + tried_min_extra_isize++; + new_extra_isize = s_min_extra_isize; + brelse(bh); + goto retry; + } + error = -1; + goto cleanup; + } + } else { + free = inode->i_sb->s_blocksize; + } + + while (new_extra_isize > 0) { + size_t offs, size, entry_size; + struct ext4_xattr_entry *small_entry = NULL; + struct ext4_xattr_info i = { + .value = NULL, + .value_len = 0, + }; + unsigned int total_size; /* EA entry size + value size */ + unsigned int shift_bytes; /* No. of bytes to shift EAs by? */ + unsigned int min_total_size = ~0U; + + is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS); + bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS); + if (!is || !bs) { + error = -ENOMEM; + goto cleanup; + } + + is->s.not_found = -ENODATA; + bs->s.not_found = -ENODATA; + is->iloc.bh = NULL; + bs->bh = NULL; + + last = IFIRST(header); + /* Find the entry best suited to be pushed into EA block */ + entry = NULL; + for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { + total_size = + EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) + + EXT4_XATTR_LEN(last->e_name_len); + if (total_size <= free && total_size < min_total_size) { + if (total_size < new_extra_isize) { + small_entry = last; + } else { + entry = last; + min_total_size = total_size; + } + } + } + + if (entry == NULL) { + if (small_entry) { + entry = small_entry; + } else { + if (!tried_min_extra_isize && + s_min_extra_isize) { + tried_min_extra_isize++; + new_extra_isize = s_min_extra_isize; + goto retry; + } + error = -1; + goto cleanup; + } + } + offs = le16_to_cpu(entry->e_value_offs); + size = le32_to_cpu(entry->e_value_size); + entry_size = EXT4_XATTR_LEN(entry->e_name_len); + i.name_index = entry->e_name_index, + buffer = kmalloc(EXT4_XATTR_SIZE(size), GFP_NOFS); + b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS); + if (!buffer || !b_entry_name) { + error = -ENOMEM; + goto cleanup; + } + /* Save the entry name and the entry value */ + memcpy(buffer, (void *)IFIRST(header) + offs, + EXT4_XATTR_SIZE(size)); + memcpy(b_entry_name, entry->e_name, entry->e_name_len); + b_entry_name[entry->e_name_len] = '\0'; + i.name = b_entry_name; + + error = ext4_get_inode_loc(inode, &is->iloc); + if (error) + goto cleanup; + + error = ext4_xattr_ibody_find(inode, &i, is); + if (error) + goto cleanup; + + /* Remove the chosen entry from the inode */ + error = ext4_xattr_ibody_set(handle, inode, &i, is); + + entry = IFIRST(header); + if (entry_size + EXT4_XATTR_SIZE(size) >= new_extra_isize) + shift_bytes = new_extra_isize; + else + shift_bytes = entry_size + size; + /* Adjust the offsets and shift the remaining entries ahead */ + ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize - + shift_bytes, (void *)raw_inode + + EXT4_GOOD_OLD_INODE_SIZE + extra_isize + shift_bytes, + (void *)header, total_ino - entry_size, + inode->i_sb->s_blocksize); + + extra_isize += shift_bytes; + new_extra_isize -= shift_bytes; + EXT4_I(inode)->i_extra_isize = extra_isize; + + i.name = b_entry_name; + i.value = buffer; + i.value_len = size; + error = ext4_xattr_block_find(inode, &i, bs); + if (error) + goto cleanup; + + /* Add entry which was removed from the inode into the block */ + error = ext4_xattr_block_set(handle, inode, &i, bs); + if (error) + goto cleanup; + kfree(b_entry_name); + kfree(buffer); + brelse(is->iloc.bh); + kfree(is); + kfree(bs); + } + brelse(bh); + up_write(&EXT4_I(inode)->xattr_sem); + return 0; + +cleanup: + kfree(b_entry_name); + kfree(buffer); + if (is) + brelse(is->iloc.bh); + kfree(is); + kfree(bs); + brelse(bh); + up_write(&EXT4_I(inode)->xattr_sem); + return error; +} + + + +/* + * ext4_xattr_delete_inode() + * + * Free extended attribute resources associated with this inode. This + * is called immediately before an inode is freed. We have exclusive + * access to the inode. + */ +void +ext4_xattr_delete_inode(handle_t *handle, struct inode *inode) +{ + struct buffer_head *bh = NULL; + + if (!EXT4_I(inode)->i_file_acl) + goto cleanup; + bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); + if (!bh) { + ext4_error(inode->i_sb, __func__, + "inode %lu: block %llu read error", inode->i_ino, + EXT4_I(inode)->i_file_acl); + goto cleanup; + } + if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || + BHDR(bh)->h_blocks != cpu_to_le32(1)) { + ext4_error(inode->i_sb, __func__, + "inode %lu: bad block %llu", inode->i_ino, + EXT4_I(inode)->i_file_acl); + goto cleanup; + } + ext4_xattr_release_block(handle, inode, bh); + EXT4_I(inode)->i_file_acl = 0; + +cleanup: + brelse(bh); +} + +/* + * ext4_xattr_put_super() + * + * This is called when a file system is unmounted. + */ +void +ext4_xattr_put_super(struct super_block *sb) +{ + mb_cache_shrink(sb->s_bdev); +} + +/* + * ext4_xattr_cache_insert() + * + * Create a new entry in the extended attribute cache, and insert + * it unless such an entry is already in the cache. + * + * Returns 0, or a negative error number on failure. + */ +static void +ext4_xattr_cache_insert(struct buffer_head *bh) +{ + __u32 hash = le32_to_cpu(BHDR(bh)->h_hash); + struct mb_cache_entry *ce; + int error; + + ce = mb_cache_entry_alloc(ext4_xattr_cache, GFP_NOFS); + if (!ce) { + ea_bdebug(bh, "out of memory"); + return; + } + error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, &hash); + if (error) { + mb_cache_entry_free(ce); + if (error == -EBUSY) { + ea_bdebug(bh, "already in cache"); + error = 0; + } + } else { + ea_bdebug(bh, "inserting [%x]", (int)hash); + mb_cache_entry_release(ce); + } +} + +/* + * ext4_xattr_cmp() + * + * Compare two extended attribute blocks for equality. + * + * Returns 0 if the blocks are equal, 1 if they differ, and + * a negative error number on errors. + */ +static int +ext4_xattr_cmp(struct ext4_xattr_header *header1, + struct ext4_xattr_header *header2) +{ + struct ext4_xattr_entry *entry1, *entry2; + + entry1 = ENTRY(header1+1); + entry2 = ENTRY(header2+1); + while (!IS_LAST_ENTRY(entry1)) { + if (IS_LAST_ENTRY(entry2)) + return 1; + if (entry1->e_hash != entry2->e_hash || + entry1->e_name_index != entry2->e_name_index || + entry1->e_name_len != entry2->e_name_len || + entry1->e_value_size != entry2->e_value_size || + memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) + return 1; + if (entry1->e_value_block != 0 || entry2->e_value_block != 0) + return -EIO; + if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), + (char *)header2 + le16_to_cpu(entry2->e_value_offs), + le32_to_cpu(entry1->e_value_size))) + return 1; + + entry1 = EXT4_XATTR_NEXT(entry1); + entry2 = EXT4_XATTR_NEXT(entry2); + } + if (!IS_LAST_ENTRY(entry2)) + return 1; + return 0; +} + +/* + * ext4_xattr_cache_find() + * + * Find an identical extended attribute block. + * + * Returns a pointer to the block found, or NULL if such a block was + * not found or an error occurred. + */ +static struct buffer_head * +ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header, + struct mb_cache_entry **pce) +{ + __u32 hash = le32_to_cpu(header->h_hash); + struct mb_cache_entry *ce; + + if (!header->h_hash) + return NULL; /* never share */ + ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); +again: + ce = mb_cache_entry_find_first(ext4_xattr_cache, 0, + inode->i_sb->s_bdev, hash); + while (ce) { + struct buffer_head *bh; + + if (IS_ERR(ce)) { + if (PTR_ERR(ce) == -EAGAIN) + goto again; + break; + } + bh = sb_bread(inode->i_sb, ce->e_block); + if (!bh) { + ext4_error(inode->i_sb, __func__, + "inode %lu: block %lu read error", + inode->i_ino, (unsigned long) ce->e_block); + } else if (le32_to_cpu(BHDR(bh)->h_refcount) >= + EXT4_XATTR_REFCOUNT_MAX) { + ea_idebug(inode, "block %lu refcount %d>=%d", + (unsigned long) ce->e_block, + le32_to_cpu(BHDR(bh)->h_refcount), + EXT4_XATTR_REFCOUNT_MAX); + } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) { + *pce = ce; + return bh; + } + brelse(bh); + ce = mb_cache_entry_find_next(ce, 0, inode->i_sb->s_bdev, hash); + } + return NULL; +} + +#define NAME_HASH_SHIFT 5 +#define VALUE_HASH_SHIFT 16 + +/* + * ext4_xattr_hash_entry() + * + * Compute the hash of an extended attribute. + */ +static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header, + struct ext4_xattr_entry *entry) +{ + __u32 hash = 0; + char *name = entry->e_name; + int n; + + for (n = 0; n < entry->e_name_len; n++) { + hash = (hash << NAME_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ + *name++; + } + + if (entry->e_value_block == 0 && entry->e_value_size != 0) { + __le32 *value = (__le32 *)((char *)header + + le16_to_cpu(entry->e_value_offs)); + for (n = (le32_to_cpu(entry->e_value_size) + + EXT4_XATTR_ROUND) >> EXT4_XATTR_PAD_BITS; n; n--) { + hash = (hash << VALUE_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ + le32_to_cpu(*value++); + } + } + entry->e_hash = cpu_to_le32(hash); +} + +#undef NAME_HASH_SHIFT +#undef VALUE_HASH_SHIFT + +#define BLOCK_HASH_SHIFT 16 + +/* + * ext4_xattr_rehash() + * + * Re-compute the extended attribute hash value after an entry has changed. + */ +static void ext4_xattr_rehash(struct ext4_xattr_header *header, + struct ext4_xattr_entry *entry) +{ + struct ext4_xattr_entry *here; + __u32 hash = 0; + + ext4_xattr_hash_entry(header, entry); + here = ENTRY(header+1); + while (!IS_LAST_ENTRY(here)) { + if (!here->e_hash) { + /* Block is not shared if an entry's hash value == 0 */ + hash = 0; + break; + } + hash = (hash << BLOCK_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^ + le32_to_cpu(here->e_hash); + here = EXT4_XATTR_NEXT(here); + } + header->h_hash = cpu_to_le32(hash); +} + +#undef BLOCK_HASH_SHIFT + +int __init +init_ext4_xattr(void) +{ + ext4_xattr_cache = mb_cache_create("ext4_xattr", NULL, + sizeof(struct mb_cache_entry) + + sizeof(((struct mb_cache_entry *) 0)->e_indexes[0]), 1, 6); + if (!ext4_xattr_cache) + return -ENOMEM; + return 0; +} + +void +exit_ext4_xattr(void) +{ + if (ext4_xattr_cache) + mb_cache_destroy(ext4_xattr_cache); + ext4_xattr_cache = NULL; +} diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h new file mode 100644 index 0000000..8ede88b --- /dev/null +++ b/fs/ext4/xattr.h @@ -0,0 +1,155 @@ +/* + File: fs/ext4/xattr.h + + On-disk format of extended attributes for the ext4 filesystem. + + (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org> +*/ + +#include <linux/xattr.h> + +/* Magic value in attribute blocks */ +#define EXT4_XATTR_MAGIC 0xEA020000 + +/* Maximum number of references to one attribute block */ +#define EXT4_XATTR_REFCOUNT_MAX 1024 + +/* Name indexes */ +#define EXT4_XATTR_INDEX_USER 1 +#define EXT4_XATTR_INDEX_POSIX_ACL_ACCESS 2 +#define EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT 3 +#define EXT4_XATTR_INDEX_TRUSTED 4 +#define EXT4_XATTR_INDEX_LUSTRE 5 +#define EXT4_XATTR_INDEX_SECURITY 6 + +struct ext4_xattr_header { + __le32 h_magic; /* magic number for identification */ + __le32 h_refcount; /* reference count */ + __le32 h_blocks; /* number of disk blocks used */ + __le32 h_hash; /* hash value of all attributes */ + __u32 h_reserved[4]; /* zero right now */ +}; + +struct ext4_xattr_ibody_header { + __le32 h_magic; /* magic number for identification */ +}; + +struct ext4_xattr_entry { + __u8 e_name_len; /* length of name */ + __u8 e_name_index; /* attribute name index */ + __le16 e_value_offs; /* offset in disk block of value */ + __le32 e_value_block; /* disk block attribute is stored on (n/i) */ + __le32 e_value_size; /* size of attribute value */ + __le32 e_hash; /* hash value of name and value */ + char e_name[0]; /* attribute name */ +}; + +#define EXT4_XATTR_PAD_BITS 2 +#define EXT4_XATTR_PAD (1<<EXT4_XATTR_PAD_BITS) +#define EXT4_XATTR_ROUND (EXT4_XATTR_PAD-1) +#define EXT4_XATTR_LEN(name_len) \ + (((name_len) + EXT4_XATTR_ROUND + \ + sizeof(struct ext4_xattr_entry)) & ~EXT4_XATTR_ROUND) +#define EXT4_XATTR_NEXT(entry) \ + ((struct ext4_xattr_entry *)( \ + (char *)(entry) + EXT4_XATTR_LEN((entry)->e_name_len))) +#define EXT4_XATTR_SIZE(size) \ + (((size) + EXT4_XATTR_ROUND) & ~EXT4_XATTR_ROUND) + +#define IHDR(inode, raw_inode) \ + ((struct ext4_xattr_ibody_header *) \ + ((void *)raw_inode + \ + EXT4_GOOD_OLD_INODE_SIZE + \ + EXT4_I(inode)->i_extra_isize)) +#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1)) + +# ifdef CONFIG_EXT4_FS_XATTR + +extern struct xattr_handler ext4_xattr_user_handler; +extern struct xattr_handler ext4_xattr_trusted_handler; +extern struct xattr_handler ext4_xattr_acl_access_handler; +extern struct xattr_handler ext4_xattr_acl_default_handler; +extern struct xattr_handler ext4_xattr_security_handler; + +extern ssize_t ext4_listxattr(struct dentry *, char *, size_t); + +extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t); +extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int); +extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int); + +extern void ext4_xattr_delete_inode(handle_t *, struct inode *); +extern void ext4_xattr_put_super(struct super_block *); + +extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, + struct ext4_inode *raw_inode, handle_t *handle); + +extern int init_ext4_xattr(void); +extern void exit_ext4_xattr(void); + +extern struct xattr_handler *ext4_xattr_handlers[]; + +# else /* CONFIG_EXT4_FS_XATTR */ + +static inline int +ext4_xattr_get(struct inode *inode, int name_index, const char *name, + void *buffer, size_t size, int flags) +{ + return -EOPNOTSUPP; +} + +static inline int +ext4_xattr_set(struct inode *inode, int name_index, const char *name, + const void *value, size_t size, int flags) +{ + return -EOPNOTSUPP; +} + +static inline int +ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, + const char *name, const void *value, size_t size, int flags) +{ + return -EOPNOTSUPP; +} + +static inline void +ext4_xattr_delete_inode(handle_t *handle, struct inode *inode) +{ +} + +static inline void +ext4_xattr_put_super(struct super_block *sb) +{ +} + +static inline int +init_ext4_xattr(void) +{ + return 0; +} + +static inline void +exit_ext4_xattr(void) +{ +} + +static inline int +ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, + struct ext4_inode *raw_inode, handle_t *handle) +{ + return -EOPNOTSUPP; +} + +#define ext4_xattr_handlers NULL + +# endif /* CONFIG_EXT4_FS_XATTR */ + +#ifdef CONFIG_EXT4_FS_SECURITY +extern int ext4_init_security(handle_t *handle, struct inode *inode, + struct inode *dir); +#else +static inline int ext4_init_security(handle_t *handle, struct inode *inode, + struct inode *dir) +{ + return 0; +} +#endif diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c new file mode 100644 index 0000000..ca5f89f --- /dev/null +++ b/fs/ext4/xattr_security.c @@ -0,0 +1,76 @@ +/* + * linux/fs/ext4/xattr_security.c + * Handler for storing security labels as extended attributes. + */ + +#include <linux/module.h> +#include <linux/string.h> +#include <linux/fs.h> +#include <linux/security.h> +#include "ext4_jbd2.h" +#include "ext4.h" +#include "xattr.h" + +static size_t +ext4_xattr_security_list(struct inode *inode, char *list, size_t list_size, + const char *name, size_t name_len) +{ + const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1; + const size_t total_len = prefix_len + name_len + 1; + + + if (list && total_len <= list_size) { + memcpy(list, XATTR_SECURITY_PREFIX, prefix_len); + memcpy(list+prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int +ext4_xattr_security_get(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ext4_xattr_get(inode, EXT4_XATTR_INDEX_SECURITY, name, + buffer, size); +} + +static int +ext4_xattr_security_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ext4_xattr_set(inode, EXT4_XATTR_INDEX_SECURITY, name, + value, size, flags); +} + +int +ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir) +{ + int err; + size_t len; + void *value; + char *name; + + err = security_inode_init_security(inode, dir, &name, &value, &len); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + err = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_SECURITY, + name, value, len, 0); + kfree(name); + kfree(value); + return err; +} + +struct xattr_handler ext4_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .list = ext4_xattr_security_list, + .get = ext4_xattr_security_get, + .set = ext4_xattr_security_set, +}; diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c new file mode 100644 index 0000000..ac1a52c --- /dev/null +++ b/fs/ext4/xattr_trusted.c @@ -0,0 +1,59 @@ +/* + * linux/fs/ext4/xattr_trusted.c + * Handler for trusted extended attributes. + * + * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org> + */ + +#include <linux/module.h> +#include <linux/string.h> +#include <linux/capability.h> +#include <linux/fs.h> +#include "ext4_jbd2.h" +#include "ext4.h" +#include "xattr.h" + +static size_t +ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size, + const char *name, size_t name_len) +{ + const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; + const size_t total_len = prefix_len + name_len + 1; + + if (!capable(CAP_SYS_ADMIN)) + return 0; + + if (list && total_len <= list_size) { + memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len); + memcpy(list+prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int +ext4_xattr_trusted_get(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ext4_xattr_get(inode, EXT4_XATTR_INDEX_TRUSTED, name, + buffer, size); +} + +static int +ext4_xattr_trusted_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ext4_xattr_set(inode, EXT4_XATTR_INDEX_TRUSTED, name, + value, size, flags); +} + +struct xattr_handler ext4_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .list = ext4_xattr_trusted_list, + .get = ext4_xattr_trusted_get, + .set = ext4_xattr_trusted_set, +}; diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c new file mode 100644 index 0000000..d91aa61 --- /dev/null +++ b/fs/ext4/xattr_user.c @@ -0,0 +1,61 @@ +/* + * linux/fs/ext4/xattr_user.c + * Handler for extended user attributes. + * + * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org> + */ + +#include <linux/module.h> +#include <linux/string.h> +#include <linux/fs.h> +#include "ext4_jbd2.h" +#include "ext4.h" +#include "xattr.h" + +static size_t +ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size, + const char *name, size_t name_len) +{ + const size_t prefix_len = XATTR_USER_PREFIX_LEN; + const size_t total_len = prefix_len + name_len + 1; + + if (!test_opt(inode->i_sb, XATTR_USER)) + return 0; + + if (list && total_len <= list_size) { + memcpy(list, XATTR_USER_PREFIX, prefix_len); + memcpy(list+prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int +ext4_xattr_user_get(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + if (!test_opt(inode->i_sb, XATTR_USER)) + return -EOPNOTSUPP; + return ext4_xattr_get(inode, EXT4_XATTR_INDEX_USER, name, buffer, size); +} + +static int +ext4_xattr_user_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + if (!test_opt(inode->i_sb, XATTR_USER)) + return -EOPNOTSUPP; + return ext4_xattr_set(inode, EXT4_XATTR_INDEX_USER, name, + value, size, flags); +} + +struct xattr_handler ext4_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .list = ext4_xattr_user_list, + .get = ext4_xattr_user_get, + .set = ext4_xattr_user_set, +}; |