diff options
Diffstat (limited to 'fs')
111 files changed, 1746 insertions, 1053 deletions
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 3de3b4a8..03c9e32 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -288,7 +288,7 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl) fl->fl_end = OFFSET_MAX; else fl->fl_end = glock.start + glock.length - 1; - fl->fl_pid = glock.proc_id; + fl->fl_pid = -glock.proc_id; } kfree(glock.client_id); return res; @@ -445,7 +445,7 @@ static int v9fs_file_fsync(struct file *filp, loff_t start, loff_t end, struct p9_wstat wstat; int retval; - retval = filemap_write_and_wait_range(inode->i_mapping, start, end); + retval = file_write_and_wait_range(filp, start, end); if (retval) return retval; @@ -468,7 +468,7 @@ int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end, struct inode *inode = filp->f_mapping->host; int retval; - retval = filemap_write_and_wait_range(inode->i_mapping, start, end); + retval = file_write_and_wait_range(filp, start, end); if (retval) return retval; diff --git a/fs/affs/file.c b/fs/affs/file.c index 196ee7f..0033181 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -954,7 +954,7 @@ int affs_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) struct inode *inode = filp->f_mapping->host; int ret, err; - err = filemap_write_and_wait_range(inode->i_mapping, start, end); + err = file_write_and_wait_range(filp, start, end); if (err) return err; diff --git a/fs/afs/misc.c b/fs/afs/misc.c index 100b207..c05f1f1 100644 --- a/fs/afs/misc.c +++ b/fs/afs/misc.c @@ -12,7 +12,6 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/errno.h> -#include <rxrpc/packet.h> #include "internal.h" #include "afs_fs.h" diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 02781e7..0bf191f 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -14,7 +14,6 @@ #include <net/sock.h> #include <net/af_rxrpc.h> -#include <rxrpc/packet.h> #include "internal.h" #include "afs_cm.h" @@ -293,6 +292,19 @@ static void afs_load_bvec(struct afs_call *call, struct msghdr *msg, } /* + * Advance the AFS call state when the RxRPC call ends the transmit phase. + */ +static void afs_notify_end_request_tx(struct sock *sock, + struct rxrpc_call *rxcall, + unsigned long call_user_ID) +{ + struct afs_call *call = (struct afs_call *)call_user_ID; + + if (call->state == AFS_CALL_REQUESTING) + call->state = AFS_CALL_AWAIT_REPLY; +} + +/* * attach the data from a bunch of pages on an inode to a call */ static int afs_send_pages(struct afs_call *call, struct msghdr *msg) @@ -311,14 +323,8 @@ static int afs_send_pages(struct afs_call *call, struct msghdr *msg) bytes = msg->msg_iter.count; nr = msg->msg_iter.nr_segs; - /* Have to change the state *before* sending the last - * packet as RxRPC might give us the reply before it - * returns from sending the request. - */ - if (first + nr - 1 >= last) - call->state = AFS_CALL_AWAIT_REPLY; - ret = rxrpc_kernel_send_data(afs_socket, call->rxcall, - msg, bytes); + ret = rxrpc_kernel_send_data(afs_socket, call->rxcall, msg, + bytes, afs_notify_end_request_tx); for (loop = 0; loop < nr; loop++) put_page(bv[loop].bv_page); if (ret < 0) @@ -410,7 +416,8 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, if (!call->send_pages) call->state = AFS_CALL_AWAIT_REPLY; ret = rxrpc_kernel_send_data(afs_socket, rxcall, - &msg, call->request_size); + &msg, call->request_size, + afs_notify_end_request_tx); if (ret < 0) goto error_do_abort; @@ -742,6 +749,20 @@ static int afs_deliver_cm_op_id(struct afs_call *call) } /* + * Advance the AFS call state when an RxRPC service call ends the transmit + * phase. + */ +static void afs_notify_end_reply_tx(struct sock *sock, + struct rxrpc_call *rxcall, + unsigned long call_user_ID) +{ + struct afs_call *call = (struct afs_call *)call_user_ID; + + if (call->state == AFS_CALL_REPLYING) + call->state = AFS_CALL_AWAIT_ACK; +} + +/* * send an empty reply */ void afs_send_empty_reply(struct afs_call *call) @@ -760,7 +781,8 @@ void afs_send_empty_reply(struct afs_call *call) msg.msg_flags = 0; call->state = AFS_CALL_AWAIT_ACK; - switch (rxrpc_kernel_send_data(afs_socket, call->rxcall, &msg, 0)) { + switch (rxrpc_kernel_send_data(afs_socket, call->rxcall, &msg, 0, + afs_notify_end_reply_tx)) { case 0: _leave(" [replied]"); return; @@ -798,7 +820,8 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) msg.msg_flags = 0; call->state = AFS_CALL_AWAIT_ACK; - n = rxrpc_kernel_send_data(afs_socket, call->rxcall, &msg, len); + n = rxrpc_kernel_send_data(afs_socket, call->rxcall, &msg, len, + afs_notify_end_reply_tx); if (n >= 0) { /* Success */ _leave(" [replied]"); diff --git a/fs/afs/write.c b/fs/afs/write.c index 2d2fccd..106e43d 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -714,7 +714,7 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync) vnode->fid.vid, vnode->fid.vnode, file, datasync); - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + ret = file_write_and_wait_range(file, start, end); if (ret) return ret; inode_lock(inode); diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 64ae7447..8cd63e8 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -79,7 +79,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, err = ceph_mdsc_do_request(mdsc, inode, req); if (operation == CEPH_MDS_OP_GETFILELOCK) { - fl->fl_pid = le64_to_cpu(req->r_reply_info.filelock_reply->pid); + fl->fl_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid); if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type) fl->fl_type = F_RDLCK; else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type) diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 72a53bd..118a63e 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -2522,7 +2522,7 @@ CIFSSMBPosixLock(const unsigned int xid, struct cifs_tcon *tcon, pLockData->fl_start = le64_to_cpu(parm_data->start); pLockData->fl_end = pLockData->fl_start + le64_to_cpu(parm_data->length) - 1; - pLockData->fl_pid = le32_to_cpu(parm_data->pid); + pLockData->fl_pid = -le32_to_cpu(parm_data->pid); } } diff --git a/fs/cifs/file.c b/fs/cifs/file.c index bc09df6..0786f19 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2329,7 +2329,7 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end, struct inode *inode = file_inode(file); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); - rc = filemap_write_and_wait_range(inode->i_mapping, start, end); + rc = file_write_and_wait_range(file, start, end); if (rc) return rc; inode_lock(inode); @@ -2371,7 +2371,7 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file); struct inode *inode = file->f_mapping->host; - rc = filemap_write_and_wait_range(inode->i_mapping, start, end); + rc = file_write_and_wait_range(file, start, end); if (rc) return rc; inode_lock(inode); diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index ca7089a..fa08448 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -68,7 +68,7 @@ static void print_format1_lock(struct seq_file *s, struct dlm_lkb *lkb, if (lkb->lkb_wait_type) seq_printf(s, " wait_type: %d", lkb->lkb_wait_type); - seq_puts(s, "\n"); + seq_putc(s, '\n'); } static void print_format1(struct dlm_rsb *res, struct seq_file *s) @@ -111,7 +111,7 @@ static void print_format1(struct dlm_rsb *res, struct seq_file *s) } if (rsb_flag(res, RSB_VALNOTVALID)) seq_puts(s, " (INVALID)"); - seq_puts(s, "\n"); + seq_putc(s, '\n'); if (seq_has_overflowed(s)) goto out; } @@ -156,7 +156,7 @@ static void print_format1(struct dlm_rsb *res, struct seq_file *s) lkb->lkb_id, print_lockmode(lkb->lkb_rqmode)); if (lkb->lkb_wait_type) seq_printf(s, " wait_type: %d", lkb->lkb_wait_type); - seq_puts(s, "\n"); + seq_putc(s, '\n'); if (seq_has_overflowed(s)) goto out; } @@ -287,7 +287,7 @@ static void print_format3(struct dlm_rsb *r, struct seq_file *s) else seq_printf(s, " %02x", (unsigned char)r->res_name[i]); } - seq_puts(s, "\n"); + seq_putc(s, '\n'); if (seq_has_overflowed(s)) goto out; @@ -298,7 +298,7 @@ static void print_format3(struct dlm_rsb *r, struct seq_file *s) for (i = 0; i < lvblen; i++) seq_printf(s, " %02x", (unsigned char)r->res_lvbptr[i]); - seq_puts(s, "\n"); + seq_putc(s, '\n'); if (seq_has_overflowed(s)) goto out; @@ -361,8 +361,7 @@ static void print_format4(struct dlm_rsb *r, struct seq_file *s) else seq_printf(s, " %02x", (unsigned char)r->res_name[i]); } - seq_puts(s, "\n"); - + seq_putc(s, '\n'); unlock_rsb(r); } @@ -436,7 +435,7 @@ static void *table_seq_start(struct seq_file *seq, loff_t *pos) if (bucket >= ls->ls_rsbtbl_size) return NULL; - ri = kzalloc(sizeof(struct rsbtbl_iter), GFP_NOFS); + ri = kzalloc(sizeof(*ri), GFP_NOFS); if (!ri) return NULL; if (n == 0) @@ -742,7 +741,7 @@ void dlm_delete_debug_file(struct dlm_ls *ls) int dlm_create_debug_file(struct dlm_ls *ls) { - char name[DLM_LOCKSPACE_LEN+8]; + char name[DLM_LOCKSPACE_LEN + 8]; /* format 1 */ @@ -757,7 +756,7 @@ int dlm_create_debug_file(struct dlm_ls *ls) /* format 2 */ memset(name, 0, sizeof(name)); - snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_locks", ls->ls_name); + snprintf(name, DLM_LOCKSPACE_LEN + 8, "%s_locks", ls->ls_name); ls->ls_debug_locks_dentry = debugfs_create_file(name, S_IFREG | S_IRUGO, @@ -770,7 +769,7 @@ int dlm_create_debug_file(struct dlm_ls *ls) /* format 3 */ memset(name, 0, sizeof(name)); - snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_all", ls->ls_name); + snprintf(name, DLM_LOCKSPACE_LEN + 8, "%s_all", ls->ls_name); ls->ls_debug_all_dentry = debugfs_create_file(name, S_IFREG | S_IRUGO, @@ -783,7 +782,7 @@ int dlm_create_debug_file(struct dlm_ls *ls) /* format 4 */ memset(name, 0, sizeof(name)); - snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_toss", ls->ls_name); + snprintf(name, DLM_LOCKSPACE_LEN + 8, "%s_toss", ls->ls_name); ls->ls_debug_toss_dentry = debugfs_create_file(name, S_IFREG | S_IRUGO, @@ -794,7 +793,7 @@ int dlm_create_debug_file(struct dlm_ls *ls) goto fail; memset(name, 0, sizeof(name)); - snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_waiters", ls->ls_name); + snprintf(name, DLM_LOCKSPACE_LEN + 8, "%s_waiters", ls->ls_name); ls->ls_debug_waiters_dentry = debugfs_create_file(name, S_IFREG | S_IRUGO, diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 6df3322..d4aadde 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -1426,7 +1426,7 @@ void dlm_scan_waiters(struct dlm_ls *ls) if (!num_nodes) { num_nodes = ls->ls_num_nodes; - warned = kzalloc(num_nodes * sizeof(int), GFP_KERNEL); + warned = kcalloc(num_nodes, sizeof(int), GFP_KERNEL); } if (!warned) continue; @@ -5119,11 +5119,9 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls) int wait_type, stub_unlock_result, stub_cancel_result; int dir_nodeid; - ms_stub = kmalloc(sizeof(struct dlm_message), GFP_KERNEL); - if (!ms_stub) { - log_error(ls, "dlm_recover_waiters_pre no mem"); + ms_stub = kmalloc(sizeof(*ms_stub), GFP_KERNEL); + if (!ms_stub) return; - } mutex_lock(&ls->ls_waiters_mutex); diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 91592b7..78a7c85 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -235,7 +235,7 @@ static int dlm_uevent(struct kset *kset, struct kobject *kobj, return 0; } -static struct kset_uevent_ops dlm_uevent_ops = { +static const struct kset_uevent_ops dlm_uevent_ops = { .uevent = dlm_uevent, }; @@ -453,9 +453,14 @@ static int new_lockspace(const char *name, const char *cluster, *ops_result = 0; } + if (!cluster) + log_print("dlm cluster name '%s' is being used without an application provided cluster name", + dlm_config.ci_cluster_name); + if (dlm_config.ci_recover_callbacks && cluster && strncmp(cluster, dlm_config.ci_cluster_name, DLM_LOCKSPACE_LEN)) { - log_print("dlm cluster name %s mismatch %s", + log_print("dlm cluster name '%s' does not match " + "the application cluster name '%s'", dlm_config.ci_cluster_name, cluster); error = -EBADR; goto out; diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 9382db9..4813d0e 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -729,7 +729,7 @@ static int tcp_accept_from_sock(struct connection *con) mutex_unlock(&connections_lock); memset(&peeraddr, 0, sizeof(peeraddr)); - result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, + result = sock_create_lite(dlm_local_addr[0]->ss_family, SOCK_STREAM, IPPROTO_TCP, &newsock); if (result < 0) return -ENOMEM; diff --git a/fs/dlm/member.c b/fs/dlm/member.c index 9c47f1c..3fda383 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -217,8 +217,7 @@ int dlm_slots_assign(struct dlm_ls *ls, int *num_slots, int *slots_size, } array_size = max + need; - - array = kzalloc(array_size * sizeof(struct dlm_slot), GFP_NOFS); + array = kcalloc(array_size, sizeof(*array), GFP_NOFS); if (!array) return -ENOMEM; @@ -319,7 +318,7 @@ static int dlm_add_member(struct dlm_ls *ls, struct dlm_config_node *node) struct dlm_member *memb; int error; - memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS); + memb = kzalloc(sizeof(*memb), GFP_NOFS); if (!memb) return -ENOMEM; @@ -405,8 +404,7 @@ static void make_member_array(struct dlm_ls *ls) } ls->ls_total_weight = total; - - array = kmalloc(sizeof(int) * total, GFP_NOFS); + array = kmalloc_array(total, sizeof(*array), GFP_NOFS); if (!array) return; @@ -492,8 +490,7 @@ void dlm_lsop_recover_done(struct dlm_ls *ls) return; num = ls->ls_num_nodes; - - slots = kzalloc(num * sizeof(struct dlm_slot), GFP_KERNEL); + slots = kcalloc(num, sizeof(*slots), GFP_KERNEL); if (!slots) return; @@ -673,11 +670,11 @@ int dlm_ls_stop(struct dlm_ls *ls) int dlm_ls_start(struct dlm_ls *ls) { - struct dlm_recover *rv = NULL, *rv_old; + struct dlm_recover *rv, *rv_old; struct dlm_config_node *nodes; int error, count; - rv = kzalloc(sizeof(struct dlm_recover), GFP_NOFS); + rv = kzalloc(sizeof(*rv), GFP_NOFS); if (!rv) return -ENOMEM; diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c index d401425..e631b16 100644 --- a/fs/dlm/plock.c +++ b/fs/dlm/plock.c @@ -367,7 +367,7 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file, locks_init_lock(fl); fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK; fl->fl_flags = FL_POSIX; - fl->fl_pid = op->info.pid; + fl->fl_pid = -op->info.pid; fl->fl_start = op->info.start; fl->fl_end = op->info.end; rv = 0; diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 23488f5..d18e7a5 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -123,6 +123,8 @@ static void compat_input(struct dlm_write_request *kb, static void compat_output(struct dlm_lock_result *res, struct dlm_lock_result32 *res32) { + memset(res32, 0, sizeof(*res32)); + res32->version[0] = res->version[0]; res32->version[1] = res->version[1]; res32->version[2] = res->version[2]; @@ -355,6 +357,10 @@ static int dlm_device_register(struct dlm_ls *ls, char *name) error = misc_register(&ls->ls_device); if (error) { kfree(ls->ls_device.name); + /* this has to be set to NULL + * to avoid a double-free in dlm_device_deregister + */ + ls->ls_device.name = NULL; } fail: return error; diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index ca4e837..c74ed3c 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -328,7 +328,7 @@ ecryptfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) { int rc; - rc = filemap_write_and_wait(file->f_mapping); + rc = file_write_and_wait(file); if (rc) return rc; diff --git a/fs/exofs/file.c b/fs/exofs/file.c index 28645f0..a94594e 100644 --- a/fs/exofs/file.c +++ b/fs/exofs/file.c @@ -48,7 +48,7 @@ static int exofs_file_fsync(struct file *filp, loff_t start, loff_t end, struct inode *inode = filp->f_mapping->host; int ret; - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + ret = file_write_and_wait_range(filp, start, end); if (ret) return ret; diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index e8b3650..b04e882 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -411,7 +411,7 @@ static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp, { struct dir_private_info *p; - p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL); + p = kzalloc(sizeof(*p), GFP_KERNEL); if (!p) return NULL; p->curr_hash = pos2maj_hash(filp, pos); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index a2bb7d2..84b9da1 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -838,13 +838,11 @@ static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra) { if (unlikely(sizeof(time->tv_sec) > 4 && (extra & cpu_to_le32(EXT4_EPOCH_MASK)))) { -#if LINUX_VERSION_CODE < KERNEL_VERSION(4,20,0) + +#if 1 /* Handle legacy encoding of pre-1970 dates with epoch - * bits 1,1. We assume that by kernel version 4.20, - * everyone will have run fsck over the affected - * filesystems to correct the problem. (This - * backwards compatibility may be removed before this - * time, at the discretion of the ext4 developers.) + * bits 1,1. (This backwards compatibility may be removed + * at the discretion of the ext4 developers.) */ u64 extra_bits = le32_to_cpu(extra) & EXT4_EPOCH_MASK; if (extra_bits == 3 && ((time->tv_sec) & 0x80000000) != 0) @@ -1567,6 +1565,7 @@ enum { nolocking */ EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ EXT4_STATE_EXT_PRECACHED, /* extents have been precached */ + EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */ }; #define EXT4_INODE_BIT_FNS(name, field, offset) \ diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 484c132..57dcaea 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -279,7 +279,20 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf, handle_t *handle = NULL; struct inode *inode = file_inode(vmf->vma->vm_file); struct super_block *sb = inode->i_sb; - bool write = vmf->flags & FAULT_FLAG_WRITE; + + /* + * We have to distinguish real writes from writes which will result in a + * COW page; COW writes should *not* poke the journal (the file will not + * be changed). Doing so would cause unintended failures when mounted + * read-only. + * + * We check for VM_SHARED rather than vmf->cow_page since the latter is + * unset for pe_size != PE_SIZE_PTE (i.e. only in do_cow_fault); for + * other sizes, dax_iomap_fault will handle splitting / fallback so that + * we eventually come back with a COW page. + */ + bool write = (vmf->flags & FAULT_FLAG_WRITE) && + (vmf->vma->vm_flags & VM_SHARED); if (write) { sb_start_pagefault(sb); @@ -557,7 +570,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) inode_lock(inode); isize = i_size_read(inode); - if (offset >= isize) { + if (offset < 0 || offset >= isize) { inode_unlock(inode); return -ENXIO; } @@ -620,7 +633,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) inode_lock(inode); isize = i_size_read(inode); - if (offset >= isize) { + if (offset < 0 || offset >= isize) { inode_unlock(inode); return -ENXIO; } diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index 38b8a96..00c6dd2 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -148,8 +148,6 @@ static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num) if (len > num*4) len = num * 4; for (i = 0; i < len; i++) { - if ((i % 4) == 0) - val = pad; val = ((int) scp[i]) + (val << 8); if ((i % 4) == 3) { *buf++ = val; @@ -176,8 +174,6 @@ static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num) if (len > num*4) len = num * 4; for (i = 0; i < len; i++) { - if ((i % 4) == 0) - val = pad; val = ((int) ucp[i]) + (val << 8); if ((i % 4) == 3) { *buf++ = val; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 507bfb3..71e93a2 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -692,24 +692,25 @@ static int find_group_other(struct super_block *sb, struct inode *parent, * somewhat arbitrary...) */ #define RECENTCY_MIN 5 -#define RECENTCY_DIRTY 30 +#define RECENTCY_DIRTY 300 static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino) { struct ext4_group_desc *gdp; struct ext4_inode *raw_inode; struct buffer_head *bh; - unsigned long dtime, now; - int inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; - int offset, ret = 0, recentcy = RECENTCY_MIN; + int inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; + int offset, ret = 0; + int recentcy = RECENTCY_MIN; + u32 dtime, now; gdp = ext4_get_group_desc(sb, group, NULL); if (unlikely(!gdp)) return 0; - bh = sb_getblk(sb, ext4_inode_table(sb, gdp) + + bh = sb_find_get_block(sb, ext4_inode_table(sb, gdp) + (ino / inodes_per_block)); - if (unlikely(!bh) || !buffer_uptodate(bh)) + if (!bh || !buffer_uptodate(bh)) /* * If the block is not in the buffer cache, then it * must have been written out. @@ -718,18 +719,45 @@ static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino) offset = (ino % inodes_per_block) * EXT4_INODE_SIZE(sb); raw_inode = (struct ext4_inode *) (bh->b_data + offset); + + /* i_dtime is only 32 bits on disk, but we only care about relative + * times in the range of a few minutes (i.e. long enough to sync a + * recently-deleted inode to disk), so using the low 32 bits of the + * clock (a 68 year range) is enough, see time_before32() */ dtime = le32_to_cpu(raw_inode->i_dtime); - now = get_seconds(); + now = ktime_get_real_seconds(); if (buffer_dirty(bh)) recentcy += RECENTCY_DIRTY; - if (dtime && (dtime < now) && (now < dtime + recentcy)) + if (dtime && time_before32(dtime, now) && + time_before32(now, dtime + recentcy)) ret = 1; out: brelse(bh); return ret; } +static int find_inode_bit(struct super_block *sb, ext4_group_t group, + struct buffer_head *bitmap, unsigned long *ino) +{ +next: + *ino = ext4_find_next_zero_bit((unsigned long *) + bitmap->b_data, + EXT4_INODES_PER_GROUP(sb), *ino); + if (*ino >= EXT4_INODES_PER_GROUP(sb)) + return 0; + + if ((EXT4_SB(sb)->s_journal == NULL) && + recently_deleted(sb, group, *ino)) { + *ino = *ino + 1; + if (*ino < EXT4_INODES_PER_GROUP(sb)) + goto next; + return 0; + } + + return 1; +} + /* * There are two policies for allocating an inode. If the new inode is * a directory, then a forward search is made for a block group with both @@ -892,19 +920,13 @@ got_group: /* * Check free inodes count before loading bitmap. */ - if (ext4_free_inodes_count(sb, gdp) == 0) { - if (++group == ngroups) - group = 0; - continue; - } + if (ext4_free_inodes_count(sb, gdp) == 0) + goto next_group; grp = ext4_get_group_info(sb, group); /* Skip groups with already-known suspicious inode tables */ - if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { - if (++group == ngroups) - group = 0; - continue; - } + if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) + goto next_group; brelse(inode_bitmap_bh); inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); @@ -912,27 +934,20 @@ got_group: if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) || IS_ERR(inode_bitmap_bh)) { inode_bitmap_bh = NULL; - if (++group == ngroups) - group = 0; - continue; + goto next_group; } repeat_in_this_group: - ino = ext4_find_next_zero_bit((unsigned long *) - inode_bitmap_bh->b_data, - EXT4_INODES_PER_GROUP(sb), ino); - if (ino >= EXT4_INODES_PER_GROUP(sb)) + ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino); + if (!ret2) goto next_group; - if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) { + + if (group == 0 && (ino + 1) < EXT4_FIRST_INO(sb)) { ext4_error(sb, "reserved inode found cleared - " "inode=%lu", ino + 1); - continue; - } - if ((EXT4_SB(sb)->s_journal == NULL) && - recently_deleted(sb, group, ino)) { - ino++; - goto next_inode; + goto next_group; } + if (!handle) { BUG_ON(nblocks <= 0); handle = __ext4_journal_start_sb(dir->i_sb, line_no, @@ -952,11 +967,23 @@ repeat_in_this_group: } ext4_lock_group(sb, group); ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data); + if (ret2) { + /* Someone already took the bit. Repeat the search + * with lock held. + */ + ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino); + if (ret2) { + ext4_set_bit(ino, inode_bitmap_bh->b_data); + ret2 = 0; + } else { + ret2 = 1; /* we didn't grab the inode */ + } + } ext4_unlock_group(sb, group); ino++; /* the inode bitmap is zero-based */ if (!ret2) goto got; /* we grabbed the inode! */ -next_inode: + if (ino < EXT4_INODES_PER_GROUP(sb)) goto repeat_in_this_group; next_group: diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ce68a34..e963508 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4890,14 +4890,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) brelse(iloc.bh); ext4_set_inode_flags(inode); - if (ei->i_flags & EXT4_EA_INODE_FL) { - ext4_xattr_inode_set_class(inode); - - inode_lock(inode); - inode->i_flags |= S_NOQUOTA; - inode_unlock(inode); - } - unlock_new_inode(inode); return inode; diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index eb98356..77cdce1 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -367,7 +367,7 @@ skip: goto failed; } - mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL); + mmpd_data = kmalloc(sizeof(*mmpd_data), GFP_KERNEL); if (!mmpd_data) { ext4_warning(sb, "not enough memory for mmpd_data"); goto failed; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index d61a70e2..c9e7be5 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2404,6 +2404,7 @@ static void ext4_orphan_cleanup(struct super_block *sb, unsigned int s_flags = sb->s_flags; int ret, nr_orphans = 0, nr_truncates = 0; #ifdef CONFIG_QUOTA + int quota_update = 0; int i; #endif if (!es->s_last_orphan) { @@ -2442,14 +2443,32 @@ static void ext4_orphan_cleanup(struct super_block *sb, #ifdef CONFIG_QUOTA /* Needed for iput() to work correctly and not trash data */ sb->s_flags |= MS_ACTIVE; - /* Turn on quotas so that they are updated correctly */ + + /* + * Turn on quotas which were not enabled for read-only mounts if + * filesystem has quota feature, so that they are updated correctly. + */ + if (ext4_has_feature_quota(sb) && (s_flags & MS_RDONLY)) { + int ret = ext4_enable_quotas(sb); + + if (!ret) + quota_update = 1; + else + ext4_msg(sb, KERN_ERR, + "Cannot turn on quotas: error %d", ret); + } + + /* Turn on journaled quotas used for old sytle */ for (i = 0; i < EXT4_MAXQUOTAS; i++) { if (EXT4_SB(sb)->s_qf_names[i]) { int ret = ext4_quota_on_mount(sb, i); - if (ret < 0) + + if (!ret) + quota_update = 1; + else ext4_msg(sb, KERN_ERR, "Cannot turn on journaled " - "quota: error %d", ret); + "quota: type %d: error %d", i, ret); } } #endif @@ -2510,10 +2529,12 @@ static void ext4_orphan_cleanup(struct super_block *sb, ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up", PLURAL(nr_truncates)); #ifdef CONFIG_QUOTA - /* Turn quotas off */ - for (i = 0; i < EXT4_MAXQUOTAS; i++) { - if (sb_dqopt(sb)->files[i]) - dquot_quota_off(sb, i); + /* Turn off quotas if they were enabled for orphan cleanup */ + if (quota_update) { + for (i = 0; i < EXT4_MAXQUOTAS; i++) { + if (sb_dqopt(sb)->files[i]) + dquot_quota_off(sb, i); + } } #endif sb->s_flags = s_flags; /* Restore MS_RDONLY status */ @@ -5512,6 +5533,9 @@ static int ext4_enable_quotas(struct super_block *sb) DQUOT_USAGE_ENABLED | (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0)); if (err) { + for (type--; type >= 0; type--) + dquot_quota_off(sb, type); + ext4_warning(sb, "Failed to enable quota tracking " "(type=%d, err=%d). Please run " diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 3dd9701..3b69330 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -354,8 +354,10 @@ free_bhs: return ret; } +#define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode)->i_mtime.tv_sec) + static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, - struct inode **ea_inode) + u32 ea_inode_hash, struct inode **ea_inode) { struct inode *inode; int err; @@ -385,6 +387,24 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, goto error; } + ext4_xattr_inode_set_class(inode); + + /* + * Check whether this is an old Lustre-style xattr inode. Lustre + * implementation does not have hash validation, rather it has a + * backpointer from ea_inode to the parent inode. + */ + if (ea_inode_hash != ext4_xattr_inode_get_hash(inode) && + EXT4_XATTR_INODE_GET_PARENT(inode) == parent->i_ino && + inode->i_generation == parent->i_generation) { + ext4_set_inode_state(inode, EXT4_STATE_LUSTRE_EA_INODE); + ext4_xattr_inode_set_ref(inode, 1); + } else { + inode_lock(inode); + inode->i_flags |= S_NOQUOTA; + inode_unlock(inode); + } + *ea_inode = inode; return 0; error: @@ -417,8 +437,6 @@ ext4_xattr_inode_verify_hashes(struct inode *ea_inode, return 0; } -#define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode)->i_mtime.tv_sec) - /* * Read xattr value from the EA inode. */ @@ -431,7 +449,7 @@ ext4_xattr_inode_get(struct inode *inode, struct ext4_xattr_entry *entry, int err; err = ext4_xattr_inode_iget(inode, le32_to_cpu(entry->e_value_inum), - &ea_inode); + le32_to_cpu(entry->e_hash), &ea_inode); if (err) { ea_inode = NULL; goto out; @@ -449,29 +467,20 @@ ext4_xattr_inode_get(struct inode *inode, struct ext4_xattr_entry *entry, if (err) goto out; - err = ext4_xattr_inode_verify_hashes(ea_inode, entry, buffer, size); - /* - * Compatibility check for old Lustre ea_inode implementation. Old - * version does not have hash validation, but it has a backpointer - * from ea_inode to the parent inode. - */ - if (err == -EFSCORRUPTED) { - if (EXT4_XATTR_INODE_GET_PARENT(ea_inode) != inode->i_ino || - ea_inode->i_generation != inode->i_generation) { + if (!ext4_test_inode_state(ea_inode, EXT4_STATE_LUSTRE_EA_INODE)) { + err = ext4_xattr_inode_verify_hashes(ea_inode, entry, buffer, + size); + if (err) { ext4_warning_inode(ea_inode, "EA inode hash validation failed"); goto out; } - /* Do not add ea_inode to the cache. */ - ea_inode_cache = NULL; - err = 0; - } else if (err) - goto out; - if (ea_inode_cache) - mb_cache_entry_create(ea_inode_cache, GFP_NOFS, - ext4_xattr_inode_get_hash(ea_inode), - ea_inode->i_ino, true /* reusable */); + if (ea_inode_cache) + mb_cache_entry_create(ea_inode_cache, GFP_NOFS, + ext4_xattr_inode_get_hash(ea_inode), + ea_inode->i_ino, true /* reusable */); + } out: iput(ea_inode); return err; @@ -838,10 +847,15 @@ static int ext4_xattr_inode_alloc_quota(struct inode *inode, size_t len) return err; } -static void ext4_xattr_inode_free_quota(struct inode *inode, size_t len) +static void ext4_xattr_inode_free_quota(struct inode *parent, + struct inode *ea_inode, + size_t len) { - dquot_free_space_nodirty(inode, round_up_cluster(inode, len)); - dquot_free_inode(inode); + if (ea_inode && + ext4_test_inode_state(ea_inode, EXT4_STATE_LUSTRE_EA_INODE)) + return; + dquot_free_space_nodirty(parent, round_up_cluster(parent, len)); + dquot_free_inode(parent); } int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode, @@ -1071,7 +1085,9 @@ static int ext4_xattr_inode_inc_ref_all(handle_t *handle, struct inode *parent, if (!entry->e_value_inum) continue; ea_ino = le32_to_cpu(entry->e_value_inum); - err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode); + err = ext4_xattr_inode_iget(parent, ea_ino, + le32_to_cpu(entry->e_hash), + &ea_inode); if (err) goto cleanup; err = ext4_xattr_inode_inc_ref(handle, ea_inode); @@ -1093,7 +1109,9 @@ cleanup: if (!entry->e_value_inum) continue; ea_ino = le32_to_cpu(entry->e_value_inum); - err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode); + err = ext4_xattr_inode_iget(parent, ea_ino, + le32_to_cpu(entry->e_hash), + &ea_inode); if (err) { ext4_warning(parent->i_sb, "cleanup ea_ino %u iget error %d", ea_ino, @@ -1131,7 +1149,9 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, if (!entry->e_value_inum) continue; ea_ino = le32_to_cpu(entry->e_value_inum); - err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode); + err = ext4_xattr_inode_iget(parent, ea_ino, + le32_to_cpu(entry->e_hash), + &ea_inode); if (err) continue; @@ -1159,7 +1179,7 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, } if (!skip_quota) - ext4_xattr_inode_free_quota(parent, + ext4_xattr_inode_free_quota(parent, ea_inode, le32_to_cpu(entry->e_value_size)); /* @@ -1591,6 +1611,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, if (!s->not_found && here->e_value_inum) { ret = ext4_xattr_inode_iget(inode, le32_to_cpu(here->e_value_inum), + le32_to_cpu(here->e_hash), &old_ea_inode); if (ret) { old_ea_inode = NULL; @@ -1609,7 +1630,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, &new_ea_inode); if (ret) { new_ea_inode = NULL; - ext4_xattr_inode_free_quota(inode, i->value_len); + ext4_xattr_inode_free_quota(inode, NULL, i->value_len); goto out; } } @@ -1628,13 +1649,13 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, ext4_warning_inode(new_ea_inode, "dec ref new_ea_inode err=%d", err); - ext4_xattr_inode_free_quota(inode, + ext4_xattr_inode_free_quota(inode, new_ea_inode, i->value_len); } goto out; } - ext4_xattr_inode_free_quota(inode, + ext4_xattr_inode_free_quota(inode, old_ea_inode, le32_to_cpu(here->e_value_size)); } @@ -1805,8 +1826,10 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, struct mb_cache_entry *ce = NULL; int error = 0; struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); - struct inode *ea_inode = NULL; - size_t old_ea_inode_size = 0; + struct inode *ea_inode = NULL, *tmp_inode; + size_t old_ea_inode_quota = 0; + unsigned int ea_ino; + #define header(x) ((struct ext4_xattr_header *)(x)) @@ -1865,12 +1888,24 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, * like it has an empty value. */ if (!s->not_found && s->here->e_value_inum) { - /* - * Defer quota free call for previous inode - * until success is guaranteed. - */ - old_ea_inode_size = le32_to_cpu( + ea_ino = le32_to_cpu(s->here->e_value_inum); + error = ext4_xattr_inode_iget(inode, ea_ino, + le32_to_cpu(s->here->e_hash), + &tmp_inode); + if (error) + goto cleanup; + + if (!ext4_test_inode_state(tmp_inode, + EXT4_STATE_LUSTRE_EA_INODE)) { + /* + * Defer quota free call for previous + * inode until success is guaranteed. + */ + old_ea_inode_quota = le32_to_cpu( s->here->e_value_size); + } + iput(tmp_inode); + s->here->e_value_inum = 0; s->here->e_value_size = 0; } @@ -1897,8 +1932,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, goto cleanup; if (i->value && s->here->e_value_inum) { - unsigned int ea_ino; - /* * A ref count on ea_inode has been taken as part of the call to * ext4_xattr_set_entry() above. We would like to drop this @@ -1906,7 +1939,9 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, * initialized and has its own ref count on the ea_inode. */ ea_ino = le32_to_cpu(s->here->e_value_inum); - error = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode); + error = ext4_xattr_inode_iget(inode, ea_ino, + le32_to_cpu(s->here->e_hash), + &ea_inode); if (error) { ea_inode = NULL; goto cleanup; @@ -2056,8 +2091,8 @@ getblk_failed: } } - if (old_ea_inode_size) - ext4_xattr_inode_free_quota(inode, old_ea_inode_size); + if (old_ea_inode_quota) + ext4_xattr_inode_free_quota(inode, NULL, old_ea_inode_quota); /* Update the inode. */ EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; @@ -2084,7 +2119,7 @@ cleanup: /* If there was an error, revert the quota charge. */ if (error) - ext4_xattr_inode_free_quota(inode, + ext4_xattr_inode_free_quota(inode, ea_inode, i_size_read(ea_inode)); iput(ea_inode); } @@ -2800,6 +2835,7 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, struct ext4_xattr_ibody_header *header; struct ext4_iloc iloc = { .bh = NULL }; struct ext4_xattr_entry *entry; + struct inode *ea_inode; int error; error = ext4_xattr_ensure_credits(handle, inode, extra_credits, @@ -2854,10 +2890,19 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, if (ext4_has_feature_ea_inode(inode->i_sb)) { for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry); - entry = EXT4_XATTR_NEXT(entry)) - if (entry->e_value_inum) - ext4_xattr_inode_free_quota(inode, + entry = EXT4_XATTR_NEXT(entry)) { + if (!entry->e_value_inum) + continue; + error = ext4_xattr_inode_iget(inode, + le32_to_cpu(entry->e_value_inum), + le32_to_cpu(entry->e_hash), + &ea_inode); + if (error) + continue; + ext4_xattr_inode_free_quota(inode, ea_inode, le32_to_cpu(entry->e_value_size)); + iput(ea_inode); + } } diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 2706130..843a0d9 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -206,7 +206,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, /* if fdatasync is triggered, let's do in-place-update */ if (datasync || get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks) set_inode_flag(inode, FI_NEED_IPU); - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + ret = file_write_and_wait_range(file, start, end); clear_inode_flag(inode, FI_NEED_IPU); if (ret) { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index ab60051..d667898 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -457,7 +457,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, * wait for all outstanding writes, before sending the FSYNC * request. */ - err = filemap_write_and_wait_range(inode->i_mapping, start, end); + err = file_write_and_wait_range(file, start, end); if (err) goto out; @@ -465,10 +465,10 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, /* * Due to implementation of fuse writeback - * filemap_write_and_wait_range() does not catch errors. + * file_write_and_wait_range() does not catch errors. * We have to do this directly after fuse_sync_writes() */ - err = filemap_check_errors(file->f_mapping); + err = file_check_and_advance_wb_err(file); if (err) goto out; @@ -2102,11 +2102,11 @@ static int convert_fuse_file_lock(struct fuse_conn *fc, fl->fl_end = ffl->end; /* - * Convert pid into the caller's pid namespace. If the pid - * does not map into the namespace fl_pid will get set to 0. + * Convert pid into init's pid namespace. The locks API will + * translate it into the caller's pid namespace. */ rcu_read_lock(); - fl->fl_pid = pid_vnr(find_pid_ns(ffl->pid, fc->pid_ns)); + fl->fl_pid = pid_nr_ns(find_pid_ns(ffl->pid, fc->pid_ns), &init_pid_ns); rcu_read_unlock(); break; diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 2524807..9d5eecb 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -86,19 +86,6 @@ int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) char *data; const char *name = gfs2_acl_name(type); - if (acl && acl->a_count > GFS2_ACL_MAX_ENTRIES(GFS2_SB(inode))) - return -E2BIG; - - if (type == ACL_TYPE_ACCESS) { - umode_t mode = inode->i_mode; - - error = posix_acl_update_mode(inode, &inode->i_mode, &acl); - if (error) - return error; - if (mode != inode->i_mode) - mark_inode_dirty(inode); - } - if (acl) { len = posix_acl_to_xattr(&init_user_ns, acl, NULL, 0); if (len == 0) @@ -129,6 +116,10 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) struct gfs2_holder gh; bool need_unlock = false; int ret; + umode_t mode; + + if (acl && acl->a_count > GFS2_ACL_MAX_ENTRIES(GFS2_SB(inode))) + return -E2BIG; ret = gfs2_rsqa_alloc(ip); if (ret) @@ -140,7 +131,20 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) return ret; need_unlock = true; } + + mode = inode->i_mode; + if (type == ACL_TYPE_ACCESS && acl) { + ret = posix_acl_update_mode(inode, &mode, &acl); + if (ret) + goto unlock; + } + ret = __gfs2_set_acl(inode, acl, type); + if (!ret && mode != inode->i_mode) { + inode->i_mode = mode; + mark_inode_dirty(inode); + } +unlock: if (need_unlock) gfs2_glock_dq_uninit(&gh); return ret; diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index ed7a2e2..68ed069 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -234,7 +234,19 @@ out: static int gfs2_writepages(struct address_space *mapping, struct writeback_control *wbc) { - return mpage_writepages(mapping, wbc, gfs2_get_block_noalloc); + struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping); + int ret = mpage_writepages(mapping, wbc, gfs2_get_block_noalloc); + + /* + * Even if we didn't write any pages here, we might still be holding + * dirty pages in the ail. We forcibly flush the ail because we don't + * want balance_dirty_pages() to loop indefinitely trying to write out + * pages held in the ail that it can't find. + */ + if (ret == 0) + set_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags); + + return ret; } /** diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 9fa3aef..3dd0cce 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -291,8 +291,9 @@ static void gfs2_metapath_ra(struct gfs2_glock *gl, if (trylock_buffer(rabh)) { if (!buffer_uptodate(rabh)) { rabh->b_end_io = end_buffer_read_sync; - submit_bh(REQ_OP_READ, REQ_RAHEAD | REQ_META, - rabh); + submit_bh(REQ_OP_READ, + REQ_RAHEAD | REQ_META | REQ_PRIO, + rabh); continue; } unlock_buffer(rabh); @@ -1103,8 +1104,15 @@ static bool find_nonnull_ptr(struct gfs2_sbd *sdp, struct metapath *mp, while (true) { ptr = metapointer(h, mp); - if (*ptr) /* if we have a non-null pointer */ + if (*ptr) { /* if we have a non-null pointer */ + /* Now zero the metapath after the current height. */ + h++; + if (h < GFS2_MAX_META_HEIGHT) + memset(&mp->mp_list[h], 0, + (GFS2_MAX_META_HEIGHT - h) * + sizeof(mp->mp_list[0])); return true; + } if (mp->mp_list[h] < ptrs) mp->mp_list[h]++; @@ -1120,6 +1128,13 @@ enum dealloc_states { DEALLOC_DONE = 3, /* process complete */ }; +static bool mp_eq_to_hgt(struct metapath *mp, __u16 *nbof, unsigned int h) +{ + if (memcmp(mp->mp_list, nbof, h * sizeof(mp->mp_list[0]))) + return false; + return true; +} + /** * trunc_dealloc - truncate a file down to a desired size * @ip: inode to truncate @@ -1197,8 +1212,7 @@ static int trunc_dealloc(struct gfs2_inode *ip, u64 newsize) /* If we're truncating to a non-zero size and the mp is at the beginning of file for the strip height, we need to preserve the first metadata pointer. */ - preserve1 = (newsize && - (mp.mp_list[mp_h] == nbof[mp_h])); + preserve1 = (newsize && mp_eq_to_hgt(&mp, nbof, mp_h)); bh = mp.mp_bh[mp_h]; gfs2_assert_withdraw(sdp, bh); if (gfs2_assert_withdraw(sdp, diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 5ee2e2f..06a0d19 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -1513,7 +1513,9 @@ static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index, continue; } bh->b_end_io = end_buffer_read_sync; - submit_bh(REQ_OP_READ, REQ_RAHEAD | REQ_META, bh); + submit_bh(REQ_OP_READ, + REQ_RAHEAD | REQ_META | REQ_PRIO, + bh); continue; } brelse(bh); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index c2062a1..33a0cb5 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -668,12 +668,14 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end, if (ret) return ret; if (gfs2_is_jdata(ip)) - filemap_write_and_wait(mapping); + ret = file_write_and_wait(file); + if (ret) + return ret; gfs2_ail_flush(ip->i_gl, 1); } if (mapping->nrpages) - ret = filemap_fdatawait_range(mapping, start, end); + ret = file_fdatawait_range(file, start, end); return ret ? ret : ret1; } @@ -1030,8 +1032,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) mutex_lock(&fp->f_fl_mutex); - gl = fl_gh->gh_gl; - if (gl) { + if (gfs2_holder_initialized(fl_gh)) { if (fl_gh->gh_state == state) goto out; locks_lock_file_wait(file, diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index c38ab6c..98e845b 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -15,6 +15,7 @@ #include <linux/buffer_head.h> #include <linux/delay.h> #include <linux/sort.h> +#include <linux/hash.h> #include <linux/jhash.h> #include <linux/kallsyms.h> #include <linux/gfs2_ondisk.h> @@ -71,7 +72,7 @@ static DEFINE_SPINLOCK(lru_lock); #define GFS2_GL_HASH_SHIFT 15 #define GFS2_GL_HASH_SIZE BIT(GFS2_GL_HASH_SHIFT) -static struct rhashtable_params ht_parms = { +static const struct rhashtable_params ht_parms = { .nelem_hint = GFS2_GL_HASH_SIZE * 3 / 4, .key_len = offsetofend(struct lm_lockname, ln_type), .key_offset = offsetof(struct gfs2_glock, gl_name), @@ -80,6 +81,49 @@ static struct rhashtable_params ht_parms = { static struct rhashtable gl_hash_table; +#define GLOCK_WAIT_TABLE_BITS 12 +#define GLOCK_WAIT_TABLE_SIZE (1 << GLOCK_WAIT_TABLE_BITS) +static wait_queue_head_t glock_wait_table[GLOCK_WAIT_TABLE_SIZE] __cacheline_aligned; + +struct wait_glock_queue { + struct lm_lockname *name; + wait_queue_entry_t wait; +}; + +static int glock_wake_function(wait_queue_entry_t *wait, unsigned int mode, + int sync, void *key) +{ + struct wait_glock_queue *wait_glock = + container_of(wait, struct wait_glock_queue, wait); + struct lm_lockname *wait_name = wait_glock->name; + struct lm_lockname *wake_name = key; + + if (wake_name->ln_sbd != wait_name->ln_sbd || + wake_name->ln_number != wait_name->ln_number || + wake_name->ln_type != wait_name->ln_type) + return 0; + return autoremove_wake_function(wait, mode, sync, key); +} + +static wait_queue_head_t *glock_waitqueue(struct lm_lockname *name) +{ + u32 hash = jhash2((u32 *)name, sizeof(*name) / 4, 0); + + return glock_wait_table + hash_32(hash, GLOCK_WAIT_TABLE_BITS); +} + +/** + * wake_up_glock - Wake up waiters on a glock + * @gl: the glock + */ +static void wake_up_glock(struct gfs2_glock *gl) +{ + wait_queue_head_t *wq = glock_waitqueue(&gl->gl_name); + + if (waitqueue_active(wq)) + __wake_up(wq, TASK_NORMAL, 1, &gl->gl_name); +} + static void gfs2_glock_dealloc(struct rcu_head *rcu) { struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu); @@ -96,6 +140,9 @@ void gfs2_glock_free(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + rhashtable_remove_fast(&gl_hash_table, &gl->gl_node, ht_parms); + smp_mb(); + wake_up_glock(gl); call_rcu(&gl->gl_rcu, gfs2_glock_dealloc); if (atomic_dec_and_test(&sdp->sd_glock_disposal)) wake_up(&sdp->sd_glock_wait); @@ -107,7 +154,7 @@ void gfs2_glock_free(struct gfs2_glock *gl) * */ -static void gfs2_glock_hold(struct gfs2_glock *gl) +void gfs2_glock_hold(struct gfs2_glock *gl) { GLOCK_BUG_ON(gl, __lockref_is_dead(&gl->gl_lockref)); lockref_get(&gl->gl_lockref); @@ -150,6 +197,9 @@ void gfs2_glock_add_to_lru(struct gfs2_glock *gl) static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl) { + if (!(gl->gl_ops->go_flags & GLOF_LRU)) + return; + spin_lock(&lru_lock); if (!list_empty(&gl->gl_lru)) { list_del_init(&gl->gl_lru); @@ -191,13 +241,20 @@ static void __gfs2_glock_put(struct gfs2_glock *gl) gfs2_glock_remove_from_lru(gl); spin_unlock(&gl->gl_lockref.lock); - rhashtable_remove_fast(&gl_hash_table, &gl->gl_node, ht_parms); GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); GLOCK_BUG_ON(gl, mapping && mapping->nrpages); trace_gfs2_glock_put(gl); sdp->sd_lockstruct.ls_ops->lm_put_lock(gl); } +/* + * Cause the glock to be put in work queue context. + */ +void gfs2_glock_queue_put(struct gfs2_glock *gl) +{ + gfs2_glock_queue_work(gl, 0); +} + /** * gfs2_glock_put() - Decrement reference count on glock * @gl: The glock to put @@ -676,6 +733,40 @@ static void glock_work_func(struct work_struct *work) spin_unlock(&gl->gl_lockref.lock); } +static struct gfs2_glock *find_insert_glock(struct lm_lockname *name, + struct gfs2_glock *new) +{ + struct wait_glock_queue wait; + wait_queue_head_t *wq = glock_waitqueue(name); + struct gfs2_glock *gl; + + wait.name = name; + init_wait(&wait.wait); + wait.wait.func = glock_wake_function; + +again: + prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + rcu_read_lock(); + if (new) { + gl = rhashtable_lookup_get_insert_fast(&gl_hash_table, + &new->gl_node, ht_parms); + if (IS_ERR(gl)) + goto out; + } else { + gl = rhashtable_lookup_fast(&gl_hash_table, + name, ht_parms); + } + if (gl && !lockref_get_not_dead(&gl->gl_lockref)) { + rcu_read_unlock(); + schedule(); + goto again; + } +out: + rcu_read_unlock(); + finish_wait(wq, &wait.wait); + return gl; +} + /** * gfs2_glock_get() - Get a glock, or create one if one doesn't exist * @sdp: The GFS2 superblock @@ -702,15 +793,11 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, struct kmem_cache *cachep; int ret = 0; - rcu_read_lock(); - gl = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms); - if (gl && !lockref_get_not_dead(&gl->gl_lockref)) - gl = NULL; - rcu_read_unlock(); - - *glp = gl; - if (gl) + gl = find_insert_glock(&name, NULL); + if (gl) { + *glp = gl; return 0; + } if (!create) return -ENOENT; @@ -764,10 +851,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, mapping->writeback_index = 0; } -again: - rcu_read_lock(); - tmp = rhashtable_lookup_get_insert_fast(&gl_hash_table, &gl->gl_node, - ht_parms); + tmp = find_insert_glock(&name, gl); if (!tmp) { *glp = gl; goto out; @@ -776,13 +860,7 @@ again: ret = PTR_ERR(tmp); goto out_free; } - if (lockref_get_not_dead(&tmp->gl_lockref)) { - *glp = tmp; - goto out_free; - } - rcu_read_unlock(); - cond_resched(); - goto again; + *glp = tmp; out_free: kfree(gl->gl_lksb.sb_lvbptr); @@ -790,7 +868,6 @@ out_free: atomic_dec(&sdp->sd_glock_disposal); out: - rcu_read_unlock(); return ret; } @@ -1473,14 +1550,15 @@ static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp) do { gl = ERR_PTR(rhashtable_walk_start(&iter)); - if (gl) - continue; + if (IS_ERR(gl)) + goto walk_stop; while ((gl = rhashtable_walk_next(&iter)) && !IS_ERR(gl)) - if ((gl->gl_name.ln_sbd == sdp) && + if (gl->gl_name.ln_sbd == sdp && lockref_get_not_dead(&gl->gl_lockref)) examiner(gl); +walk_stop: rhashtable_walk_stop(&iter); } while (cond_resched(), gl == ERR_PTR(-EAGAIN)); @@ -1803,7 +1881,7 @@ static int gfs2_sbstats_seq_show(struct seq_file *seq, void *iter_ptr) int __init gfs2_glock_init(void) { - int ret; + int i, ret; ret = rhashtable_init(&gl_hash_table, &ht_parms); if (ret < 0) @@ -1832,6 +1910,9 @@ int __init gfs2_glock_init(void) return ret; } + for (i = 0; i < GLOCK_WAIT_TABLE_SIZE; i++) + init_waitqueue_head(glock_wait_table + i); + return 0; } @@ -1860,6 +1941,7 @@ static void gfs2_glock_iter_next(struct gfs2_glock_iter *gi) } static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) { struct gfs2_glock_iter *gi = seq->private; loff_t n = *pos; @@ -1892,6 +1974,7 @@ static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr, } static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr) + __releases(RCU) { struct gfs2_glock_iter *gi = seq->private; diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 9ad4a6a..5e12220 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -13,6 +13,7 @@ #include <linux/sched.h> #include <linux/parser.h> #include "incore.h" +#include "util.h" /* Options for hostdata parser */ @@ -181,7 +182,9 @@ static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl) extern int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, const struct gfs2_glock_operations *glops, int create, struct gfs2_glock **glp); +extern void gfs2_glock_hold(struct gfs2_glock *gl); extern void gfs2_glock_put(struct gfs2_glock *gl); +extern void gfs2_glock_queue_put(struct gfs2_glock *gl); extern void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, u16 flags, struct gfs2_holder *gh); extern void gfs2_holder_reinit(unsigned int state, u16 flags, @@ -257,11 +260,44 @@ static inline bool gfs2_holder_initialized(struct gfs2_holder *gh) return gh->gh_gl; } +/** + * glock_set_object - set the gl_object field of a glock + * @gl: the glock + * @object: the object + */ static inline void glock_set_object(struct gfs2_glock *gl, void *object) { spin_lock(&gl->gl_lockref.lock); + if (gfs2_assert_warn(gl->gl_name.ln_sbd, gl->gl_object == NULL)) + gfs2_dump_glock(NULL, gl); gl->gl_object = object; spin_unlock(&gl->gl_lockref.lock); } +/** + * glock_clear_object - clear the gl_object field of a glock + * @gl: the glock + * @object: the object + * + * I'd love to similarly add this: + * else if (gfs2_assert_warn(gl->gl_sbd, gl->gl_object == object)) + * gfs2_dump_glock(NULL, gl); + * Unfortunately, that's not possible because as soon as gfs2_delete_inode + * frees the block in the rgrp, another process can reassign it for an I_NEW + * inode in gfs2_create_inode because that calls new_inode, not gfs2_iget. + * That means gfs2_delete_inode may subsequently try to call this function + * for a glock that's already pointing to a brand new inode. If we clear the + * new inode's gl_object, we'll introduce metadata corruption. Function + * gfs2_delete_inode calls clear_inode which calls gfs2_clear_inode which also + * tries to clear gl_object, so it's more than just gfs2_delete_inode. + * + */ +static inline void glock_clear_object(struct gfs2_glock *gl, void *object) +{ + spin_lock(&gl->gl_lockref.lock); + if (gl->gl_object == object) + gl->gl_object = NULL; + spin_unlock(&gl->gl_lockref.lock); +} + #endif /* __GLOCK_DOT_H__ */ diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 5e69636..dac6559 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -329,32 +329,6 @@ static int inode_go_demote_ok(const struct gfs2_glock *gl) return 1; } -/** - * gfs2_set_nlink - Set the inode's link count based on on-disk info - * @inode: The inode in question - * @nlink: The link count - * - * If the link count has hit zero, it must never be raised, whatever the - * on-disk inode might say. When new struct inodes are created the link - * count is set to 1, so that we can safely use this test even when reading - * in on disk information for the first time. - */ - -static void gfs2_set_nlink(struct inode *inode, u32 nlink) -{ - /* - * We will need to review setting the nlink count here in the - * light of the forthcoming ro bind mount work. This is a reminder - * to do that. - */ - if ((inode->i_nlink != nlink) && (inode->i_nlink != 0)) { - if (nlink == 0) - clear_nlink(inode); - else - set_nlink(inode, nlink); - } -} - static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) { const struct gfs2_dinode *str = buf; @@ -376,7 +350,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) i_uid_write(&ip->i_inode, be32_to_cpu(str->di_uid)); i_gid_write(&ip->i_inode, be32_to_cpu(str->di_gid)); - gfs2_set_nlink(&ip->i_inode, be32_to_cpu(str->di_nlink)); + set_nlink(&ip->i_inode, be32_to_cpu(str->di_nlink)); i_size_write(&ip->i_inode, be64_to_cpu(str->di_size)); gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks)); atime.tv_sec = be64_to_cpu(str->di_atime); @@ -470,7 +444,7 @@ static int inode_go_lock(struct gfs2_holder *gh) (gh->gh_state == LM_ST_EXCLUSIVE)) { spin_lock(&sdp->sd_trunc_lock); if (list_empty(&ip->i_trunc_list)) - list_add(&sdp->sd_trunc_list, &ip->i_trunc_list); + list_add(&ip->i_trunc_list, &sdp->sd_trunc_list); spin_unlock(&sdp->sd_trunc_lock); wake_up(&sdp->sd_quota_wait); return 1; diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 73fce76..6e18e97 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -606,6 +606,7 @@ enum { SDF_NOJOURNALID = 6, SDF_RORECOVERY = 7, /* read only recovery */ SDF_SKIP_DLM_UNLOCK = 8, + SDF_FORCE_AIL_FLUSH = 9, }; enum gfs2_freeze_state { @@ -816,6 +817,7 @@ struct gfs2_sbd { atomic_t sd_log_in_flight; struct bio *sd_log_bio; wait_queue_head_t sd_log_flush_wait; + int sd_log_error; atomic_t sd_reserving_log; wait_queue_head_t sd_reserving_log_wait; @@ -831,7 +833,7 @@ struct gfs2_sbd { atomic_t sd_freeze_state; struct mutex sd_freeze_mutex; - char sd_fsname[GFS2_FSNAME_LEN]; + char sd_fsname[GFS2_FSNAME_LEN + 3 * sizeof(int) + 2]; char sd_table_name[GFS2_FSNAME_LEN]; char sd_proto_name[GFS2_FSNAME_LEN]; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index acca501..863749e 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -109,7 +109,7 @@ static void gfs2_set_iop(struct inode *inode) * @no_addr: The inode number * @no_formal_ino: The inode generation number * @blktype: Requested block type (GFS2_BLKST_DINODE or GFS2_BLKST_UNLINKED; - * GFS2_BLKST_FREE do indicate not to verify) + * GFS2_BLKST_FREE to indicate not to verify) * * If @type is DT_UNKNOWN, the inode type is fetched from disk. * @@ -145,7 +145,6 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, if (unlikely(error)) goto fail; flush_delayed_work(&ip->i_gl->gl_work); - glock_set_object(ip->i_gl, ip); error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl); if (unlikely(error)) @@ -170,11 +169,11 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, } } + glock_set_object(ip->i_gl, ip); set_bit(GIF_INVALID, &ip->i_flags); error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); if (unlikely(error)) goto fail_put; - flush_delayed_work(&ip->i_iopen_gh.gh_gl->gl_work); glock_set_object(ip->i_iopen_gh.gh_gl, ip); gfs2_glock_put(io_gl); io_gl = NULL; @@ -202,14 +201,14 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, fail_refresh: ip->i_iopen_gh.gh_flags |= GL_NOCACHE; - glock_set_object(ip->i_iopen_gh.gh_gl, NULL); + glock_clear_object(ip->i_iopen_gh.gh_gl, ip); gfs2_glock_dq_uninit(&ip->i_iopen_gh); fail_put: if (io_gl) gfs2_glock_put(io_gl); + glock_clear_object(ip->i_gl, ip); if (gfs2_holder_initialized(&i_gh)) gfs2_glock_dq_uninit(&i_gh); - glock_set_object(ip->i_gl, NULL); fail: iget_failed(inode); return ERR_PTR(error); @@ -706,8 +705,9 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); if (error) goto fail_free_inode; - + flush_delayed_work(&ip->i_gl->gl_work); glock_set_object(ip->i_gl, ip); + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1); if (error) goto fail_free_inode; @@ -775,14 +775,17 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, return error; fail_gunlock3: + glock_clear_object(io_gl, ip); gfs2_glock_dq_uninit(&ip->i_iopen_gh); gfs2_glock_put(io_gl); fail_gunlock2: if (io_gl) clear_bit(GLF_INODE_CREATING, &io_gl->gl_flags); fail_free_inode: - if (ip->i_gl) + if (ip->i_gl) { + glock_clear_object(ip->i_gl, ip); gfs2_glock_put(ip->i_gl); + } gfs2_rsqa_delete(ip, NULL); fail_free_acls: if (default_acl) diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 0515f0a..65f33a0 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -23,8 +23,6 @@ #include "sys.h" #include "trace_gfs2.h" -extern struct workqueue_struct *gfs2_control_wq; - /** * gfs2_update_stats - Update time based stats * @mv: Pointer to mean/variance structure to update @@ -1059,6 +1057,7 @@ static void free_recover_size(struct lm_lockstruct *ls) ls->ls_recover_submit = NULL; ls->ls_recover_result = NULL; ls->ls_recover_size = 0; + ls->ls_lvb_bits = NULL; } /* dlm calls before it does lock recovery */ @@ -1175,7 +1174,7 @@ static void gdlm_recovery_result(struct gfs2_sbd *sdp, unsigned int jid, spin_unlock(&ls->ls_recover_spin); } -const struct dlm_lockspace_ops gdlm_lockspace_ops = { +static const struct dlm_lockspace_ops gdlm_lockspace_ops = { .recover_prep = gdlm_recover_prep, .recover_slot = gdlm_recover_slot, .recover_done = gdlm_recover_done, diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 9a624f6..f72c442 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -898,6 +898,10 @@ static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp) static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp) { unsigned int used_blocks = sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free); + + if (test_and_clear_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags)) + return 1; + return used_blocks + atomic_read(&sdp->sd_log_blks_needed) >= atomic_read(&sdp->sd_log_thresh2); } @@ -919,6 +923,15 @@ int gfs2_logd(void *data) while (!kthread_should_stop()) { + /* Check for errors writing to the journal */ + if (sdp->sd_log_error) { + gfs2_lm_withdraw(sdp, + "GFS2: fsid=%s: error %d: " + "withdrawing the file system to " + "prevent further damage.\n", + sdp->sd_fsname, sdp->sd_log_error); + } + did_flush = false; if (gfs2_jrnl_flush_reqd(sdp) || t == 0) { gfs2_ail1_empty(sdp); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 3010f9e..7dabbe7 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -207,8 +207,11 @@ static void gfs2_end_log_write(struct bio *bio) struct page *page; int i; - if (bio->bi_status) - fs_err(sdp, "Error %d writing to log\n", bio->bi_status); + if (bio->bi_status) { + fs_err(sdp, "Error %d writing to journal, jid=%u\n", + bio->bi_status, sdp->sd_jdesc->jd_jid); + wake_up(&sdp->sd_logd_waitq); + } bio_for_each_segment_all(bvec, bio, i) { page = bvec->bv_page; diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index fabe1614f..61ef6c9 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -419,8 +419,9 @@ int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num, if (ret == 0 && gfs2_metatype_check(sdp, bh, mtype)) { brelse(bh); ret = -EIO; + } else { + *bhp = bh; } - *bhp = bh; return ret; } @@ -452,7 +453,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) if (buffer_uptodate(first_bh)) goto out; if (!buffer_locked(first_bh)) - ll_rw_block(REQ_OP_READ, REQ_META, 1, &first_bh); + ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &first_bh); dblock++; extlen--; @@ -461,7 +462,9 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) bh = gfs2_getbuf(gl, dblock, CREATE); if (!buffer_uptodate(bh) && !buffer_locked(bh)) - ll_rw_block(REQ_OP_READ, REQ_RAHEAD | REQ_META, 1, &bh); + ll_rw_block(REQ_OP_READ, + REQ_RAHEAD | REQ_META | REQ_PRIO, + 1, &bh); brelse(bh); dblock++; extlen--; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index e76058d..c0a4b37 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1113,7 +1113,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent return error; } - snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s", sdp->sd_table_name); + snprintf(sdp->sd_fsname, sizeof(sdp->sd_fsname), "%s", sdp->sd_table_name); error = gfs2_sys_fs_add(sdp); /* @@ -1159,10 +1159,10 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent } if (sdp->sd_args.ar_spectator) - snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", + snprintf(sdp->sd_fsname, sizeof(sdp->sd_fsname), "%s.s", sdp->sd_table_name); else - snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", + snprintf(sdp->sd_fsname, sizeof(sdp->sd_fsname), "%s.%u", sdp->sd_table_name, sdp->sd_lockstruct.ls_jid); error = init_inodes(sdp, DO); @@ -1388,7 +1388,6 @@ static void gfs2_kill_sb(struct super_block *sb) sdp->sd_root_dir = NULL; sdp->sd_master_dir = NULL; shrink_dcache_sb(sb); - gfs2_delete_debugfs_file(sdp); free_percpu(sdp->sd_lkstats); kill_block_super(sb); } diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index c2ca956..e647938 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -730,7 +730,7 @@ static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index, if (PageUptodate(page)) set_buffer_uptodate(bh); if (!buffer_uptodate(bh)) { - ll_rw_block(REQ_OP_READ, REQ_META, 1, &bh); + ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) goto unlock_out; @@ -1474,8 +1474,11 @@ static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error) { if (error == 0 || error == -EROFS) return; - if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) + if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) { fs_err(sdp, "gfs2_quotad: %s error %d\n", msg, error); + sdp->sd_log_error = error; + wake_up(&sdp->sd_logd_waitq); + } } static void quotad_check_timeo(struct gfs2_sbd *sdp, const char *msg, diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 836e38b..95b2a57 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -705,8 +705,7 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp) rb_erase(n, &sdp->sd_rindex_tree); if (gl) { - glock_set_object(gl, NULL); - gfs2_glock_add_to_lru(gl); + glock_clear_object(gl, rgd); gfs2_glock_put(gl); } diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index fdedec3..7698411 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -924,6 +924,7 @@ restart: gfs2_jindex_free(sdp); /* Take apart glock structures and buffer lists */ gfs2_gl_hash_clear(sdp); + gfs2_delete_debugfs_file(sdp); /* Unmount the locking protocol */ gfs2_lm_unmount(sdp); @@ -943,9 +944,9 @@ static int gfs2_sync_fs(struct super_block *sb, int wait) struct gfs2_sbd *sdp = sb->s_fs_info; gfs2_quota_sync(sb, -1); - if (wait && sdp) + if (wait) gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); - return 0; + return sdp->sd_log_error; } void gfs2_freeze_func(struct work_struct *work) @@ -1295,7 +1296,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) * gfs2_drop_inode - Drop an inode (test for remote unlink) * @inode: The inode to drop * - * If we've received a callback on an iopen lock then its because a + * If we've received a callback on an iopen lock then it's because a * remote node tried to deallocate the inode but failed due to this node * still having the inode open. Here we mark the link count zero * since we know that it must have reached zero if the GLF_DEMOTE flag @@ -1317,6 +1318,23 @@ static int gfs2_drop_inode(struct inode *inode) if (test_bit(GLF_DEMOTE, &gl->gl_flags)) clear_nlink(inode); } + + /* + * When under memory pressure when an inode's link count has dropped to + * zero, defer deleting the inode to the delete workqueue. This avoids + * calling into DLM under memory pressure, which can deadlock. + */ + if (!inode->i_nlink && + unlikely(current->flags & PF_MEMALLOC) && + gfs2_holder_initialized(&ip->i_iopen_gh)) { + struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; + + gfs2_glock_hold(gl); + if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0) + gfs2_glock_queue_put(gl); + return false; + } + return generic_drop_inode(inode); } @@ -1501,6 +1519,22 @@ out_qs: } /** + * gfs2_glock_put_eventually + * @gl: The glock to put + * + * When under memory pressure, trigger a deferred glock put to make sure we + * won't call into DLM and deadlock. Otherwise, put the glock directly. + */ + +static void gfs2_glock_put_eventually(struct gfs2_glock *gl) +{ + if (current->flags & PF_MEMALLOC) + gfs2_glock_queue_put(gl); + else + gfs2_glock_put(gl); +} + +/** * gfs2_evict_inode - Remove an inode from cache * @inode: The inode to evict * @@ -1544,9 +1578,14 @@ static void gfs2_evict_inode(struct inode *inode) goto alloc_failed; } + /* Deletes should never happen under memory pressure anymore. */ + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) + goto out; + /* Must not read inode block until block type has been verified */ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh); if (unlikely(error)) { + glock_clear_object(ip->i_iopen_gh.gh_gl, ip); ip->i_iopen_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq_uninit(&ip->i_iopen_gh); goto out; @@ -1562,6 +1601,12 @@ static void gfs2_evict_inode(struct inode *inode) goto out_truncate; } + /* + * The inode may have been recreated in the meantime. + */ + if (inode->i_nlink) + goto out_truncate; + alloc_failed: if (gfs2_holder_initialized(&ip->i_iopen_gh) && test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { @@ -1595,6 +1640,11 @@ alloc_failed: goto out_unlock; } + /* We're about to clear the bitmap for the dinode, but as soon as we + do, gfs2_create_inode can create another inode at the same block + location and try to set gl_object again. We clear gl_object here so + that subsequent inode creates don't see an old gl_object. */ + glock_clear_object(ip->i_gl, ip); error = gfs2_dinode_dealloc(ip); goto out_unlock; @@ -1623,14 +1673,17 @@ out_unlock: gfs2_rs_deltree(&ip->i_res); if (gfs2_holder_initialized(&ip->i_iopen_gh)) { + glock_clear_object(ip->i_iopen_gh.gh_gl, ip); if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { ip->i_iopen_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq(&ip->i_iopen_gh); } gfs2_holder_uninit(&ip->i_iopen_gh); } - if (gfs2_holder_initialized(&gh)) + if (gfs2_holder_initialized(&gh)) { + glock_clear_object(ip->i_gl, ip); gfs2_glock_dq_uninit(&gh); + } if (error && error != GLR_TRYFAILED && error != -EROFS) fs_warn(sdp, "gfs2_evict_inode: %d\n", error); out: @@ -1640,15 +1693,19 @@ out: gfs2_ordered_del_inode(ip); clear_inode(inode); gfs2_dir_hash_inval(ip); - glock_set_object(ip->i_gl, NULL); + glock_clear_object(ip->i_gl, ip); wait_on_bit_io(&ip->i_flags, GIF_GLOP_PENDING, TASK_UNINTERRUPTIBLE); gfs2_glock_add_to_lru(ip->i_gl); - gfs2_glock_put(ip->i_gl); + gfs2_glock_put_eventually(ip->i_gl); ip->i_gl = NULL; if (gfs2_holder_initialized(&ip->i_iopen_gh)) { - glock_set_object(ip->i_iopen_gh.gh_gl, NULL); + struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; + + glock_clear_object(gl, ip); ip->i_iopen_gh.gh_flags |= GL_NOCACHE; + gfs2_glock_hold(gl); gfs2_glock_dq_uninit(&ip->i_iopen_gh); + gfs2_glock_put_eventually(gl); } } diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h index c81295f..3926f95 100644 --- a/fs/gfs2/util.h +++ b/fs/gfs2/util.h @@ -151,6 +151,7 @@ extern struct kmem_cache *gfs2_rgrpd_cachep; extern struct kmem_cache *gfs2_quotad_cachep; extern struct kmem_cache *gfs2_qadata_cachep; extern mempool_t *gfs2_page_pool; +extern struct workqueue_struct *gfs2_control_wq; static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt, unsigned int *p) diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 5417955..ea09e41 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -25,6 +25,7 @@ #include "meta_io.h" #include "quota.h" #include "rgrp.h" +#include "super.h" #include "trans.h" #include "util.h" @@ -1209,8 +1210,12 @@ int __gfs2_xattr_set(struct inode *inode, const char *name, if (namel > GFS2_EA_MAX_NAME_LEN) return -ERANGE; - if (value == NULL) - return gfs2_xattr_remove(ip, type, name); + if (value == NULL) { + error = gfs2_xattr_remove(ip, type, name); + if (error == -ENODATA && !(flags & XATTR_REPLACE)) + error = 0; + return error; + } if (ea_check_size(sdp, namel, size)) return -ERANGE; diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index bfbba79..2538b49 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -656,7 +656,7 @@ static int hfs_file_fsync(struct file *filp, loff_t start, loff_t end, struct super_block * sb; int ret, err; - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + ret = file_write_and_wait_range(filp, start, end); if (ret) return ret; inode_lock(inode); diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index e8638d5..4f26b68 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -283,7 +283,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb); int error = 0, error2; - error = filemap_write_and_wait_range(inode->i_mapping, start, end); + error = file_write_and_wait_range(file, start, end); if (error) return error; inode_lock(inode); diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index e61261a..c148e7f 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -374,7 +374,7 @@ static int hostfs_fsync(struct file *file, loff_t start, loff_t end, struct inode *inode = file->f_mapping->host; int ret; - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + ret = file_write_and_wait_range(file, start, end); if (ret) return ret; diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index b3be1b5..f261384 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -24,7 +24,7 @@ int hpfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) struct inode *inode = file->f_mapping->host; int ret; - ret = filemap_write_and_wait_range(file->f_mapping, start, end); + ret = file_write_and_wait_range(file, start, end); if (ret) return ret; return sync_blockdev(inode->i_sb->s_bdev); @@ -637,6 +637,7 @@ again: dispose_list(&dispose); } +EXPORT_SYMBOL_GPL(evict_inodes); /** * invalidate_inodes - attempt to free all inodes on a superblock diff --git a/fs/internal.h b/fs/internal.h index 9676fe1..fedfe94 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -132,7 +132,6 @@ static inline bool atime_needs_update_rcu(const struct path *path, extern void inode_io_list_del(struct inode *inode); extern long get_nr_dirty_inodes(void); -extern void evict_inodes(struct super_block *); extern int invalidate_inodes(struct super_block *, bool); /* @@ -477,10 +477,10 @@ int iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) set_page_dirty(page); wait_for_stable_page(page); - return 0; + return VM_FAULT_LOCKED; out_unlock: unlock_page(page); - return ret; + return block_page_mkwrite_return(ret); } EXPORT_SYMBOL_GPL(iomap_page_mkwrite); diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index c12476e..bd0428b 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -35,7 +35,7 @@ int jffs2_fsync(struct file *filp, loff_t start, loff_t end, int datasync) struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); int ret; - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + ret = file_write_and_wait_range(filp, start, end); if (ret) return ret; diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 739492c..36665fd 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -34,7 +34,7 @@ int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) struct inode *inode = file->f_mapping->host; int rc = 0; - rc = filemap_write_and_wait_range(inode->i_mapping, start, end); + rc = file_write_and_wait_range(file, start, end); if (rc) return rc; @@ -137,6 +137,7 @@ #define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK) #define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT)) #define IS_OFDLCK(fl) (fl->fl_flags & FL_OFDLCK) +#define IS_REMOTELCK(fl) (fl->fl_pid <= 0) static inline bool is_remote_lock(struct file *filp) { @@ -270,6 +271,22 @@ locks_check_ctx_lists(struct inode *inode) } } +static void +locks_check_ctx_file_list(struct file *filp, struct list_head *list, + char *list_type) +{ + struct file_lock *fl; + struct inode *inode = locks_inode(filp); + + list_for_each_entry(fl, list, fl_list) + if (fl->fl_file == filp) + pr_warn("Leaked %s lock on dev=0x%x:0x%x ino=0x%lx " + " fl_owner=%p fl_flags=0x%x fl_type=0x%x fl_pid=%u\n", + list_type, MAJOR(inode->i_sb->s_dev), + MINOR(inode->i_sb->s_dev), inode->i_ino, + fl->fl_owner, fl->fl_flags, fl->fl_type, fl->fl_pid); +} + void locks_free_lock_context(struct inode *inode) { @@ -733,7 +750,6 @@ static void locks_wake_up_blocks(struct file_lock *blocker) static void locks_insert_lock_ctx(struct file_lock *fl, struct list_head *before) { - fl->fl_nspid = get_pid(task_tgid(current)); list_add_tail(&fl->fl_list, before); locks_insert_global_locks(fl); } @@ -743,10 +759,6 @@ locks_unlink_lock_ctx(struct file_lock *fl) { locks_delete_global_locks(fl); list_del_init(&fl->fl_list); - if (fl->fl_nspid) { - put_pid(fl->fl_nspid); - fl->fl_nspid = NULL; - } locks_wake_up_blocks(fl); } @@ -823,8 +835,6 @@ posix_test_lock(struct file *filp, struct file_lock *fl) list_for_each_entry(cfl, &ctx->flc_posix, fl_list) { if (posix_locks_conflict(fl, cfl)) { locks_copy_conflock(fl, cfl); - if (cfl->fl_nspid) - fl->fl_pid = pid_vnr(cfl->fl_nspid); goto out; } } @@ -2048,9 +2058,33 @@ int vfs_test_lock(struct file *filp, struct file_lock *fl) } EXPORT_SYMBOL_GPL(vfs_test_lock); +/** + * locks_translate_pid - translate a file_lock's fl_pid number into a namespace + * @fl: The file_lock who's fl_pid should be translated + * @ns: The namespace into which the pid should be translated + * + * Used to tranlate a fl_pid into a namespace virtual pid number + */ +static pid_t locks_translate_pid(struct file_lock *fl, struct pid_namespace *ns) +{ + pid_t vnr; + struct pid *pid; + + if (IS_OFDLCK(fl)) + return -1; + if (IS_REMOTELCK(fl)) + return fl->fl_pid; + + rcu_read_lock(); + pid = find_pid_ns(fl->fl_pid, &init_pid_ns); + vnr = pid_nr_ns(pid, ns); + rcu_read_unlock(); + return vnr; +} + static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl) { - flock->l_pid = IS_OFDLCK(fl) ? -1 : fl->fl_pid; + flock->l_pid = locks_translate_pid(fl, task_active_pid_ns(current)); #if BITS_PER_LONG == 32 /* * Make sure we can represent the posix lock via @@ -2072,7 +2106,7 @@ static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl) #if BITS_PER_LONG == 32 static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl) { - flock->l_pid = IS_OFDLCK(fl) ? -1 : fl->fl_pid; + flock->l_pid = locks_translate_pid(fl, task_active_pid_ns(current)); flock->l_start = fl->fl_start; flock->l_len = fl->fl_end == OFFSET_MAX ? 0 : fl->fl_end - fl->fl_start + 1; @@ -2086,14 +2120,17 @@ static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl) */ int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock *flock) { - struct file_lock file_lock; + struct file_lock *fl; int error; + fl = locks_alloc_lock(); + if (fl == NULL) + return -ENOMEM; error = -EINVAL; if (flock->l_type != F_RDLCK && flock->l_type != F_WRLCK) goto out; - error = flock_to_posix_lock(filp, &file_lock, flock); + error = flock_to_posix_lock(filp, fl, flock); if (error) goto out; @@ -2103,23 +2140,22 @@ int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock *flock) goto out; cmd = F_GETLK; - file_lock.fl_flags |= FL_OFDLCK; - file_lock.fl_owner = filp; + fl->fl_flags |= FL_OFDLCK; + fl->fl_owner = filp; } - error = vfs_test_lock(filp, &file_lock); + error = vfs_test_lock(filp, fl); if (error) goto out; - flock->l_type = file_lock.fl_type; - if (file_lock.fl_type != F_UNLCK) { - error = posix_lock_to_flock(flock, &file_lock); + flock->l_type = fl->fl_type; + if (fl->fl_type != F_UNLCK) { + error = posix_lock_to_flock(flock, fl); if (error) - goto rel_priv; + goto out; } -rel_priv: - locks_release_private(&file_lock); out: + locks_free_lock(fl); return error; } @@ -2298,14 +2334,18 @@ out: */ int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 *flock) { - struct file_lock file_lock; + struct file_lock *fl; int error; + fl = locks_alloc_lock(); + if (fl == NULL) + return -ENOMEM; + error = -EINVAL; if (flock->l_type != F_RDLCK && flock->l_type != F_WRLCK) goto out; - error = flock64_to_posix_lock(filp, &file_lock, flock); + error = flock64_to_posix_lock(filp, fl, flock); if (error) goto out; @@ -2315,20 +2355,20 @@ int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 *flock) goto out; cmd = F_GETLK64; - file_lock.fl_flags |= FL_OFDLCK; - file_lock.fl_owner = filp; + fl->fl_flags |= FL_OFDLCK; + fl->fl_owner = filp; } - error = vfs_test_lock(filp, &file_lock); + error = vfs_test_lock(filp, fl); if (error) goto out; - flock->l_type = file_lock.fl_type; - if (file_lock.fl_type != F_UNLCK) - posix_lock_to_flock64(flock, &file_lock); + flock->l_type = fl->fl_type; + if (fl->fl_type != F_UNLCK) + posix_lock_to_flock64(flock, fl); - locks_release_private(&file_lock); out: + locks_free_lock(fl); return error; } @@ -2525,6 +2565,12 @@ void locks_remove_file(struct file *filp) /* remove any leases */ locks_remove_lease(filp, ctx); + + spin_lock(&ctx->flc_lock); + locks_check_ctx_file_list(filp, &ctx->flc_posix, "POSIX"); + locks_check_ctx_file_list(filp, &ctx->flc_flock, "FLOCK"); + locks_check_ctx_file_list(filp, &ctx->flc_lease, "LEASE"); + spin_unlock(&ctx->flc_lock); } /** @@ -2578,22 +2624,16 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, { struct inode *inode = NULL; unsigned int fl_pid; + struct pid_namespace *proc_pidns = file_inode(f->file)->i_sb->s_fs_info; - if (fl->fl_nspid) { - struct pid_namespace *proc_pidns = file_inode(f->file)->i_sb->s_fs_info; - - /* Don't let fl_pid change based on who is reading the file */ - fl_pid = pid_nr_ns(fl->fl_nspid, proc_pidns); - - /* - * If there isn't a fl_pid don't display who is waiting on - * the lock if we are called from locks_show, or if we are - * called from __show_fd_info - skip lock entirely - */ - if (fl_pid == 0) - return; - } else - fl_pid = fl->fl_pid; + fl_pid = locks_translate_pid(fl, proc_pidns); + /* + * If there isn't a fl_pid don't display who is waiting on + * the lock if we are called from locks_show, or if we are + * called from __show_fd_info - skip lock entirely + */ + if (fl_pid == 0) + return; if (fl->fl_file != NULL) inode = locks_inode(fl->fl_file); @@ -2668,7 +2708,7 @@ static int locks_show(struct seq_file *f, void *v) fl = hlist_entry(v, struct file_lock, fl_link); - if (fl->fl_nspid && !pid_nr_ns(fl->fl_nspid, proc_pidns)) + if (locks_translate_pid(fl, proc_pidns) == 0) return 0; lock_get_status(f, fl, iter->li_pos, ""); diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c index 76965e7..a06c076 100644 --- a/fs/ncpfs/file.c +++ b/fs/ncpfs/file.c @@ -23,7 +23,7 @@ static int ncp_fsync(struct file *file, loff_t start, loff_t end, int datasync) { - return filemap_write_and_wait_range(file->f_mapping, start, end); + return file_write_and_wait_range(file, start, end); } /* diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c index 0ee19ec..1a24be9 100644 --- a/fs/ntfs/dir.c +++ b/fs/ntfs/dir.c @@ -1506,7 +1506,7 @@ static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end, ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); - err = filemap_write_and_wait_range(vi->i_mapping, start, end); + err = file_write_and_wait_range(filp, start, end); if (err) return err; inode_lock(vi); diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index c4f68c3..331910f 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -1989,7 +1989,7 @@ static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end, ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); - err = filemap_write_and_wait_range(vi->i_mapping, start, end); + err = file_write_and_wait_range(filp, start, end); if (err) return err; inode_lock(vi); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index b4512fa..6e41fc8 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -196,7 +196,7 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end, if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) return -EROFS; - err = filemap_write_and_wait_range(inode->i_mapping, start, end); + err = file_write_and_wait_range(file, start, end); if (err) return err; diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c index 45aa05e..5b50689 100644 --- a/fs/reiserfs/dir.c +++ b/fs/reiserfs/dir.c @@ -34,7 +34,7 @@ static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end, struct inode *inode = filp->f_mapping->host; int err; - err = filemap_write_and_wait_range(inode->i_mapping, start, end); + err = file_write_and_wait_range(filp, start, end); if (err) return err; diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index b396eb0..843aadc 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -154,7 +154,7 @@ static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end, int err; int barrier_done; - err = filemap_write_and_wait_range(inode->i_mapping, start, end); + err = file_write_and_wait_range(filp, start, end); if (err) return err; @@ -337,7 +337,7 @@ SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes, mapping = f.file->f_mapping; ret = 0; if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) { - ret = filemap_fdatawait_range(mapping, offset, endbyte); + ret = file_fdatawait_range(f.file, offset, endbyte); if (ret < 0) goto out_put; } @@ -350,7 +350,7 @@ SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes, } if (flags & SYNC_FILE_RANGE_WAIT_AFTER) - ret = filemap_fdatawait_range(mapping, offset, endbyte); + ret = file_fdatawait_range(f.file, offset, endbyte); out_put: fdput(f); diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 8cad0b1..f90a466 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1337,7 +1337,7 @@ int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) */ return 0; - err = filemap_write_and_wait_range(inode->i_mapping, start, end); + err = file_write_and_wait_range(file, start, end); if (err) return err; inode_lock(inode); diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index de7b9bd..6249c92 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -328,20 +328,19 @@ xfs_attr_set( */ xfs_defer_init(args.dfops, args.firstblock); error = xfs_attr_shortform_to_leaf(&args); - if (!error) - error = xfs_defer_finish(&args.trans, args.dfops, dp); - if (error) { - args.trans = NULL; - xfs_defer_cancel(&dfops); - goto out; - } + if (error) + goto out_defer_cancel; + xfs_defer_ijoin(args.dfops, dp); + error = xfs_defer_finish(&args.trans, args.dfops); + if (error) + goto out_defer_cancel; /* * Commit the leaf transformation. We'll need another (linked) * transaction to add the new attribute to the leaf. */ - error = xfs_trans_roll(&args.trans, dp); + error = xfs_trans_roll_inode(&args.trans, dp); if (error) goto out; @@ -373,6 +372,9 @@ xfs_attr_set( return error; +out_defer_cancel: + xfs_defer_cancel(&dfops); + args.trans = NULL; out: if (args.trans) xfs_trans_cancel(args.trans); @@ -593,19 +595,18 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) */ xfs_defer_init(args->dfops, args->firstblock); error = xfs_attr3_leaf_to_node(args); - if (!error) - error = xfs_defer_finish(&args->trans, args->dfops, dp); - if (error) { - args->trans = NULL; - xfs_defer_cancel(args->dfops); - return error; - } + if (error) + goto out_defer_cancel; + xfs_defer_ijoin(args->dfops, dp); + error = xfs_defer_finish(&args->trans, args->dfops); + if (error) + goto out_defer_cancel; /* * Commit the current trans (including the inode) and start * a new one. */ - error = xfs_trans_roll(&args->trans, dp); + error = xfs_trans_roll_inode(&args->trans, dp); if (error) return error; @@ -620,7 +621,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * Commit the transaction that added the attr name so that * later routines can manage their own transactions. */ - error = xfs_trans_roll(&args->trans, dp); + error = xfs_trans_roll_inode(&args->trans, dp); if (error) return error; @@ -684,20 +685,18 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) xfs_defer_init(args->dfops, args->firstblock); error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ - if (!error) - error = xfs_defer_finish(&args->trans, - args->dfops, dp); - if (error) { - args->trans = NULL; - xfs_defer_cancel(args->dfops); - return error; - } + if (error) + goto out_defer_cancel; + xfs_defer_ijoin(args->dfops, dp); + error = xfs_defer_finish(&args->trans, args->dfops); + if (error) + goto out_defer_cancel; } /* * Commit the remove and start the next trans in series. */ - error = xfs_trans_roll(&args->trans, dp); + error = xfs_trans_roll_inode(&args->trans, dp); } else if (args->rmtblkno > 0) { /* @@ -706,6 +705,10 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) error = xfs_attr3_leaf_clearflag(args); } return error; +out_defer_cancel: + xfs_defer_cancel(args->dfops); + args->trans = NULL; + return error; } /* @@ -747,15 +750,18 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) xfs_defer_init(args->dfops, args->firstblock); error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ - if (!error) - error = xfs_defer_finish(&args->trans, args->dfops, dp); - if (error) { - args->trans = NULL; - xfs_defer_cancel(args->dfops); - return error; - } + if (error) + goto out_defer_cancel; + xfs_defer_ijoin(args->dfops, dp); + error = xfs_defer_finish(&args->trans, args->dfops); + if (error) + goto out_defer_cancel; } return 0; +out_defer_cancel: + xfs_defer_cancel(args->dfops); + args->trans = NULL; + return error; } /* @@ -872,20 +878,18 @@ restart: state = NULL; xfs_defer_init(args->dfops, args->firstblock); error = xfs_attr3_leaf_to_node(args); - if (!error) - error = xfs_defer_finish(&args->trans, - args->dfops, dp); - if (error) { - args->trans = NULL; - xfs_defer_cancel(args->dfops); - goto out; - } + if (error) + goto out_defer_cancel; + xfs_defer_ijoin(args->dfops, dp); + error = xfs_defer_finish(&args->trans, args->dfops); + if (error) + goto out_defer_cancel; /* * Commit the node conversion and start the next * trans in the chain. */ - error = xfs_trans_roll(&args->trans, dp); + error = xfs_trans_roll_inode(&args->trans, dp); if (error) goto out; @@ -900,13 +904,12 @@ restart: */ xfs_defer_init(args->dfops, args->firstblock); error = xfs_da3_split(state); - if (!error) - error = xfs_defer_finish(&args->trans, args->dfops, dp); - if (error) { - args->trans = NULL; - xfs_defer_cancel(args->dfops); - goto out; - } + if (error) + goto out_defer_cancel; + xfs_defer_ijoin(args->dfops, dp); + error = xfs_defer_finish(&args->trans, args->dfops); + if (error) + goto out_defer_cancel; } else { /* * Addition succeeded, update Btree hashvals. @@ -925,7 +928,7 @@ restart: * Commit the leaf addition or btree split and start the next * trans in the chain. */ - error = xfs_trans_roll(&args->trans, dp); + error = xfs_trans_roll_inode(&args->trans, dp); if (error) goto out; @@ -999,20 +1002,18 @@ restart: if (retval && (state->path.active > 1)) { xfs_defer_init(args->dfops, args->firstblock); error = xfs_da3_join(state); - if (!error) - error = xfs_defer_finish(&args->trans, - args->dfops, dp); - if (error) { - args->trans = NULL; - xfs_defer_cancel(args->dfops); - goto out; - } + if (error) + goto out_defer_cancel; + xfs_defer_ijoin(args->dfops, dp); + error = xfs_defer_finish(&args->trans, args->dfops); + if (error) + goto out_defer_cancel; } /* * Commit and start the next trans in the chain. */ - error = xfs_trans_roll(&args->trans, dp); + error = xfs_trans_roll_inode(&args->trans, dp); if (error) goto out; @@ -1032,6 +1033,10 @@ out: if (error) return error; return retval; +out_defer_cancel: + xfs_defer_cancel(args->dfops); + args->trans = NULL; + goto out; } /* @@ -1122,17 +1127,16 @@ xfs_attr_node_removename(xfs_da_args_t *args) if (retval && (state->path.active > 1)) { xfs_defer_init(args->dfops, args->firstblock); error = xfs_da3_join(state); - if (!error) - error = xfs_defer_finish(&args->trans, args->dfops, dp); - if (error) { - args->trans = NULL; - xfs_defer_cancel(args->dfops); - goto out; - } + if (error) + goto out_defer_cancel; + xfs_defer_ijoin(args->dfops, dp); + error = xfs_defer_finish(&args->trans, args->dfops); + if (error) + goto out_defer_cancel; /* * Commit the Btree join operation and start a new trans. */ - error = xfs_trans_roll(&args->trans, dp); + error = xfs_trans_roll_inode(&args->trans, dp); if (error) goto out; } @@ -1156,14 +1160,12 @@ xfs_attr_node_removename(xfs_da_args_t *args) xfs_defer_init(args->dfops, args->firstblock); error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ - if (!error) - error = xfs_defer_finish(&args->trans, - args->dfops, dp); - if (error) { - args->trans = NULL; - xfs_defer_cancel(args->dfops); - goto out; - } + if (error) + goto out_defer_cancel; + xfs_defer_ijoin(args->dfops, dp); + error = xfs_defer_finish(&args->trans, args->dfops); + if (error) + goto out_defer_cancel; } else xfs_trans_brelse(args->trans, bp); } @@ -1172,6 +1174,10 @@ xfs_attr_node_removename(xfs_da_args_t *args) out: xfs_da_state_free(state); return error; +out_defer_cancel: + xfs_defer_cancel(args->dfops); + args->trans = NULL; + goto out; } /* diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index c6c15e5..5c16db8 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -2608,7 +2608,7 @@ xfs_attr3_leaf_clearflag( /* * Commit the flag value change and start the next trans in series. */ - return xfs_trans_roll(&args->trans, args->dp); + return xfs_trans_roll_inode(&args->trans, args->dp); } /* @@ -2659,7 +2659,7 @@ xfs_attr3_leaf_setflag( /* * Commit the flag value change and start the next trans in series. */ - return xfs_trans_roll(&args->trans, args->dp); + return xfs_trans_roll_inode(&args->trans, args->dp); } /* @@ -2777,7 +2777,7 @@ xfs_attr3_leaf_flipflags( /* * Commit the flag value change and start the next trans in series. */ - error = xfs_trans_roll(&args->trans, args->dp); + error = xfs_trans_roll_inode(&args->trans, args->dp); return error; } diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 5236d8e..d56caf0 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -467,13 +467,12 @@ xfs_attr_rmtval_set( error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno, blkcnt, XFS_BMAPI_ATTRFORK, args->firstblock, args->total, &map, &nmap, args->dfops); - if (!error) - error = xfs_defer_finish(&args->trans, args->dfops, dp); - if (error) { - args->trans = NULL; - xfs_defer_cancel(args->dfops); - return error; - } + if (error) + goto out_defer_cancel; + xfs_defer_ijoin(args->dfops, dp); + error = xfs_defer_finish(&args->trans, args->dfops); + if (error) + goto out_defer_cancel; ASSERT(nmap == 1); ASSERT((map.br_startblock != DELAYSTARTBLOCK) && @@ -484,7 +483,7 @@ xfs_attr_rmtval_set( /* * Start the next trans in the chain. */ - error = xfs_trans_roll(&args->trans, dp); + error = xfs_trans_roll_inode(&args->trans, dp); if (error) return error; } @@ -539,6 +538,10 @@ xfs_attr_rmtval_set( } ASSERT(valuelen == 0); return 0; +out_defer_cancel: + xfs_defer_cancel(args->dfops); + args->trans = NULL; + return error; } /* @@ -609,21 +612,23 @@ xfs_attr_rmtval_remove( error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, XFS_BMAPI_ATTRFORK, 1, args->firstblock, args->dfops, &done); - if (!error) - error = xfs_defer_finish(&args->trans, args->dfops, - args->dp); - if (error) { - args->trans = NULL; - xfs_defer_cancel(args->dfops); - return error; - } + if (error) + goto out_defer_cancel; + xfs_defer_ijoin(args->dfops, args->dp); + error = xfs_defer_finish(&args->trans, args->dfops); + if (error) + goto out_defer_cancel; /* * Close out trans and start the next one in the chain. */ - error = xfs_trans_roll(&args->trans, args->dp); + error = xfs_trans_roll_inode(&args->trans, args->dp); if (error) return error; } return 0; +out_defer_cancel: + xfs_defer_cancel(args->dfops); + args->trans = NULL; + return error; } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index c09c16b..459f4b4f0 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -579,7 +579,7 @@ xfs_bmap_validate_ret( #else #define xfs_bmap_check_leaf_extents(cur, ip, whichfork) do { } while (0) -#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap) +#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap) do { } while (0) #endif /* DEBUG */ /* @@ -880,7 +880,7 @@ xfs_bmap_local_to_extents( xfs_ifork_t *ifp; /* inode fork pointer */ xfs_alloc_arg_t args; /* allocation arguments */ xfs_buf_t *bp; /* buffer for extent block */ - xfs_bmbt_rec_host_t *ep; /* extent record pointer */ + struct xfs_bmbt_irec rec; /* * We don't want to deal with the case of keeping inode data inline yet. @@ -943,9 +943,12 @@ xfs_bmap_local_to_extents( xfs_bmap_local_to_extents_empty(ip, whichfork); flags |= XFS_ILOG_CORE; - xfs_iext_add(ifp, 0, 1); - ep = xfs_iext_get_ext(ifp, 0); - xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM); + rec.br_startoff = 0; + rec.br_startblock = args.fsbno; + rec.br_blockcount = 1; + rec.br_state = XFS_EXT_NORM; + xfs_iext_insert(ip, 0, 1, &rec, 0); + trace_xfs_bmap_post_update(ip, 0, whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0, _THIS_IP_); @@ -1196,7 +1199,7 @@ xfs_bmap_add_attrfork( xfs_log_sb(tp); } - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto bmap_cancel; error = xfs_trans_commit(tp); @@ -1356,7 +1359,6 @@ xfs_bmap_first_unused( xfs_fileoff_t lastaddr; /* last block number seen */ xfs_fileoff_t lowest; /* lowest useful block */ xfs_fileoff_t max; /* starting useful block */ - xfs_fileoff_t off; /* offset for this block */ xfs_extnum_t nextents; /* number of extent entries */ ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE || @@ -1373,16 +1375,19 @@ xfs_bmap_first_unused( lowest = *first_unused; nextents = xfs_iext_count(ifp); for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) { - xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx); - off = xfs_bmbt_get_startoff(ep); + struct xfs_bmbt_irec got; + + xfs_iext_get_extent(ifp, idx, &got); + /* * See if the hole before this extent will work. */ - if (off >= lowest + len && off - max >= len) { + if (got.br_startoff >= lowest + len && + got.br_startoff - max >= len) { *first_unused = max; return 0; } - lastaddr = off + xfs_bmbt_get_blockcount(ep); + lastaddr = got.br_startoff + got.br_blockcount; max = XFS_FILEOFF_MAX(lastaddr, lowest); } *first_unused = max; @@ -4918,7 +4923,7 @@ xfs_bmap_del_extent_delay( da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, got->br_blockcount), da_old); got->br_startblock = nullstartblock((int)da_new); - xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got); + xfs_iext_update_extent(ifp, *idx, got); trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); break; case BMAP_RIGHT_CONTIG: @@ -4930,7 +4935,7 @@ xfs_bmap_del_extent_delay( da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, got->br_blockcount), da_old); got->br_startblock = nullstartblock((int)da_new); - xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got); + xfs_iext_update_extent(ifp, *idx, got); trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); break; case 0: @@ -4956,7 +4961,7 @@ xfs_bmap_del_extent_delay( del->br_blockcount); got->br_startblock = nullstartblock((int)got_indlen); - xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got); + xfs_iext_update_extent(ifp, *idx, got); trace_xfs_bmap_post_update(ip, *idx, 0, _THIS_IP_); new.br_startoff = del_endoff; @@ -5026,7 +5031,7 @@ xfs_bmap_del_extent_cow( got->br_startoff = del_endoff; got->br_blockcount -= del->br_blockcount; got->br_startblock = del->br_startblock + del->br_blockcount; - xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got); + xfs_iext_update_extent(ifp, *idx, got); trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); break; case BMAP_RIGHT_CONTIG: @@ -5035,7 +5040,7 @@ xfs_bmap_del_extent_cow( */ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); got->br_blockcount -= del->br_blockcount; - xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got); + xfs_iext_update_extent(ifp, *idx, got); trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); break; case 0: @@ -5044,7 +5049,7 @@ xfs_bmap_del_extent_cow( */ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); got->br_blockcount = del->br_startoff - got->br_startoff; - xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got); + xfs_iext_update_extent(ifp, *idx, got); trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); new.br_startoff = del_endoff; @@ -5876,32 +5881,26 @@ xfs_bmse_merge( int whichfork, xfs_fileoff_t shift, /* shift fsb */ int current_ext, /* idx of gotp */ - struct xfs_bmbt_rec_host *gotp, /* extent to shift */ - struct xfs_bmbt_rec_host *leftp, /* preceding extent */ + struct xfs_bmbt_irec *got, /* extent to shift */ + struct xfs_bmbt_irec *left, /* preceding extent */ struct xfs_btree_cur *cur, - int *logflags) /* output */ + int *logflags, /* output */ + struct xfs_defer_ops *dfops) { - struct xfs_bmbt_irec got; - struct xfs_bmbt_irec left; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_bmbt_irec new; xfs_filblks_t blockcount; int error, i; struct xfs_mount *mp = ip->i_mount; - xfs_bmbt_get_all(gotp, &got); - xfs_bmbt_get_all(leftp, &left); - blockcount = left.br_blockcount + got.br_blockcount; + blockcount = left->br_blockcount + got->br_blockcount; ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT(xfs_bmse_can_merge(&left, &got, shift)); + ASSERT(xfs_bmse_can_merge(left, got, shift)); - /* - * Merge the in-core extents. Note that the host record pointers and - * current_ext index are invalid once the extent has been removed via - * xfs_iext_remove(). - */ - xfs_bmbt_set_blockcount(leftp, blockcount); - xfs_iext_remove(ip, current_ext, 1, 0); + new = *left; + new.br_blockcount = blockcount; /* * Update the on-disk extent count, the btree if necessary and log the @@ -5912,12 +5911,12 @@ xfs_bmse_merge( *logflags |= XFS_ILOG_CORE; if (!cur) { *logflags |= XFS_ILOG_DEXT; - return 0; + goto done; } /* lookup and remove the extent to merge */ - error = xfs_bmbt_lookup_eq(cur, got.br_startoff, got.br_startblock, - got.br_blockcount, &i); + error = xfs_bmbt_lookup_eq(cur, got->br_startoff, got->br_startblock, + got->br_blockcount, &i); if (error) return error; XFS_WANT_CORRUPTED_RETURN(mp, i == 1); @@ -5928,16 +5927,28 @@ xfs_bmse_merge( XFS_WANT_CORRUPTED_RETURN(mp, i == 1); /* lookup and update size of the previous extent */ - error = xfs_bmbt_lookup_eq(cur, left.br_startoff, left.br_startblock, - left.br_blockcount, &i); + error = xfs_bmbt_lookup_eq(cur, left->br_startoff, left->br_startblock, + left->br_blockcount, &i); if (error) return error; XFS_WANT_CORRUPTED_RETURN(mp, i == 1); - left.br_blockcount = blockcount; + error = xfs_bmbt_update(cur, new.br_startoff, new.br_startblock, + new.br_blockcount, new.br_state); + if (error) + return error; + +done: + xfs_iext_update_extent(ifp, current_ext - 1, &new); + xfs_iext_remove(ip, current_ext, 1, 0); - return xfs_bmbt_update(cur, left.br_startoff, left.br_startblock, - left.br_blockcount, left.br_state); + /* update reverse mapping. rmap functions merge the rmaps for us */ + error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, got); + if (error) + return error; + memcpy(&new, got, sizeof(new)); + new.br_startoff = left->br_startoff + left->br_blockcount; + return xfs_rmap_map_extent(mp, dfops, ip, whichfork, &new); } /* @@ -5949,7 +5960,7 @@ xfs_bmse_shift_one( int whichfork, xfs_fileoff_t offset_shift_fsb, int *current_ext, - struct xfs_bmbt_rec_host *gotp, + struct xfs_bmbt_irec *got, struct xfs_btree_cur *cur, int *logflags, enum shift_direction direction, @@ -5958,9 +5969,7 @@ xfs_bmse_shift_one( struct xfs_ifork *ifp; struct xfs_mount *mp; xfs_fileoff_t startoff; - struct xfs_bmbt_rec_host *adj_irecp; - struct xfs_bmbt_irec got; - struct xfs_bmbt_irec adj_irec; + struct xfs_bmbt_irec adj_irec, new; int error; int i; int total_extents; @@ -5969,13 +5978,11 @@ xfs_bmse_shift_one( ifp = XFS_IFORK_PTR(ip, whichfork); total_extents = xfs_iext_count(ifp); - xfs_bmbt_get_all(gotp, &got); - /* delalloc extents should be prevented by caller */ - XFS_WANT_CORRUPTED_RETURN(mp, !isnullstartblock(got.br_startblock)); + XFS_WANT_CORRUPTED_RETURN(mp, !isnullstartblock(got->br_startblock)); if (direction == SHIFT_LEFT) { - startoff = got.br_startoff - offset_shift_fsb; + startoff = got->br_startoff - offset_shift_fsb; /* * Check for merge if we've got an extent to the left, @@ -5983,46 +5990,39 @@ xfs_bmse_shift_one( * of the file for the shift. */ if (!*current_ext) { - if (got.br_startoff < offset_shift_fsb) + if (got->br_startoff < offset_shift_fsb) return -EINVAL; goto update_current_ext; } + /* - * grab the left extent and check for a large - * enough hole. + * grab the left extent and check for a large enough hole. */ - adj_irecp = xfs_iext_get_ext(ifp, *current_ext - 1); - xfs_bmbt_get_all(adj_irecp, &adj_irec); - - if (startoff < - adj_irec.br_startoff + adj_irec.br_blockcount) + xfs_iext_get_extent(ifp, *current_ext - 1, &adj_irec); + if (startoff < adj_irec.br_startoff + adj_irec.br_blockcount) return -EINVAL; /* check whether to merge the extent or shift it down */ - if (xfs_bmse_can_merge(&adj_irec, &got, - offset_shift_fsb)) { - error = xfs_bmse_merge(ip, whichfork, offset_shift_fsb, - *current_ext, gotp, adj_irecp, - cur, logflags); - if (error) - return error; - adj_irec = got; - goto update_rmap; + if (xfs_bmse_can_merge(&adj_irec, got, offset_shift_fsb)) { + return xfs_bmse_merge(ip, whichfork, offset_shift_fsb, + *current_ext, got, &adj_irec, + cur, logflags, dfops); } } else { - startoff = got.br_startoff + offset_shift_fsb; + startoff = got->br_startoff + offset_shift_fsb; /* nothing to move if this is the last extent */ if (*current_ext >= (total_extents - 1)) goto update_current_ext; + /* * If this is not the last extent in the file, make sure there * is enough room between current extent and next extent for * accommodating the shift. */ - adj_irecp = xfs_iext_get_ext(ifp, *current_ext + 1); - xfs_bmbt_get_all(adj_irecp, &adj_irec); - if (startoff + got.br_blockcount > adj_irec.br_startoff) + xfs_iext_get_extent(ifp, *current_ext + 1, &adj_irec); + if (startoff + got->br_blockcount > adj_irec.br_startoff) return -EINVAL; + /* * Unlike a left shift (which involves a hole punch), * a right shift does not modify extent neighbors @@ -6030,45 +6030,48 @@ xfs_bmse_shift_one( * in this scenario. Check anyways and warn if we * encounter two extents that could be one. */ - if (xfs_bmse_can_merge(&got, &adj_irec, offset_shift_fsb)) + if (xfs_bmse_can_merge(got, &adj_irec, offset_shift_fsb)) WARN_ON_ONCE(1); } + /* * Increment the extent index for the next iteration, update the start * offset of the in-core extent and update the btree if applicable. */ update_current_ext: - if (direction == SHIFT_LEFT) - (*current_ext)++; - else - (*current_ext)--; - xfs_bmbt_set_startoff(gotp, startoff); *logflags |= XFS_ILOG_CORE; - adj_irec = got; - if (!cur) { + + new = *got; + new.br_startoff = startoff; + + if (cur) { + error = xfs_bmbt_lookup_eq(cur, got->br_startoff, + got->br_startblock, got->br_blockcount, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + + error = xfs_bmbt_update(cur, new.br_startoff, + new.br_startblock, new.br_blockcount, + new.br_state); + if (error) + return error; + } else { *logflags |= XFS_ILOG_DEXT; - goto update_rmap; } - error = xfs_bmbt_lookup_eq(cur, got.br_startoff, got.br_startblock, - got.br_blockcount, &i); - if (error) - return error; - XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + xfs_iext_update_extent(ifp, *current_ext, &new); - got.br_startoff = startoff; - error = xfs_bmbt_update(cur, got.br_startoff, got.br_startblock, - got.br_blockcount, got.br_state); - if (error) - return error; + if (direction == SHIFT_LEFT) + (*current_ext)++; + else + (*current_ext)--; -update_rmap: /* update reverse mapping */ - error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, &adj_irec); + error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, got); if (error) return error; - adj_irec.br_startoff = startoff; - return xfs_rmap_map_extent(mp, dfops, ip, whichfork, &adj_irec); + return xfs_rmap_map_extent(mp, dfops, ip, whichfork, &new); } /* @@ -6095,7 +6098,6 @@ xfs_bmap_shift_extents( int num_exts) { struct xfs_btree_cur *cur = NULL; - struct xfs_bmbt_rec_host *gotp; struct xfs_bmbt_irec got; struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp; @@ -6122,7 +6124,6 @@ xfs_bmap_shift_extents( ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT); - ASSERT(*next_fsb != NULLFSBLOCK || direction == SHIFT_RIGHT); ifp = XFS_IFORK_PTR(ip, whichfork); if (!(ifp->if_flags & XFS_IFEXTENTS)) { @@ -6154,10 +6155,26 @@ xfs_bmap_shift_extents( * In case of first right shift, we need to initialize next_fsb */ if (*next_fsb == NULLFSBLOCK) { - gotp = xfs_iext_get_ext(ifp, total_extents - 1); - xfs_bmbt_get_all(gotp, &got); + ASSERT(direction == SHIFT_RIGHT); + + current_ext = total_extents - 1; + xfs_iext_get_extent(ifp, current_ext, &got); + if (stop_fsb > got.br_startoff) { + *done = 1; + goto del_cursor; + } *next_fsb = got.br_startoff; - if (stop_fsb > *next_fsb) { + } else { + /* + * Look up the extent index for the fsb where we start shifting. We can + * henceforth iterate with current_ext as extent list changes are locked + * out via ilock. + * + * If next_fsb lies in a hole beyond which there are no extents we are + * done. + */ + if (!xfs_iext_lookup_extent(ip, ifp, *next_fsb, ¤t_ext, + &got)) { *done = 1; goto del_cursor; } @@ -6165,37 +6182,26 @@ xfs_bmap_shift_extents( /* Lookup the extent index at which we have to stop */ if (direction == SHIFT_RIGHT) { - gotp = xfs_iext_bno_to_ext(ifp, stop_fsb, &stop_extent); + struct xfs_bmbt_irec s; + + xfs_iext_lookup_extent(ip, ifp, stop_fsb, &stop_extent, &s); /* Make stop_extent exclusive of shift range */ stop_extent--; - } else + if (current_ext <= stop_extent) { + error = -EIO; + goto del_cursor; + } + } else { stop_extent = total_extents; - - /* - * Look up the extent index for the fsb where we start shifting. We can - * henceforth iterate with current_ext as extent list changes are locked - * out via ilock. - * - * gotp can be null in 2 cases: 1) if there are no extents or 2) - * *next_fsb lies in a hole beyond which there are no extents. Either - * way, we are done. - */ - gotp = xfs_iext_bno_to_ext(ifp, *next_fsb, ¤t_ext); - if (!gotp) { - *done = 1; - goto del_cursor; - } - - /* some sanity checking before we finally start shifting extents */ - if ((direction == SHIFT_LEFT && current_ext >= stop_extent) || - (direction == SHIFT_RIGHT && current_ext <= stop_extent)) { - error = -EIO; - goto del_cursor; + if (current_ext >= stop_extent) { + error = -EIO; + goto del_cursor; + } } while (nexts++ < num_exts) { error = xfs_bmse_shift_one(ip, whichfork, offset_shift_fsb, - ¤t_ext, gotp, cur, &logflags, + ¤t_ext, &got, cur, &logflags, direction, dfops); if (error) goto del_cursor; @@ -6213,13 +6219,11 @@ xfs_bmap_shift_extents( *next_fsb = NULLFSBLOCK; break; } - gotp = xfs_iext_get_ext(ifp, current_ext); + xfs_iext_get_extent(ifp, current_ext, &got); } - if (!*done) { - xfs_bmbt_get_all(gotp, &got); + if (!*done) *next_fsb = got.br_startoff; - } del_cursor: if (cur) @@ -6248,7 +6252,6 @@ xfs_bmap_split_extent_at( { int whichfork = XFS_DATA_FORK; struct xfs_btree_cur *cur = NULL; - struct xfs_bmbt_rec_host *gotp; struct xfs_bmbt_irec got; struct xfs_bmbt_irec new; /* split extent */ struct xfs_mount *mp = ip->i_mount; @@ -6280,21 +6283,10 @@ xfs_bmap_split_extent_at( } /* - * gotp can be null in 2 cases: 1) if there are no extents - * or 2) split_fsb lies in a hole beyond which there are - * no extents. Either way, we are done. - */ - gotp = xfs_iext_bno_to_ext(ifp, split_fsb, ¤t_ext); - if (!gotp) - return 0; - - xfs_bmbt_get_all(gotp, &got); - - /* - * Check split_fsb lies in a hole or the start boundary offset - * of the extent. + * If there are not extents, or split_fsb lies in a hole we are done. */ - if (got.br_startoff >= split_fsb) + if (!xfs_iext_lookup_extent(ip, ifp, split_fsb, ¤t_ext, &got) || + got.br_startoff >= split_fsb) return 0; gotblkcnt = split_fsb - got.br_startoff; @@ -6317,8 +6309,8 @@ xfs_bmap_split_extent_at( XFS_WANT_CORRUPTED_GOTO(mp, i == 1, del_cursor); } - xfs_bmbt_set_blockcount(gotp, gotblkcnt); got.br_blockcount = gotblkcnt; + xfs_iext_update_extent(ifp, current_ext, &got); logflags = XFS_ILOG_CORE; if (cur) { @@ -6402,7 +6394,7 @@ xfs_bmap_split_extent( if (error) goto out; - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out; @@ -6452,7 +6444,7 @@ __xfs_bmap_add( bi->bi_whichfork = whichfork; bi->bi_bmap = *bmap; - error = xfs_defer_join(dfops, bi->bi_owner); + error = xfs_defer_ijoin(dfops, bi->bi_owner); if (error) { kmem_free(bi); return error; diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 85de225..a6331ff 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -858,6 +858,7 @@ xfs_bmbt_change_owner( cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork); if (!cur) return -ENOMEM; + cur->bc_private.b.flags |= XFS_BTCUR_BPRV_INVALID_OWNER; error = xfs_btree_change_owner(cur, new_owner, buffer_list); xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index e0bcc4a..5bfb882 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -1791,6 +1791,7 @@ xfs_btree_lookup_get_block( /* Check the inode owner since the verifiers don't. */ if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) && + !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_INVALID_OWNER) && (cur->bc_flags & XFS_BTREE_LONG_PTRS) && be64_to_cpu((*blkp)->bb_u.l.bb_owner) != cur->bc_private.b.ip->i_ino) @@ -4451,10 +4452,15 @@ xfs_btree_block_change_owner( /* modify the owner */ block = xfs_btree_get_block(cur, level, &bp); - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner)) + return 0; block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner); - else + } else { + if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner)) + return 0; block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner); + } /* * If the block is a root block hosted in an inode, we might not have a @@ -4463,16 +4469,19 @@ xfs_btree_block_change_owner( * block is formatted into the on-disk inode fork. We still change it, * though, so everything is consistent in memory. */ - if (bp) { - if (cur->bc_tp) { - xfs_trans_ordered_buf(cur->bc_tp, bp); + if (!bp) { + ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); + ASSERT(level == cur->bc_nlevels - 1); + return 0; + } + + if (cur->bc_tp) { + if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) { xfs_btree_log_block(cur, bp, XFS_BB_OWNER); - } else { - xfs_buf_delwri_queue(bp, bbcoi->buffer_list); + return -EAGAIN; } } else { - ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); - ASSERT(level == cur->bc_nlevels - 1); + xfs_buf_delwri_queue(bp, bbcoi->buffer_list); } return 0; diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 9c95e96..f2a88c3 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -233,7 +233,8 @@ typedef struct xfs_btree_cur short forksize; /* fork's inode space */ char whichfork; /* data or attr fork */ char flags; /* flags */ -#define XFS_BTCUR_BPRV_WASDEL 1 /* was delayed */ +#define XFS_BTCUR_BPRV_WASDEL (1<<0) /* was delayed */ +#define XFS_BTCUR_BPRV_INVALID_OWNER (1<<1) /* for ext swap */ } b; } bc_private; /* per-btree type data */ } xfs_btree_cur_t; diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 5c2929f..072ebfe 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -240,23 +240,19 @@ xfs_defer_trans_abort( STATIC int xfs_defer_trans_roll( struct xfs_trans **tp, - struct xfs_defer_ops *dop, - struct xfs_inode *ip) + struct xfs_defer_ops *dop) { int i; int error; - /* Log all the joined inodes except the one we passed in. */ - for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) { - if (dop->dop_inodes[i] == ip) - continue; + /* Log all the joined inodes. */ + for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) xfs_trans_log_inode(*tp, dop->dop_inodes[i], XFS_ILOG_CORE); - } trace_xfs_defer_trans_roll((*tp)->t_mountp, dop); /* Roll the transaction. */ - error = xfs_trans_roll(tp, ip); + error = xfs_trans_roll(tp); if (error) { trace_xfs_defer_trans_roll_error((*tp)->t_mountp, dop, error); xfs_defer_trans_abort(*tp, dop, error); @@ -264,12 +260,9 @@ xfs_defer_trans_roll( } dop->dop_committed = true; - /* Rejoin the joined inodes except the one we passed in. */ - for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) { - if (dop->dop_inodes[i] == ip) - continue; + /* Rejoin the joined inodes. */ + for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) xfs_trans_ijoin(*tp, dop->dop_inodes[i], 0); - } return error; } @@ -284,11 +277,10 @@ xfs_defer_has_unfinished_work( /* * Add this inode to the deferred op. Each joined inode is relogged - * each time we roll the transaction, in addition to any inode passed - * to xfs_defer_finish(). + * each time we roll the transaction. */ int -xfs_defer_join( +xfs_defer_ijoin( struct xfs_defer_ops *dop, struct xfs_inode *ip) { @@ -317,8 +309,7 @@ xfs_defer_join( int xfs_defer_finish( struct xfs_trans **tp, - struct xfs_defer_ops *dop, - struct xfs_inode *ip) + struct xfs_defer_ops *dop) { struct xfs_defer_pending *dfp; struct list_head *li; @@ -337,7 +328,7 @@ xfs_defer_finish( xfs_defer_intake_work(*tp, dop); /* Roll the transaction. */ - error = xfs_defer_trans_roll(tp, dop, ip); + error = xfs_defer_trans_roll(tp, dop); if (error) goto out; diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index f6e93ef..d4f046d 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -72,12 +72,11 @@ struct xfs_defer_ops { void xfs_defer_add(struct xfs_defer_ops *dop, enum xfs_defer_ops_type type, struct list_head *h); -int xfs_defer_finish(struct xfs_trans **tp, struct xfs_defer_ops *dop, - struct xfs_inode *ip); +int xfs_defer_finish(struct xfs_trans **tp, struct xfs_defer_ops *dop); void xfs_defer_cancel(struct xfs_defer_ops *dop); void xfs_defer_init(struct xfs_defer_ops *dop, xfs_fsblock_t *fbp); bool xfs_defer_has_unfinished_work(struct xfs_defer_ops *dop); -int xfs_defer_join(struct xfs_defer_ops *dop, struct xfs_inode *ip); +int xfs_defer_ijoin(struct xfs_defer_ops *dop, struct xfs_inode *ip); /* Description of a deferred type. */ struct xfs_defer_op_type { diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index abf5bea..988bb3f 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -378,8 +378,6 @@ xfs_ialloc_inode_init( * transaction and pin the log appropriately. */ xfs_trans_ordered_buf(tp, fbuf); - xfs_trans_log_buf(tp, fbuf, 0, - BBTOB(fbuf->b_length) - 1); } } else { fbuf->b_flags |= XBF_DONE; @@ -1133,6 +1131,7 @@ xfs_dialloc_ag_inobt( int error; int offset; int i, j; + int searchdistance = 10; pag = xfs_perag_get(mp, agno); @@ -1159,7 +1158,6 @@ xfs_dialloc_ag_inobt( if (pagno == agno) { int doneleft; /* done, to the left */ int doneright; /* done, to the right */ - int searchdistance = 10; error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i); if (error) @@ -1220,21 +1218,9 @@ xfs_dialloc_ag_inobt( /* * Loop until we find an inode chunk with a free inode. */ - while (!doneleft || !doneright) { + while (--searchdistance > 0 && (!doneleft || !doneright)) { int useleft; /* using left inode chunk this time */ - if (!--searchdistance) { - /* - * Not in range - save last search - * location and allocate a new inode - */ - xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); - pag->pagl_leftrec = trec.ir_startino; - pag->pagl_rightrec = rec.ir_startino; - pag->pagl_pagino = pagino; - goto newino; - } - /* figure out the closer block if both are valid. */ if (!doneleft && !doneright) { useleft = pagino - @@ -1278,26 +1264,37 @@ xfs_dialloc_ag_inobt( goto error1; } - /* - * We've reached the end of the btree. because - * we are only searching a small chunk of the - * btree each search, there is obviously free - * inodes closer to the parent inode than we - * are now. restart the search again. - */ - pag->pagl_pagino = NULLAGINO; - pag->pagl_leftrec = NULLAGINO; - pag->pagl_rightrec = NULLAGINO; - xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); - xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); - goto restart_pagno; + if (searchdistance <= 0) { + /* + * Not in range - save last search + * location and allocate a new inode + */ + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + pag->pagl_leftrec = trec.ir_startino; + pag->pagl_rightrec = rec.ir_startino; + pag->pagl_pagino = pagino; + + } else { + /* + * We've reached the end of the btree. because + * we are only searching a small chunk of the + * btree each search, there is obviously free + * inodes closer to the parent inode than we + * are now. restart the search again. + */ + pag->pagl_pagino = NULLAGINO; + pag->pagl_leftrec = NULLAGINO; + pag->pagl_rightrec = NULLAGINO; + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + goto restart_pagno; + } } /* * In a different AG from the parent. * See if the most recently allocated block has any free. */ -newino: if (agi->agi_newino != cpu_to_be32(NULLAGINO)) { error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino), XFS_LOOKUP_EQ, &i); diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 0e80f34..31840ca 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -1499,14 +1499,11 @@ xfs_iext_realloc_indirect( xfs_ifork_t *ifp, /* inode fork pointer */ int new_size) /* new indirection array size */ { - int nlists; /* number of irec's (ex lists) */ - int size; /* current indirection array size */ - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - size = nlists * sizeof(xfs_ext_irec_t); ASSERT(ifp->if_real_bytes); - ASSERT((new_size >= 0) && (new_size != size)); + ASSERT((new_size >= 0) && + (new_size != ((ifp->if_real_bytes / XFS_IEXT_BUFSZ) * + sizeof(xfs_ext_irec_t)))); if (new_size == 0) { xfs_iext_destroy(ifp); } else { @@ -2023,3 +2020,15 @@ xfs_iext_get_extent( xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), gotp); return true; } + +void +xfs_iext_update_extent( + struct xfs_ifork *ifp, + xfs_extnum_t idx, + struct xfs_bmbt_irec *gotp) +{ + ASSERT(idx >= 0); + ASSERT(idx < xfs_iext_count(ifp)); + + xfs_bmbt_set_all(xfs_iext_get_ext(ifp, idx), gotp); +} diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 7fb8365..11af705 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -187,6 +187,8 @@ bool xfs_iext_lookup_extent(struct xfs_inode *ip, xfs_extnum_t *idxp, struct xfs_bmbt_irec *gotp); bool xfs_iext_get_extent(struct xfs_ifork *ifp, xfs_extnum_t idx, struct xfs_bmbt_irec *gotp); +void xfs_iext_update_extent(struct xfs_ifork *ifp, xfs_extnum_t idx, + struct xfs_bmbt_irec *gotp); extern struct kmem_zone *xfs_ifork_zone; diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 45b1c3b..9d5406b 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1679,7 +1679,7 @@ xfs_refcount_recover_cow_leftovers( xfs_bmap_add_free(mp, &dfops, fsb, rr->rr_rrec.rc_blockcount, NULL); - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_defer; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 6bf120b..f9efd67 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -85,11 +85,11 @@ xfs_find_bdev_for_inode( * associated buffer_heads, paying attention to the start and end offsets that * we need to process on the page. * - * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last - * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or - * the page at all, as we may be racing with memory reclaim and it can free both - * the bufferhead chain and the page as it will see the page as clean and - * unused. + * Note that we open code the action in end_buffer_async_write here so that we + * only have to iterate over the buffers attached to the page once. This is not + * only more efficient, but also ensures that we only calls end_page_writeback + * at the end of the iteration, and thus avoids the pitfall of having the page + * and buffers potentially freed after every call to end_buffer_async_write. */ static void xfs_finish_page_writeback( @@ -97,29 +97,44 @@ xfs_finish_page_writeback( struct bio_vec *bvec, int error) { - unsigned int end = bvec->bv_offset + bvec->bv_len - 1; - struct buffer_head *head, *bh, *next; + struct buffer_head *head = page_buffers(bvec->bv_page), *bh = head; + bool busy = false; unsigned int off = 0; - unsigned int bsize; + unsigned long flags; ASSERT(bvec->bv_offset < PAGE_SIZE); ASSERT((bvec->bv_offset & (i_blocksize(inode) - 1)) == 0); - ASSERT(end < PAGE_SIZE); + ASSERT(bvec->bv_offset + bvec->bv_len <= PAGE_SIZE); ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0); - bh = head = page_buffers(bvec->bv_page); - - bsize = bh->b_size; + local_irq_save(flags); + bit_spin_lock(BH_Uptodate_Lock, &head->b_state); do { - if (off > end) - break; - next = bh->b_this_page; - if (off < bvec->bv_offset) - goto next_bh; - bh->b_end_io(bh, !error); -next_bh: - off += bsize; - } while ((bh = next) != head); + if (off >= bvec->bv_offset && + off < bvec->bv_offset + bvec->bv_len) { + ASSERT(buffer_async_write(bh)); + ASSERT(bh->b_end_io == NULL); + + if (error) { + mark_buffer_write_io_error(bh); + clear_buffer_uptodate(bh); + SetPageError(bvec->bv_page); + } else { + set_buffer_uptodate(bh); + } + clear_buffer_async_write(bh); + unlock_buffer(bh); + } else if (buffer_async_write(bh)) { + ASSERT(buffer_locked(bh)); + busy = true; + } + off += bh->b_size; + } while ((bh = bh->b_this_page) != head); + bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); + local_irq_restore(flags); + + if (!busy) + end_page_writeback(bvec->bv_page); } /* @@ -133,8 +148,10 @@ xfs_destroy_ioend( int error) { struct inode *inode = ioend->io_inode; - struct bio *last = ioend->io_bio; - struct bio *bio, *next; + struct bio *bio = &ioend->io_inline_bio; + struct bio *last = ioend->io_bio, *next; + u64 start = bio->bi_iter.bi_sector; + bool quiet = bio_flagged(bio, BIO_QUIET); for (bio = &ioend->io_inline_bio; bio; bio = next) { struct bio_vec *bvec; @@ -155,6 +172,11 @@ xfs_destroy_ioend( bio_put(bio); } + + if (unlikely(error && !quiet)) { + xfs_err_ratelimited(XFS_I(inode)->i_mount, + "writeback error on sector %llu", start); + } } /* @@ -423,7 +445,8 @@ xfs_start_buffer_writeback( ASSERT(!buffer_delay(bh)); ASSERT(!buffer_unwritten(bh)); - mark_buffer_async_write(bh); + bh->b_end_io = NULL; + set_buffer_async_write(bh); set_buffer_uptodate(bh); clear_buffer_dirty(bh); } diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index be0b79d..ebd66b1 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -97,7 +97,7 @@ xfs_attr3_leaf_freextent( /* * Roll to next transaction. */ - error = xfs_trans_roll(trans, dp); + error = xfs_trans_roll_inode(trans, dp); if (error) return error; } @@ -308,7 +308,7 @@ xfs_attr3_node_inactive( /* * Atomically commit the whole invalidate stuff. */ - error = xfs_trans_roll(trans, dp); + error = xfs_trans_roll_inode(trans, dp); if (error) return error; } @@ -375,7 +375,7 @@ xfs_attr3_root_inactive( /* * Commit the invalidate and start the next transaction. */ - error = xfs_trans_roll(trans, dp); + error = xfs_trans_roll_inode(trans, dp); return error; } diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 8807391..dd136f7 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -502,7 +502,7 @@ xfs_bui_recover( } /* Finish transaction, free inodes. */ - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto err_dfops; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 93e9552..cd9a540 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -222,22 +222,21 @@ xfs_bmap_eof( * Count leaf blocks given a range of extent records. Delayed allocation * extents are not counted towards the totals. */ -STATIC void +xfs_extnum_t xfs_bmap_count_leaves( struct xfs_ifork *ifp, - xfs_extnum_t *numrecs, xfs_filblks_t *count) { - xfs_extnum_t i; - xfs_extnum_t nr_exts = xfs_iext_count(ifp); - - for (i = 0; i < nr_exts; i++) { - xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, i); - if (!isnullstartblock(xfs_bmbt_get_startblock(frp))) { - (*numrecs)++; - *count += xfs_bmbt_get_blockcount(frp); + struct xfs_bmbt_irec got; + xfs_extnum_t numrecs = 0, i = 0; + + while (xfs_iext_get_extent(ifp, i++, &got)) { + if (!isnullstartblock(got.br_startblock)) { + *count += got.br_blockcount; + numrecs++; } } + return numrecs; } /* @@ -370,7 +369,7 @@ xfs_bmap_count_blocks( switch (XFS_IFORK_FORMAT(ip, whichfork)) { case XFS_DINODE_FMT_EXTENTS: - xfs_bmap_count_leaves(ifp, nextents, count); + *nextents = xfs_bmap_count_leaves(ifp, count); return 0; case XFS_DINODE_FMT_BTREE: if (!(ifp->if_flags & XFS_IFEXTENTS)) { @@ -1136,7 +1135,7 @@ xfs_alloc_file_space( /* * Complete the transaction */ - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto error0; @@ -1202,7 +1201,8 @@ xfs_unmap_extent( if (error) goto out_bmap_cancel; - error = xfs_defer_finish(&tp, &dfops, ip); + xfs_defer_ijoin(&dfops, ip); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_bmap_cancel; @@ -1496,7 +1496,7 @@ xfs_shift_file_space( if (error) goto out_bmap_cancel; - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_bmap_cancel; @@ -1777,7 +1777,8 @@ xfs_swap_extent_rmap( if (error) goto out_defer; - error = xfs_defer_finish(tpp, &dfops, ip); + xfs_defer_ijoin(&dfops, ip); + error = xfs_defer_finish(tpp, &dfops); if (error) goto out_defer; @@ -1840,29 +1841,18 @@ xfs_swap_extent_forks( } /* - * Before we've swapped the forks, lets set the owners of the forks - * appropriately. We have to do this as we are demand paging the btree - * buffers, and so the validation done on read will expect the owner - * field to be correctly set. Once we change the owners, we can swap the - * inode forks. + * Btree format (v3) inodes have the inode number stamped in the bmbt + * block headers. We can't start changing the bmbt blocks until the + * inode owner change is logged so recovery does the right thing in the + * event of a crash. Set the owner change log flags now and leave the + * bmbt scan as the last step. */ if (ip->i_d.di_version == 3 && - ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + ip->i_d.di_format == XFS_DINODE_FMT_BTREE) (*target_log_flags) |= XFS_ILOG_DOWNER; - error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, - tip->i_ino, NULL); - if (error) - return error; - } - if (tip->i_d.di_version == 3 && - tip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + tip->i_d.di_format == XFS_DINODE_FMT_BTREE) (*src_log_flags) |= XFS_ILOG_DOWNER; - error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK, - ip->i_ino, NULL); - if (error) - return error; - } /* * Swap the data forks of the inodes @@ -1940,6 +1930,48 @@ xfs_swap_extent_forks( return 0; } +/* + * Fix up the owners of the bmbt blocks to refer to the current inode. The + * change owner scan attempts to order all modified buffers in the current + * transaction. In the event of ordered buffer failure, the offending buffer is + * physically logged as a fallback and the scan returns -EAGAIN. We must roll + * the transaction in this case to replenish the fallback log reservation and + * restart the scan. This process repeats until the scan completes. + */ +static int +xfs_swap_change_owner( + struct xfs_trans **tpp, + struct xfs_inode *ip, + struct xfs_inode *tmpip) +{ + int error; + struct xfs_trans *tp = *tpp; + + do { + error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, ip->i_ino, + NULL); + /* success or fatal error */ + if (error != -EAGAIN) + break; + + error = xfs_trans_roll(tpp); + if (error) + break; + tp = *tpp; + + /* + * Redirty both inodes so they can relog and keep the log tail + * moving forward. + */ + xfs_trans_ijoin(tp, ip, 0); + xfs_trans_ijoin(tp, tmpip, 0); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + xfs_trans_log_inode(tp, tmpip, XFS_ILOG_CORE); + } while (true); + + return error; +} + int xfs_swap_extents( struct xfs_inode *ip, /* target inode */ @@ -1954,7 +1986,7 @@ xfs_swap_extents( int lock_flags; struct xfs_ifork *cowfp; uint64_t f; - int resblks; + int resblks = 0; /* * Lock the inodes against other IO, page faults and truncate to @@ -2002,11 +2034,8 @@ xfs_swap_extents( XFS_SWAP_RMAP_SPACE_RES(mp, XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK), XFS_DATA_FORK); - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, - 0, 0, &tp); - } else - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, - 0, 0, &tp); + } + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); if (error) goto out_unlock; @@ -2092,6 +2121,23 @@ xfs_swap_extents( xfs_trans_log_inode(tp, tip, target_log_flags); /* + * The extent forks have been swapped, but crc=1,rmapbt=0 filesystems + * have inode number owner values in the bmbt blocks that still refer to + * the old inode. Scan each bmbt to fix up the owner values with the + * inode number of the current inode. + */ + if (src_log_flags & XFS_ILOG_DOWNER) { + error = xfs_swap_change_owner(&tp, ip, tip); + if (error) + goto out_trans_cancel; + } + if (target_log_flags & XFS_ILOG_DOWNER) { + error = xfs_swap_change_owner(&tp, tip, ip); + if (error) + goto out_trans_cancel; + } + + /* * If this is a synchronous mount, make sure that the * transaction goes to disk before returning to the user. */ diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 0cede10..0eaa81d 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -70,6 +70,7 @@ int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip, xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb); +xfs_extnum_t xfs_bmap_count_leaves(struct xfs_ifork *ifp, xfs_filblks_t *count); int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, xfs_extnum_t *nextents, xfs_filblks_t *count); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index f6a8422..e0a0af0 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -29,6 +29,7 @@ #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_log.h" +#include "xfs_inode.h" kmem_zone_t *xfs_buf_item_zone; @@ -322,6 +323,8 @@ xfs_buf_item_format( ASSERT((bip->bli_flags & XFS_BLI_STALE) || (xfs_blft_from_flags(&bip->__bli_format) > XFS_BLFT_UNKNOWN_BUF && xfs_blft_from_flags(&bip->__bli_format) < XFS_BLFT_MAX_BUF)); + ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED) || + (bip->bli_flags & XFS_BLI_STALE)); /* @@ -346,16 +349,6 @@ xfs_buf_item_format( bip->bli_flags &= ~XFS_BLI_INODE_BUF; } - if ((bip->bli_flags & (XFS_BLI_ORDERED|XFS_BLI_STALE)) == - XFS_BLI_ORDERED) { - /* - * The buffer has been logged just to order it. It is not being - * included in the transaction commit, so don't format it. - */ - trace_xfs_buf_item_format_ordered(bip); - return; - } - for (i = 0; i < bip->bli_format_count; i++) { xfs_buf_item_format_segment(bip, lv, &vecp, offset, &bip->bli_formats[i]); @@ -574,26 +567,20 @@ xfs_buf_item_unlock( { struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; - bool clean; - bool aborted; - int flags; + bool aborted = !!(lip->li_flags & XFS_LI_ABORTED); + bool hold = !!(bip->bli_flags & XFS_BLI_HOLD); + bool dirty = !!(bip->bli_flags & XFS_BLI_DIRTY); +#if defined(DEBUG) || defined(XFS_WARN) + bool ordered = !!(bip->bli_flags & XFS_BLI_ORDERED); +#endif /* Clear the buffer's association with this transaction. */ bp->b_transp = NULL; /* - * If this is a transaction abort, don't return early. Instead, allow - * the brelse to happen. Normally it would be done for stale - * (cancelled) buffers at unpin time, but we'll never go through the - * pin/unpin cycle if we abort inside commit. + * The per-transaction state has been copied above so clear it from the + * bli. */ - aborted = (lip->li_flags & XFS_LI_ABORTED) ? true : false; - /* - * Before possibly freeing the buf item, copy the per-transaction state - * so we can reference it safely later after clearing it from the - * buffer log item. - */ - flags = bip->bli_flags; bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED); /* @@ -601,7 +588,7 @@ xfs_buf_item_unlock( * unlock the buffer and free the buf item when the buffer is unpinned * for the last time. */ - if (flags & XFS_BLI_STALE) { + if (bip->bli_flags & XFS_BLI_STALE) { trace_xfs_buf_item_unlock_stale(bip); ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); if (!aborted) { @@ -619,20 +606,11 @@ xfs_buf_item_unlock( * regardless of whether it is dirty or not. A dirty abort implies a * shutdown, anyway. * - * Ordered buffers are dirty but may have no recorded changes, so ensure - * we only release clean items here. + * The bli dirty state should match whether the blf has logged segments + * except for ordered buffers, where only the bli should be dirty. */ - clean = (flags & XFS_BLI_DIRTY) ? false : true; - if (clean) { - int i; - for (i = 0; i < bip->bli_format_count; i++) { - if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map, - bip->bli_formats[i].blf_map_size)) { - clean = false; - break; - } - } - } + ASSERT((!ordered && dirty == xfs_buf_item_dirty_format(bip)) || + (ordered && dirty && !xfs_buf_item_dirty_format(bip))); /* * Clean buffers, by definition, cannot be in the AIL. However, aborted @@ -651,11 +629,11 @@ xfs_buf_item_unlock( ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp)); xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR); xfs_buf_item_relse(bp); - } else if (clean) + } else if (!dirty) xfs_buf_item_relse(bp); } - if (!(flags & XFS_BLI_HOLD)) + if (!hold) xfs_buf_relse(bp); } @@ -945,14 +923,22 @@ xfs_buf_item_log( /* - * Return 1 if the buffer has been logged or ordered in a transaction (at any - * point, not just the current transaction) and 0 if not. + * Return true if the buffer has any ranges logged/dirtied by a transaction, + * false otherwise. */ -uint -xfs_buf_item_dirty( - xfs_buf_log_item_t *bip) +bool +xfs_buf_item_dirty_format( + struct xfs_buf_log_item *bip) { - return (bip->bli_flags & XFS_BLI_DIRTY); + int i; + + for (i = 0; i < bip->bli_format_count; i++) { + if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map, + bip->bli_formats[i].blf_map_size)) + return true; + } + + return false; } STATIC void @@ -1054,6 +1040,31 @@ xfs_buf_do_callbacks( } } +/* + * Invoke the error state callback for each log item affected by the failed I/O. + * + * If a metadata buffer write fails with a non-permanent error, the buffer is + * eventually resubmitted and so the completion callbacks are not run. The error + * state may need to be propagated to the log items attached to the buffer, + * however, so the next AIL push of the item knows hot to handle it correctly. + */ +STATIC void +xfs_buf_do_callbacks_fail( + struct xfs_buf *bp) +{ + struct xfs_log_item *next; + struct xfs_log_item *lip = bp->b_fspriv; + struct xfs_ail *ailp = lip->li_ailp; + + spin_lock(&ailp->xa_lock); + for (; lip; lip = next) { + next = lip->li_bio_list; + if (lip->li_ops->iop_error) + lip->li_ops->iop_error(lip, bp); + } + spin_unlock(&ailp->xa_lock); +} + static bool xfs_buf_iodone_callback_error( struct xfs_buf *bp) @@ -1123,7 +1134,11 @@ xfs_buf_iodone_callback_error( if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount) goto permanent_error; - /* still a transient error, higher layers will retry */ + /* + * Still a transient error, run IO completion failure callbacks and let + * the higher layers retry the buffer. + */ + xfs_buf_do_callbacks_fail(bp); xfs_buf_ioerror(bp, 0); xfs_buf_relse(bp); return true; @@ -1204,3 +1219,31 @@ xfs_buf_iodone( xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE); xfs_buf_item_free(BUF_ITEM(lip)); } + +/* + * Requeue a failed buffer for writeback + * + * Return true if the buffer has been re-queued properly, false otherwise + */ +bool +xfs_buf_resubmit_failed_buffers( + struct xfs_buf *bp, + struct xfs_log_item *lip, + struct list_head *buffer_list) +{ + struct xfs_log_item *next; + + /* + * Clear XFS_LI_FAILED flag from all items before resubmit + * + * XFS_LI_FAILED set/clear is protected by xa_lock, caller this + * function already have it acquired + */ + for (; lip; lip = next) { + next = lip->li_bio_list; + xfs_clear_li_failed(lip); + } + + /* Add this buffer back to the delayed write list */ + return xfs_buf_delwri_queue(bp, buffer_list); +} diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index f7eba99..9690ce6 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -64,12 +64,15 @@ typedef struct xfs_buf_log_item { int xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); void xfs_buf_item_relse(struct xfs_buf *); void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint); -uint xfs_buf_item_dirty(xfs_buf_log_item_t *); +bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *); void xfs_buf_attach_iodone(struct xfs_buf *, void(*)(struct xfs_buf *, xfs_log_item_t *), xfs_log_item_t *); void xfs_buf_iodone_callbacks(struct xfs_buf *); void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *); +bool xfs_buf_resubmit_failed_buffers(struct xfs_buf *, + struct xfs_log_item *, + struct list_head *); extern kmem_zone_t *xfs_buf_item_zone; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index fd2ef8c..cd82429 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -383,7 +383,7 @@ xfs_qm_dqalloc( xfs_trans_bhold(tp, bp); - error = xfs_defer_finish(tpp, &dfops, NULL); + error = xfs_defer_finish(tpp, &dfops); if (error) goto error1; diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 2f4feb9..bd786a9 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -57,6 +57,7 @@ static unsigned int xfs_errortag_random_default[] = { XFS_RANDOM_AG_RESV_CRITICAL, XFS_RANDOM_DROP_WRITES, XFS_RANDOM_LOG_BAD_CRC, + XFS_RANDOM_LOG_ITEM_PIN, }; struct xfs_errortag_attr { @@ -161,6 +162,7 @@ XFS_ERRORTAG_ATTR_RW(bmap_finish_one, XFS_ERRTAG_BMAP_FINISH_ONE); XFS_ERRORTAG_ATTR_RW(ag_resv_critical, XFS_ERRTAG_AG_RESV_CRITICAL); XFS_ERRORTAG_ATTR_RW(drop_writes, XFS_ERRTAG_DROP_WRITES); XFS_ERRORTAG_ATTR_RW(log_bad_crc, XFS_ERRTAG_LOG_BAD_CRC); +XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN); static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(noerror), @@ -193,6 +195,7 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(ag_resv_critical), XFS_ERRORTAG_ATTR_LIST(drop_writes), XFS_ERRORTAG_ATTR_LIST(log_bad_crc), + XFS_ERRORTAG_ATTR_LIST(log_item_pin), NULL, }; diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 7577be5..7c4bef3 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -106,7 +106,8 @@ extern void xfs_verifier_error(struct xfs_buf *bp); */ #define XFS_ERRTAG_DROP_WRITES 28 #define XFS_ERRTAG_LOG_BAD_CRC 29 -#define XFS_ERRTAG_MAX 30 +#define XFS_ERRTAG_LOG_ITEM_PIN 30 +#define XFS_ERRTAG_MAX 31 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -141,6 +142,7 @@ extern void xfs_verifier_error(struct xfs_buf *bp); #define XFS_RANDOM_AG_RESV_CRITICAL 4 #define XFS_RANDOM_DROP_WRITES 1 #define XFS_RANDOM_LOG_BAD_CRC 1 +#define XFS_RANDOM_LOG_ITEM_PIN 1 #ifdef DEBUG extern int xfs_errortag_init(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 62db8ff..ec3e44f 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1011,96 +1011,67 @@ xfs_file_llseek( * page_lock (MM) * i_lock (XFS - extent map serialisation) */ - -/* - * mmap()d file has taken write protection fault and is being made writable. We - * can set the page state up correctly for a writable page, which means we can - * do correct delalloc accounting (ENOSPC checking!) and unwritten extent - * mapping. - */ -STATIC int -xfs_filemap_page_mkwrite( - struct vm_fault *vmf) +static int +__xfs_filemap_fault( + struct vm_fault *vmf, + enum page_entry_size pe_size, + bool write_fault) { struct inode *inode = file_inode(vmf->vma->vm_file); + struct xfs_inode *ip = XFS_I(inode); int ret; - trace_xfs_filemap_page_mkwrite(XFS_I(inode)); + trace_xfs_filemap_fault(ip, pe_size, write_fault); - sb_start_pagefault(inode->i_sb); - file_update_time(vmf->vma->vm_file); - xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + if (write_fault) { + sb_start_pagefault(inode->i_sb); + file_update_time(vmf->vma->vm_file); + } + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (IS_DAX(inode)) { - ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops); + ret = dax_iomap_fault(vmf, pe_size, &xfs_iomap_ops); } else { - ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops); - ret = block_page_mkwrite_return(ret); + if (write_fault) + ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops); + else + ret = filemap_fault(vmf); } - xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - sb_end_pagefault(inode->i_sb); + if (write_fault) + sb_end_pagefault(inode->i_sb); return ret; } -STATIC int +static int xfs_filemap_fault( struct vm_fault *vmf) { - struct inode *inode = file_inode(vmf->vma->vm_file); - int ret; - - trace_xfs_filemap_fault(XFS_I(inode)); - /* DAX can shortcut the normal fault path on write faults! */ - if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(inode)) - return xfs_filemap_page_mkwrite(vmf); - - xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - if (IS_DAX(inode)) - ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops); - else - ret = filemap_fault(vmf); - xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - - return ret; + return __xfs_filemap_fault(vmf, PE_SIZE_PTE, + IS_DAX(file_inode(vmf->vma->vm_file)) && + (vmf->flags & FAULT_FLAG_WRITE)); } -/* - * Similar to xfs_filemap_fault(), the DAX fault path can call into here on - * both read and write faults. Hence we need to handle both cases. There is no - * ->huge_mkwrite callout for huge pages, so we have a single function here to - * handle both cases here. @flags carries the information on the type of fault - * occuring. - */ -STATIC int +static int xfs_filemap_huge_fault( struct vm_fault *vmf, enum page_entry_size pe_size) { - struct inode *inode = file_inode(vmf->vma->vm_file); - struct xfs_inode *ip = XFS_I(inode); - int ret; - - if (!IS_DAX(inode)) + if (!IS_DAX(file_inode(vmf->vma->vm_file))) return VM_FAULT_FALLBACK; - trace_xfs_filemap_huge_fault(ip); - - if (vmf->flags & FAULT_FLAG_WRITE) { - sb_start_pagefault(inode->i_sb); - file_update_time(vmf->vma->vm_file); - } - - xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - ret = dax_iomap_fault(vmf, pe_size, &xfs_iomap_ops); - xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - - if (vmf->flags & FAULT_FLAG_WRITE) - sb_end_pagefault(inode->i_sb); + /* DAX can shortcut the normal fault path on write faults! */ + return __xfs_filemap_fault(vmf, pe_size, + (vmf->flags & FAULT_FLAG_WRITE)); +} - return ret; +static int +xfs_filemap_page_mkwrite( + struct vm_fault *vmf) +{ + return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true); } /* diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 0a9e698..3422711 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1124,11 +1124,11 @@ reclaim: * Because we use RCU freeing we need to ensure the inode always appears * to be reclaimed with an invalid inode number when in the free state. * We do this as early as possible under the ILOCK so that - * xfs_iflush_cluster() can be guaranteed to detect races with us here. - * By doing this, we guarantee that once xfs_iflush_cluster has locked - * XFS_ILOCK that it will see either a valid, flushable inode that will - * serialise correctly, or it will see a clean (and invalid) inode that - * it can skip. + * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to + * detect races with us here. By doing this, we guarantee that once + * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that + * it will see either a valid inode that will serialise correctly, or it + * will see an invalid inode that it can skip. */ spin_lock(&ip->i_flags_lock); ip->i_flags = XFS_IRECLAIM; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ff48f00..5599dda 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1055,7 +1055,7 @@ xfs_dir_ialloc( tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY); } - code = xfs_trans_roll(&tp, NULL); + code = xfs_trans_roll(&tp); if (committed != NULL) *committed = 1; @@ -1285,7 +1285,7 @@ xfs_create( */ xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_bmap_cancel; @@ -1513,7 +1513,7 @@ xfs_link( if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) xfs_trans_set_sync(tp); - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) { xfs_defer_cancel(&dfops); goto error_return; @@ -1607,11 +1607,12 @@ xfs_itruncate_extents( * Duplicate the transaction that has the permanent * reservation and commit the old transaction. */ - error = xfs_defer_finish(&tp, &dfops, ip); + xfs_defer_ijoin(&dfops, ip); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_bmap_cancel; - error = xfs_trans_roll(&tp, ip); + error = xfs_trans_roll_inode(&tp, ip); if (error) goto out; } @@ -1855,7 +1856,7 @@ xfs_inactive_ifree( * Just ignore errors at this point. There is nothing we can do except * to try to keep going. Make sure it's not a silent error. */ - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) { xfs_notice(mp, "%s: xfs_defer_finish returned error %d", __func__, error); @@ -2359,11 +2360,24 @@ retry: * already marked stale. If we can't lock it, back off * and retry. */ - if (ip != free_ip && - !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { - rcu_read_unlock(); - delay(1); - goto retry; + if (ip != free_ip) { + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { + rcu_read_unlock(); + delay(1); + goto retry; + } + + /* + * Check the inode number again in case we're + * racing with freeing in xfs_reclaim_inode(). + * See the comments in that function for more + * information as to why the initial check is + * not sufficient. + */ + if (ip->i_ino != inum + i) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + continue; + } } rcu_read_unlock(); @@ -2637,7 +2651,7 @@ xfs_remove( if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) xfs_trans_set_sync(tp); - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_bmap_cancel; @@ -2723,7 +2737,7 @@ xfs_finish_rename( if (tp->t_mountp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) xfs_trans_set_sync(tp); - error = xfs_defer_finish(&tp, dfops, NULL); + error = xfs_defer_finish(&tp, dfops); if (error) { xfs_defer_cancel(dfops); xfs_trans_cancel(tp); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 013cc78..6d0f74e 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -27,6 +27,7 @@ #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_trans_priv.h" +#include "xfs_buf_item.h" #include "xfs_log.h" @@ -475,6 +476,23 @@ xfs_inode_item_unpin( wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT); } +/* + * Callback used to mark a buffer with XFS_LI_FAILED when items in the buffer + * have been failed during writeback + * + * This informs the AIL that the inode is already flush locked on the next push, + * and acquires a hold on the buffer to ensure that it isn't reclaimed before + * dirty data makes it to disk. + */ +STATIC void +xfs_inode_item_error( + struct xfs_log_item *lip, + struct xfs_buf *bp) +{ + ASSERT(xfs_isiflocked(INODE_ITEM(lip)->ili_inode)); + xfs_set_li_failed(lip, bp); +} + STATIC uint xfs_inode_item_push( struct xfs_log_item *lip, @@ -484,13 +502,28 @@ xfs_inode_item_push( { struct xfs_inode_log_item *iip = INODE_ITEM(lip); struct xfs_inode *ip = iip->ili_inode; - struct xfs_buf *bp = NULL; + struct xfs_buf *bp = lip->li_buf; uint rval = XFS_ITEM_SUCCESS; int error; if (xfs_ipincount(ip) > 0) return XFS_ITEM_PINNED; + /* + * The buffer containing this item failed to be written back + * previously. Resubmit the buffer for IO. + */ + if (lip->li_flags & XFS_LI_FAILED) { + if (!xfs_buf_trylock(bp)) + return XFS_ITEM_LOCKED; + + if (!xfs_buf_resubmit_failed_buffers(bp, lip, buffer_list)) + rval = XFS_ITEM_FLUSHING; + + xfs_buf_unlock(bp); + return rval; + } + if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) return XFS_ITEM_LOCKED; @@ -622,7 +655,8 @@ static const struct xfs_item_ops xfs_inode_item_ops = { .iop_unlock = xfs_inode_item_unlock, .iop_committed = xfs_inode_item_committed, .iop_push = xfs_inode_item_push, - .iop_committing = xfs_inode_item_committing + .iop_committing = xfs_inode_item_committing, + .iop_error = xfs_inode_item_error }; @@ -710,7 +744,8 @@ xfs_iflush_done( * the AIL lock. */ iip = INODE_ITEM(blip); - if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) + if ((iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) || + lip->li_flags & XFS_LI_FAILED) need_ail++; blip = next; @@ -718,7 +753,8 @@ xfs_iflush_done( /* make sure we capture the state of the initial inode. */ iip = INODE_ITEM(lip); - if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) + if ((iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) || + lip->li_flags & XFS_LI_FAILED) need_ail++; /* @@ -739,6 +775,9 @@ xfs_iflush_done( if (INODE_ITEM(blip)->ili_logged && blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn) mlip_changed |= xfs_ail_delete_one(ailp, blip); + else { + xfs_clear_li_failed(blip); + } } if (mlip_changed) { diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 9c0c7a9..5049e8a 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -931,16 +931,15 @@ xfs_ioc_fsgetxattr( return 0; } -STATIC void -xfs_set_diflags( +STATIC uint16_t +xfs_flags2diflags( struct xfs_inode *ip, unsigned int xflags) { - unsigned int di_flags; - uint64_t di_flags2; - /* can't set PREALLOC this way, just preserve it */ - di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC); + uint16_t di_flags = + (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC); + if (xflags & FS_XFLAG_IMMUTABLE) di_flags |= XFS_DIFLAG_IMMUTABLE; if (xflags & FS_XFLAG_APPEND) @@ -970,19 +969,24 @@ xfs_set_diflags( if (xflags & FS_XFLAG_EXTSIZE) di_flags |= XFS_DIFLAG_EXTSIZE; } - ip->i_d.di_flags = di_flags; - /* diflags2 only valid for v3 inodes. */ - if (ip->i_d.di_version < 3) - return; + return di_flags; +} + +STATIC uint64_t +xfs_flags2diflags2( + struct xfs_inode *ip, + unsigned int xflags) +{ + uint64_t di_flags2 = + (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK); - di_flags2 = (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK); if (xflags & FS_XFLAG_DAX) di_flags2 |= XFS_DIFLAG2_DAX; if (xflags & FS_XFLAG_COWEXTSIZE) di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; - ip->i_d.di_flags2 = di_flags2; + return di_flags2; } STATIC void @@ -1008,11 +1012,12 @@ xfs_diflags_to_linux( inode->i_flags |= S_NOATIME; else inode->i_flags &= ~S_NOATIME; +#if 0 /* disabled until the flag switching races are sorted out */ if (xflags & FS_XFLAG_DAX) inode->i_flags |= S_DAX; else inode->i_flags &= ~S_DAX; - +#endif } static int @@ -1022,6 +1027,7 @@ xfs_ioctl_setattr_xflags( struct fsxattr *fa) { struct xfs_mount *mp = ip->i_mount; + uint64_t di_flags2; /* Can't change realtime flag if any extents are allocated. */ if ((ip->i_d.di_nextents || ip->i_delayed_blks) && @@ -1052,7 +1058,14 @@ xfs_ioctl_setattr_xflags( !capable(CAP_LINUX_IMMUTABLE)) return -EPERM; - xfs_set_diflags(ip, fa->fsx_xflags); + /* diflags2 only valid for v3 inodes. */ + di_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags); + if (di_flags2 && ip->i_d.di_version < 3) + return -EINVAL; + + ip->i_d.di_flags = xfs_flags2diflags(ip, fa->fsx_xflags); + ip->i_d.di_flags2 = di_flags2; + xfs_diflags_to_linux(ip); xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 813394c..79cb5b3 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -274,7 +274,7 @@ xfs_iomap_write_direct( /* * Complete the transaction */ - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_bmap_cancel; @@ -520,7 +520,6 @@ xfs_file_iomap_begin_delay( struct inode *inode, loff_t offset, loff_t count, - unsigned flags, struct iomap *iomap) { struct xfs_inode *ip = XFS_I(inode); @@ -784,7 +783,7 @@ xfs_iomap_write_allocate( if (error) goto trans_cancel; - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto trans_cancel; @@ -906,7 +905,7 @@ xfs_iomap_write_unwritten( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto error_on_bmapi_transaction; @@ -984,8 +983,7 @@ xfs_file_iomap_begin( if (((flags & (IOMAP_WRITE | IOMAP_DIRECT)) == IOMAP_WRITE) && !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) { /* Reserve delalloc blocks for regular writeback. */ - return xfs_file_iomap_begin_delay(inode, offset, length, flags, - iomap); + return xfs_file_iomap_begin_delay(inode, offset, length, iomap); } if (need_excl_ilock(ip, flags)) { diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 469c9fa..17081c7 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -817,7 +817,7 @@ xfs_vn_setattr_nonsize( * Caution: The caller of this function is responsible for calling * setattr_prepare() or otherwise verifying the change is fine. */ -int +STATIC int xfs_setattr_size( struct xfs_inode *ip, struct iattr *iattr) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 4ebd0ba..c5107c7 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -743,10 +743,14 @@ xfs_log_mount_finish( struct xfs_mount *mp) { int error = 0; + bool readonly = (mp->m_flags & XFS_MOUNT_RDONLY); if (mp->m_flags & XFS_MOUNT_NORECOVERY) { ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); return 0; + } else if (readonly) { + /* Allow unlinked processing to proceed */ + mp->m_flags &= ~XFS_MOUNT_RDONLY; } /* @@ -757,12 +761,27 @@ xfs_log_mount_finish( * inodes. Turn it off immediately after recovery finishes * so that we don't leak the quota inodes if subsequent mount * activities fail. + * + * We let all inodes involved in redo item processing end up on + * the LRU instead of being evicted immediately so that if we do + * something to an unlinked inode, the irele won't cause + * premature truncation and freeing of the inode, which results + * in log recovery failure. We have to evict the unreferenced + * lru inodes after clearing MS_ACTIVE because we don't + * otherwise clean up the lru if there's a subsequent failure in + * xfs_mountfs, which leads to us leaking the inodes if nothing + * else (e.g. quotacheck) references the inodes before the + * mount failure occurs. */ mp->m_super->s_flags |= MS_ACTIVE; error = xlog_recover_finish(mp->m_log); if (!error) xfs_log_work_queue(mp); mp->m_super->s_flags &= ~MS_ACTIVE; + evict_inodes(mp->m_super); + + if (readonly) + mp->m_flags |= XFS_MOUNT_RDONLY; return error; } @@ -812,11 +831,14 @@ xfs_log_unmount_write(xfs_mount_t *mp) int error; /* - * Don't write out unmount record on read-only mounts. + * Don't write out unmount record on norecovery mounts or ro devices. * Or, if we are doing a forced umount (typically because of IO errors). */ - if (mp->m_flags & XFS_MOUNT_RDONLY) + if (mp->m_flags & XFS_MOUNT_NORECOVERY || + xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) { + ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); return 0; + } error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL); ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log))); @@ -3353,8 +3375,6 @@ maybe_sleep: */ if (iclog->ic_state & XLOG_STATE_IOERROR) return -EIO; - if (log_flushed) - *log_flushed = 1; } else { no_sleep: @@ -3458,8 +3478,6 @@ try_again: xlog_wait(&iclog->ic_prev->ic_write_wait, &log->l_icloglock); - if (log_flushed) - *log_flushed = 1; already_slept = 1; goto try_again; } @@ -3493,9 +3511,6 @@ try_again: */ if (iclog->ic_state & XLOG_STATE_IOERROR) return -EIO; - - if (log_flushed) - *log_flushed = 1; } else { /* just return */ spin_unlock(&log->l_icloglock); } diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 9549188..ee34899 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1029,61 +1029,106 @@ out_error: } /* - * Check the log tail for torn writes. This is required when torn writes are - * detected at the head and the head had to be walked back to a previous record. - * The tail of the previous record must now be verified to ensure the torn - * writes didn't corrupt the previous tail. + * Calculate distance from head to tail (i.e., unused space in the log). + */ +static inline int +xlog_tail_distance( + struct xlog *log, + xfs_daddr_t head_blk, + xfs_daddr_t tail_blk) +{ + if (head_blk < tail_blk) + return tail_blk - head_blk; + + return tail_blk + (log->l_logBBsize - head_blk); +} + +/* + * Verify the log tail. This is particularly important when torn or incomplete + * writes have been detected near the front of the log and the head has been + * walked back accordingly. + * + * We also have to handle the case where the tail was pinned and the head + * blocked behind the tail right before a crash. If the tail had been pushed + * immediately prior to the crash and the subsequent checkpoint was only + * partially written, it's possible it overwrote the last referenced tail in the + * log with garbage. This is not a coherency problem because the tail must have + * been pushed before it can be overwritten, but appears as log corruption to + * recovery because we have no way to know the tail was updated if the + * subsequent checkpoint didn't write successfully. * - * Return an error if CRC verification fails as recovery cannot proceed. + * Therefore, CRC check the log from tail to head. If a failure occurs and the + * offending record is within max iclog bufs from the head, walk the tail + * forward and retry until a valid tail is found or corruption is detected out + * of the range of a possible overwrite. */ STATIC int xlog_verify_tail( struct xlog *log, xfs_daddr_t head_blk, - xfs_daddr_t tail_blk) + xfs_daddr_t *tail_blk, + int hsize) { struct xlog_rec_header *thead; struct xfs_buf *bp; xfs_daddr_t first_bad; - int count; int error = 0; bool wrapped; - xfs_daddr_t tmp_head; + xfs_daddr_t tmp_tail; + xfs_daddr_t orig_tail = *tail_blk; bp = xlog_get_bp(log, 1); if (!bp) return -ENOMEM; /* - * Seek XLOG_MAX_ICLOGS + 1 records past the current tail record to get - * a temporary head block that points after the last possible - * concurrently written record of the tail. + * Make sure the tail points to a record (returns positive count on + * success). */ - count = xlog_seek_logrec_hdr(log, head_blk, tail_blk, - XLOG_MAX_ICLOGS + 1, bp, &tmp_head, &thead, - &wrapped); - if (count < 0) { - error = count; + error = xlog_seek_logrec_hdr(log, head_blk, *tail_blk, 1, bp, + &tmp_tail, &thead, &wrapped); + if (error < 0) goto out; - } - - /* - * If the call above didn't find XLOG_MAX_ICLOGS + 1 records, we ran - * into the actual log head. tmp_head points to the start of the record - * so update it to the actual head block. - */ - if (count < XLOG_MAX_ICLOGS + 1) - tmp_head = head_blk; + if (*tail_blk != tmp_tail) + *tail_blk = tmp_tail; /* - * We now have a tail and temporary head block that covers at least - * XLOG_MAX_ICLOGS records from the tail. We need to verify that these - * records were completely written. Run a CRC verification pass from - * tail to head and return the result. + * Run a CRC check from the tail to the head. We can't just check + * MAX_ICLOGS records past the tail because the tail may point to stale + * blocks cleared during the search for the head/tail. These blocks are + * overwritten with zero-length records and thus record count is not a + * reliable indicator of the iclog state before a crash. */ - error = xlog_do_recovery_pass(log, tmp_head, tail_blk, + first_bad = 0; + error = xlog_do_recovery_pass(log, head_blk, *tail_blk, XLOG_RECOVER_CRCPASS, &first_bad); + while ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) { + int tail_distance; + + /* + * Is corruption within range of the head? If so, retry from + * the next record. Otherwise return an error. + */ + tail_distance = xlog_tail_distance(log, head_blk, first_bad); + if (tail_distance > BTOBB(XLOG_MAX_ICLOGS * hsize)) + break; + /* skip to the next record; returns positive count on success */ + error = xlog_seek_logrec_hdr(log, head_blk, first_bad, 2, bp, + &tmp_tail, &thead, &wrapped); + if (error < 0) + goto out; + + *tail_blk = tmp_tail; + first_bad = 0; + error = xlog_do_recovery_pass(log, head_blk, *tail_blk, + XLOG_RECOVER_CRCPASS, &first_bad); + } + + if (!error && *tail_blk != orig_tail) + xfs_warn(log->l_mp, + "Tail block (0x%llx) overwrite detected. Updated to 0x%llx", + orig_tail, *tail_blk); out: xlog_put_bp(bp); return error; @@ -1143,7 +1188,7 @@ xlog_verify_head( */ error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk, XLOG_RECOVER_CRCPASS, &first_bad); - if (error == -EFSBADCRC) { + if ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) { /* * We've hit a potential torn write. Reset the error and warn * about it. @@ -1183,31 +1228,12 @@ xlog_verify_head( ASSERT(0); return 0; } - - /* - * Now verify the tail based on the updated head. This is - * required because the torn writes trimmed from the head could - * have been written over the tail of a previous record. Return - * any errors since recovery cannot proceed if the tail is - * corrupt. - * - * XXX: This leaves a gap in truly robust protection from torn - * writes in the log. If the head is behind the tail, the tail - * pushes forward to create some space and then a crash occurs - * causing the writes into the previous record's tail region to - * tear, log recovery isn't able to recover. - * - * How likely is this to occur? If possible, can we do something - * more intelligent here? Is it safe to push the tail forward if - * we can determine that the tail is within the range of the - * torn write (e.g., the kernel can only overwrite the tail if - * it has actually been pushed forward)? Alternatively, could we - * somehow prevent this condition at runtime? - */ - error = xlog_verify_tail(log, *head_blk, *tail_blk); } + if (error) + return error; - return error; + return xlog_verify_tail(log, *head_blk, tail_blk, + be32_to_cpu((*rhead)->h_size)); } /* @@ -4801,12 +4827,16 @@ xlog_recover_process_intents( int error = 0; struct xfs_ail_cursor cur; struct xfs_ail *ailp; +#if defined(DEBUG) || defined(XFS_WARN) xfs_lsn_t last_lsn; +#endif ailp = log->l_ailp; spin_lock(&ailp->xa_lock); lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); +#if defined(DEBUG) || defined(XFS_WARN) last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block); +#endif while (lip != NULL) { /* * We're done when we see something other than an intent. @@ -5218,7 +5248,7 @@ xlog_do_recovery_pass( xfs_daddr_t *first_bad) /* out: first bad log rec */ { xlog_rec_header_t *rhead; - xfs_daddr_t blk_no; + xfs_daddr_t blk_no, rblk_no; xfs_daddr_t rhead_blk; char *offset; xfs_buf_t *hbp, *dbp; @@ -5231,7 +5261,7 @@ xlog_do_recovery_pass( LIST_HEAD (buffer_list); ASSERT(head_blk != tail_blk); - rhead_blk = 0; + blk_no = rhead_blk = tail_blk; for (i = 0; i < XLOG_RHASH_SIZE; i++) INIT_HLIST_HEAD(&rhash[i]); @@ -5309,7 +5339,6 @@ xlog_do_recovery_pass( } memset(rhash, 0, sizeof(rhash)); - blk_no = rhead_blk = tail_blk; if (tail_blk > head_blk) { /* * Perform recovery around the end of the physical log. @@ -5371,9 +5400,19 @@ xlog_do_recovery_pass( bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); blk_no += hblks; - /* Read in data for log record */ - if (blk_no + bblks <= log->l_logBBsize) { - error = xlog_bread(log, blk_no, bblks, dbp, + /* + * Read the log record data in multiple reads if it + * wraps around the end of the log. Note that if the + * header already wrapped, blk_no could point past the + * end of the log. The record data is contiguous in + * that case. + */ + if (blk_no + bblks <= log->l_logBBsize || + blk_no >= log->l_logBBsize) { + /* mod blk_no in case the header wrapped and + * pushed it beyond the end of the log */ + rblk_no = do_mod(blk_no, log->l_logBBsize); + error = xlog_bread(log, rblk_no, bblks, dbp, &offset); if (error) goto bread_err2; @@ -5563,6 +5602,8 @@ xlog_do_recover( xfs_buf_t *bp; xfs_sb_t *sbp; + trace_xfs_log_recover(log, head_blk, tail_blk); + /* * First replay the images in the log. */ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 15751dc..010a13a 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -31,6 +31,7 @@ #include "xfs_error.h" #include "xfs_bmap.h" #include "xfs_bmap_btree.h" +#include "xfs_bmap_util.h" #include "xfs_trans.h" #include "xfs_trans_space.h" #include "xfs_qm.h" @@ -1120,31 +1121,6 @@ xfs_qm_quotacheck_dqadjust( return 0; } -STATIC int -xfs_qm_get_rtblks( - xfs_inode_t *ip, - xfs_qcnt_t *O_rtblks) -{ - xfs_filblks_t rtblks; /* total rt blks */ - xfs_extnum_t idx; /* extent record index */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_extnum_t nextents; /* number of extent entries */ - int error; - - ASSERT(XFS_IS_REALTIME_INODE(ip)); - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); - if (!(ifp->if_flags & XFS_IFEXTENTS)) { - if ((error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK))) - return error; - } - rtblks = 0; - nextents = xfs_iext_count(ifp); - for (idx = 0; idx < nextents; idx++) - rtblks += xfs_bmbt_get_blockcount(xfs_iext_get_ext(ifp, idx)); - *O_rtblks = (xfs_qcnt_t)rtblks; - return 0; -} - /* * callback routine supplied to bulkstat(). Given an inumber, find its * dquots and update them to account for resources taken by that inode. @@ -1160,7 +1136,8 @@ xfs_qm_dqusage_adjust( int *res) /* result code value */ { xfs_inode_t *ip; - xfs_qcnt_t nblks, rtblks = 0; + xfs_qcnt_t nblks; + xfs_filblks_t rtblks = 0; /* total rt blks */ int error; ASSERT(XFS_IS_QUOTA_RUNNING(mp)); @@ -1190,12 +1167,15 @@ xfs_qm_dqusage_adjust( ASSERT(ip->i_delayed_blks == 0); if (XFS_IS_REALTIME_INODE(ip)) { - /* - * Walk thru the extent list and count the realtime blocks. - */ - error = xfs_qm_get_rtblks(ip, &rtblks); - if (error) - goto error0; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); + if (error) + goto error0; + } + + xfs_bmap_count_leaves(ifp, &rtblks); } nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks; diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 96fe209..8f2e2fa 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -525,7 +525,7 @@ xfs_cui_recover( } xfs_refcount_finish_one_cleanup(tp, rcur, error); - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto abort_defer; set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags); diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index f45fbf0..3246815 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -464,7 +464,7 @@ retry: goto out_bmap_cancel; /* Finish up. */ - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_bmap_cancel; @@ -602,7 +602,8 @@ xfs_reflink_cancel_cow_blocks( -(long)del.br_blockcount); /* Roll the transaction */ - error = xfs_defer_finish(tpp, &dfops, ip); + xfs_defer_ijoin(&dfops, ip); + error = xfs_defer_finish(tpp, &dfops); if (error) { xfs_defer_cancel(&dfops); break; @@ -791,7 +792,8 @@ xfs_reflink_end_cow( /* Remove the mapping from the CoW fork. */ xfs_bmap_del_extent_cow(ip, &idx, &got, &del); - error = xfs_defer_finish(&tp, &dfops, ip); + xfs_defer_ijoin(&dfops, ip); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_defer; next_extent: @@ -1152,7 +1154,8 @@ xfs_reflink_remap_extent( next_extent: /* Process all the deferred stuff. */ - error = xfs_defer_finish(&tp, &dfops, ip); + xfs_defer_ijoin(&dfops, ip); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_defer; } diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 9147219..488719d 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -810,7 +810,7 @@ xfs_growfs_rt_alloc( /* * Free any blocks freed up in the transaction, then commit. */ - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_bmap_cancel; error = xfs_trans_commit(tp); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 38aaacd..c1c4c2e 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1220,7 +1220,7 @@ xfs_test_remount_options( tmp_mp->m_super = sb; error = xfs_parseargs(tmp_mp, options); xfs_free_fsname(tmp_mp); - kfree(tmp_mp); + kmem_free(tmp_mp); return error; } diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 23a50d7..68d3ca2 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -378,7 +378,7 @@ xfs_symlink( xfs_trans_set_sync(tp); } - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_bmap_cancel; @@ -497,7 +497,8 @@ xfs_inactive_symlink_rmt( /* * Commit the first transaction. This logs the EFI and the inode. */ - error = xfs_defer_finish(&tp, &dfops, ip); + xfs_defer_ijoin(&dfops, ip); + error = xfs_defer_finish(&tp, &dfops); if (error) goto error_bmap_cancel; /* diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index bcc3cdf..bb55146 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -517,7 +517,6 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_ordered); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format); -DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_ordered); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_ordered); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin); @@ -689,11 +688,34 @@ DEFINE_INODE_EVENT(xfs_inode_set_cowblocks_tag); DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag); DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid); -DEFINE_INODE_EVENT(xfs_filemap_fault); -DEFINE_INODE_EVENT(xfs_filemap_huge_fault); -DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite); DEFINE_INODE_EVENT(xfs_filemap_pfn_mkwrite); +TRACE_EVENT(xfs_filemap_fault, + TP_PROTO(struct xfs_inode *ip, enum page_entry_size pe_size, + bool write_fault), + TP_ARGS(ip, pe_size, write_fault), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(enum page_entry_size, pe_size) + __field(bool, write_fault) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->pe_size = pe_size; + __entry->write_fault = write_fault; + ), + TP_printk("dev %d:%d ino 0x%llx %s write_fault %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_symbolic(__entry->pe_size, + { PE_SIZE_PTE, "PTE" }, + { PE_SIZE_PMD, "PMD" }, + { PE_SIZE_PUD, "PUD" }), + __entry->write_fault) +) + DECLARE_EVENT_CLASS(xfs_iref_class, TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), TP_ARGS(ip, caller_ip), @@ -1963,6 +1985,24 @@ DEFINE_EVENT(xfs_swap_extent_class, name, \ DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before); DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after); +TRACE_EVENT(xfs_log_recover, + TP_PROTO(struct xlog *log, xfs_daddr_t headblk, xfs_daddr_t tailblk), + TP_ARGS(log, headblk, tailblk), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_daddr_t, headblk) + __field(xfs_daddr_t, tailblk) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->headblk = headblk; + __entry->tailblk = tailblk; + ), + TP_printk("dev %d:%d headblk 0x%llx tailblk 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->headblk, + __entry->tailblk) +) + TRACE_EVENT(xfs_log_recover_record, TP_PROTO(struct xlog *log, struct xlog_rec_header *rhead, int pass), TP_ARGS(log, rhead, pass), diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 2011620..a87f657 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1035,25 +1035,18 @@ xfs_trans_cancel( */ int xfs_trans_roll( - struct xfs_trans **tpp, - struct xfs_inode *dp) + struct xfs_trans **tpp) { - struct xfs_trans *trans; + struct xfs_trans *trans = *tpp; struct xfs_trans_res tres; int error; /* - * Ensure that the inode is always logged. - */ - trans = *tpp; - if (dp) - xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE); - - /* * Copy the critical parameters from one trans to the next. */ tres.tr_logres = trans->t_log_res; tres.tr_logcount = trans->t_log_count; + *tpp = xfs_trans_dup(trans); /* @@ -1067,10 +1060,8 @@ xfs_trans_roll( if (error) return error; - trans = *tpp; - /* - * Reserve space in the log for th next transaction. + * Reserve space in the log for the next transaction. * This also pushes items in the "AIL", the list of logged items, * out to disk if they are taking up space at the tail of the log * that we want to use. This requires that either nothing be locked @@ -1078,14 +1069,5 @@ xfs_trans_roll( * the prior and the next transactions. */ tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; - error = xfs_trans_reserve(trans, &tres, 0, 0); - /* - * Ensure that the inode is in the new transaction and locked. - */ - if (error) - return error; - - if (dp) - xfs_trans_ijoin(trans, dp, 0); - return 0; + return xfs_trans_reserve(*tpp, &tres, 0, 0); } diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 6bdad6f..815b53d2 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -49,6 +49,7 @@ typedef struct xfs_log_item { struct xfs_ail *li_ailp; /* ptr to AIL */ uint li_type; /* item type */ uint li_flags; /* misc flags */ + struct xfs_buf *li_buf; /* real buffer pointer */ struct xfs_log_item *li_bio_list; /* buffer item list */ void (*li_cb)(struct xfs_buf *, struct xfs_log_item *); @@ -64,11 +65,13 @@ typedef struct xfs_log_item { } xfs_log_item_t; #define XFS_LI_IN_AIL 0x1 -#define XFS_LI_ABORTED 0x2 +#define XFS_LI_ABORTED 0x2 +#define XFS_LI_FAILED 0x4 #define XFS_LI_FLAGS \ { XFS_LI_IN_AIL, "IN_AIL" }, \ - { XFS_LI_ABORTED, "ABORTED" } + { XFS_LI_ABORTED, "ABORTED" }, \ + { XFS_LI_FAILED, "FAILED" } struct xfs_item_ops { void (*iop_size)(xfs_log_item_t *, int *, int *); @@ -79,6 +82,7 @@ struct xfs_item_ops { void (*iop_unlock)(xfs_log_item_t *); xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t); void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t); + void (*iop_error)(xfs_log_item_t *, xfs_buf_t *); }; void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item, @@ -208,12 +212,14 @@ void xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *); void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *); void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *); void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *); -void xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *); +bool xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *); void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint); void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *); void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int); void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint); -void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint); +void xfs_trans_log_buf(struct xfs_trans *, struct xfs_buf *, uint, + uint); +void xfs_trans_dirty_buf(struct xfs_trans *, struct xfs_buf *); void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint); void xfs_extent_free_init_defer_op(void); @@ -224,7 +230,8 @@ int xfs_trans_free_extent(struct xfs_trans *, struct xfs_efd_log_item *, xfs_fsblock_t, xfs_extlen_t, struct xfs_owner_info *); int xfs_trans_commit(struct xfs_trans *); -int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *); +int xfs_trans_roll(struct xfs_trans **); +int xfs_trans_roll_inode(struct xfs_trans **, struct xfs_inode *); void xfs_trans_cancel(xfs_trans_t *); int xfs_trans_ail_init(struct xfs_mount *); void xfs_trans_ail_destroy(struct xfs_mount *); diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 9056c0f..354368a 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -325,6 +325,21 @@ xfs_ail_delete( xfs_trans_ail_cursor_clear(ailp, lip); } +static inline uint +xfsaild_push_item( + struct xfs_ail *ailp, + struct xfs_log_item *lip) +{ + /* + * If log item pinning is enabled, skip the push and track the item as + * pinned. This can help induce head-behind-tail conditions. + */ + if (XFS_TEST_ERROR(false, ailp->xa_mount, XFS_ERRTAG_LOG_ITEM_PIN)) + return XFS_ITEM_PINNED; + + return lip->li_ops->iop_push(lip, &ailp->xa_buf_list); +} + static long xfsaild_push( struct xfs_ail *ailp) @@ -382,7 +397,7 @@ xfsaild_push( * rely on the AIL cursor implementation to be able to deal with * the dropped lock. */ - lock_result = lip->li_ops->iop_push(lip, &ailp->xa_buf_list); + lock_result = xfsaild_push_item(ailp, lip); switch (lock_result) { case XFS_ITEM_SUCCESS: XFS_STATS_INC(mp, xs_push_ail_success); @@ -687,12 +702,13 @@ xfs_trans_ail_update_bulk( bool xfs_ail_delete_one( struct xfs_ail *ailp, - struct xfs_log_item *lip) + struct xfs_log_item *lip) { struct xfs_log_item *mlip = xfs_ail_min(ailp); trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn); xfs_ail_delete(ailp, lip); + xfs_clear_li_failed(lip); lip->li_flags &= ~XFS_LI_IN_AIL; lip->li_lsn = 0; diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 86987d8..3ba7a96 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -435,7 +435,7 @@ xfs_trans_brelse(xfs_trans_t *tp, if (XFS_FORCED_SHUTDOWN(tp->t_mountp) && freed) { xfs_trans_ail_remove(&bip->bli_item, SHUTDOWN_LOG_IO_ERROR); xfs_buf_item_relse(bp); - } else if (!xfs_buf_item_dirty(bip)) { + } else if (!(bip->bli_flags & XFS_BLI_DIRTY)) { /*** ASSERT(bp->b_pincount == 0); ***/ @@ -493,25 +493,17 @@ xfs_trans_bhold_release(xfs_trans_t *tp, } /* - * This is called to mark bytes first through last inclusive of the given - * buffer as needing to be logged when the transaction is committed. - * The buffer must already be associated with the given transaction. - * - * First and last are numbers relative to the beginning of this buffer, - * so the first byte in the buffer is numbered 0 regardless of the - * value of b_blkno. + * Mark a buffer dirty in the transaction. */ void -xfs_trans_log_buf(xfs_trans_t *tp, - xfs_buf_t *bp, - uint first, - uint last) +xfs_trans_dirty_buf( + struct xfs_trans *tp, + struct xfs_buf *bp) { - xfs_buf_log_item_t *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_fspriv; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); - ASSERT(first <= last && last < BBTOB(bp->b_length)); ASSERT(bp->b_iodone == NULL || bp->b_iodone == xfs_buf_iodone_callbacks); @@ -531,8 +523,6 @@ xfs_trans_log_buf(xfs_trans_t *tp, bp->b_iodone = xfs_buf_iodone_callbacks; bip->bli_item.li_cb = xfs_buf_iodone; - trace_xfs_trans_log_buf(bip); - /* * If we invalidated the buffer within this transaction, then * cancel the invalidation now that we're dirtying the buffer @@ -545,17 +535,37 @@ xfs_trans_log_buf(xfs_trans_t *tp, bp->b_flags &= ~XBF_STALE; bip->__bli_format.blf_flags &= ~XFS_BLF_CANCEL; } + bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED; tp->t_flags |= XFS_TRANS_DIRTY; bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY; +} - /* - * If we have an ordered buffer we are not logging any dirty range but - * it still needs to be marked dirty and that it has been logged. - */ - bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED; - if (!(bip->bli_flags & XFS_BLI_ORDERED)) - xfs_buf_item_log(bip, first, last); +/* + * This is called to mark bytes first through last inclusive of the given + * buffer as needing to be logged when the transaction is committed. + * The buffer must already be associated with the given transaction. + * + * First and last are numbers relative to the beginning of this buffer, + * so the first byte in the buffer is numbered 0 regardless of the + * value of b_blkno. + */ +void +xfs_trans_log_buf( + struct xfs_trans *tp, + struct xfs_buf *bp, + uint first, + uint last) +{ + struct xfs_buf_log_item *bip = bp->b_fspriv; + + ASSERT(first <= last && last < BBTOB(bp->b_length)); + ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED)); + + xfs_trans_dirty_buf(tp, bp); + + trace_xfs_trans_log_buf(bip); + xfs_buf_item_log(bip, first, last); } @@ -708,14 +718,13 @@ xfs_trans_inode_alloc_buf( } /* - * Mark the buffer as ordered for this transaction. This means - * that the contents of the buffer are not recorded in the transaction - * but it is tracked in the AIL as though it was. This allows us - * to record logical changes in transactions rather than the physical - * changes we make to the buffer without changing writeback ordering - * constraints of metadata buffers. + * Mark the buffer as ordered for this transaction. This means that the contents + * of the buffer are not recorded in the transaction but it is tracked in the + * AIL as though it was. This allows us to record logical changes in + * transactions rather than the physical changes we make to the buffer without + * changing writeback ordering constraints of metadata buffers. */ -void +bool xfs_trans_ordered_buf( struct xfs_trans *tp, struct xfs_buf *bp) @@ -726,8 +735,18 @@ xfs_trans_ordered_buf( ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); + if (xfs_buf_item_dirty_format(bip)) + return false; + bip->bli_flags |= XFS_BLI_ORDERED; trace_xfs_buf_item_ordered(bip); + + /* + * We don't log a dirty range of an ordered buffer but it still needs + * to be marked dirty and that it has been logged. + */ + xfs_trans_dirty_buf(tp, bp); + return true; } /* diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index dab8daa..daa761549 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -134,3 +134,17 @@ xfs_trans_log_inode( flags |= ip->i_itemp->ili_last_fields; ip->i_itemp->ili_fields |= flags; } + +int +xfs_trans_roll_inode( + struct xfs_trans **tpp, + struct xfs_inode *ip) +{ + int error; + + xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE); + error = xfs_trans_roll(tpp); + if (!error) + xfs_trans_ijoin(*tpp, ip, 0); + return error; +} diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index d91706c..b317a36 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -164,4 +164,35 @@ xfs_trans_ail_copy_lsn( *dst = *src; } #endif + +static inline void +xfs_clear_li_failed( + struct xfs_log_item *lip) +{ + struct xfs_buf *bp = lip->li_buf; + + ASSERT(lip->li_flags & XFS_LI_IN_AIL); + lockdep_assert_held(&lip->li_ailp->xa_lock); + + if (lip->li_flags & XFS_LI_FAILED) { + lip->li_flags &= ~XFS_LI_FAILED; + lip->li_buf = NULL; + xfs_buf_rele(bp); + } +} + +static inline void +xfs_set_li_failed( + struct xfs_log_item *lip, + struct xfs_buf *bp) +{ + lockdep_assert_held(&lip->li_ailp->xa_lock); + + if (!(lip->li_flags & XFS_LI_FAILED)) { + xfs_buf_hold(bp); + lip->li_flags |= XFS_LI_FAILED; + lip->li_buf = bp; + } +} + #endif /* __XFS_TRANS_PRIV_H__ */ |