diff options
Diffstat (limited to 'fs')
80 files changed, 1331 insertions, 776 deletions
diff --git a/fs/9p/fid.c b/fs/9p/fid.c index 3585636..6406f89 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -242,7 +242,8 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry) } kfree(wnames); fid_out: - v9fs_fid_add(dentry, fid); + if (!IS_ERR(fid)) + v9fs_fid_add(dentry, fid); err_out: up_read(&v9ses->rename_sem); return fid; diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 16c8a2a..899f168 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -292,9 +292,11 @@ int v9fs_dir_release(struct inode *inode, struct file *filp) fid = filp->private_data; P9_DPRINTK(P9_DEBUG_VFS, - "inode: %p filp: %p fid: %d\n", inode, filp, fid->fid); + "v9fs_dir_release: inode: %p filp: %p fid: %d\n", + inode, filp, fid ? fid->fid : -1); filemap_write_and_wait(inode->i_mapping); - p9_client_clunk(fid); + if (fid) + p9_client_clunk(fid); return 0; } diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index c7c23ea..9e670d5 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -730,7 +730,10 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int mode, P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err); goto error; } - dentry->d_op = &v9fs_cached_dentry_operations; + if (v9ses->cache) + dentry->d_op = &v9fs_cached_dentry_operations; + else + dentry->d_op = &v9fs_dentry_operations; d_instantiate(dentry, inode); err = v9fs_fid_add(dentry, fid); if (err < 0) @@ -1128,6 +1131,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, v9fs_stat2inode(st, dentry->d_inode, dentry->d_inode->i_sb); generic_fillattr(dentry->d_inode, stat); + p9stat_free(st); kfree(st); return 0; } @@ -1489,6 +1493,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen) retval = strnlen(buffer, buflen); done: + p9stat_free(st); kfree(st); return retval; } @@ -1942,7 +1947,7 @@ static const struct inode_operations v9fs_dir_inode_operations_dotu = { .unlink = v9fs_vfs_unlink, .mkdir = v9fs_vfs_mkdir, .rmdir = v9fs_vfs_rmdir, - .mknod = v9fs_vfs_mknod_dotl, + .mknod = v9fs_vfs_mknod, .rename = v9fs_vfs_rename, .getattr = v9fs_vfs_getattr, .setattr = v9fs_vfs_setattr, diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index f931107..1d12ba0 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -122,6 +122,10 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags, fid = v9fs_session_init(v9ses, dev_name, data); if (IS_ERR(fid)) { retval = PTR_ERR(fid); + /* + * we need to call session_close to tear down some + * of the data structure setup by session_init + */ goto close_session; } @@ -144,7 +148,6 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags, retval = -ENOMEM; goto release_sb; } - sb->s_root = root; if (v9fs_proto_dotl(v9ses)) { @@ -152,7 +155,7 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags, st = p9_client_getattr_dotl(fid, P9_STATS_BASIC); if (IS_ERR(st)) { retval = PTR_ERR(st); - goto clunk_fid; + goto release_sb; } v9fs_stat2inode_dotl(st, root->d_inode); @@ -162,7 +165,7 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags, st = p9_client_stat(fid); if (IS_ERR(st)) { retval = PTR_ERR(st); - goto clunk_fid; + goto release_sb; } root->d_inode->i_ino = v9fs_qid2ino(&st->qid); @@ -174,19 +177,24 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags, v9fs_fid_add(root, fid); -P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n"); + P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n"); simple_set_mnt(mnt, sb); return 0; clunk_fid: p9_client_clunk(fid); - close_session: v9fs_session_close(v9ses); kfree(v9ses); return retval; - release_sb: + /* + * we will do the session_close and root dentry release + * in the below call. But we need to clunk fid, because we haven't + * attached the fid to dentry so it won't get clunked + * automatically. + */ + p9_client_clunk(fid); deactivate_locked_super(sb); return retval; } diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index a7528b9..fd0cc0b 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -724,7 +724,7 @@ static int __init init_misc_binfmt(void) { int err = register_filesystem(&bm_fs_type); if (!err) { - err = register_binfmt(&misc_format); + err = insert_binfmt(&misc_format); if (err) unregister_filesystem(&bm_fs_type); } diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 612a5c3..4d0ff5e 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -413,10 +413,10 @@ int bio_integrity_prep(struct bio *bio) /* Allocate kernel buffer for protection data */ len = sectors * blk_integrity_tuple_size(bi); - buf = kmalloc(len, GFP_NOIO | __GFP_NOFAIL | q->bounce_gfp); + buf = kmalloc(len, GFP_NOIO | q->bounce_gfp); if (unlikely(buf == NULL)) { printk(KERN_ERR "could not allocate integrity buffer\n"); - return -EIO; + return -ENOMEM; } end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 5598a0d..4cfce1e 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -87,7 +87,7 @@ static int ceph_set_page_dirty(struct page *page) /* dirty the head */ spin_lock(&inode->i_lock); - if (ci->i_wrbuffer_ref_head == 0) + if (ci->i_head_snapc == NULL) ci->i_head_snapc = ceph_get_snap_context(snapc); ++ci->i_wrbuffer_ref_head; if (ci->i_wrbuffer_ref == 0) @@ -105,13 +105,7 @@ static int ceph_set_page_dirty(struct page *page) spin_lock_irq(&mapping->tree_lock); if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(!PageUptodate(page)); - - if (mapping_cap_account_dirty(mapping)) { - __inc_zone_page_state(page, NR_FILE_DIRTY); - __inc_bdi_stat(mapping->backing_dev_info, - BDI_RECLAIMABLE); - task_io_account_write(PAGE_CACHE_SIZE); - } + account_page_dirtied(page, page->mapping); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); @@ -352,7 +346,7 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode, break; } } - if (!snapc && ci->i_head_snapc) { + if (!snapc && ci->i_wrbuffer_ref_head) { snapc = ceph_get_snap_context(ci->i_head_snapc); dout(" head snapc %p has %d dirty pages\n", snapc, ci->i_wrbuffer_ref_head); diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c index 582e0b2..a2d002c 100644 --- a/fs/ceph/auth_x.c +++ b/fs/ceph/auth_x.c @@ -376,7 +376,7 @@ static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed) th = get_ticket_handler(ac, service); - if (!th) { + if (IS_ERR(th)) { *pneed |= service; continue; } @@ -399,6 +399,9 @@ static int ceph_x_build_request(struct ceph_auth_client *ac, struct ceph_x_ticket_handler *th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH); + if (IS_ERR(th)) + return PTR_ERR(th); + ceph_x_validate_tickets(ac, &need); dout("build_request want %x have %x need %x\n", @@ -450,7 +453,6 @@ static int ceph_x_build_request(struct ceph_auth_client *ac, return -ERANGE; head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY); - BUG_ON(!th); ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer); if (ret) return ret; @@ -505,7 +507,8 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result, case CEPHX_GET_PRINCIPAL_SESSION_KEY: th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH); - BUG_ON(!th); + if (IS_ERR(th)) + return PTR_ERR(th); ret = ceph_x_proc_ticket_reply(ac, &th->session_key, buf + sizeof(*head), end); break; @@ -563,8 +566,8 @@ static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac, void *end = p + sizeof(au->reply_buf); th = get_ticket_handler(ac, au->service); - if (!th) - return -EIO; /* hrm! */ + if (IS_ERR(th)) + return PTR_ERR(th); ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply)); if (ret < 0) return ret; @@ -626,7 +629,7 @@ static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac, struct ceph_x_ticket_handler *th; th = get_ticket_handler(ac, peer_type); - if (th && !IS_ERR(th)) + if (!IS_ERR(th)) remove_ticket_handler(ac, th); } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 7bf182b..a2069b6 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1082,6 +1082,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, gid_t gid; struct ceph_mds_session *session; u64 xattr_version = 0; + struct ceph_buffer *xattr_blob = NULL; int delayed = 0; u64 flush_tid = 0; int i; @@ -1142,6 +1143,10 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, for (i = 0; i < CEPH_CAP_BITS; i++) if (flushing & (1 << i)) ci->i_cap_flush_tid[i] = flush_tid; + + follows = ci->i_head_snapc->seq; + } else { + follows = 0; } keep = cap->implemented; @@ -1155,14 +1160,14 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, mtime = inode->i_mtime; atime = inode->i_atime; time_warp_seq = ci->i_time_warp_seq; - follows = ci->i_snap_realm->cached_context->seq; uid = inode->i_uid; gid = inode->i_gid; mode = inode->i_mode; - if (dropping & CEPH_CAP_XATTR_EXCL) { + if (flushing & CEPH_CAP_XATTR_EXCL) { __ceph_build_xattrs_blob(ci); - xattr_version = ci->i_xattrs.version + 1; + xattr_blob = ci->i_xattrs.blob; + xattr_version = ci->i_xattrs.version; } spin_unlock(&inode->i_lock); @@ -1170,9 +1175,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id, op, keep, want, flushing, seq, flush_tid, issue_seq, mseq, size, max_size, &mtime, &atime, time_warp_seq, - uid, gid, mode, - xattr_version, - (flushing & CEPH_CAP_XATTR_EXCL) ? ci->i_xattrs.blob : NULL, + uid, gid, mode, xattr_version, xattr_blob, follows); if (ret < 0) { dout("error sending cap msg, must requeue %p\n", inode); @@ -1282,7 +1285,7 @@ retry: &capsnap->mtime, &capsnap->atime, capsnap->time_warp_seq, capsnap->uid, capsnap->gid, capsnap->mode, - 0, NULL, + capsnap->xattr_version, capsnap->xattr_blob, capsnap->follows); next_follows = capsnap->follows + 1; @@ -1332,7 +1335,11 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) ceph_cap_string(was | mask)); ci->i_dirty_caps |= mask; if (was == 0) { - dout(" inode %p now dirty\n", &ci->vfs_inode); + if (!ci->i_head_snapc) + ci->i_head_snapc = ceph_get_snap_context( + ci->i_snap_realm->cached_context); + dout(" inode %p now dirty snapc %p\n", &ci->vfs_inode, + ci->i_head_snapc); BUG_ON(!list_empty(&ci->i_dirty_item)); spin_lock(&mdsc->cap_dirty_lock); list_add(&ci->i_dirty_item, &mdsc->cap_dirty); @@ -2190,7 +2197,9 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, if (ci->i_head_snapc == snapc) { ci->i_wrbuffer_ref_head -= nr; - if (!ci->i_wrbuffer_ref_head) { + if (ci->i_wrbuffer_ref_head == 0 && + ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) { + BUG_ON(!ci->i_head_snapc); ceph_put_snap_context(ci->i_head_snapc); ci->i_head_snapc = NULL; } @@ -2483,6 +2492,11 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, dout(" inode %p now clean\n", inode); BUG_ON(!list_empty(&ci->i_dirty_item)); drop = 1; + if (ci->i_wrbuffer_ref_head == 0) { + BUG_ON(!ci->i_head_snapc); + ceph_put_snap_context(ci->i_head_snapc); + ci->i_head_snapc = NULL; + } } else { BUG_ON(list_empty(&ci->i_dirty_item)); } diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 360c4f2..6fd8b20 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -171,6 +171,8 @@ static int mdsc_show(struct seq_file *s, void *p) } else if (req->r_dentry) { path = ceph_mdsc_build_path(req->r_dentry, &pathlen, &pathbase, 0); + if (IS_ERR(path)) + path = NULL; spin_lock(&req->r_dentry->d_lock); seq_printf(s, " #%llx/%.*s (%s)", ceph_ino(req->r_dentry->d_parent->d_inode), @@ -187,6 +189,8 @@ static int mdsc_show(struct seq_file *s, void *p) if (req->r_old_dentry) { path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen, &pathbase, 0); + if (IS_ERR(path)) + path = NULL; spin_lock(&req->r_old_dentry->d_lock); seq_printf(s, " #%llx/%.*s (%s)", ceph_ino(req->r_old_dentry->d_parent->d_inode), diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 67bbb41..6e4f43f 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -46,7 +46,7 @@ int ceph_init_dentry(struct dentry *dentry) else dentry->d_op = &ceph_snap_dentry_ops; - di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS); + di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO); if (!di) return -ENOMEM; /* oh well */ diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 5d893d3..e7cca414 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -677,6 +677,7 @@ static int fill_inode(struct inode *inode, if (ci->i_files == 0 && ci->i_subdirs == 0 && ceph_snap(inode) == CEPH_NOSNAP && (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) && + (issued & CEPH_CAP_FILE_EXCL) == 0 && (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) { dout(" marking %p complete (empty)\n", inode); ci->i_ceph_flags |= CEPH_I_COMPLETE; @@ -1229,11 +1230,11 @@ retry_lookup: in = dn->d_inode; } else { in = ceph_get_inode(parent->d_sb, vino); - if (in == NULL) { + if (IS_ERR(in)) { dout("new_inode badness\n"); d_delete(dn); dput(dn); - err = -ENOMEM; + err = PTR_ERR(in); goto out; } dn = splice_dentry(dn, in, NULL); diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index ae85af0..ff4e753 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -82,7 +82,8 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) length = fl->fl_end - fl->fl_start + 1; err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, - (u64)fl->fl_pid, (u64)fl->fl_nspid, + (u64)fl->fl_pid, + (u64)(unsigned long)fl->fl_nspid, lock_cmd, fl->fl_start, length, wait); if (!err) { @@ -92,7 +93,8 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) /* undo! This should only happen if the kernel detects * local deadlock. */ ceph_lock_message(CEPH_LOCK_FCNTL, op, file, - (u64)fl->fl_pid, (u64)fl->fl_nspid, + (u64)fl->fl_pid, + (u64)(unsigned long)fl->fl_nspid, CEPH_LOCK_UNLOCK, fl->fl_start, length, 0); dout("got %d on posix_lock_file, undid lock", err); @@ -132,7 +134,8 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) length = fl->fl_end - fl->fl_start + 1; err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, - file, (u64)fl->fl_pid, (u64)fl->fl_nspid, + file, (u64)fl->fl_pid, + (u64)(unsigned long)fl->fl_nspid, lock_cmd, fl->fl_start, length, wait); if (!err) { @@ -141,7 +144,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, file, (u64)fl->fl_pid, - (u64)fl->fl_nspid, + (u64)(unsigned long)fl->fl_nspid, CEPH_LOCK_UNLOCK, fl->fl_start, length, 0); dout("got %d on flock_lock_file_wait, undid lock", err); @@ -235,7 +238,8 @@ int lock_to_ceph_filelock(struct file_lock *lock, cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1); cephlock->client = cpu_to_le64(0); cephlock->pid = cpu_to_le64(lock->fl_pid); - cephlock->pid_namespace = cpu_to_le64((u64)lock->fl_nspid); + cephlock->pid_namespace = + cpu_to_le64((u64)(unsigned long)lock->fl_nspid); switch (lock->fl_type) { case F_RDLCK: diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index a75ddbf..f091b13 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -560,6 +560,13 @@ static void __unregister_request(struct ceph_mds_client *mdsc, * * Called under mdsc->mutex. */ +struct dentry *get_nonsnap_parent(struct dentry *dentry) +{ + while (!IS_ROOT(dentry) && ceph_snap(dentry->d_inode) != CEPH_NOSNAP) + dentry = dentry->d_parent; + return dentry; +} + static int __choose_mds(struct ceph_mds_client *mdsc, struct ceph_mds_request *req) { @@ -590,14 +597,29 @@ static int __choose_mds(struct ceph_mds_client *mdsc, if (req->r_inode) { inode = req->r_inode; } else if (req->r_dentry) { - if (req->r_dentry->d_inode) { + struct inode *dir = req->r_dentry->d_parent->d_inode; + + if (dir->i_sb != mdsc->client->sb) { + /* not this fs! */ + inode = req->r_dentry->d_inode; + } else if (ceph_snap(dir) != CEPH_NOSNAP) { + /* direct snapped/virtual snapdir requests + * based on parent dir inode */ + struct dentry *dn = + get_nonsnap_parent(req->r_dentry->d_parent); + inode = dn->d_inode; + dout("__choose_mds using nonsnap parent %p\n", inode); + } else if (req->r_dentry->d_inode) { + /* dentry target */ inode = req->r_dentry->d_inode; } else { - inode = req->r_dentry->d_parent->d_inode; + /* dir + name */ + inode = dir; hash = req->r_dentry->d_name.hash; is_hash = true; } } + dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash, (int)hash, mode); if (!inode) @@ -2208,7 +2230,7 @@ static void handle_session(struct ceph_mds_session *session, pr_info("mds%d reconnect denied\n", session->s_mds); remove_session_caps(session); wake = 1; /* for good measure */ - complete_all(&mdsc->session_close_waiters); + wake_up_all(&mdsc->session_close_wq); kick_requests(mdsc, mds); break; @@ -2302,7 +2324,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0); if (IS_ERR(path)) { err = PTR_ERR(path); - BUG_ON(err); + goto out_dput; } } else { path = NULL; @@ -2310,7 +2332,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, } err = ceph_pagelist_encode_string(pagelist, path, pathlen); if (err) - goto out; + goto out_free; spin_lock(&inode->i_lock); cap->seq = 0; /* reset cap seq */ @@ -2354,8 +2376,9 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, unlock_kernel(); } -out: +out_free: kfree(path); +out_dput: dput(dentry); return err; } @@ -2876,7 +2899,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client) return -ENOMEM; init_completion(&mdsc->safe_umount_waiters); - init_completion(&mdsc->session_close_waiters); + init_waitqueue_head(&mdsc->session_close_wq); INIT_LIST_HEAD(&mdsc->waiting_for_map); mdsc->sessions = NULL; mdsc->max_sessions = 0; @@ -3021,6 +3044,23 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush)); } +/* + * true if all sessions are closed, or we force unmount + */ +bool done_closing_sessions(struct ceph_mds_client *mdsc) +{ + int i, n = 0; + + if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN) + return true; + + mutex_lock(&mdsc->mutex); + for (i = 0; i < mdsc->max_sessions; i++) + if (mdsc->sessions[i]) + n++; + mutex_unlock(&mdsc->mutex); + return n == 0; +} /* * called after sb is ro. @@ -3029,45 +3069,32 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) { struct ceph_mds_session *session; int i; - int n; struct ceph_client *client = mdsc->client; - unsigned long started, timeout = client->mount_args->mount_timeout * HZ; + unsigned long timeout = client->mount_args->mount_timeout * HZ; dout("close_sessions\n"); - mutex_lock(&mdsc->mutex); - /* close sessions */ - started = jiffies; - while (time_before(jiffies, started + timeout)) { - dout("closing sessions\n"); - n = 0; - for (i = 0; i < mdsc->max_sessions; i++) { - session = __ceph_lookup_mds_session(mdsc, i); - if (!session) - continue; - mutex_unlock(&mdsc->mutex); - mutex_lock(&session->s_mutex); - __close_session(mdsc, session); - mutex_unlock(&session->s_mutex); - ceph_put_mds_session(session); - mutex_lock(&mdsc->mutex); - n++; - } - if (n == 0) - break; - - if (client->mount_state == CEPH_MOUNT_SHUTDOWN) - break; - - dout("waiting for sessions to close\n"); + mutex_lock(&mdsc->mutex); + for (i = 0; i < mdsc->max_sessions; i++) { + session = __ceph_lookup_mds_session(mdsc, i); + if (!session) + continue; mutex_unlock(&mdsc->mutex); - wait_for_completion_timeout(&mdsc->session_close_waiters, - timeout); + mutex_lock(&session->s_mutex); + __close_session(mdsc, session); + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); mutex_lock(&mdsc->mutex); } + mutex_unlock(&mdsc->mutex); + + dout("waiting for sessions to close\n"); + wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc), + timeout); /* tear down remaining sessions */ + mutex_lock(&mdsc->mutex); for (i = 0; i < mdsc->max_sessions; i++) { if (mdsc->sessions[i]) { session = get_session(mdsc->sessions[i]); @@ -3080,9 +3107,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) mutex_lock(&mdsc->mutex); } } - WARN_ON(!list_empty(&mdsc->cap_delay_list)); - mutex_unlock(&mdsc->mutex); ceph_cleanup_empty_realms(mdsc); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index ab7e89f..c98267c 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -234,7 +234,8 @@ struct ceph_mds_client { struct mutex mutex; /* all nested structures */ struct ceph_mdsmap *mdsmap; - struct completion safe_umount_waiters, session_close_waiters; + struct completion safe_umount_waiters; + wait_queue_head_t session_close_wq; struct list_head waiting_for_map; struct ceph_mds_session **sessions; /* NULL for mds if no session */ diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c index bed6391..dfced1d 100644 --- a/fs/ceph/osd_client.c +++ b/fs/ceph/osd_client.c @@ -661,7 +661,7 @@ static int __send_request(struct ceph_osd_client *osdc, reqhead->reassert_version = req->r_reassert_version; req->r_stamp = jiffies; - list_move_tail(&osdc->req_lru, &req->r_req_lru_item); + list_move_tail(&req->r_req_lru_item, &osdc->req_lru); ceph_msg_get(req->r_request); /* send consumes a ref */ ceph_con_send(&req->r_osd->o_con, req->r_request); diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index c0b26b6..4868b9d 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -435,7 +435,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) { struct inode *inode = &ci->vfs_inode; struct ceph_cap_snap *capsnap; - int used; + int used, dirty; capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS); if (!capsnap) { @@ -445,6 +445,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) spin_lock(&inode->i_lock); used = __ceph_caps_used(ci); + dirty = __ceph_caps_dirty(ci); if (__ceph_have_pending_cap_snap(ci)) { /* there is no point in queuing multiple "pending" cap_snaps, as no new writes are allowed to start when pending, so any @@ -452,11 +453,15 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) cap_snap. lucky us. */ dout("queue_cap_snap %p already pending\n", inode); kfree(capsnap); - } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) { + } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR) || + (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL| + CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR))) { struct ceph_snap_context *snapc = ci->i_head_snapc; + dout("queue_cap_snap %p cap_snap %p queuing under %p\n", inode, + capsnap, snapc); igrab(inode); - + atomic_set(&capsnap->nref, 1); capsnap->ci = ci; INIT_LIST_HEAD(&capsnap->ci_item); @@ -464,15 +469,21 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) capsnap->follows = snapc->seq - 1; capsnap->issued = __ceph_caps_issued(ci, NULL); - capsnap->dirty = __ceph_caps_dirty(ci); + capsnap->dirty = dirty; capsnap->mode = inode->i_mode; capsnap->uid = inode->i_uid; capsnap->gid = inode->i_gid; - /* fixme? */ - capsnap->xattr_blob = NULL; - capsnap->xattr_len = 0; + if (dirty & CEPH_CAP_XATTR_EXCL) { + __ceph_build_xattrs_blob(ci); + capsnap->xattr_blob = + ceph_buffer_get(ci->i_xattrs.blob); + capsnap->xattr_version = ci->i_xattrs.version; + } else { + capsnap->xattr_blob = NULL; + capsnap->xattr_version = 0; + } /* dirty page count moved from _head to this cap_snap; all subsequent writes page dirties occur _after_ this @@ -480,7 +491,9 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) capsnap->dirty_pages = ci->i_wrbuffer_ref_head; ci->i_wrbuffer_ref_head = 0; capsnap->context = snapc; - ci->i_head_snapc = NULL; + ci->i_head_snapc = + ceph_get_snap_context(ci->i_snap_realm->cached_context); + dout(" new snapc is %p\n", ci->i_head_snapc); list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); if (used & CEPH_CAP_FILE_WR) { @@ -539,6 +552,41 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, return 1; /* caller may want to ceph_flush_snaps */ } +/* + * Queue cap_snaps for snap writeback for this realm and its children. + * Called under snap_rwsem, so realm topology won't change. + */ +static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) +{ + struct ceph_inode_info *ci; + struct inode *lastinode = NULL; + struct ceph_snap_realm *child; + + dout("queue_realm_cap_snaps %p %llx inodes\n", realm, realm->ino); + + spin_lock(&realm->inodes_with_caps_lock); + list_for_each_entry(ci, &realm->inodes_with_caps, + i_snap_realm_item) { + struct inode *inode = igrab(&ci->vfs_inode); + if (!inode) + continue; + spin_unlock(&realm->inodes_with_caps_lock); + if (lastinode) + iput(lastinode); + lastinode = inode; + ceph_queue_cap_snap(ci); + spin_lock(&realm->inodes_with_caps_lock); + } + spin_unlock(&realm->inodes_with_caps_lock); + if (lastinode) + iput(lastinode); + + dout("queue_realm_cap_snaps %p %llx children\n", realm, realm->ino); + list_for_each_entry(child, &realm->children, child_item) + queue_realm_cap_snaps(child); + + dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino); +} /* * Parse and apply a snapblob "snap trace" from the MDS. This specifies @@ -589,29 +637,8 @@ more: * * ...unless it's a snap deletion! */ - if (!deletion) { - struct ceph_inode_info *ci; - struct inode *lastinode = NULL; - - spin_lock(&realm->inodes_with_caps_lock); - list_for_each_entry(ci, &realm->inodes_with_caps, - i_snap_realm_item) { - struct inode *inode = igrab(&ci->vfs_inode); - if (!inode) - continue; - spin_unlock(&realm->inodes_with_caps_lock); - if (lastinode) - iput(lastinode); - lastinode = inode; - ceph_queue_cap_snap(ci); - spin_lock(&realm->inodes_with_caps_lock); - } - spin_unlock(&realm->inodes_with_caps_lock); - if (lastinode) - iput(lastinode); - dout("update_snap_trace cap_snaps queued\n"); - } - + if (!deletion) + queue_realm_cap_snaps(realm); } else { dout("update_snap_trace %llx %p seq %lld unchanged\n", realm->ino, realm, realm->seq); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 2482d69..c33897a 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -216,8 +216,7 @@ struct ceph_cap_snap { uid_t uid; gid_t gid; - void *xattr_blob; - int xattr_len; + struct ceph_buffer *xattr_blob; u64 xattr_version; u64 size; @@ -229,8 +228,11 @@ struct ceph_cap_snap { static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) { - if (atomic_dec_and_test(&capsnap->nref)) + if (atomic_dec_and_test(&capsnap->nref)) { + if (capsnap->xattr_blob) + ceph_buffer_put(capsnap->xattr_blob); kfree(capsnap); + } } /* @@ -342,7 +344,8 @@ struct ceph_inode_info { unsigned i_cap_exporting_issued; struct ceph_cap_reservation i_cap_migration_resv; struct list_head i_cap_snaps; /* snapped state pending flush to mds */ - struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 */ + struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or + dirty|flushing caps */ unsigned i_snap_caps; /* cap bits for snapped files */ int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */ diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 097a265..9578af6 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -485,6 +485,7 @@ void __ceph_build_xattrs_blob(struct ceph_inode_info *ci) ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob; ci->i_xattrs.prealloc_blob = NULL; ci->i_xattrs.dirty = false; + ci->i_xattrs.version++; } } diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h index 6506382..7fe6b52 100644 --- a/fs/cifs/cifs_unicode.h +++ b/fs/cifs/cifs_unicode.h @@ -30,6 +30,8 @@ * This is a compressed table of upper and lower case conversion. * */ +#ifndef _CIFS_UNICODE_H +#define _CIFS_UNICODE_H #include <asm/byteorder.h> #include <linux/types.h> @@ -67,8 +69,8 @@ extern const struct UniCaseRange CifsUniUpperRange[]; #endif /* UNIUPR_NOUPPER */ #ifndef UNIUPR_NOLOWER -extern signed char UniLowerTable[512]; -extern struct UniCaseRange UniLowerRange[]; +extern signed char CifsUniLowerTable[512]; +extern const struct UniCaseRange CifsUniLowerRange[]; #endif /* UNIUPR_NOLOWER */ #ifdef __KERNEL__ @@ -337,15 +339,15 @@ UniStrupr(register wchar_t *upin) * UniTolower: Convert a unicode character to lower case */ static inline wchar_t -UniTolower(wchar_t uc) +UniTolower(register wchar_t uc) { - register struct UniCaseRange *rp; + register const struct UniCaseRange *rp; - if (uc < sizeof(UniLowerTable)) { + if (uc < sizeof(CifsUniLowerTable)) { /* Latin characters */ - return uc + UniLowerTable[uc]; /* Use base tables */ + return uc + CifsUniLowerTable[uc]; /* Use base tables */ } else { - rp = UniLowerRange; /* Use range tables */ + rp = CifsUniLowerRange; /* Use range tables */ while (rp->start) { if (uc < rp->start) /* Before start of range */ return uc; /* Uppercase = input */ @@ -374,3 +376,5 @@ UniStrlwr(register wchar_t *upin) } #endif + +#endif /* _CIFS_UNICODE_H */ diff --git a/fs/cifs/cifs_uniupr.h b/fs/cifs/cifs_uniupr.h index 18a9d97..0ac7c5a 100644 --- a/fs/cifs/cifs_uniupr.h +++ b/fs/cifs/cifs_uniupr.h @@ -140,7 +140,7 @@ const struct UniCaseRange CifsUniUpperRange[] = { /* * Latin lower case */ -static signed char CifsUniLowerTable[512] = { +signed char CifsUniLowerTable[512] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 000-00f */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 010-01f */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 020-02f */ @@ -242,12 +242,12 @@ static signed char UniCaseRangeLff20[27] = { /* * Lower Case Range */ -static const struct UniCaseRange CifsUniLowerRange[] = { - 0x0380, 0x03ab, UniCaseRangeL0380, - 0x0400, 0x042f, UniCaseRangeL0400, - 0x0490, 0x04cb, UniCaseRangeL0490, - 0x1e00, 0x1ff7, UniCaseRangeL1e00, - 0xff20, 0xff3a, UniCaseRangeLff20, - 0, 0, 0 +const struct UniCaseRange CifsUniLowerRange[] = { + {0x0380, 0x03ab, UniCaseRangeL0380}, + {0x0400, 0x042f, UniCaseRangeL0400}, + {0x0490, 0x04cb, UniCaseRangeL0490}, + {0x1e00, 0x1ff7, UniCaseRangeL1e00}, + {0xff20, 0xff3a, UniCaseRangeLff20}, + {0} }; #endif diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 847628d..35042d8 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -223,63 +223,6 @@ int cifs_calculate_mac_key(struct mac_key *key, const char *rn, return 0; } -int CalcNTLMv2_partial_mac_key(struct cifsSesInfo *ses, - const struct nls_table *nls_info) -{ - char temp_hash[16]; - struct HMACMD5Context ctx; - char *ucase_buf; - __le16 *unicode_buf; - unsigned int i, user_name_len, dom_name_len; - - if (ses == NULL) - return -EINVAL; - - E_md4hash(ses->password, temp_hash); - - hmac_md5_init_limK_to_64(temp_hash, 16, &ctx); - user_name_len = strlen(ses->userName); - if (user_name_len > MAX_USERNAME_SIZE) - return -EINVAL; - if (ses->domainName == NULL) - return -EINVAL; /* BB should we use CIFS_LINUX_DOM */ - dom_name_len = strlen(ses->domainName); - if (dom_name_len > MAX_USERNAME_SIZE) - return -EINVAL; - - ucase_buf = kmalloc((MAX_USERNAME_SIZE+1), GFP_KERNEL); - if (ucase_buf == NULL) - return -ENOMEM; - unicode_buf = kmalloc((MAX_USERNAME_SIZE+1)*4, GFP_KERNEL); - if (unicode_buf == NULL) { - kfree(ucase_buf); - return -ENOMEM; - } - - for (i = 0; i < user_name_len; i++) - ucase_buf[i] = nls_info->charset2upper[(int)ses->userName[i]]; - ucase_buf[i] = 0; - user_name_len = cifs_strtoUCS(unicode_buf, ucase_buf, - MAX_USERNAME_SIZE*2, nls_info); - unicode_buf[user_name_len] = 0; - user_name_len++; - - for (i = 0; i < dom_name_len; i++) - ucase_buf[i] = nls_info->charset2upper[(int)ses->domainName[i]]; - ucase_buf[i] = 0; - dom_name_len = cifs_strtoUCS(unicode_buf+user_name_len, ucase_buf, - MAX_USERNAME_SIZE*2, nls_info); - - unicode_buf[user_name_len + dom_name_len] = 0; - hmac_md5_update((const unsigned char *) unicode_buf, - (user_name_len+dom_name_len)*2, &ctx); - - hmac_md5_final(ses->server->ntlmv2_hash, &ctx); - kfree(ucase_buf); - kfree(unicode_buf); - return 0; -} - #ifdef CONFIG_CIFS_WEAK_PW_HASH void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt, char *lnm_session_key) diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 1f54508..1d60c65 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -87,8 +87,9 @@ extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr); extern int decode_negTokenInit(unsigned char *security_blob, int length, struct TCP_Server_Info *server); extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len); +extern int cifs_set_port(struct sockaddr *addr, const unsigned short int port); extern int cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len, - unsigned short int port); + const unsigned short int port); extern int map_smb_to_linux_error(struct smb_hdr *smb, int logErr); extern void header_assemble(struct smb_hdr *, char /* command */ , const struct cifsTconInfo *, int /* length of @@ -365,8 +366,6 @@ extern int cifs_verify_signature(struct smb_hdr *, __u32 expected_sequence_number); extern int cifs_calculate_mac_key(struct mac_key *key, const char *rn, const char *pass); -extern int CalcNTLMv2_partial_mac_key(struct cifsSesInfo *, - const struct nls_table *); extern void CalcNTLMv2_response(const struct cifsSesInfo *, char *); extern void setup_ntlmv2_rsp(struct cifsSesInfo *, char *, const struct nls_table *); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 95c2ea6..67dad54 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -400,7 +400,9 @@ incomplete_rcv: cFYI(1, "call to reconnect done"); csocket = server->ssocket; continue; - } else if ((length == -ERESTARTSYS) || (length == -EAGAIN)) { + } else if (length == -ERESTARTSYS || + length == -EAGAIN || + length == -EINTR) { msleep(1); /* minimum sleep to prevent looping allowing socket to clear and app threads to set tcpStatus CifsNeedReconnect if server hung */ @@ -414,18 +416,6 @@ incomplete_rcv: } else continue; } else if (length <= 0) { - if (server->tcpStatus == CifsNew) { - cFYI(1, "tcp session abend after SMBnegprot"); - /* some servers kill the TCP session rather than - returning an SMB negprot error, in which - case reconnecting here is not going to help, - and so simply return error to mount */ - break; - } - if (!try_to_freeze() && (length == -EINTR)) { - cFYI(1, "cifsd thread killed"); - break; - } cFYI(1, "Reconnect after unexpected peek error %d", length); cifs_reconnect(server); @@ -466,27 +456,19 @@ incomplete_rcv: an error on SMB negprot response */ cFYI(1, "Negative RFC1002 Session Response Error 0x%x)", pdu_length); - if (server->tcpStatus == CifsNew) { - /* if nack on negprot (rather than - ret of smb negprot error) reconnecting - not going to help, ret error to mount */ - break; - } else { - /* give server a second to - clean up before reconnect attempt */ - msleep(1000); - /* always try 445 first on reconnect - since we get NACK on some if we ever - connected to port 139 (the NACK is - since we do not begin with RFC1001 - session initialize frame) */ - server->addr.sockAddr.sin_port = - htons(CIFS_PORT); - cifs_reconnect(server); - csocket = server->ssocket; - wake_up(&server->response_q); - continue; - } + /* give server a second to clean up */ + msleep(1000); + /* always try 445 first on reconnect since we get NACK + * on some if we ever connected to port 139 (the NACK + * is since we do not begin with RFC1001 session + * initialize frame) + */ + cifs_set_port((struct sockaddr *) + &server->addr.sockAddr, CIFS_PORT); + cifs_reconnect(server); + csocket = server->ssocket; + wake_up(&server->response_q); + continue; } else if (temp != (char) 0) { cERROR(1, "Unknown RFC 1002 frame"); cifs_dump_mem(" Received Data: ", (char *)smb_buffer, @@ -522,8 +504,7 @@ incomplete_rcv: total_read += length) { length = kernel_recvmsg(csocket, &smb_msg, &iov, 1, pdu_length - total_read, 0); - if ((server->tcpStatus == CifsExiting) || - (length == -EINTR)) { + if (server->tcpStatus == CifsExiting) { /* then will exit */ reconnect = 2; break; @@ -534,8 +515,9 @@ incomplete_rcv: /* Now we will reread sock */ reconnect = 1; break; - } else if ((length == -ERESTARTSYS) || - (length == -EAGAIN)) { + } else if (length == -ERESTARTSYS || + length == -EAGAIN || + length == -EINTR) { msleep(1); /* minimum sleep to prevent looping, allowing socket to clear and app threads to set tcpStatus @@ -1673,7 +1655,9 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol) MAX_USERNAME_SIZE)) continue; if (strlen(vol->username) != 0 && - strncmp(ses->password, vol->password, + ses->password != NULL && + strncmp(ses->password, + vol->password ? vol->password : "", MAX_PASSWORD_SIZE)) continue; } diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 578d88c..f9ed075 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -305,8 +305,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, full_path = build_path_from_dentry(direntry); if (full_path == NULL) { rc = -ENOMEM; - FreeXid(xid); - return rc; + goto cifs_create_out; } if (oplockEnabled) @@ -365,9 +364,8 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); if (buf == NULL) { - kfree(full_path); - FreeXid(xid); - return -ENOMEM; + rc = -ENOMEM; + goto cifs_create_out; } /* @@ -496,6 +494,11 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode, struct cifsTconInfo *pTcon; char *full_path = NULL; struct inode *newinode = NULL; + int oplock = 0; + u16 fileHandle; + FILE_ALL_INFO *buf = NULL; + unsigned int bytes_written; + struct win_dev *pdev; if (!old_valid_dev(device_number)) return -EINVAL; @@ -506,9 +509,12 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode, pTcon = cifs_sb->tcon; full_path = build_path_from_dentry(direntry); - if (full_path == NULL) + if (full_path == NULL) { rc = -ENOMEM; - else if (pTcon->unix_ext) { + goto mknod_out; + } + + if (pTcon->unix_ext) { struct cifs_unix_set_info_args args = { .mode = mode & ~current_umask(), .ctime = NO_CHANGE_64, @@ -527,87 +533,78 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + if (rc) + goto mknod_out; - if (!rc) { - rc = cifs_get_inode_info_unix(&newinode, full_path, + rc = cifs_get_inode_info_unix(&newinode, full_path, inode->i_sb, xid); - if (pTcon->nocase) - direntry->d_op = &cifs_ci_dentry_ops; - else - direntry->d_op = &cifs_dentry_ops; - if (rc == 0) - d_instantiate(direntry, newinode); - } - } else { - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) { - int oplock = 0; - u16 fileHandle; - FILE_ALL_INFO *buf; + if (pTcon->nocase) + direntry->d_op = &cifs_ci_dentry_ops; + else + direntry->d_op = &cifs_dentry_ops; - cFYI(1, "sfu compat create special file"); + if (rc == 0) + d_instantiate(direntry, newinode); + goto mknod_out; + } - buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); - if (buf == NULL) { - kfree(full_path); - rc = -ENOMEM; - FreeXid(xid); - return rc; - } + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) + goto mknod_out; - rc = CIFSSMBOpen(xid, pTcon, full_path, - FILE_CREATE, /* fail if exists */ - GENERIC_WRITE /* BB would - WRITE_OWNER | WRITE_DAC be better? */, - /* Create a file and set the - file attribute to SYSTEM */ - CREATE_NOT_DIR | CREATE_OPTION_SPECIAL, - &fileHandle, &oplock, buf, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - - /* BB FIXME - add handling for backlevel servers - which need legacy open and check for all - calls to SMBOpen for fallback to SMBLeagcyOpen */ - if (!rc) { - /* BB Do not bother to decode buf since no - local inode yet to put timestamps in, - but we can reuse it safely */ - unsigned int bytes_written; - struct win_dev *pdev; - pdev = (struct win_dev *)buf; - if (S_ISCHR(mode)) { - memcpy(pdev->type, "IntxCHR", 8); - pdev->major = - cpu_to_le64(MAJOR(device_number)); - pdev->minor = - cpu_to_le64(MINOR(device_number)); - rc = CIFSSMBWrite(xid, pTcon, - fileHandle, - sizeof(struct win_dev), - 0, &bytes_written, (char *)pdev, - NULL, 0); - } else if (S_ISBLK(mode)) { - memcpy(pdev->type, "IntxBLK", 8); - pdev->major = - cpu_to_le64(MAJOR(device_number)); - pdev->minor = - cpu_to_le64(MINOR(device_number)); - rc = CIFSSMBWrite(xid, pTcon, - fileHandle, - sizeof(struct win_dev), - 0, &bytes_written, (char *)pdev, - NULL, 0); - } /* else if(S_ISFIFO */ - CIFSSMBClose(xid, pTcon, fileHandle); - d_drop(direntry); - } - kfree(buf); - /* add code here to set EAs */ - } + + cFYI(1, "sfu compat create special file"); + + buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); + if (buf == NULL) { + kfree(full_path); + rc = -ENOMEM; + FreeXid(xid); + return rc; } + /* FIXME: would WRITE_OWNER | WRITE_DAC be better? */ + rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_CREATE, + GENERIC_WRITE, CREATE_NOT_DIR | CREATE_OPTION_SPECIAL, + &fileHandle, &oplock, buf, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + if (rc) + goto mknod_out; + + /* BB Do not bother to decode buf since no local inode yet to put + * timestamps in, but we can reuse it safely */ + + pdev = (struct win_dev *)buf; + if (S_ISCHR(mode)) { + memcpy(pdev->type, "IntxCHR", 8); + pdev->major = + cpu_to_le64(MAJOR(device_number)); + pdev->minor = + cpu_to_le64(MINOR(device_number)); + rc = CIFSSMBWrite(xid, pTcon, + fileHandle, + sizeof(struct win_dev), + 0, &bytes_written, (char *)pdev, + NULL, 0); + } else if (S_ISBLK(mode)) { + memcpy(pdev->type, "IntxBLK", 8); + pdev->major = + cpu_to_le64(MAJOR(device_number)); + pdev->minor = + cpu_to_le64(MINOR(device_number)); + rc = CIFSSMBWrite(xid, pTcon, + fileHandle, + sizeof(struct win_dev), + 0, &bytes_written, (char *)pdev, + NULL, 0); + } /* else if (S_ISFIFO) */ + CIFSSMBClose(xid, pTcon, fileHandle); + d_drop(direntry); + + /* FIXME: add code here to set EAs */ + +mknod_out: kfree(full_path); + kfree(buf); FreeXid(xid); return rc; } diff --git a/fs/cifs/file.c b/fs/cifs/file.c index db11fde..de748c6 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -242,8 +242,7 @@ int cifs_open(struct inode *inode, struct file *file) full_path = build_path_from_dentry(file->f_path.dentry); if (full_path == NULL) { rc = -ENOMEM; - FreeXid(xid); - return rc; + goto out; } cFYI(1, "inode = 0x%p file flags are 0x%x for %s", diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 4bc47e5..93f77d4 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -834,7 +834,7 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino) xid, NULL); if (!inode) - return ERR_PTR(-ENOMEM); + return ERR_PTR(rc); #ifdef CONFIG_CIFS_FSCACHE /* populate tcon->resource_id */ @@ -1462,29 +1462,18 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry, { char *fromName = NULL; char *toName = NULL; - struct cifs_sb_info *cifs_sb_source; - struct cifs_sb_info *cifs_sb_target; + struct cifs_sb_info *cifs_sb; struct cifsTconInfo *tcon; FILE_UNIX_BASIC_INFO *info_buf_source = NULL; FILE_UNIX_BASIC_INFO *info_buf_target; int xid, rc, tmprc; - cifs_sb_target = CIFS_SB(target_dir->i_sb); - cifs_sb_source = CIFS_SB(source_dir->i_sb); - tcon = cifs_sb_source->tcon; + cifs_sb = CIFS_SB(source_dir->i_sb); + tcon = cifs_sb->tcon; xid = GetXid(); /* - * BB: this might be allowed if same server, but different share. - * Consider adding support for this - */ - if (tcon != cifs_sb_target->tcon) { - rc = -EXDEV; - goto cifs_rename_exit; - } - - /* * we already have the rename sem so we do not need to * grab it again here to protect the path integrity */ @@ -1519,17 +1508,16 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry, info_buf_target = info_buf_source + 1; tmprc = CIFSSMBUnixQPathInfo(xid, tcon, fromName, info_buf_source, - cifs_sb_source->local_nls, - cifs_sb_source->mnt_cifs_flags & + cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); if (tmprc != 0) goto unlink_target; - tmprc = CIFSSMBUnixQPathInfo(xid, tcon, - toName, info_buf_target, - cifs_sb_target->local_nls, - /* remap based on source sb */ - cifs_sb_source->mnt_cifs_flags & + tmprc = CIFSSMBUnixQPathInfo(xid, tcon, toName, + info_buf_target, + cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); if (tmprc == 0 && (info_buf_source->UniqueId == diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index f978511..9aad47a 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -206,26 +206,30 @@ cifs_convert_address(struct sockaddr *dst, const char *src, int len) } int -cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len, - const unsigned short int port) +cifs_set_port(struct sockaddr *addr, const unsigned short int port) { - if (!cifs_convert_address(dst, src, len)) - return 0; - - switch (dst->sa_family) { + switch (addr->sa_family) { case AF_INET: - ((struct sockaddr_in *)dst)->sin_port = htons(port); + ((struct sockaddr_in *)addr)->sin_port = htons(port); break; case AF_INET6: - ((struct sockaddr_in6 *)dst)->sin6_port = htons(port); + ((struct sockaddr_in6 *)addr)->sin6_port = htons(port); break; default: return 0; } - return 1; } +int +cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len, + const unsigned short int port) +{ + if (!cifs_convert_address(dst, src, len)) + return 0; + return cifs_set_port(dst, port); +} + /***************************************************************************** convert a NT status code to a dos class/code *****************************************************************************/ diff --git a/fs/direct-io.c b/fs/direct-io.c index 51f270b..48d74c7 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -634,7 +634,7 @@ static int dio_send_cur_page(struct dio *dio) int ret = 0; if (dio->bio) { - loff_t cur_offset = dio->block_in_file << dio->blkbits; + loff_t cur_offset = dio->cur_page_fs_offset; loff_t bio_next_offset = dio->logical_offset_in_bio + dio->bio->bi_size; @@ -659,7 +659,7 @@ static int dio_send_cur_page(struct dio *dio) * Submit now if the underlying fs is about to perform a * metadata read */ - if (dio->boundary) + else if (dio->boundary) dio_bio_submit(dio); } diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index a2e3b56..cbadc1b 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1793,7 +1793,7 @@ struct kmem_cache *ecryptfs_key_tfm_cache; static struct list_head key_tfm_list; struct mutex key_tfm_list_mutex; -int ecryptfs_init_crypto(void) +int __init ecryptfs_init_crypto(void) { mutex_init(&key_tfm_list_mutex); INIT_LIST_HEAD(&key_tfm_list); @@ -2169,7 +2169,6 @@ int ecryptfs_encrypt_and_encode_filename( (ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE + encoded_name_no_prefix_size); (*encoded_name)[(*encoded_name_size)] = '\0'; - (*encoded_name_size)++; } else { rc = -EOPNOTSUPP; } diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 6c55113..3fbc942 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -349,7 +349,7 @@ out: /** * ecryptfs_new_lower_dentry - * @ename: The name of the new dentry. + * @name: The name of the new dentry. * @lower_dir_dentry: Parent directory of the new dentry. * @nd: nameidata from last lookup. * @@ -386,20 +386,19 @@ ecryptfs_new_lower_dentry(struct qstr *name, struct dentry *lower_dir_dentry, * ecryptfs_lookup_one_lower * @ecryptfs_dentry: The eCryptfs dentry that we are looking up * @lower_dir_dentry: lower parent directory + * @name: lower file name * * Get the lower dentry from vfs. If lower dentry does not exist yet, * create it. */ static struct dentry * ecryptfs_lookup_one_lower(struct dentry *ecryptfs_dentry, - struct dentry *lower_dir_dentry) + struct dentry *lower_dir_dentry, struct qstr *name) { struct nameidata nd; struct vfsmount *lower_mnt; - struct qstr *name; int err; - name = &ecryptfs_dentry->d_name; lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt( ecryptfs_dentry->d_parent)); err = vfs_path_lookup(lower_dir_dentry, lower_mnt, name->name , 0, &nd); @@ -434,6 +433,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, size_t encrypted_and_encoded_name_size; struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL; struct dentry *lower_dir_dentry, *lower_dentry; + struct qstr lower_name; int rc = 0; ecryptfs_dentry->d_op = &ecryptfs_dops; @@ -444,9 +444,17 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, goto out_d_drop; } lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent); - + lower_name.name = ecryptfs_dentry->d_name.name; + lower_name.len = ecryptfs_dentry->d_name.len; + lower_name.hash = ecryptfs_dentry->d_name.hash; + if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) { + rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry, + &lower_name); + if (rc < 0) + goto out_d_drop; + } lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry, - lower_dir_dentry); + lower_dir_dentry, &lower_name); if (IS_ERR(lower_dentry)) { rc = PTR_ERR(lower_dentry); ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned " @@ -471,8 +479,17 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, "filename; rc = [%d]\n", __func__, rc); goto out_d_drop; } + lower_name.name = encrypted_and_encoded_name; + lower_name.len = encrypted_and_encoded_name_size; + lower_name.hash = full_name_hash(lower_name.name, lower_name.len); + if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) { + rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry, + &lower_name); + if (rc < 0) + goto out_d_drop; + } lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry, - lower_dir_dentry); + lower_dir_dentry, &lower_name); if (IS_ERR(lower_dentry)) { rc = PTR_ERR(lower_dentry); ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned " diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 89c5476..73811cf 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -515,6 +515,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, if (!s) { printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc " "[%zd] bytes of kernel memory\n", __func__, sizeof(*s)); + rc = -ENOMEM; goto out; } s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; @@ -806,6 +807,7 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, if (!s) { printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc " "[%zd] bytes of kernel memory\n", __func__, sizeof(*s)); + rc = -ENOMEM; goto out; } s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c index d8c3a37..0851ab6 100644 --- a/fs/ecryptfs/kthread.c +++ b/fs/ecryptfs/kthread.c @@ -86,7 +86,7 @@ out: return 0; } -int ecryptfs_init_kthread(void) +int __init ecryptfs_init_kthread(void) { int rc = 0; diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c index bcb68c0..ab22480 100644 --- a/fs/ecryptfs/messaging.c +++ b/fs/ecryptfs/messaging.c @@ -473,7 +473,7 @@ sleep: return rc; } -int ecryptfs_init_messaging(void) +int __init ecryptfs_init_messaging(void) { int i; int rc = 0; diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c index 3745f61..00208c3 100644 --- a/fs/ecryptfs/miscdev.c +++ b/fs/ecryptfs/miscdev.c @@ -500,7 +500,7 @@ static struct miscdevice ecryptfs_miscdev = { * * Returns zero on success; non-zero otherwise */ -int ecryptfs_init_ecryptfs_miscdev(void) +int __init ecryptfs_init_ecryptfs_miscdev(void) { int rc; @@ -376,6 +376,9 @@ static int count(const char __user * const __user * argv, int max) argv++; if (i++ >= max) return -E2BIG; + + if (fatal_signal_pending(current)) + return -ERESTARTNOHAND; cond_resched(); } } @@ -419,6 +422,12 @@ static int copy_strings(int argc, const char __user *const __user *argv, while (len > 0) { int offset, bytes_to_copy; + if (fatal_signal_pending(current)) { + ret = -ERESTARTNOHAND; + goto out; + } + cond_resched(); + offset = pos % PAGE_SIZE; if (offset == 0) offset = PAGE_SIZE; @@ -594,6 +603,11 @@ int setup_arg_pages(struct linux_binprm *bprm, #else stack_top = arch_align_stack(stack_top); stack_top = PAGE_ALIGN(stack_top); + + if (unlikely(stack_top < mmap_min_addr) || + unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr)) + return -ENOMEM; + stack_shift = vma->vm_end - stack_top; bprm->p -= stack_shift; @@ -769,11 +769,15 @@ EXPORT_SYMBOL(kill_fasync); static int __init fcntl_init(void) { - /* please add new bits here to ensure allocation uniqueness */ - BUILD_BUG_ON(19 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( + /* + * Please add new bits here to ensure allocation uniqueness. + * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY + * is defined as O_NONBLOCK on some platforms and not on others. + */ + BUILD_BUG_ON(18 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( O_RDONLY | O_WRONLY | O_RDWR | O_CREAT | O_EXCL | O_NOCTTY | - O_TRUNC | O_APPEND | O_NONBLOCK | + O_TRUNC | O_APPEND | /* O_NONBLOCK | */ __O_SYNC | O_DSYNC | FASYNC | O_DIRECT | O_LARGEFILE | O_DIRECTORY | O_NOFOLLOW | O_NOATIME | O_CLOEXEC | diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 7d9d06b..81e086d 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -808,7 +808,7 @@ int bdi_writeback_thread(void *data) wb->last_active = jiffies; set_current_state(TASK_INTERRUPTIBLE); - if (!list_empty(&bdi->work_list)) { + if (!list_empty(&bdi->work_list) || kthread_should_stop()) { __set_current_state(TASK_RUNNING); continue; } diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 69ad053..d367af1 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -276,7 +276,7 @@ static void flush_bg_queue(struct fuse_conn *fc) * Called with fc->lock, unlocks it */ static void request_end(struct fuse_conn *fc, struct fuse_req *req) -__releases(&fc->lock) +__releases(fc->lock) { void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; req->end = NULL; @@ -306,8 +306,8 @@ __releases(&fc->lock) static void wait_answer_interruptible(struct fuse_conn *fc, struct fuse_req *req) -__releases(&fc->lock) -__acquires(&fc->lock) +__releases(fc->lock) +__acquires(fc->lock) { if (signal_pending(current)) return; @@ -325,8 +325,8 @@ static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req) } static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) -__releases(&fc->lock) -__acquires(&fc->lock) +__releases(fc->lock) +__acquires(fc->lock) { if (!fc->no_interrupt) { /* Any signal may interrupt this */ @@ -905,8 +905,8 @@ static int request_pending(struct fuse_conn *fc) /* Wait until a request is available on the pending list */ static void request_wait(struct fuse_conn *fc) -__releases(&fc->lock) -__acquires(&fc->lock) +__releases(fc->lock) +__acquires(fc->lock) { DECLARE_WAITQUEUE(wait, current); @@ -934,7 +934,7 @@ __acquires(&fc->lock) */ static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_copy_state *cs, size_t nbytes, struct fuse_req *req) -__releases(&fc->lock) +__releases(fc->lock) { struct fuse_in_header ih; struct fuse_interrupt_in arg; @@ -1720,8 +1720,8 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait) * This function releases and reacquires fc->lock */ static void end_requests(struct fuse_conn *fc, struct list_head *head) -__releases(&fc->lock) -__acquires(&fc->lock) +__releases(fc->lock) +__acquires(fc->lock) { while (!list_empty(head)) { struct fuse_req *req; @@ -1744,8 +1744,8 @@ __acquires(&fc->lock) * locked). */ static void end_io_requests(struct fuse_conn *fc) -__releases(&fc->lock) -__acquires(&fc->lock) +__releases(fc->lock) +__acquires(fc->lock) { while (!list_empty(&fc->io)) { struct fuse_req *req = @@ -1769,6 +1769,16 @@ __acquires(&fc->lock) } } +static void end_queued_requests(struct fuse_conn *fc) +__releases(fc->lock) +__acquires(fc->lock) +{ + fc->max_background = UINT_MAX; + flush_bg_queue(fc); + end_requests(fc, &fc->pending); + end_requests(fc, &fc->processing); +} + /* * Abort all requests. * @@ -1795,8 +1805,7 @@ void fuse_abort_conn(struct fuse_conn *fc) fc->connected = 0; fc->blocked = 0; end_io_requests(fc); - end_requests(fc, &fc->pending); - end_requests(fc, &fc->processing); + end_queued_requests(fc); wake_up_all(&fc->waitq); wake_up_all(&fc->blocked_waitq); kill_fasync(&fc->fasync, SIGIO, POLL_IN); @@ -1811,8 +1820,9 @@ int fuse_dev_release(struct inode *inode, struct file *file) if (fc) { spin_lock(&fc->lock); fc->connected = 0; - end_requests(fc, &fc->pending); - end_requests(fc, &fc->processing); + fc->blocked = 0; + end_queued_requests(fc); + wake_up_all(&fc->blocked_waitq); spin_unlock(&fc->lock); fuse_conn_put(fc); } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 147c1f7..c822458 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1144,8 +1144,8 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) /* Called under fc->lock, may release and reacquire it */ static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req) -__releases(&fc->lock) -__acquires(&fc->lock) +__releases(fc->lock) +__acquires(fc->lock) { struct fuse_inode *fi = get_fuse_inode(req->inode); loff_t size = i_size_read(req->inode); @@ -1183,8 +1183,8 @@ __acquires(&fc->lock) * Called with fc->lock */ void fuse_flush_writepages(struct inode *inode) -__releases(&fc->lock) -__acquires(&fc->lock) +__releases(fc->lock) +__acquires(fc->lock) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); diff --git a/fs/minix/namei.c b/fs/minix/namei.c index e20ee85..f3f3578 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -115,7 +115,7 @@ static int minix_mkdir(struct inode * dir, struct dentry *dentry, int mode) inode_inc_link_count(dir); - inode = minix_new_inode(dir, mode, &err); + inode = minix_new_inode(dir, S_IFDIR | mode, &err); if (!inode) goto out_dir; diff --git a/fs/namespace.c b/fs/namespace.c index de402eb..a72eaab 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1484,13 +1484,30 @@ out_unlock: } /* + * Sanity check the flags to change_mnt_propagation. + */ + +static int flags_to_propagation_type(int flags) +{ + int type = flags & ~MS_REC; + + /* Fail if any non-propagation flags are set */ + if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) + return 0; + /* Only one propagation flag should be set */ + if (!is_power_of_2(type)) + return 0; + return type; +} + +/* * recursively change the type of the mountpoint. */ static int do_change_type(struct path *path, int flag) { struct vfsmount *m, *mnt = path->mnt; int recurse = flag & MS_REC; - int type = flag & ~MS_REC; + int type; int err = 0; if (!capable(CAP_SYS_ADMIN)) @@ -1499,6 +1516,10 @@ static int do_change_type(struct path *path, int flag) if (path->dentry != path->mnt->mnt_root) return -EINVAL; + type = flags_to_propagation_type(flag); + if (!type) + return -EINVAL; + down_write(&namespace_sem); if (type == MS_SHARED) { err = invent_group_ids(mnt, recurse); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 2e73571..cf0d2ff 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -440,7 +440,7 @@ test_share(struct nfs4_stateid *stp, struct nfsd4_open *open) { static int nfs4_access_to_omode(u32 access) { - switch (access) { + switch (access & NFS4_SHARE_ACCESS_BOTH) { case NFS4_SHARE_ACCESS_READ: return O_RDONLY; case NFS4_SHARE_ACCESS_WRITE: @@ -2450,14 +2450,13 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh, static __be32 nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_stateid *stp, struct nfsd4_open *open) { - u32 op_share_access, new_access; + u32 op_share_access = open->op_share_access & ~NFS4_SHARE_WANT_MASK; + bool new_access; __be32 status; - set_access(&new_access, stp->st_access_bmap); - new_access = (~new_access) & open->op_share_access & ~NFS4_SHARE_WANT_MASK; - + new_access = !test_bit(op_share_access, &stp->st_access_bmap); if (new_access) { - status = nfs4_get_vfs_file(rqstp, fp, cur_fh, new_access); + status = nfs4_get_vfs_file(rqstp, fp, cur_fh, op_share_access); if (status) return status; } @@ -2470,7 +2469,6 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *c return status; } /* remember the open */ - op_share_access = open->op_share_access & ~NFS4_SHARE_WANT_MASK; __set_bit(op_share_access, &stp->st_access_bmap); __set_bit(open->op_share_deny, &stp->st_deny_bmap); @@ -2983,7 +2981,6 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, *filpp = find_readable_file(stp->st_file); else *filpp = find_writeable_file(stp->st_file); - BUG_ON(!*filpp); /* assured by check_openmode */ } } status = nfs_ok; @@ -3561,7 +3558,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfs4_stateowner *open_sop = NULL; struct nfs4_stateowner *lock_sop = NULL; struct nfs4_stateid *lock_stp; - struct file *filp; + struct nfs4_file *fp; + struct file *filp = NULL; struct file_lock file_lock; struct file_lock conflock; __be32 status = 0; @@ -3591,7 +3589,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * lock stateid. */ struct nfs4_stateid *open_stp = NULL; - struct nfs4_file *fp; status = nfserr_stale_clientid; if (!nfsd4_has_session(cstate) && @@ -3634,6 +3631,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) goto out; lock_sop = lock->lk_replay_owner; + fp = lock_stp->st_file; } /* lock->lk_replay_owner and lock_stp have been created or found */ @@ -3648,13 +3646,19 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, switch (lock->lk_type) { case NFS4_READ_LT: case NFS4_READW_LT: - filp = find_readable_file(lock_stp->st_file); + if (find_readable_file(lock_stp->st_file)) { + nfs4_get_vfs_file(rqstp, fp, &cstate->current_fh, NFS4_SHARE_ACCESS_READ); + filp = find_readable_file(lock_stp->st_file); + } file_lock.fl_type = F_RDLCK; cmd = F_SETLK; break; case NFS4_WRITE_LT: case NFS4_WRITEW_LT: - filp = find_writeable_file(lock_stp->st_file); + if (find_writeable_file(lock_stp->st_file)) { + nfs4_get_vfs_file(rqstp, fp, &cstate->current_fh, NFS4_SHARE_ACCESS_WRITE); + filp = find_writeable_file(lock_stp->st_file); + } file_lock.fl_type = F_WRLCK; cmd = F_SETLK; break; diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 7731a75..322518c 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -363,23 +363,23 @@ struct nfs4_file { * at all? */ static inline struct file *find_writeable_file(struct nfs4_file *f) { - if (f->fi_fds[O_RDWR]) - return f->fi_fds[O_RDWR]; - return f->fi_fds[O_WRONLY]; + if (f->fi_fds[O_WRONLY]) + return f->fi_fds[O_WRONLY]; + return f->fi_fds[O_RDWR]; } static inline struct file *find_readable_file(struct nfs4_file *f) { - if (f->fi_fds[O_RDWR]) - return f->fi_fds[O_RDWR]; - return f->fi_fds[O_RDONLY]; + if (f->fi_fds[O_RDONLY]) + return f->fi_fds[O_RDONLY]; + return f->fi_fds[O_RDWR]; } static inline struct file *find_any_file(struct nfs4_file *f) { if (f->fi_fds[O_RDWR]) return f->fi_fds[O_RDWR]; - else if (f->fi_fds[O_RDWR]) + else if (f->fi_fds[O_WRONLY]) return f->fi_fds[O_WRONLY]; else return f->fi_fds[O_RDONLY]; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 96360a8..661a6cf 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -2033,15 +2033,17 @@ out: __be32 nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, int access) { - struct path path = { - .mnt = fhp->fh_export->ex_path.mnt, - .dentry = fhp->fh_dentry, - }; __be32 err; err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access); - if (!err && vfs_statfs(&path, stat)) - err = nfserr_io; + if (!err) { + struct path path = { + .mnt = fhp->fh_export->ex_path.mnt, + .dentry = fhp->fh_dentry, + }; + if (vfs_statfs(&path, stat)) + err = nfserr_io; + } return err; } diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 4317f17..ba7c10c 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -446,6 +446,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi) nilfs_mdt_destroy(nilfs->ns_cpfile); nilfs_mdt_destroy(nilfs->ns_sufile); nilfs_mdt_destroy(nilfs->ns_dat); + nilfs_mdt_destroy(nilfs->ns_gc_dat); failed: nilfs_clear_recovery_info(&ri); diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 756566f..85366c7 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -165,9 +165,6 @@ static bool fanotify_should_send_event(struct fsnotify_group *group, "mask=%x data=%p data_type=%d\n", __func__, group, to_tell, inode_mark, vfsmnt_mark, event_mask, data, data_type); - pr_debug("%s: group=%p vfsmount_mark=%p inode_mark=%p mask=%x\n", - __func__, group, vfsmnt_mark, inode_mark, event_mask); - /* sorry, fanotify only gives a damn about files and dirs */ if (!S_ISREG(to_tell->i_mode) && !S_ISDIR(to_tell->i_mode)) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 032b837..5ed8e58 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -195,6 +195,14 @@ static int prepare_for_access_response(struct fsnotify_group *group, re->fd = fd; mutex_lock(&group->fanotify_data.access_mutex); + + if (group->fanotify_data.bypass_perm) { + mutex_unlock(&group->fanotify_data.access_mutex); + kmem_cache_free(fanotify_response_event_cache, re); + event->response = FAN_ALLOW; + return 0; + } + list_add_tail(&re->list, &group->fanotify_data.access_list); mutex_unlock(&group->fanotify_data.access_mutex); @@ -364,9 +372,28 @@ static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t static int fanotify_release(struct inode *ignored, struct file *file) { struct fsnotify_group *group = file->private_data; + struct fanotify_response_event *re, *lre; pr_debug("%s: file=%p group=%p\n", __func__, file, group); +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + mutex_lock(&group->fanotify_data.access_mutex); + + group->fanotify_data.bypass_perm = true; + + list_for_each_entry_safe(re, lre, &group->fanotify_data.access_list, list) { + pr_debug("%s: found group=%p re=%p event=%p\n", __func__, group, + re, re->event); + + list_del_init(&re->list); + re->event->response = FAN_ALLOW; + + kmem_cache_free(fanotify_response_event_cache, re); + } + mutex_unlock(&group->fanotify_data.access_mutex); + + wake_up(&group->fanotify_data.access_waitq); +#endif /* matches the fanotify_init->fsnotify_alloc_group */ fsnotify_put_group(group); @@ -614,7 +641,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) __func__, flags, event_f_flags); if (!capable(CAP_SYS_ADMIN)) - return -EACCES; + return -EPERM; if (flags & ~FAN_ALL_INIT_FLAGS) return -EINVAL; diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 3970392..3680242 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -148,13 +148,14 @@ static int send_to_group(struct inode *to_tell, struct vfsmount *mnt, const unsigned char *file_name, struct fsnotify_event **event) { - struct fsnotify_group *group = inode_mark->group; - __u32 inode_test_mask = (mask & ~FS_EVENT_ON_CHILD); - __u32 vfsmount_test_mask = (mask & ~FS_EVENT_ON_CHILD); + struct fsnotify_group *group = NULL; + __u32 inode_test_mask = 0; + __u32 vfsmount_test_mask = 0; - pr_debug("%s: group=%p to_tell=%p mnt=%p mark=%p mask=%x data=%p" - " data_is=%d cookie=%d event=%p\n", __func__, group, to_tell, - mnt, inode_mark, mask, data, data_is, cookie, *event); + if (unlikely(!inode_mark && !vfsmount_mark)) { + BUG(); + return 0; + } /* clear ignored on inode modification */ if (mask & FS_MODIFY) { @@ -168,18 +169,29 @@ static int send_to_group(struct inode *to_tell, struct vfsmount *mnt, /* does the inode mark tell us to do something? */ if (inode_mark) { + group = inode_mark->group; + inode_test_mask = (mask & ~FS_EVENT_ON_CHILD); inode_test_mask &= inode_mark->mask; inode_test_mask &= ~inode_mark->ignored_mask; } /* does the vfsmount_mark tell us to do something? */ if (vfsmount_mark) { + vfsmount_test_mask = (mask & ~FS_EVENT_ON_CHILD); + group = vfsmount_mark->group; vfsmount_test_mask &= vfsmount_mark->mask; vfsmount_test_mask &= ~vfsmount_mark->ignored_mask; if (inode_mark) vfsmount_test_mask &= ~inode_mark->ignored_mask; } + pr_debug("%s: group=%p to_tell=%p mnt=%p mask=%x inode_mark=%p" + " inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x" + " data=%p data_is=%d cookie=%d event=%p\n", + __func__, group, to_tell, mnt, mask, inode_mark, + inode_test_mask, vfsmount_mark, vfsmount_test_mask, data, + data_is, cookie, *event); + if (!inode_test_mask && !vfsmount_test_mask) return 0; @@ -207,13 +219,12 @@ static int send_to_group(struct inode *to_tell, struct vfsmount *mnt, int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const unsigned char *file_name, u32 cookie) { - struct hlist_node *inode_node, *vfsmount_node; + struct hlist_node *inode_node = NULL, *vfsmount_node = NULL; struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL; struct fsnotify_group *inode_group, *vfsmount_group; struct fsnotify_event *event = NULL; struct vfsmount *mnt; int idx, ret = 0; - bool used_inode = false, used_vfsmount = false; /* global tests shouldn't care about events on child only the specific event */ __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD); @@ -238,57 +249,50 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, (test_mask & to_tell->i_fsnotify_mask)) inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first, &fsnotify_mark_srcu); - else - inode_node = NULL; - if (mnt) { - if ((mask & FS_MODIFY) || - (test_mask & mnt->mnt_fsnotify_mask)) - vfsmount_node = srcu_dereference(mnt->mnt_fsnotify_marks.first, - &fsnotify_mark_srcu); - else - vfsmount_node = NULL; - } else { - mnt = NULL; - vfsmount_node = NULL; + if (mnt && ((mask & FS_MODIFY) || + (test_mask & mnt->mnt_fsnotify_mask))) { + vfsmount_node = srcu_dereference(mnt->mnt_fsnotify_marks.first, + &fsnotify_mark_srcu); + inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first, + &fsnotify_mark_srcu); } while (inode_node || vfsmount_node) { + inode_group = vfsmount_group = NULL; + if (inode_node) { inode_mark = hlist_entry(srcu_dereference(inode_node, &fsnotify_mark_srcu), struct fsnotify_mark, i.i_list); inode_group = inode_mark->group; - } else - inode_group = (void *)-1; + } if (vfsmount_node) { vfsmount_mark = hlist_entry(srcu_dereference(vfsmount_node, &fsnotify_mark_srcu), struct fsnotify_mark, m.m_list); vfsmount_group = vfsmount_mark->group; - } else - vfsmount_group = (void *)-1; + } - if (inode_group < vfsmount_group) { + if (inode_group > vfsmount_group) { /* handle inode */ send_to_group(to_tell, NULL, inode_mark, NULL, mask, data, data_is, cookie, file_name, &event); - used_inode = true; - } else if (vfsmount_group < inode_group) { + /* we didn't use the vfsmount_mark */ + vfsmount_group = NULL; + } else if (vfsmount_group > inode_group) { send_to_group(to_tell, mnt, NULL, vfsmount_mark, mask, data, data_is, cookie, file_name, &event); - used_vfsmount = true; + inode_group = NULL; } else { send_to_group(to_tell, mnt, inode_mark, vfsmount_mark, mask, data, data_is, cookie, file_name, &event); - used_vfsmount = true; - used_inode = true; } - if (used_inode) + if (inode_group) inode_node = srcu_dereference(inode_node->next, &fsnotify_mark_srcu); - if (used_vfsmount) + if (vfsmount_group) vfsmount_node = srcu_dereference(vfsmount_node->next, &fsnotify_mark_srcu); } diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 215e12c..592fae5 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6672,7 +6672,7 @@ int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end, last_page_bytes = PAGE_ALIGN(end); index = start >> PAGE_CACHE_SHIFT; do { - pages[numpages] = grab_cache_page(mapping, index); + pages[numpages] = find_or_create_page(mapping, index, GFP_NOFS); if (!pages[numpages]) { ret = -ENOMEM; mlog_errno(ret); diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c index ec6d123..c7ee03c 100644 --- a/fs/ocfs2/blockcheck.c +++ b/fs/ocfs2/blockcheck.c @@ -439,7 +439,7 @@ int ocfs2_block_check_validate(void *data, size_t blocksize, ocfs2_blockcheck_inc_failure(stats); mlog(ML_ERROR, - "CRC32 failed: stored: %u, computed %u. Applying ECC.\n", + "CRC32 failed: stored: 0x%x, computed 0x%x. Applying ECC.\n", (unsigned int)check.bc_crc32e, (unsigned int)crc); /* Ok, try ECC fixups */ @@ -453,7 +453,7 @@ int ocfs2_block_check_validate(void *data, size_t blocksize, goto out; } - mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n", + mlog(ML_ERROR, "Fixed CRC32 failed: stored: 0x%x, computed 0x%x\n", (unsigned int)check.bc_crc32e, (unsigned int)crc); rc = -EIO; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 81296b4..9a03c15 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -36,6 +36,7 @@ #include <linux/writeback.h> #include <linux/falloc.h> #include <linux/quotaops.h> +#include <linux/blkdev.h> #define MLOG_MASK_PREFIX ML_INODE #include <cluster/masklog.h> @@ -190,8 +191,16 @@ static int ocfs2_sync_file(struct file *file, int datasync) if (err) goto bail; - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) + if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) { + /* + * We still have to flush drive's caches to get data to the + * platter + */ + if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER) + blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, + NULL, BLKDEV_IFL_WAIT); goto bail; + } journal = osb->journal->j_journal; err = jbd2_journal_force_commit(journal); @@ -774,7 +783,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT)); BUG_ON(abs_from & (inode->i_blkbits - 1)); - page = grab_cache_page(mapping, index); + page = find_or_create_page(mapping, index, GFP_NOFS); if (!page) { ret = -ENOMEM; mlog_errno(ret); @@ -2329,7 +2338,7 @@ out_dio: BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) || - ((file->f_flags & O_DIRECT) && has_refcount)) { + ((file->f_flags & O_DIRECT) && !direct_io)) { ret = filemap_fdatawrite_range(file->f_mapping, pos, pos + count - 1); if (ret < 0) diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 0492464..eece3e0 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -488,7 +488,11 @@ static int ocfs2_read_locked_inode(struct inode *inode, OCFS2_BH_IGNORE_CACHE); } else { status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh); - if (!status) + /* + * If buffer is in jbd, then its checksum may not have been + * computed as yet. + */ + if (!status && !buffer_jbd(bh)) status = ocfs2_validate_inode_block(osb->sb, bh); } if (status < 0) { diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index af2b8fe..4c18f4a 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -74,9 +74,11 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh, /* * Another node might have truncated while we were waiting on * cluster locks. + * We don't check size == 0 before the shift. This is borrowed + * from do_generic_file_read. */ - last_index = size >> PAGE_CACHE_SHIFT; - if (page->index > last_index) { + last_index = (size - 1) >> PAGE_CACHE_SHIFT; + if (unlikely(!size || page->index > last_index)) { ret = -EINVAL; goto out; } @@ -107,7 +109,7 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh, * because the "write" would invalidate their data. */ if (page->index == last_index) - len = size & ~PAGE_CACHE_MASK; + len = ((size - 1) & ~PAGE_CACHE_MASK) + 1; ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page, &fsdata, di_bh, page); diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index f171b51..a00dda2 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -472,32 +472,23 @@ leave: return status; } -static int ocfs2_mknod_locked(struct ocfs2_super *osb, - struct inode *dir, - struct inode *inode, - dev_t dev, - struct buffer_head **new_fe_bh, - struct buffer_head *parent_fe_bh, - handle_t *handle, - struct ocfs2_alloc_context *inode_ac) +static int __ocfs2_mknod_locked(struct inode *dir, + struct inode *inode, + dev_t dev, + struct buffer_head **new_fe_bh, + struct buffer_head *parent_fe_bh, + handle_t *handle, + struct ocfs2_alloc_context *inode_ac, + u64 fe_blkno, u64 suballoc_loc, u16 suballoc_bit) { int status = 0; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); struct ocfs2_dinode *fe = NULL; struct ocfs2_extent_list *fel; - u64 suballoc_loc, fe_blkno = 0; - u16 suballoc_bit; u16 feat; *new_fe_bh = NULL; - status = ocfs2_claim_new_inode(handle, dir, parent_fe_bh, - inode_ac, &suballoc_loc, - &suballoc_bit, &fe_blkno); - if (status < 0) { - mlog_errno(status); - goto leave; - } - /* populate as many fields early on as possible - many of * these are used by the support functions here and in * callers. */ @@ -591,6 +582,34 @@ leave: return status; } +static int ocfs2_mknod_locked(struct ocfs2_super *osb, + struct inode *dir, + struct inode *inode, + dev_t dev, + struct buffer_head **new_fe_bh, + struct buffer_head *parent_fe_bh, + handle_t *handle, + struct ocfs2_alloc_context *inode_ac) +{ + int status = 0; + u64 suballoc_loc, fe_blkno = 0; + u16 suballoc_bit; + + *new_fe_bh = NULL; + + status = ocfs2_claim_new_inode(handle, dir, parent_fe_bh, + inode_ac, &suballoc_loc, + &suballoc_bit, &fe_blkno); + if (status < 0) { + mlog_errno(status); + return status; + } + + return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh, + parent_fe_bh, handle, inode_ac, + fe_blkno, suballoc_loc, suballoc_bit); +} + static int ocfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode) @@ -1852,61 +1871,117 @@ bail: return status; } -static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, - struct inode **ret_orphan_dir, - u64 blkno, - char *name, - struct ocfs2_dir_lookup_result *lookup) +static int ocfs2_lookup_lock_orphan_dir(struct ocfs2_super *osb, + struct inode **ret_orphan_dir, + struct buffer_head **ret_orphan_dir_bh) { struct inode *orphan_dir_inode; struct buffer_head *orphan_dir_bh = NULL; - int status = 0; - - status = ocfs2_blkno_stringify(blkno, name); - if (status < 0) { - mlog_errno(status); - return status; - } + int ret = 0; orphan_dir_inode = ocfs2_get_system_file_inode(osb, ORPHAN_DIR_SYSTEM_INODE, osb->slot_num); if (!orphan_dir_inode) { - status = -ENOENT; - mlog_errno(status); - return status; + ret = -ENOENT; + mlog_errno(ret); + return ret; } mutex_lock(&orphan_dir_inode->i_mutex); - status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); - if (status < 0) { - mlog_errno(status); - goto leave; + ret = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); + if (ret < 0) { + mutex_unlock(&orphan_dir_inode->i_mutex); + iput(orphan_dir_inode); + + mlog_errno(ret); + return ret; } - status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode, - orphan_dir_bh, name, - OCFS2_ORPHAN_NAMELEN, lookup); - if (status < 0) { - ocfs2_inode_unlock(orphan_dir_inode, 1); + *ret_orphan_dir = orphan_dir_inode; + *ret_orphan_dir_bh = orphan_dir_bh; - mlog_errno(status); - goto leave; + return 0; +} + +static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode, + struct buffer_head *orphan_dir_bh, + u64 blkno, + char *name, + struct ocfs2_dir_lookup_result *lookup) +{ + int ret; + struct ocfs2_super *osb = OCFS2_SB(orphan_dir_inode->i_sb); + + ret = ocfs2_blkno_stringify(blkno, name); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + ret = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode, + orphan_dir_bh, name, + OCFS2_ORPHAN_NAMELEN, lookup); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + return 0; +} + +/** + * ocfs2_prepare_orphan_dir() - Prepare an orphan directory for + * insertion of an orphan. + * @osb: ocfs2 file system + * @ret_orphan_dir: Orphan dir inode - returned locked! + * @blkno: Actual block number of the inode to be inserted into orphan dir. + * @lookup: dir lookup result, to be passed back into functions like + * ocfs2_orphan_add + * + * Returns zero on success and the ret_orphan_dir, name and lookup + * fields will be populated. + * + * Returns non-zero on failure. + */ +static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, + struct inode **ret_orphan_dir, + u64 blkno, + char *name, + struct ocfs2_dir_lookup_result *lookup) +{ + struct inode *orphan_dir_inode = NULL; + struct buffer_head *orphan_dir_bh = NULL; + int ret = 0; + + ret = ocfs2_lookup_lock_orphan_dir(osb, &orphan_dir_inode, + &orphan_dir_bh); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + ret = __ocfs2_prepare_orphan_dir(orphan_dir_inode, orphan_dir_bh, + blkno, name, lookup); + if (ret < 0) { + mlog_errno(ret); + goto out; } *ret_orphan_dir = orphan_dir_inode; -leave: - if (status) { +out: + brelse(orphan_dir_bh); + + if (ret) { + ocfs2_inode_unlock(orphan_dir_inode, 1); mutex_unlock(&orphan_dir_inode->i_mutex); iput(orphan_dir_inode); } - brelse(orphan_dir_bh); - - mlog_exit(status); - return status; + mlog_exit(ret); + return ret; } static int ocfs2_orphan_add(struct ocfs2_super *osb, @@ -2053,6 +2128,99 @@ leave: return status; } +/** + * ocfs2_prep_new_orphaned_file() - Prepare the orphan dir to recieve a newly + * allocated file. This is different from the typical 'add to orphan dir' + * operation in that the inode does not yet exist. This is a problem because + * the orphan dir stringifies the inode block number to come up with it's + * dirent. Obviously if the inode does not yet exist we have a chicken and egg + * problem. This function works around it by calling deeper into the orphan + * and suballoc code than other callers. Use this only by necessity. + * @dir: The directory which this inode will ultimately wind up under - not the + * orphan dir! + * @dir_bh: buffer_head the @dir inode block + * @orphan_name: string of length (CFS2_ORPHAN_NAMELEN + 1). Will be filled + * with the string to be used for orphan dirent. Pass back to the orphan dir + * code. + * @ret_orphan_dir: orphan dir inode returned to be passed back into orphan + * dir code. + * @ret_di_blkno: block number where the new inode will be allocated. + * @orphan_insert: Dir insert context to be passed back into orphan dir code. + * @ret_inode_ac: Inode alloc context to be passed back to the allocator. + * + * Returns zero on success and the ret_orphan_dir, name and lookup + * fields will be populated. + * + * Returns non-zero on failure. + */ +static int ocfs2_prep_new_orphaned_file(struct inode *dir, + struct buffer_head *dir_bh, + char *orphan_name, + struct inode **ret_orphan_dir, + u64 *ret_di_blkno, + struct ocfs2_dir_lookup_result *orphan_insert, + struct ocfs2_alloc_context **ret_inode_ac) +{ + int ret; + u64 di_blkno; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + struct inode *orphan_dir = NULL; + struct buffer_head *orphan_dir_bh = NULL; + struct ocfs2_alloc_context *inode_ac = NULL; + + ret = ocfs2_lookup_lock_orphan_dir(osb, &orphan_dir, &orphan_dir_bh); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + /* reserve an inode spot */ + ret = ocfs2_reserve_new_inode(osb, &inode_ac); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_new_inode_loc(dir, dir_bh, inode_ac, + &di_blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = __ocfs2_prepare_orphan_dir(orphan_dir, orphan_dir_bh, + di_blkno, orphan_name, orphan_insert); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + +out: + if (ret == 0) { + *ret_orphan_dir = orphan_dir; + *ret_di_blkno = di_blkno; + *ret_inode_ac = inode_ac; + /* + * orphan_name and orphan_insert are already up to + * date via prepare_orphan_dir + */ + } else { + /* Unroll reserve_new_inode* */ + if (inode_ac) + ocfs2_free_alloc_context(inode_ac); + + /* Unroll orphan dir locking */ + mutex_unlock(&orphan_dir->i_mutex); + ocfs2_inode_unlock(orphan_dir, 1); + iput(orphan_dir); + } + + brelse(orphan_dir_bh); + + return 0; +} + int ocfs2_create_inode_in_orphan(struct inode *dir, int mode, struct inode **new_inode) @@ -2068,6 +2236,8 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, struct buffer_head *new_di_bh = NULL; struct ocfs2_alloc_context *inode_ac = NULL; struct ocfs2_dir_lookup_result orphan_insert = { NULL, }; + u64 uninitialized_var(di_blkno), suballoc_loc; + u16 suballoc_bit; status = ocfs2_inode_lock(dir, &parent_di_bh, 1); if (status < 0) { @@ -2076,20 +2246,9 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, return status; } - /* - * We give the orphan dir the root blkno to fake an orphan name, - * and allocate enough space for our insertion. - */ - status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, - osb->root_blkno, - orphan_name, &orphan_insert); - if (status < 0) { - mlog_errno(status); - goto leave; - } - - /* reserve an inode spot */ - status = ocfs2_reserve_new_inode(osb, &inode_ac); + status = ocfs2_prep_new_orphaned_file(dir, parent_di_bh, + orphan_name, &orphan_dir, + &di_blkno, &orphan_insert, &inode_ac); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); @@ -2116,17 +2275,20 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, goto leave; did_quota_inode = 1; - inode->i_nlink = 0; - /* do the real work now. */ - status = ocfs2_mknod_locked(osb, dir, inode, - 0, &new_di_bh, parent_di_bh, handle, - inode_ac); + status = ocfs2_claim_new_inode_at_loc(handle, dir, inode_ac, + &suballoc_loc, + &suballoc_bit, di_blkno); if (status < 0) { mlog_errno(status); goto leave; } - status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, orphan_name); + inode->i_nlink = 0; + /* do the real work now. */ + status = __ocfs2_mknod_locked(dir, inode, + 0, &new_di_bh, parent_di_bh, handle, + inode_ac, di_blkno, suballoc_loc, + suballoc_bit); if (status < 0) { mlog_errno(status); goto leave; diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 73a11cc..0afeda831 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -2960,7 +2960,7 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle, if (map_end & (PAGE_CACHE_SIZE - 1)) to = map_end & (PAGE_CACHE_SIZE - 1); - page = grab_cache_page(mapping, page_index); + page = find_or_create_page(mapping, page_index, GFP_NOFS); /* * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page @@ -3179,7 +3179,8 @@ static int ocfs2_cow_sync_writeback(struct super_block *sb, if (map_end > end) map_end = end; - page = grab_cache_page(context->inode->i_mapping, page_index); + page = find_or_create_page(context->inode->i_mapping, + page_index, GFP_NOFS); BUG_ON(!page); wait_on_page_writeback(page); diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index a8e6a95..8a286f5 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -57,11 +57,28 @@ struct ocfs2_suballoc_result { u64 sr_bg_blkno; /* The bg we allocated from. Set to 0 when a block group is contiguous. */ + u64 sr_bg_stable_blkno; /* + * Doesn't change, always + * set to target block + * group descriptor + * block. + */ u64 sr_blkno; /* The first allocated block */ unsigned int sr_bit_offset; /* The bit in the bg */ unsigned int sr_bits; /* How many bits we claimed */ }; +static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res) +{ + if (res->sr_blkno == 0) + return 0; + + if (res->sr_bg_blkno) + return res->sr_bg_blkno; + + return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset); +} + static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg); static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe); static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl); @@ -138,6 +155,10 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) brelse(ac->ac_bh); ac->ac_bh = NULL; ac->ac_resv = NULL; + if (ac->ac_find_loc_priv) { + kfree(ac->ac_find_loc_priv); + ac->ac_find_loc_priv = NULL; + } } void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) @@ -1678,6 +1699,15 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, if (!ret) ocfs2_bg_discontig_fix_result(ac, gd, res); + /* + * sr_bg_blkno might have been changed by + * ocfs2_bg_discontig_fix_result + */ + res->sr_bg_stable_blkno = group_bh->b_blocknr; + + if (ac->ac_find_loc_only) + goto out_loc_only; + ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh, res->sr_bits, le16_to_cpu(gd->bg_chain)); @@ -1691,6 +1721,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, if (ret < 0) mlog_errno(ret); +out_loc_only: *bits_left = le16_to_cpu(gd->bg_free_bits_count); out: @@ -1708,7 +1739,6 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, { int status; u16 chain; - u32 tmp_used; u64 next_group; struct inode *alloc_inode = ac->ac_inode; struct buffer_head *group_bh = NULL; @@ -1770,6 +1800,11 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, if (!status) ocfs2_bg_discontig_fix_result(ac, bg, res); + /* + * sr_bg_blkno might have been changed by + * ocfs2_bg_discontig_fix_result + */ + res->sr_bg_stable_blkno = group_bh->b_blocknr; /* * Keep track of previous block descriptor read. When @@ -1796,22 +1831,17 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, } } - /* Ok, claim our bits now: set the info on dinode, chainlist - * and then the group */ - status = ocfs2_journal_access_di(handle, - INODE_CACHE(alloc_inode), - ac->ac_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { + if (ac->ac_find_loc_only) + goto out_loc_only; + + status = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, + ac->ac_bh, res->sr_bits, + chain); + if (status) { mlog_errno(status); goto bail; } - tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); - fe->id1.bitmap1.i_used = cpu_to_le32(res->sr_bits + tmp_used); - le32_add_cpu(&cl->cl_recs[chain].c_free, -res->sr_bits); - ocfs2_journal_dirty(handle, ac->ac_bh); - status = ocfs2_block_group_set_bits(handle, alloc_inode, bg, @@ -1826,6 +1856,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits, (unsigned long long)le64_to_cpu(fe->i_blkno)); +out_loc_only: *bits_left = le16_to_cpu(bg->bg_free_bits_count); bail: brelse(group_bh); @@ -1845,6 +1876,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, int status; u16 victim, i; u16 bits_left = 0; + u64 hint = ac->ac_last_group; struct ocfs2_chain_list *cl; struct ocfs2_dinode *fe; @@ -1872,7 +1904,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, goto bail; } - res->sr_bg_blkno = ac->ac_last_group; + res->sr_bg_blkno = hint; if (res->sr_bg_blkno) { /* Attempt to short-circuit the usual search mechanism * by jumping straight to the most recently used @@ -1896,8 +1928,10 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, res, &bits_left); - if (!status) + if (!status) { + hint = ocfs2_group_from_res(res); goto set_hint; + } if (status < 0 && status != -ENOSPC) { mlog_errno(status); goto bail; @@ -1920,8 +1954,10 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, ac->ac_chain = i; status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, res, &bits_left); - if (!status) + if (!status) { + hint = ocfs2_group_from_res(res); break; + } if (status < 0 && status != -ENOSPC) { mlog_errno(status); goto bail; @@ -1936,7 +1972,7 @@ set_hint: if (bits_left < min_bits) ac->ac_last_group = 0; else - ac->ac_last_group = res->sr_bg_blkno; + ac->ac_last_group = hint; } bail: @@ -2016,6 +2052,136 @@ static inline void ocfs2_save_inode_ac_group(struct inode *dir, OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot; } +int ocfs2_find_new_inode_loc(struct inode *dir, + struct buffer_head *parent_fe_bh, + struct ocfs2_alloc_context *ac, + u64 *fe_blkno) +{ + int ret; + handle_t *handle = NULL; + struct ocfs2_suballoc_result *res; + + BUG_ON(!ac); + BUG_ON(ac->ac_bits_given != 0); + BUG_ON(ac->ac_bits_wanted != 1); + BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE); + + res = kzalloc(sizeof(*res), GFP_NOFS); + if (res == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac); + + /* + * The handle started here is for chain relink. Alternatively, + * we could just disable relink for these calls. + */ + handle = ocfs2_start_trans(OCFS2_SB(dir->i_sb), OCFS2_SUBALLOC_ALLOC); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + mlog_errno(ret); + goto out; + } + + /* + * This will instruct ocfs2_claim_suballoc_bits and + * ocfs2_search_one_group to search but save actual allocation + * for later. + */ + ac->ac_find_loc_only = 1; + + ret = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, res); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ac->ac_find_loc_priv = res; + *fe_blkno = res->sr_blkno; + +out: + if (handle) + ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle); + + if (ret) + kfree(res); + + return ret; +} + +int ocfs2_claim_new_inode_at_loc(handle_t *handle, + struct inode *dir, + struct ocfs2_alloc_context *ac, + u64 *suballoc_loc, + u16 *suballoc_bit, + u64 di_blkno) +{ + int ret; + u16 chain; + struct ocfs2_suballoc_result *res = ac->ac_find_loc_priv; + struct buffer_head *bg_bh = NULL; + struct ocfs2_group_desc *bg; + struct ocfs2_dinode *di = (struct ocfs2_dinode *) ac->ac_bh->b_data; + + /* + * Since di_blkno is being passed back in, we check for any + * inconsistencies which may have happened between + * calls. These are code bugs as di_blkno is not expected to + * change once returned from ocfs2_find_new_inode_loc() + */ + BUG_ON(res->sr_blkno != di_blkno); + + ret = ocfs2_read_group_descriptor(ac->ac_inode, di, + res->sr_bg_stable_blkno, &bg_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + bg = (struct ocfs2_group_desc *) bg_bh->b_data; + chain = le16_to_cpu(bg->bg_chain); + + ret = ocfs2_alloc_dinode_update_counts(ac->ac_inode, handle, + ac->ac_bh, res->sr_bits, + chain); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_block_group_set_bits(handle, + ac->ac_inode, + bg, + bg_bh, + res->sr_bit_offset, + res->sr_bits); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits, + (unsigned long long)di_blkno); + + atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs); + + BUG_ON(res->sr_bits != 1); + + *suballoc_loc = res->sr_bg_blkno; + *suballoc_bit = res->sr_bit_offset; + ac->ac_bits_given++; + ocfs2_save_inode_ac_group(dir, ac); + +out: + brelse(bg_bh); + + return ret; +} + int ocfs2_claim_new_inode(handle_t *handle, struct inode *dir, struct buffer_head *parent_fe_bh, @@ -2567,7 +2733,8 @@ out: * suballoc_bit. */ static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno, - u16 *suballoc_slot, u16 *suballoc_bit) + u16 *suballoc_slot, u64 *group_blkno, + u16 *suballoc_bit) { int status; struct buffer_head *inode_bh = NULL; @@ -2604,6 +2771,8 @@ static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno, *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot); if (suballoc_bit) *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit); + if (group_blkno) + *group_blkno = le64_to_cpu(inode_fe->i_suballoc_loc); bail: brelse(inode_bh); @@ -2621,7 +2790,8 @@ bail: */ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, struct inode *suballoc, - struct buffer_head *alloc_bh, u64 blkno, + struct buffer_head *alloc_bh, + u64 group_blkno, u64 blkno, u16 bit, int *res) { struct ocfs2_dinode *alloc_di; @@ -2642,10 +2812,8 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, goto bail; } - if (alloc_di->i_suballoc_loc) - bg_blkno = le64_to_cpu(alloc_di->i_suballoc_loc); - else - bg_blkno = ocfs2_which_suballoc_group(blkno, bit); + bg_blkno = group_blkno ? group_blkno : + ocfs2_which_suballoc_group(blkno, bit); status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno, &group_bh); if (status < 0) { @@ -2680,6 +2848,7 @@ bail: int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) { int status; + u64 group_blkno = 0; u16 suballoc_bit = 0, suballoc_slot = 0; struct inode *inode_alloc_inode; struct buffer_head *alloc_bh = NULL; @@ -2687,7 +2856,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) mlog_entry("blkno: %llu", (unsigned long long)blkno); status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot, - &suballoc_bit); + &group_blkno, &suballoc_bit); if (status < 0) { mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status); goto bail; @@ -2715,7 +2884,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) } status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh, - blkno, suballoc_bit, res); + group_blkno, blkno, suballoc_bit, res); if (status < 0) mlog(ML_ERROR, "test suballoc bit failed %d\n", status); diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index a017dd3..b8afabf 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -56,6 +56,9 @@ struct ocfs2_alloc_context { u64 ac_max_block; /* Highest block number to allocate. 0 is is the same as ~0 - unlimited */ + int ac_find_loc_only; /* hack for reflink operation ordering */ + struct ocfs2_suballoc_result *ac_find_loc_priv; /* */ + struct ocfs2_alloc_reservation *ac_resv; }; @@ -197,4 +200,22 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et, struct ocfs2_alloc_context **meta_ac); int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res); + + + +/* + * The following two interfaces are for ocfs2_create_inode_in_orphan(). + */ +int ocfs2_find_new_inode_loc(struct inode *dir, + struct buffer_head *parent_fe_bh, + struct ocfs2_alloc_context *ac, + u64 *fe_blkno); + +int ocfs2_claim_new_inode_at_loc(handle_t *handle, + struct inode *dir, + struct ocfs2_alloc_context *ac, + u64 *suballoc_loc, + u16 *suballoc_bit, + u64 di_blkno); + #endif /* _CHAINALLOC_H_ */ diff --git a/fs/proc/page.c b/fs/proc/page.c index 180cf5a..3b8b4566 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -146,7 +146,7 @@ u64 stable_page_flags(struct page *page) u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison); #endif -#ifdef CONFIG_IA64_UNCACHED_ALLOCATOR +#ifdef CONFIG_ARCH_USES_PG_UNCACHED u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached); #endif diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 439fc1f..271afc4 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -224,7 +224,8 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) /* We don't show the stack guard page in /proc/maps */ start = vma->vm_start; if (vma->vm_flags & VM_GROWSDOWN) - start += PAGE_SIZE; + if (!vma_stack_continue(vma->vm_prev, vma->vm_start)) + start += PAGE_SIZE; seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", start, diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 1b27b56..da3fefe 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -340,7 +340,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file) char *p; p = d_path(&file->f_path, last_sysfs_file, sizeof(last_sysfs_file)); - if (p) + if (!IS_ERR(p)) memmove(last_sysfs_file, p, strlen(p) + 1); /* need attr_sd for attr and ops, its parent for kobj */ diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 15412fe..b552f81 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -852,8 +852,8 @@ xfs_convert_page( SetPageUptodate(page); if (count) { - wbc->nr_to_write--; - if (wbc->nr_to_write <= 0) + if (--wbc->nr_to_write <= 0 && + wbc->sync_mode == WB_SYNC_NONE) done = 1; } xfs_start_page_writeback(page, !page_dirty, count); @@ -1068,7 +1068,7 @@ xfs_vm_writepage( * by themselves. */ if ((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC) - goto out_fail; + goto redirty; /* * We need a transaction if there are delalloc or unwritten buffers @@ -1080,7 +1080,7 @@ xfs_vm_writepage( */ xfs_count_page_state(page, &delalloc, &unwritten); if ((current->flags & PF_FSTRANS) && (delalloc || unwritten)) - goto out_fail; + goto redirty; /* Is this page beyond the end of the file? */ offset = i_size_read(inode); @@ -1245,12 +1245,15 @@ error: if (iohead) xfs_cancel_ioend(iohead); + if (err == -EAGAIN) + goto redirty; + xfs_aops_discard_page(page); ClearPageUptodate(page); unlock_page(page); return err; -out_fail: +redirty: redirty_page_for_writepage(wbc, page); unlock_page(page); return 0; diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index ea79072..286e36e 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -440,12 +440,7 @@ _xfs_buf_find( ASSERT(btp == bp->b_target); if (bp->b_file_offset == range_base && bp->b_buffer_length == range_length) { - /* - * If we look at something, bring it to the - * front of the list for next time. - */ atomic_inc(&bp->b_hold); - list_move(&bp->b_hash_list, &hash->bh_list); goto found; } } @@ -1443,8 +1438,7 @@ xfs_alloc_bufhash( { unsigned int i; - btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */ - btp->bt_hashmask = (1 << btp->bt_hashshift) - 1; + btp->bt_hashshift = external ? 3 : 12; /* 8 or 4096 buckets */ btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) * sizeof(xfs_bufhash_t)); for (i = 0; i < (1 << btp->bt_hashshift); i++) { @@ -1938,7 +1932,8 @@ xfs_buf_init(void) if (!xfs_buf_zone) goto out; - xfslogd_workqueue = create_workqueue("xfslogd"); + xfslogd_workqueue = alloc_workqueue("xfslogd", + WQ_RESCUER | WQ_HIGHPRI, 1); if (!xfslogd_workqueue) goto out_free_buf_zone; diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index d072e5f..2a05614 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -137,7 +137,6 @@ typedef struct xfs_buftarg { size_t bt_smask; /* per device buffer hash table */ - uint bt_hashmask; uint bt_hashshift; xfs_bufhash_t *bt_hash; diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index 237f5ff..3b9e626 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -785,6 +785,8 @@ xfs_ioc_fsgetxattr( { struct fsxattr fa; + memset(&fa, 0, sizeof(struct fsxattr)); + xfs_ilock(ip, XFS_ILOCK_SHARED); fa.fsx_xflags = xfs_ip2xflags(ip); fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog; @@ -907,6 +909,13 @@ xfs_ioctl_setattr( return XFS_ERROR(EIO); /* + * Disallow 32bit project ids because on-disk structure + * is 16bit only. + */ + if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1)) + return XFS_ERROR(EINVAL); + + /* * If disk quotas is on, we make sure that the dquots do exist on disk, * before we start any other transactions. Trying to do this later * is messy. We don't care to take a readlock to look at the ids diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 68be25d..b1fc2a6 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -664,7 +664,7 @@ xfs_vn_fiemap( fieinfo->fi_extents_max + 1; bm.bmv_count = min_t(__s32, bm.bmv_count, (PAGE_SIZE * 16 / sizeof(struct getbmapx))); - bm.bmv_iflags = BMV_IF_PREALLOC; + bm.bmv_iflags = BMV_IF_PREALLOC | BMV_IF_NO_HOLES; if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) bm.bmv_iflags |= BMV_IF_ATTRFORK; if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC)) diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 15c35b6..a4e0797 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1226,6 +1226,7 @@ xfs_fs_statfs( struct xfs_inode *ip = XFS_I(dentry->d_inode); __uint64_t fakeinos, id; xfs_extlen_t lsize; + __int64_t ffree; statp->f_type = XFS_SB_MAGIC; statp->f_namelen = MAXNAMELEN - 1; @@ -1249,7 +1250,11 @@ xfs_fs_statfs( statp->f_files = min_t(typeof(statp->f_files), statp->f_files, mp->m_maxicount); - statp->f_ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree); + + /* make sure statp->f_ffree does not underflow */ + ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree); + statp->f_ffree = max_t(__int64_t, ffree, 0); + spin_unlock(&mp->m_sb_lock); if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) || @@ -1402,7 +1407,7 @@ xfs_fs_freeze( xfs_save_resvblks(mp); xfs_quiesce_attr(mp); - return -xfs_fs_log_dummy(mp); + return -xfs_fs_log_dummy(mp, SYNC_WAIT); } STATIC int diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index dfcbd98..d59c4a6 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -34,6 +34,7 @@ #include "xfs_inode_item.h" #include "xfs_quota.h" #include "xfs_trace.h" +#include "xfs_fsops.h" #include <linux/kthread.h> #include <linux/freezer.h> @@ -341,38 +342,6 @@ xfs_sync_attr( } STATIC int -xfs_commit_dummy_trans( - struct xfs_mount *mp, - uint flags) -{ - struct xfs_inode *ip = mp->m_rootip; - struct xfs_trans *tp; - int error; - - /* - * Put a dummy transaction in the log to tell recovery - * that all others are OK. - */ - tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1); - error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0); - if (error) { - xfs_trans_cancel(tp, 0); - return error; - } - - xfs_ilock(ip, XFS_ILOCK_EXCL); - - xfs_trans_ijoin(tp, ip); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - error = xfs_trans_commit(tp, 0); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - - /* the log force ensures this transaction is pushed to disk */ - xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0); - return error; -} - -STATIC int xfs_sync_fsdata( struct xfs_mount *mp) { @@ -432,7 +401,7 @@ xfs_quiesce_data( /* mark the log as covered if needed */ if (xfs_log_need_covered(mp)) - error2 = xfs_commit_dummy_trans(mp, SYNC_WAIT); + error2 = xfs_fs_log_dummy(mp, SYNC_WAIT); /* flush data-only devices */ if (mp->m_rtdev_targp) @@ -563,7 +532,7 @@ xfs_flush_inodes( /* * Every sync period we need to unpin all items, reclaim inodes and sync * disk quotas. We might need to cover the log to indicate that the - * filesystem is idle. + * filesystem is idle and not frozen. */ STATIC void xfs_sync_worker( @@ -577,8 +546,9 @@ xfs_sync_worker( xfs_reclaim_inodes(mp, 0); /* dgc: errors ignored here */ error = xfs_qm_sync(mp, SYNC_TRYLOCK); - if (xfs_log_need_covered(mp)) - error = xfs_commit_dummy_trans(mp, 0); + if (mp->m_super->s_frozen == SB_UNFROZEN && + xfs_log_need_covered(mp)) + error = xfs_fs_log_dummy(mp, 0); } mp->m_sync_seq++; wake_up(&mp->m_wait_single_sync_task); diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 23f14e5..f90dadd 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -5533,12 +5533,24 @@ xfs_getbmap( map[i].br_startblock)) goto out_free_map; - nexleft--; bmv->bmv_offset = out[cur_ext].bmv_offset + out[cur_ext].bmv_length; bmv->bmv_length = max_t(__int64_t, 0, bmvend - bmv->bmv_offset); + + /* + * In case we don't want to return the hole, + * don't increase cur_ext so that we can reuse + * it in the next loop. + */ + if ((iflags & BMV_IF_NO_HOLES) && + map[i].br_startblock == HOLESTARTBLOCK) { + memset(&out[cur_ext], 0, sizeof(out[cur_ext])); + continue; + } + + nexleft--; bmv->bmv_entries++; cur_ext++; } diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index 7cf7220..87c2e9d 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -114,8 +114,10 @@ struct getbmapx { #define BMV_IF_NO_DMAPI_READ 0x2 /* Do not generate DMAPI read event */ #define BMV_IF_PREALLOC 0x4 /* rtn status BMV_OF_PREALLOC if req */ #define BMV_IF_DELALLOC 0x8 /* rtn status BMV_OF_DELALLOC if req */ +#define BMV_IF_NO_HOLES 0x10 /* Do not return holes */ #define BMV_IF_VALID \ - (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|BMV_IF_DELALLOC) + (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC| \ + BMV_IF_DELALLOC|BMV_IF_NO_HOLES) /* bmv_oflags values - returned for each non-header segment */ #define BMV_OF_PREALLOC 0x1 /* segment = unwritten pre-allocation */ diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index dbca5f5..43b1d56 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -604,31 +604,36 @@ out: return 0; } +/* + * Dump a transaction into the log that contains no real change. This is needed + * to be able to make the log dirty or stamp the current tail LSN into the log + * during the covering operation. + * + * We cannot use an inode here for this - that will push dirty state back up + * into the VFS and then periodic inode flushing will prevent log covering from + * making progress. Hence we log a field in the superblock instead. + */ int xfs_fs_log_dummy( - xfs_mount_t *mp) + xfs_mount_t *mp, + int flags) { xfs_trans_t *tp; - xfs_inode_t *ip; int error; tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP); - error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0); + error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, + XFS_DEFAULT_LOG_COUNT); if (error) { xfs_trans_cancel(tp, 0); return error; } - ip = mp->m_rootip; - xfs_ilock(ip, XFS_ILOCK_EXCL); - - xfs_trans_ijoin(tp, ip); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0); - - xfs_iunlock(ip, XFS_ILOCK_EXCL); - return error; + /* log the UUID because it is an unchanging field */ + xfs_mod_sb(tp, XFS_SB_UUID); + if (flags & SYNC_WAIT) + xfs_trans_set_sync(tp); + return xfs_trans_commit(tp, 0); } int diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index 88435e0..a786c52 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h @@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt); extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval, xfs_fsop_resblks_t *outval); extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); -extern int xfs_fs_log_dummy(xfs_mount_t *mp); +extern int xfs_fs_log_dummy(xfs_mount_t *mp, int flags); #endif /* __XFS_FSOPS_H__ */ diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index abf80ae..5371d2d 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -1213,7 +1213,6 @@ xfs_imap_lookup( struct xfs_inobt_rec_incore rec; struct xfs_btree_cur *cur; struct xfs_buf *agbp; - xfs_agino_t startino; int error; int i; @@ -1227,13 +1226,13 @@ xfs_imap_lookup( } /* - * derive and lookup the exact inode record for the given agino. If the - * record cannot be found, then it's an invalid inode number and we - * should abort. + * Lookup the inode record for the given agino. If the record cannot be + * found, then it's an invalid inode number and we should abort. Once + * we have a record, we need to ensure it contains the inode number + * we are looking up. */ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); - startino = agino & ~(XFS_IALLOC_INODES(mp) - 1); - error = xfs_inobt_lookup(cur, startino, XFS_LOOKUP_EQ, &i); + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i); if (!error) { if (i) error = xfs_inobt_get_rec(cur, &rec, &i); @@ -1246,6 +1245,11 @@ xfs_imap_lookup( if (error) return error; + /* check that the returned record contains the required inode */ + if (rec.ir_startino > agino || + rec.ir_startino + XFS_IALLOC_INODES(mp) <= agino) + return EINVAL; + /* for untrusted inodes check it is allocated first */ if ((flags & XFS_IGET_UNTRUSTED) && (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino))) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 68415cb..34798f3 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1914,6 +1914,11 @@ xfs_iunlink_remove( return 0; } +/* + * A big issue when freeing the inode cluster is is that we _cannot_ skip any + * inodes that are in memory - they all must be marked stale and attached to + * the cluster buffer. + */ STATIC void xfs_ifree_cluster( xfs_inode_t *free_ip, @@ -1945,8 +1950,6 @@ xfs_ifree_cluster( } for (j = 0; j < nbufs; j++, inum += ninodes) { - int found = 0; - blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), XFS_INO_TO_AGBNO(mp, inum)); @@ -1965,7 +1968,9 @@ xfs_ifree_cluster( /* * Walk the inodes already attached to the buffer and mark them * stale. These will all have the flush locks held, so an - * in-memory inode walk can't lock them. + * in-memory inode walk can't lock them. By marking them all + * stale first, we will not attempt to lock them in the loop + * below as the XFS_ISTALE flag will be set. */ lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); while (lip) { @@ -1977,11 +1982,11 @@ xfs_ifree_cluster( &iip->ili_flush_lsn, &iip->ili_item.li_lsn); xfs_iflags_set(iip->ili_inode, XFS_ISTALE); - found++; } lip = lip->li_bio_list; } + /* * For each inode in memory attempt to add it to the inode * buffer and set it up for being staled on buffer IO @@ -1993,6 +1998,7 @@ xfs_ifree_cluster( * even trying to lock them. */ for (i = 0; i < ninodes; i++) { +retry: read_lock(&pag->pag_ici_lock); ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, (inum + i))); @@ -2003,38 +2009,36 @@ xfs_ifree_cluster( continue; } - /* don't try to lock/unlock the current inode */ + /* + * Don't try to lock/unlock the current inode, but we + * _cannot_ skip the other inodes that we did not find + * in the list attached to the buffer and are not + * already marked stale. If we can't lock it, back off + * and retry. + */ if (ip != free_ip && !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { read_unlock(&pag->pag_ici_lock); - continue; + delay(1); + goto retry; } read_unlock(&pag->pag_ici_lock); - if (!xfs_iflock_nowait(ip)) { - if (ip != free_ip) - xfs_iunlock(ip, XFS_ILOCK_EXCL); - continue; - } - + xfs_iflock(ip); xfs_iflags_set(ip, XFS_ISTALE); - if (xfs_inode_clean(ip)) { - ASSERT(ip != free_ip); - xfs_ifunlock(ip); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - continue; - } + /* + * we don't need to attach clean inodes or those only + * with unlogged changes (which we throw away, anyway). + */ iip = ip->i_itemp; - if (!iip) { - /* inode with unlogged changes only */ + if (!iip || xfs_inode_clean(ip)) { ASSERT(ip != free_ip); ip->i_update_core = 0; xfs_ifunlock(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); continue; } - found++; iip->ili_last_fields = iip->ili_format.ilf_fields; iip->ili_format.ilf_fields = 0; @@ -2049,8 +2053,7 @@ xfs_ifree_cluster( xfs_iunlock(ip, XFS_ILOCK_EXCL); } - if (found) - xfs_trans_stale_inode_buf(tp, bp); + xfs_trans_stale_inode_buf(tp, bp); xfs_trans_binval(tp, bp); } diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 925d572..33f718f 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3015,7 +3015,8 @@ _xfs_log_force( XFS_STATS_INC(xs_log_force); - xlog_cil_push(log, 1); + if (log->l_cilp) + xlog_cil_force(log); spin_lock(&log->l_icloglock); @@ -3167,7 +3168,7 @@ _xfs_log_force_lsn( XFS_STATS_INC(xs_log_force); if (log->l_cilp) { - lsn = xlog_cil_push_lsn(log, lsn); + lsn = xlog_cil_force_lsn(log, lsn); if (lsn == NULLCOMMITLSN) return 0; } @@ -3724,7 +3725,7 @@ xfs_log_force_umount( * call below. */ if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG)) - xlog_cil_push(log, 1); + xlog_cil_force(log); /* * We must hold both the GRANT lock and the LOG lock, diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 31e4ea2..ed575fb 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -68,6 +68,7 @@ xlog_cil_init( ctx->sequence = 1; ctx->cil = cil; cil->xc_ctx = ctx; + cil->xc_current_sequence = ctx->sequence; cil->xc_log = log; log->l_cilp = cil; @@ -269,15 +270,10 @@ xlog_cil_insert( static void xlog_cil_format_items( struct log *log, - struct xfs_log_vec *log_vector, - struct xlog_ticket *ticket, - xfs_lsn_t *start_lsn) + struct xfs_log_vec *log_vector) { struct xfs_log_vec *lv; - if (start_lsn) - *start_lsn = log->l_cilp->xc_ctx->sequence; - ASSERT(log_vector); for (lv = log_vector; lv; lv = lv->lv_next) { void *ptr; @@ -301,9 +297,24 @@ xlog_cil_format_items( ptr += vec->i_len; } ASSERT(ptr == lv->lv_buf + lv->lv_buf_len); + } +} + +static void +xlog_cil_insert_items( + struct log *log, + struct xfs_log_vec *log_vector, + struct xlog_ticket *ticket, + xfs_lsn_t *start_lsn) +{ + struct xfs_log_vec *lv; + + if (start_lsn) + *start_lsn = log->l_cilp->xc_ctx->sequence; + ASSERT(log_vector); + for (lv = log_vector; lv; lv = lv->lv_next) xlog_cil_insert(log, ticket, lv->lv_item, lv); - } } static void @@ -321,80 +332,6 @@ xlog_cil_free_logvec( } /* - * Commit a transaction with the given vector to the Committed Item List. - * - * To do this, we need to format the item, pin it in memory if required and - * account for the space used by the transaction. Once we have done that we - * need to release the unused reservation for the transaction, attach the - * transaction to the checkpoint context so we carry the busy extents through - * to checkpoint completion, and then unlock all the items in the transaction. - * - * For more specific information about the order of operations in - * xfs_log_commit_cil() please refer to the comments in - * xfs_trans_commit_iclog(). - * - * Called with the context lock already held in read mode to lock out - * background commit, returns without it held once background commits are - * allowed again. - */ -int -xfs_log_commit_cil( - struct xfs_mount *mp, - struct xfs_trans *tp, - struct xfs_log_vec *log_vector, - xfs_lsn_t *commit_lsn, - int flags) -{ - struct log *log = mp->m_log; - int log_flags = 0; - int push = 0; - - if (flags & XFS_TRANS_RELEASE_LOG_RES) - log_flags = XFS_LOG_REL_PERM_RESERV; - - if (XLOG_FORCED_SHUTDOWN(log)) { - xlog_cil_free_logvec(log_vector); - return XFS_ERROR(EIO); - } - - /* lock out background commit */ - down_read(&log->l_cilp->xc_ctx_lock); - xlog_cil_format_items(log, log_vector, tp->t_ticket, commit_lsn); - - /* check we didn't blow the reservation */ - if (tp->t_ticket->t_curr_res < 0) - xlog_print_tic_res(log->l_mp, tp->t_ticket); - - /* attach the transaction to the CIL if it has any busy extents */ - if (!list_empty(&tp->t_busy)) { - spin_lock(&log->l_cilp->xc_cil_lock); - list_splice_init(&tp->t_busy, - &log->l_cilp->xc_ctx->busy_extents); - spin_unlock(&log->l_cilp->xc_cil_lock); - } - - tp->t_commit_lsn = *commit_lsn; - xfs_log_done(mp, tp->t_ticket, NULL, log_flags); - xfs_trans_unreserve_and_mod_sb(tp); - - /* check for background commit before unlock */ - if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log)) - push = 1; - up_read(&log->l_cilp->xc_ctx_lock); - - /* - * We need to push CIL every so often so we don't cache more than we - * can fit in the log. The limit really is that a checkpoint can't be - * more than half the log (the current checkpoint is not allowed to - * overwrite the previous checkpoint), but commit latency and memory - * usage limit this to a smaller size in most cases. - */ - if (push) - xlog_cil_push(log, 0); - return 0; -} - -/* * Mark all items committed and clear busy extents. We free the log vector * chains in a separate pass so that we unpin the log items as quickly as * possible. @@ -427,13 +364,23 @@ xlog_cil_committed( } /* - * Push the Committed Item List to the log. If the push_now flag is not set, - * then it is a background flush and so we can chose to ignore it. + * Push the Committed Item List to the log. If @push_seq flag is zero, then it + * is a background flush and so we can chose to ignore it. Otherwise, if the + * current sequence is the same as @push_seq we need to do a flush. If + * @push_seq is less than the current sequence, then it has already been + * flushed and we don't need to do anything - the caller will wait for it to + * complete if necessary. + * + * @push_seq is a value rather than a flag because that allows us to do an + * unlocked check of the sequence number for a match. Hence we can allows log + * forces to run racily and not issue pushes for the same sequence twice. If we + * get a race between multiple pushes for the same sequence they will block on + * the first one and then abort, hence avoiding needless pushes. */ -int +STATIC int xlog_cil_push( struct log *log, - int push_now) + xfs_lsn_t push_seq) { struct xfs_cil *cil = log->l_cilp; struct xfs_log_vec *lv; @@ -453,12 +400,14 @@ xlog_cil_push( if (!cil) return 0; + ASSERT(!push_seq || push_seq <= cil->xc_ctx->sequence); + new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS); new_ctx->ticket = xlog_cil_ticket_alloc(log); /* lock out transaction commit, but don't block on background push */ if (!down_write_trylock(&cil->xc_ctx_lock)) { - if (!push_now) + if (!push_seq) goto out_free_ticket; down_write(&cil->xc_ctx_lock); } @@ -469,7 +418,11 @@ xlog_cil_push( goto out_skip; /* check for spurious background flush */ - if (!push_now && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) + if (!push_seq && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) + goto out_skip; + + /* check for a previously pushed seqeunce */ + if (push_seq < cil->xc_ctx->sequence) goto out_skip; /* @@ -515,6 +468,13 @@ xlog_cil_push( cil->xc_ctx = new_ctx; /* + * mirror the new sequence into the cil structure so that we can do + * unlocked checks against the current sequence in log forces without + * risking deferencing a freed context pointer. + */ + cil->xc_current_sequence = new_ctx->sequence; + + /* * The switch is now done, so we can drop the context lock and move out * of a shared context. We can't just go straight to the commit record, * though - we need to synchronise with previous and future commits so @@ -626,6 +586,102 @@ out_abort: } /* + * Commit a transaction with the given vector to the Committed Item List. + * + * To do this, we need to format the item, pin it in memory if required and + * account for the space used by the transaction. Once we have done that we + * need to release the unused reservation for the transaction, attach the + * transaction to the checkpoint context so we carry the busy extents through + * to checkpoint completion, and then unlock all the items in the transaction. + * + * For more specific information about the order of operations in + * xfs_log_commit_cil() please refer to the comments in + * xfs_trans_commit_iclog(). + * + * Called with the context lock already held in read mode to lock out + * background commit, returns without it held once background commits are + * allowed again. + */ +int +xfs_log_commit_cil( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_log_vec *log_vector, + xfs_lsn_t *commit_lsn, + int flags) +{ + struct log *log = mp->m_log; + int log_flags = 0; + int push = 0; + + if (flags & XFS_TRANS_RELEASE_LOG_RES) + log_flags = XFS_LOG_REL_PERM_RESERV; + + if (XLOG_FORCED_SHUTDOWN(log)) { + xlog_cil_free_logvec(log_vector); + return XFS_ERROR(EIO); + } + + /* + * do all the hard work of formatting items (including memory + * allocation) outside the CIL context lock. This prevents stalling CIL + * pushes when we are low on memory and a transaction commit spends a + * lot of time in memory reclaim. + */ + xlog_cil_format_items(log, log_vector); + + /* lock out background commit */ + down_read(&log->l_cilp->xc_ctx_lock); + xlog_cil_insert_items(log, log_vector, tp->t_ticket, commit_lsn); + + /* check we didn't blow the reservation */ + if (tp->t_ticket->t_curr_res < 0) + xlog_print_tic_res(log->l_mp, tp->t_ticket); + + /* attach the transaction to the CIL if it has any busy extents */ + if (!list_empty(&tp->t_busy)) { + spin_lock(&log->l_cilp->xc_cil_lock); + list_splice_init(&tp->t_busy, + &log->l_cilp->xc_ctx->busy_extents); + spin_unlock(&log->l_cilp->xc_cil_lock); + } + + tp->t_commit_lsn = *commit_lsn; + xfs_log_done(mp, tp->t_ticket, NULL, log_flags); + xfs_trans_unreserve_and_mod_sb(tp); + + /* + * Once all the items of the transaction have been copied to the CIL, + * the items can be unlocked and freed. + * + * This needs to be done before we drop the CIL context lock because we + * have to update state in the log items and unlock them before they go + * to disk. If we don't, then the CIL checkpoint can race with us and + * we can run checkpoint completion before we've updated and unlocked + * the log items. This affects (at least) processing of stale buffers, + * inodes and EFIs. + */ + xfs_trans_free_items(tp, *commit_lsn, 0); + + /* check for background commit before unlock */ + if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log)) + push = 1; + + up_read(&log->l_cilp->xc_ctx_lock); + + /* + * We need to push CIL every so often so we don't cache more than we + * can fit in the log. The limit really is that a checkpoint can't be + * more than half the log (the current checkpoint is not allowed to + * overwrite the previous checkpoint), but commit latency and memory + * usage limit this to a smaller size in most cases. + */ + if (push) + xlog_cil_push(log, 0); + return 0; +} + +/* * Conditionally push the CIL based on the sequence passed in. * * We only need to push if we haven't already pushed the sequence @@ -639,39 +695,34 @@ out_abort: * commit lsn is there. It'll be empty, so this is broken for now. */ xfs_lsn_t -xlog_cil_push_lsn( +xlog_cil_force_lsn( struct log *log, - xfs_lsn_t push_seq) + xfs_lsn_t sequence) { struct xfs_cil *cil = log->l_cilp; struct xfs_cil_ctx *ctx; xfs_lsn_t commit_lsn = NULLCOMMITLSN; -restart: - down_write(&cil->xc_ctx_lock); - ASSERT(push_seq <= cil->xc_ctx->sequence); - - /* check to see if we need to force out the current context */ - if (push_seq == cil->xc_ctx->sequence) { - up_write(&cil->xc_ctx_lock); - xlog_cil_push(log, 1); - goto restart; - } + ASSERT(sequence <= cil->xc_current_sequence); + + /* + * check to see if we need to force out the current context. + * xlog_cil_push() handles racing pushes for the same sequence, + * so no need to deal with it here. + */ + if (sequence == cil->xc_current_sequence) + xlog_cil_push(log, sequence); /* * See if we can find a previous sequence still committing. - * We can drop the flush lock as soon as we have the cil lock - * because we are now only comparing contexts protected by - * the cil lock. - * * We need to wait for all previous sequence commits to complete * before allowing the force of push_seq to go ahead. Hence block * on commits for those as well. */ +restart: spin_lock(&cil->xc_cil_lock); - up_write(&cil->xc_ctx_lock); list_for_each_entry(ctx, &cil->xc_committing, committing) { - if (ctx->sequence > push_seq) + if (ctx->sequence > sequence) continue; if (!ctx->commit_lsn) { /* @@ -681,7 +732,7 @@ restart: sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0); goto restart; } - if (ctx->sequence != push_seq) + if (ctx->sequence != sequence) continue; /* found it! */ commit_lsn = ctx->commit_lsn; diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 8c07261..ced52b9 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -422,6 +422,7 @@ struct xfs_cil { struct rw_semaphore xc_ctx_lock; struct list_head xc_committing; sv_t xc_commit_wait; + xfs_lsn_t xc_current_sequence; }; /* @@ -562,8 +563,16 @@ int xlog_cil_init(struct log *log); void xlog_cil_init_post_recovery(struct log *log); void xlog_cil_destroy(struct log *log); -int xlog_cil_push(struct log *log, int push_now); -xfs_lsn_t xlog_cil_push_lsn(struct log *log, xfs_lsn_t push_sequence); +/* + * CIL force routines + */ +xfs_lsn_t xlog_cil_force_lsn(struct log *log, xfs_lsn_t sequence); + +static inline void +xlog_cil_force(struct log *log) +{ + xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence); +} /* * Unmount record type is used as a pseudo transaction type for the ticket. diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index fdca741..1c47eda 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1167,7 +1167,7 @@ xfs_trans_del_item( * Unlock all of the items of a transaction and free all the descriptors * of that transaction. */ -STATIC void +void xfs_trans_free_items( struct xfs_trans *tp, xfs_lsn_t commit_lsn, @@ -1653,9 +1653,6 @@ xfs_trans_commit_cil( return error; current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); - - /* xfs_trans_free_items() unlocks them first */ - xfs_trans_free_items(tp, *commit_lsn, 0); xfs_trans_free(tp); return 0; } diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index e2d93d8..62da86c 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -25,7 +25,8 @@ struct xfs_trans; void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *); void xfs_trans_del_item(struct xfs_log_item *); - +void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn, + int flags); void xfs_trans_item_committed(struct xfs_log_item *lip, xfs_lsn_t commit_lsn, int aborted); void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp); diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 66d585c..4c7c7bf 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -2299,15 +2299,22 @@ xfs_alloc_file_space( e = allocatesize_fsb; } + /* + * The transaction reservation is limited to a 32-bit block + * count, hence we need to limit the number of blocks we are + * trying to reserve to avoid an overflow. We can't allocate + * more than @nimaps extents, and an extent is limited on disk + * to MAXEXTLEN (21 bits), so use that to enforce the limit. + */ + resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps)); if (unlikely(rt)) { - resrtextents = qblocks = (uint)(e - s); + resrtextents = qblocks = resblks; resrtextents /= mp->m_sb.sb_rextsize; resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); quota_flag = XFS_QMOPT_RES_RTBLKS; } else { resrtextents = 0; - resblks = qblocks = \ - XFS_DIOSTRAT_SPACE_RES(mp, (uint)(e - s)); + resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks); quota_flag = XFS_QMOPT_RES_REGBLKS; } |