diff options
Diffstat (limited to 'fs')
240 files changed, 5744 insertions, 3207 deletions
diff --git a/fs/afs/Makefile b/fs/afs/Makefile index 45b7fc4..532acae 100644 --- a/fs/afs/Makefile +++ b/fs/afs/Makefile @@ -12,6 +12,8 @@ kafs-objs := \ cell.o \ cmservice.o \ dir.o \ + dir_edit.o \ + dynroot.o \ file.o \ flock.o \ fsclient.o \ diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c index fd9f28b..7587fb6 100644 --- a/fs/afs/addr_list.c +++ b/fs/afs/addr_list.c @@ -121,7 +121,7 @@ struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len, p = text; do { struct sockaddr_rxrpc *srx = &alist->addrs[alist->nr_addrs]; - char tdelim = delim; + const char *q, *stop; if (*p == delim) { p++; @@ -130,28 +130,33 @@ struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len, if (*p == '[') { p++; - tdelim = ']'; + q = memchr(p, ']', end - p); + } else { + for (q = p; q < end; q++) + if (*q == '+' || *q == delim) + break; } - if (in4_pton(p, end - p, + if (in4_pton(p, q - p, (u8 *)&srx->transport.sin6.sin6_addr.s6_addr32[3], - tdelim, &p)) { + -1, &stop)) { srx->transport.sin6.sin6_addr.s6_addr32[0] = 0; srx->transport.sin6.sin6_addr.s6_addr32[1] = 0; srx->transport.sin6.sin6_addr.s6_addr32[2] = htonl(0xffff); - } else if (in6_pton(p, end - p, + } else if (in6_pton(p, q - p, srx->transport.sin6.sin6_addr.s6_addr, - tdelim, &p)) { + -1, &stop)) { /* Nothing to do */ } else { goto bad_address; } - if (tdelim == ']') { - if (p == end || *p != ']') - goto bad_address; + if (stop != q) + goto bad_address; + + p = q; + if (q < end && *q == ']') p++; - } if (p < end) { if (*p == '+') { @@ -243,9 +248,9 @@ void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port) xport == a->sin6_port) return; if (xdr == a->sin6_addr.s6_addr32[3] && - xport < a->sin6_port) + (u16 __force)xport < (u16 __force)a->sin6_port) break; - if (xdr < a->sin6_addr.s6_addr32[3]) + if ((u32 __force)xdr < (u32 __force)a->sin6_addr.s6_addr32[3]) break; } @@ -280,7 +285,7 @@ void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port) xport == a->sin6_port) return; if (diff == 0 && - xport < a->sin6_port) + (u16 __force)xport < (u16 __force)a->sin6_port) break; if (diff < 0) break; diff --git a/fs/afs/afs.h b/fs/afs/afs.h index b94d0ed..b4ff1f7 100644 --- a/fs/afs/afs.h +++ b/fs/afs/afs.h @@ -67,10 +67,14 @@ typedef enum { } afs_callback_type_t; struct afs_callback { - struct afs_fid fid; /* file identifier */ - unsigned version; /* callback version */ - unsigned expiry; /* time at which expires */ - afs_callback_type_t type; /* type of callback */ + unsigned version; /* Callback version */ + unsigned expiry; /* Time at which expires */ + afs_callback_type_t type; /* Type of callback */ +}; + +struct afs_callback_break { + struct afs_fid fid; /* File identifier */ + struct afs_callback cb; /* Callback details */ }; #define AFSCBMAX 50 /* maximum callbacks transferred per bulk op */ @@ -123,21 +127,20 @@ typedef u32 afs_access_t; * AFS file status information */ struct afs_file_status { - unsigned if_version; /* interface version */ -#define AFS_FSTATUS_VERSION 1 + u64 size; /* file size */ + afs_dataversion_t data_version; /* current data version */ + time_t mtime_client; /* last time client changed data */ + time_t mtime_server; /* last time server changed data */ + unsigned abort_code; /* Abort if bulk-fetching this failed */ afs_file_type_t type; /* file type */ unsigned nlink; /* link count */ - u64 size; /* file size */ - afs_dataversion_t data_version; /* current data version */ u32 author; /* author ID */ - kuid_t owner; /* owner ID */ - kgid_t group; /* group ID */ + u32 owner; /* owner ID */ + u32 group; /* group ID */ afs_access_t caller_access; /* access rights for authenticated caller */ afs_access_t anon_access; /* access rights for unauthenticated caller */ umode_t mode; /* UNIX mode */ - time_t mtime_client; /* last time client changed data */ - time_t mtime_server; /* last time server changed data */ s32 lock_count; /* file lock count (0=UNLK -1=WRLCK +ve=#RDLCK */ }; diff --git a/fs/afs/afs_fs.h b/fs/afs/afs_fs.h index d47b6d0..ddfa88a 100644 --- a/fs/afs/afs_fs.h +++ b/fs/afs/afs_fs.h @@ -31,10 +31,12 @@ enum AFS_FS_Operations { FSGETVOLUMEINFO = 148, /* AFS Get information about a volume */ FSGETVOLUMESTATUS = 149, /* AFS Get volume status information */ FSGETROOTVOLUME = 151, /* AFS Get root volume name */ + FSBULKSTATUS = 155, /* AFS Fetch multiple file statuses */ FSSETLOCK = 156, /* AFS Request a file lock */ FSEXTENDLOCK = 157, /* AFS Extend a file lock */ FSRELEASELOCK = 158, /* AFS Release a file lock */ FSLOOKUP = 161, /* AFS lookup file in directory */ + FSINLINEBULKSTATUS = 65536, /* AFS Fetch multiple file statuses with inline errors */ FSFETCHDATA64 = 65537, /* AFS Fetch file data */ FSSTOREDATA64 = 65538, /* AFS Store file data */ FSGIVEUPALLCALLBACKS = 65539, /* AFS Give up all outstanding callbacks on a server */ diff --git a/fs/afs/callback.c b/fs/afs/callback.c index f4291b5..571437d 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -23,36 +23,55 @@ /* * Set up an interest-in-callbacks record for a volume on a server and * register it with the server. - * - Called with volume->server_sem held. + * - Called with vnode->io_lock held. */ int afs_register_server_cb_interest(struct afs_vnode *vnode, - struct afs_server_entry *entry) + struct afs_server_list *slist, + unsigned int index) { - struct afs_cb_interest *cbi = entry->cb_interest, *vcbi, *new, *x; + struct afs_server_entry *entry = &slist->servers[index]; + struct afs_cb_interest *cbi, *vcbi, *new, *old; struct afs_server *server = entry->server; again: + if (vnode->cb_interest && + likely(vnode->cb_interest == entry->cb_interest)) + return 0; + + read_lock(&slist->lock); + cbi = afs_get_cb_interest(entry->cb_interest); + read_unlock(&slist->lock); + vcbi = vnode->cb_interest; if (vcbi) { - if (vcbi == cbi) + if (vcbi == cbi) { + afs_put_cb_interest(afs_v2net(vnode), cbi); return 0; + } + /* Use a new interest in the server list for the same server + * rather than an old one that's still attached to a vnode. + */ if (cbi && vcbi->server == cbi->server) { write_seqlock(&vnode->cb_lock); - vnode->cb_interest = afs_get_cb_interest(cbi); + old = vnode->cb_interest; + vnode->cb_interest = cbi; write_sequnlock(&vnode->cb_lock); - afs_put_cb_interest(afs_v2net(vnode), cbi); + afs_put_cb_interest(afs_v2net(vnode), old); return 0; } + /* Re-use the one attached to the vnode. */ if (!cbi && vcbi->server == server) { - afs_get_cb_interest(vcbi); - x = cmpxchg(&entry->cb_interest, cbi, vcbi); - if (x != cbi) { - cbi = x; - afs_put_cb_interest(afs_v2net(vnode), vcbi); + write_lock(&slist->lock); + if (entry->cb_interest) { + write_unlock(&slist->lock); + afs_put_cb_interest(afs_v2net(vnode), cbi); goto again; } + + entry->cb_interest = cbi; + write_unlock(&slist->lock); return 0; } } @@ -72,13 +91,16 @@ again: list_add_tail(&new->cb_link, &server->cb_interests); write_unlock(&server->cb_break_lock); - x = cmpxchg(&entry->cb_interest, cbi, new); - if (x == cbi) { + write_lock(&slist->lock); + if (!entry->cb_interest) { + entry->cb_interest = afs_get_cb_interest(new); cbi = new; + new = NULL; } else { - cbi = x; - afs_put_cb_interest(afs_v2net(vnode), new); + cbi = afs_get_cb_interest(entry->cb_interest); } + write_unlock(&slist->lock); + afs_put_cb_interest(afs_v2net(vnode), new); } ASSERT(cbi); @@ -88,35 +110,18 @@ again: */ write_seqlock(&vnode->cb_lock); - vnode->cb_interest = afs_get_cb_interest(cbi); + old = vnode->cb_interest; + vnode->cb_interest = cbi; vnode->cb_s_break = cbi->server->cb_s_break; + vnode->cb_v_break = vnode->volume->cb_v_break; clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); write_sequnlock(&vnode->cb_lock); + afs_put_cb_interest(afs_v2net(vnode), old); return 0; } /* - * Set a vnode's interest on a server. - */ -void afs_set_cb_interest(struct afs_vnode *vnode, struct afs_cb_interest *cbi) -{ - struct afs_cb_interest *old_cbi = NULL; - - if (vnode->cb_interest == cbi) - return; - - write_seqlock(&vnode->cb_lock); - if (vnode->cb_interest != cbi) { - afs_get_cb_interest(cbi); - old_cbi = vnode->cb_interest; - vnode->cb_interest = cbi; - } - write_sequnlock(&vnode->cb_lock); - afs_put_cb_interest(afs_v2net(vnode), cbi); -} - -/* * Remove an interest on a server. */ void afs_put_cb_interest(struct afs_net *net, struct afs_cb_interest *cbi) @@ -150,6 +155,7 @@ void afs_break_callback(struct afs_vnode *vnode) write_seqlock(&vnode->cb_lock); + clear_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) { vnode->cb_break++; afs_clear_permits(vnode); @@ -190,13 +196,24 @@ static void afs_break_one_callback(struct afs_server *server, if (cbi->vid != fid->vid) continue; - data.volume = NULL; - data.fid = *fid; - inode = ilookup5_nowait(cbi->sb, fid->vnode, afs_iget5_test, &data); - if (inode) { - vnode = AFS_FS_I(inode); - afs_break_callback(vnode); - iput(inode); + if (fid->vnode == 0 && fid->unique == 0) { + /* The callback break applies to an entire volume. */ + struct afs_super_info *as = AFS_FS_S(cbi->sb); + struct afs_volume *volume = as->volume; + + write_lock(&volume->cb_break_lock); + volume->cb_v_break++; + write_unlock(&volume->cb_break_lock); + } else { + data.volume = NULL; + data.fid = *fid; + inode = ilookup5_nowait(cbi->sb, fid->vnode, + afs_iget5_test, &data); + if (inode) { + vnode = AFS_FS_I(inode); + afs_break_callback(vnode); + iput(inode); + } } } @@ -207,21 +224,23 @@ static void afs_break_one_callback(struct afs_server *server, * allow the fileserver to break callback promises */ void afs_break_callbacks(struct afs_server *server, size_t count, - struct afs_callback callbacks[]) + struct afs_callback_break *callbacks) { _enter("%p,%zu,", server, count); ASSERT(server != NULL); ASSERTCMP(count, <=, AFSCBMAX); + /* TODO: Sort the callback break list by volume ID */ + for (; count > 0; callbacks++, count--) { _debug("- Fid { vl=%08x n=%u u=%u } CB { v=%u x=%u t=%u }", callbacks->fid.vid, callbacks->fid.vnode, callbacks->fid.unique, - callbacks->version, - callbacks->expiry, - callbacks->type + callbacks->cb.version, + callbacks->cb.expiry, + callbacks->cb.type ); afs_break_one_callback(server, &callbacks->fid); } diff --git a/fs/afs/cell.c b/fs/afs/cell.c index 4235a05..fdf4c36 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -18,7 +18,7 @@ #include <keys/rxrpc-type.h> #include "internal.h" -unsigned __read_mostly afs_cell_gc_delay = 10; +static unsigned __read_mostly afs_cell_gc_delay = 10; static void afs_manage_cell(struct work_struct *); @@ -75,7 +75,7 @@ struct afs_cell *afs_lookup_cell_rcu(struct afs_net *net, cell = rcu_dereference_raw(net->ws_cell); if (cell) { afs_get_cell(cell); - continue; + break; } ret = -EDESTADDRREQ; continue; @@ -130,6 +130,8 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, _leave(" = -ENAMETOOLONG"); return ERR_PTR(-ENAMETOOLONG); } + if (namelen == 5 && memcmp(name, "@cell", 5) == 0) + return ERR_PTR(-EINVAL); _enter("%*.*s,%s", namelen, namelen, name, vllist); @@ -334,8 +336,8 @@ int afs_cell_init(struct afs_net *net, const char *rootcell) return PTR_ERR(new_root); } - set_bit(AFS_CELL_FL_NO_GC, &new_root->flags); - afs_get_cell(new_root); + if (!test_and_set_bit(AFS_CELL_FL_NO_GC, &new_root->flags)) + afs_get_cell(new_root); /* install the new cell */ write_seqlock(&net->cells_lock); @@ -411,7 +413,7 @@ static void afs_cell_destroy(struct rcu_head *rcu) ASSERTCMP(atomic_read(&cell->usage), ==, 0); - afs_put_addrlist(cell->vl_addrs); + afs_put_addrlist(rcu_access_pointer(cell->vl_addrs)); key_put(cell->anonymous_key); kfree(cell); diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 41e277f..c332c95 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -133,21 +133,10 @@ bool afs_cm_incoming_call(struct afs_call *call) } /* - * clean up a cache manager call + * Clean up a cache manager call. */ static void afs_cm_destructor(struct afs_call *call) { - _enter(""); - - /* Break the callbacks here so that we do it after the final ACK is - * received. The step number here must match the final number in - * afs_deliver_cb_callback(). - */ - if (call->unmarshall == 5) { - ASSERT(call->cm_server && call->count && call->request); - afs_break_callbacks(call->cm_server, call->count, call->request); - } - kfree(call->buffer); call->buffer = NULL; } @@ -161,14 +150,14 @@ static void SRXAFSCB_CallBack(struct work_struct *work) _enter(""); - /* be sure to send the reply *before* attempting to spam the AFS server - * with FSFetchStatus requests on the vnodes with broken callbacks lest - * the AFS server get into a vicious cycle of trying to break further - * callbacks because it hadn't received completion of the CBCallBack op - * yet */ - afs_send_empty_reply(call); + /* We need to break the callbacks before sending the reply as the + * server holds up change visibility till it receives our reply so as + * to maintain cache coherency. + */ + if (call->cm_server) + afs_break_callbacks(call->cm_server, call->count, call->request); - afs_break_callbacks(call->cm_server, call->count, call->request); + afs_send_empty_reply(call); afs_put_call(call); _leave(""); } @@ -178,9 +167,8 @@ static void SRXAFSCB_CallBack(struct work_struct *work) */ static int afs_deliver_cb_callback(struct afs_call *call) { + struct afs_callback_break *cb; struct sockaddr_rxrpc srx; - struct afs_callback *cb; - struct afs_server *server; __be32 *bp; int ret, loop; @@ -201,7 +189,7 @@ static int afs_deliver_cb_callback(struct afs_call *call) call->count = ntohl(call->tmp); _debug("FID count: %u", call->count); if (call->count > AFSCBMAX) - return -EBADMSG; + return afs_protocol_error(call, -EBADMSG); call->buffer = kmalloc(call->count * 3 * 4, GFP_KERNEL); if (!call->buffer) @@ -218,7 +206,7 @@ static int afs_deliver_cb_callback(struct afs_call *call) _debug("unmarshall FID array"); call->request = kcalloc(call->count, - sizeof(struct afs_callback), + sizeof(struct afs_callback_break), GFP_KERNEL); if (!call->request) return -ENOMEM; @@ -229,7 +217,7 @@ static int afs_deliver_cb_callback(struct afs_call *call) cb->fid.vid = ntohl(*bp++); cb->fid.vnode = ntohl(*bp++); cb->fid.unique = ntohl(*bp++); - cb->type = AFSCM_CB_UNTYPED; + cb->cb.type = AFSCM_CB_UNTYPED; } call->offset = 0; @@ -245,7 +233,7 @@ static int afs_deliver_cb_callback(struct afs_call *call) call->count2 = ntohl(call->tmp); _debug("CB count: %u", call->count2); if (call->count2 != call->count && call->count2 != 0) - return -EBADMSG; + return afs_protocol_error(call, -EBADMSG); call->offset = 0; call->unmarshall++; @@ -260,22 +248,13 @@ static int afs_deliver_cb_callback(struct afs_call *call) cb = call->request; bp = call->buffer; for (loop = call->count2; loop > 0; loop--, cb++) { - cb->version = ntohl(*bp++); - cb->expiry = ntohl(*bp++); - cb->type = ntohl(*bp++); + cb->cb.version = ntohl(*bp++); + cb->cb.expiry = ntohl(*bp++); + cb->cb.type = ntohl(*bp++); } call->offset = 0; call->unmarshall++; - - /* Record that the message was unmarshalled successfully so - * that the call destructor can know do the callback breaking - * work, even if the final ACK isn't received. - * - * If the step number changes, then afs_cm_destructor() must be - * updated also. - */ - call->unmarshall++; case 5: break; } @@ -286,10 +265,9 @@ static int afs_deliver_cb_callback(struct afs_call *call) /* we'll need the file server record as that tells us which set of * vnodes to operate upon */ rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx); - server = afs_find_server(call->net, &srx); - if (!server) - return -ENOTCONN; - call->cm_server = server; + call->cm_server = afs_find_server(call->net, &srx); + if (!call->cm_server) + trace_afs_cm_no_server(call, &srx); return afs_queue_call_work(call); } @@ -303,7 +281,8 @@ static void SRXAFSCB_InitCallBackState(struct work_struct *work) _enter("{%p}", call->cm_server); - afs_init_callback_state(call->cm_server); + if (call->cm_server) + afs_init_callback_state(call->cm_server); afs_send_empty_reply(call); afs_put_call(call); _leave(""); @@ -315,7 +294,6 @@ static void SRXAFSCB_InitCallBackState(struct work_struct *work) static int afs_deliver_cb_init_call_back_state(struct afs_call *call) { struct sockaddr_rxrpc srx; - struct afs_server *server; int ret; _enter(""); @@ -328,10 +306,9 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call) /* we'll need the file server record as that tells us which set of * vnodes to operate upon */ - server = afs_find_server(call->net, &srx); - if (!server) - return -ENOTCONN; - call->cm_server = server; + call->cm_server = afs_find_server(call->net, &srx); + if (!call->cm_server) + trace_afs_cm_no_server(call, &srx); return afs_queue_call_work(call); } @@ -341,8 +318,6 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call) */ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) { - struct sockaddr_rxrpc srx; - struct afs_server *server; struct afs_uuid *r; unsigned loop; __be32 *b; @@ -398,11 +373,11 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) /* we'll need the file server record as that tells us which set of * vnodes to operate upon */ - rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx); - server = afs_find_server(call->net, &srx); - if (!server) - return -ENOTCONN; - call->cm_server = server; + rcu_read_lock(); + call->cm_server = afs_find_server_by_uuid(call->net, call->request); + rcu_read_unlock(); + if (!call->cm_server) + trace_afs_cm_no_server_u(call, call->request); return afs_queue_call_work(call); } @@ -500,9 +475,9 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call) b = call->buffer; r = call->request; - r->time_low = ntohl(b[0]); - r->time_mid = ntohl(b[1]); - r->time_hi_and_version = ntohl(b[2]); + r->time_low = b[0]; + r->time_mid = htons(ntohl(b[1])); + r->time_hi_and_version = htons(ntohl(b[2])); r->clock_seq_hi_and_reserved = ntohl(b[3]); r->clock_seq_low = ntohl(b[4]); diff --git a/fs/afs/dir.c b/fs/afs/dir.c index ba2b458..7d623008 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -1,6 +1,6 @@ /* dir.c: AFS filesystem directory handling * - * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2002, 2018 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * This program is free software; you can redistribute it and/or @@ -10,27 +10,26 @@ */ #include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> #include <linux/fs.h> #include <linux/namei.h> #include <linux/pagemap.h> +#include <linux/swap.h> #include <linux/ctype.h> #include <linux/sched.h> -#include <linux/dns_resolver.h> +#include <linux/task_io_accounting_ops.h> #include "internal.h" +#include "xdr_fs.h" static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); -static struct dentry *afs_dynroot_lookup(struct inode *dir, struct dentry *dentry, - unsigned int flags); static int afs_dir_open(struct inode *inode, struct file *file); static int afs_readdir(struct file *file, struct dir_context *ctx); static int afs_d_revalidate(struct dentry *dentry, unsigned int flags); static int afs_d_delete(const struct dentry *dentry); -static void afs_d_release(struct dentry *dentry); -static int afs_lookup_filldir(struct dir_context *ctx, const char *name, int nlen, +static int afs_lookup_one_filldir(struct dir_context *ctx, const char *name, int nlen, loff_t fpos, u64 ino, unsigned dtype); +static int afs_lookup_filldir(struct dir_context *ctx, const char *name, int nlen, + loff_t fpos, u64 ino, unsigned dtype); static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl); static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); @@ -43,6 +42,14 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry, static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags); +static int afs_dir_releasepage(struct page *page, gfp_t gfp_flags); +static void afs_dir_invalidatepage(struct page *page, unsigned int offset, + unsigned int length); + +static int afs_dir_set_page_dirty(struct page *page) +{ + BUG(); /* This should never happen. */ +} const struct file_operations afs_dir_file_operations = { .open = afs_dir_open, @@ -67,15 +74,10 @@ const struct inode_operations afs_dir_inode_operations = { .listxattr = afs_listxattr, }; -const struct file_operations afs_dynroot_file_operations = { - .open = dcache_dir_open, - .release = dcache_dir_close, - .iterate_shared = dcache_readdir, - .llseek = dcache_dir_lseek, -}; - -const struct inode_operations afs_dynroot_inode_operations = { - .lookup = afs_dynroot_lookup, +const struct address_space_operations afs_dir_aops = { + .set_page_dirty = afs_dir_set_page_dirty, + .releasepage = afs_dir_releasepage, + .invalidatepage = afs_dir_invalidatepage, }; const struct dentry_operations afs_fs_dentry_operations = { @@ -85,91 +87,38 @@ const struct dentry_operations afs_fs_dentry_operations = { .d_automount = afs_d_automount, }; -#define AFS_DIR_HASHTBL_SIZE 128 -#define AFS_DIR_DIRENT_SIZE 32 -#define AFS_DIRENT_PER_BLOCK 64 - -union afs_dirent { - struct { - uint8_t valid; - uint8_t unused[1]; - __be16 hash_next; - __be32 vnode; - __be32 unique; - uint8_t name[16]; - uint8_t overflow[4]; /* if any char of the name (inc - * NUL) reaches here, consume - * the next dirent too */ - } u; - uint8_t extended_name[32]; -}; - -/* AFS directory page header (one at the beginning of every 2048-byte chunk) */ -struct afs_dir_pagehdr { - __be16 npages; - __be16 magic; -#define AFS_DIR_MAGIC htons(1234) - uint8_t nentries; - uint8_t bitmap[8]; - uint8_t pad[19]; -}; - -/* directory block layout */ -union afs_dir_block { - - struct afs_dir_pagehdr pagehdr; - - struct { - struct afs_dir_pagehdr pagehdr; - uint8_t alloc_ctrs[128]; - /* dir hash table */ - uint16_t hashtable[AFS_DIR_HASHTBL_SIZE]; - } hdr; - - union afs_dirent dirents[AFS_DIRENT_PER_BLOCK]; -}; - -/* layout on a linux VM page */ -struct afs_dir_page { - union afs_dir_block blocks[PAGE_SIZE / sizeof(union afs_dir_block)]; +struct afs_lookup_one_cookie { + struct dir_context ctx; + struct qstr name; + bool found; + struct afs_fid fid; }; struct afs_lookup_cookie { - struct dir_context ctx; - struct afs_fid fid; - struct qstr name; - int found; + struct dir_context ctx; + struct qstr name; + bool found; + bool one_only; + unsigned short nr_fids; + struct afs_file_status *statuses; + struct afs_callback *callbacks; + struct afs_fid fids[50]; }; /* * check that a directory page is valid */ -bool afs_dir_check_page(struct inode *dir, struct page *page) +static bool afs_dir_check_page(struct afs_vnode *dvnode, struct page *page, + loff_t i_size) { - struct afs_dir_page *dbuf; - struct afs_vnode *vnode = AFS_FS_I(dir); - loff_t latter, i_size, off; + struct afs_xdr_dir_page *dbuf; + loff_t latter, off; int tmp, qty; -#if 0 - /* check the page count */ - qty = desc.size / sizeof(dbuf->blocks[0]); - if (qty == 0) - goto error; - - if (page->index == 0 && qty != ntohs(dbuf->blocks[0].pagehdr.npages)) { - printk("kAFS: %s(%lu): wrong number of dir blocks %d!=%hu\n", - __func__, dir->i_ino, qty, - ntohs(dbuf->blocks[0].pagehdr.npages)); - goto error; - } -#endif - /* Determine how many magic numbers there should be in this page, but * we must take care because the directory may change size under us. */ off = page_offset(page); - i_size = i_size_read(dir); if (i_size <= off) goto checked; @@ -178,112 +127,222 @@ bool afs_dir_check_page(struct inode *dir, struct page *page) qty = PAGE_SIZE; else qty = latter; - qty /= sizeof(union afs_dir_block); + qty /= sizeof(union afs_xdr_dir_block); /* check them */ - dbuf = page_address(page); + dbuf = kmap(page); for (tmp = 0; tmp < qty; tmp++) { - if (dbuf->blocks[tmp].pagehdr.magic != AFS_DIR_MAGIC) { + if (dbuf->blocks[tmp].hdr.magic != AFS_DIR_MAGIC) { printk("kAFS: %s(%lx): bad magic %d/%d is %04hx\n", - __func__, dir->i_ino, tmp, qty, - ntohs(dbuf->blocks[tmp].pagehdr.magic)); - trace_afs_dir_check_failed(vnode, off, i_size); + __func__, dvnode->vfs_inode.i_ino, tmp, qty, + ntohs(dbuf->blocks[tmp].hdr.magic)); + trace_afs_dir_check_failed(dvnode, off, i_size); + kunmap(page); goto error; } + + /* Make sure each block is NUL terminated so we can reasonably + * use string functions on it. The filenames in the page + * *should* be NUL-terminated anyway. + */ + ((u8 *)&dbuf->blocks[tmp])[AFS_DIR_BLOCK_SIZE - 1] = 0; } + kunmap(page); + checked: - SetPageChecked(page); + afs_stat_v(dvnode, n_read_dir); return true; error: - SetPageError(page); return false; } /* - * discard a page cached in the pagecache + * open an AFS directory file */ -static inline void afs_dir_put_page(struct page *page) +static int afs_dir_open(struct inode *inode, struct file *file) { - kunmap(page); - unlock_page(page); - put_page(page); + _enter("{%lu}", inode->i_ino); + + BUILD_BUG_ON(sizeof(union afs_xdr_dir_block) != 2048); + BUILD_BUG_ON(sizeof(union afs_xdr_dirent) != 32); + + if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(inode)->flags)) + return -ENOENT; + + return afs_open(inode, file); } /* - * get a page into the pagecache + * Read the directory into the pagecache in one go, scrubbing the previous + * contents. The list of pages is returned, pinning them so that they don't + * get reclaimed during the iteration. */ -static struct page *afs_dir_get_page(struct inode *dir, unsigned long index, - struct key *key) +static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key) + __acquires(&dvnode->validate_lock) { - struct page *page; - _enter("{%lu},%lu", dir->i_ino, index); - - page = read_cache_page(dir->i_mapping, index, afs_page_filler, key); - if (!IS_ERR(page)) { - lock_page(page); - kmap(page); - if (unlikely(!PageChecked(page))) { - if (PageError(page)) - goto fail; - } + struct afs_read *req; + loff_t i_size; + int nr_pages, nr_inline, i, n; + int ret = -ENOMEM; + +retry: + i_size = i_size_read(&dvnode->vfs_inode); + if (i_size < 2048) + return ERR_PTR(-EIO); + if (i_size > 2048 * 1024) + return ERR_PTR(-EFBIG); + + _enter("%llu", i_size); + + /* Get a request record to hold the page list. We want to hold it + * inline if we can, but we don't want to make an order 1 allocation. + */ + nr_pages = (i_size + PAGE_SIZE - 1) / PAGE_SIZE; + nr_inline = nr_pages; + if (nr_inline > (PAGE_SIZE - sizeof(*req)) / sizeof(struct page *)) + nr_inline = 0; + + req = kzalloc(sizeof(*req) + sizeof(struct page *) * nr_inline, + GFP_KERNEL); + if (!req) + return ERR_PTR(-ENOMEM); + + refcount_set(&req->usage, 1); + req->nr_pages = nr_pages; + req->actual_len = i_size; /* May change */ + req->len = nr_pages * PAGE_SIZE; /* We can ask for more than there is */ + req->data_version = dvnode->status.data_version; /* May change */ + if (nr_inline > 0) { + req->pages = req->array; + } else { + req->pages = kcalloc(nr_pages, sizeof(struct page *), + GFP_KERNEL); + if (!req->pages) + goto error; } - return page; -fail: - afs_dir_put_page(page); - _leave(" = -EIO"); - return ERR_PTR(-EIO); -} + /* Get a list of all the pages that hold or will hold the directory + * content. We need to fill in any gaps that we might find where the + * memory reclaimer has been at work. If there are any gaps, we will + * need to reread the entire directory contents. + */ + i = 0; + do { + n = find_get_pages_contig(dvnode->vfs_inode.i_mapping, i, + req->nr_pages - i, + req->pages + i); + _debug("find %u at %u/%u", n, i, req->nr_pages); + if (n == 0) { + gfp_t gfp = dvnode->vfs_inode.i_mapping->gfp_mask; + + if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) + afs_stat_v(dvnode, n_inval); + + ret = -ENOMEM; + req->pages[i] = __page_cache_alloc(gfp); + if (!req->pages[i]) + goto error; + ret = add_to_page_cache_lru(req->pages[i], + dvnode->vfs_inode.i_mapping, + i, gfp); + if (ret < 0) + goto error; + + set_page_private(req->pages[i], 1); + SetPagePrivate(req->pages[i]); + unlock_page(req->pages[i]); + i++; + } else { + i += n; + } + } while (i < req->nr_pages); -/* - * open an AFS directory file - */ -static int afs_dir_open(struct inode *inode, struct file *file) -{ - _enter("{%lu}", inode->i_ino); + /* If we're going to reload, we need to lock all the pages to prevent + * races. + */ + ret = -ERESTARTSYS; + if (down_read_killable(&dvnode->validate_lock) < 0) + goto error; - BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048); - BUILD_BUG_ON(sizeof(union afs_dirent) != 32); + if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) + goto success; - if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(inode)->flags)) - return -ENOENT; + up_read(&dvnode->validate_lock); + if (down_write_killable(&dvnode->validate_lock) < 0) + goto error; - return afs_open(inode, file); + if (!test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) { + ret = afs_fetch_data(dvnode, key, req); + if (ret < 0) + goto error_unlock; + + task_io_account_read(PAGE_SIZE * req->nr_pages); + + if (req->len < req->file_size) + goto content_has_grown; + + /* Validate the data we just read. */ + ret = -EIO; + for (i = 0; i < req->nr_pages; i++) + if (!afs_dir_check_page(dvnode, req->pages[i], + req->actual_len)) + goto error_unlock; + + // TODO: Trim excess pages + + set_bit(AFS_VNODE_DIR_VALID, &dvnode->flags); + } + + downgrade_write(&dvnode->validate_lock); +success: + return req; + +error_unlock: + up_write(&dvnode->validate_lock); +error: + afs_put_read(req); + _leave(" = %d", ret); + return ERR_PTR(ret); + +content_has_grown: + up_write(&dvnode->validate_lock); + afs_put_read(req); + goto retry; } /* * deal with one block in an AFS directory */ static int afs_dir_iterate_block(struct dir_context *ctx, - union afs_dir_block *block, + union afs_xdr_dir_block *block, unsigned blkoff) { - union afs_dirent *dire; + union afs_xdr_dirent *dire; unsigned offset, next, curr; size_t nlen; int tmp; _enter("%u,%x,%p,,",(unsigned)ctx->pos,blkoff,block); - curr = (ctx->pos - blkoff) / sizeof(union afs_dirent); + curr = (ctx->pos - blkoff) / sizeof(union afs_xdr_dirent); /* walk through the block, an entry at a time */ - for (offset = AFS_DIRENT_PER_BLOCK - block->pagehdr.nentries; - offset < AFS_DIRENT_PER_BLOCK; + for (offset = (blkoff == 0 ? AFS_DIR_RESV_BLOCKS0 : AFS_DIR_RESV_BLOCKS); + offset < AFS_DIR_SLOTS_PER_BLOCK; offset = next ) { next = offset + 1; /* skip entries marked unused in the bitmap */ - if (!(block->pagehdr.bitmap[offset / 8] & + if (!(block->hdr.bitmap[offset / 8] & (1 << (offset % 8)))) { _debug("ENT[%zu.%u]: unused", - blkoff / sizeof(union afs_dir_block), offset); + blkoff / sizeof(union afs_xdr_dir_block), offset); if (offset >= curr) ctx->pos = blkoff + - next * sizeof(union afs_dirent); + next * sizeof(union afs_xdr_dirent); continue; } @@ -291,34 +350,34 @@ static int afs_dir_iterate_block(struct dir_context *ctx, dire = &block->dirents[offset]; nlen = strnlen(dire->u.name, sizeof(*block) - - offset * sizeof(union afs_dirent)); + offset * sizeof(union afs_xdr_dirent)); _debug("ENT[%zu.%u]: %s %zu \"%s\"", - blkoff / sizeof(union afs_dir_block), offset, + blkoff / sizeof(union afs_xdr_dir_block), offset, (offset < curr ? "skip" : "fill"), nlen, dire->u.name); /* work out where the next possible entry is */ - for (tmp = nlen; tmp > 15; tmp -= sizeof(union afs_dirent)) { - if (next >= AFS_DIRENT_PER_BLOCK) { + for (tmp = nlen; tmp > 15; tmp -= sizeof(union afs_xdr_dirent)) { + if (next >= AFS_DIR_SLOTS_PER_BLOCK) { _debug("ENT[%zu.%u]:" " %u travelled beyond end dir block" " (len %u/%zu)", - blkoff / sizeof(union afs_dir_block), + blkoff / sizeof(union afs_xdr_dir_block), offset, next, tmp, nlen); return -EIO; } - if (!(block->pagehdr.bitmap[next / 8] & + if (!(block->hdr.bitmap[next / 8] & (1 << (next % 8)))) { _debug("ENT[%zu.%u]:" " %u unmarked extension (len %u/%zu)", - blkoff / sizeof(union afs_dir_block), + blkoff / sizeof(union afs_xdr_dir_block), offset, next, tmp, nlen); return -EIO; } _debug("ENT[%zu.%u]: ext %u/%zu", - blkoff / sizeof(union afs_dir_block), + blkoff / sizeof(union afs_xdr_dir_block), next, tmp, nlen); next++; } @@ -330,13 +389,14 @@ static int afs_dir_iterate_block(struct dir_context *ctx, /* found the next entry */ if (!dir_emit(ctx, dire->u.name, nlen, ntohl(dire->u.vnode), - ctx->actor == afs_lookup_filldir ? + (ctx->actor == afs_lookup_filldir || + ctx->actor == afs_lookup_one_filldir)? ntohl(dire->u.unique) : DT_UNKNOWN)) { _leave(" = 0 [full]"); return 0; } - ctx->pos = blkoff + next * sizeof(union afs_dirent); + ctx->pos = blkoff + next * sizeof(union afs_xdr_dirent); } _leave(" = 1 [more]"); @@ -349,8 +409,10 @@ static int afs_dir_iterate_block(struct dir_context *ctx, static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx, struct key *key) { - union afs_dir_block *dblock; - struct afs_dir_page *dbuf; + struct afs_vnode *dvnode = AFS_FS_I(dir); + struct afs_xdr_dir_page *dbuf; + union afs_xdr_dir_block *dblock; + struct afs_read *req; struct page *page; unsigned blkoff, limit; int ret; @@ -362,45 +424,54 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx, return -ESTALE; } + req = afs_read_dir(dvnode, key); + if (IS_ERR(req)) + return PTR_ERR(req); + /* round the file position up to the next entry boundary */ - ctx->pos += sizeof(union afs_dirent) - 1; - ctx->pos &= ~(sizeof(union afs_dirent) - 1); + ctx->pos += sizeof(union afs_xdr_dirent) - 1; + ctx->pos &= ~(sizeof(union afs_xdr_dirent) - 1); /* walk through the blocks in sequence */ ret = 0; - while (ctx->pos < dir->i_size) { - blkoff = ctx->pos & ~(sizeof(union afs_dir_block) - 1); + while (ctx->pos < req->actual_len) { + blkoff = ctx->pos & ~(sizeof(union afs_xdr_dir_block) - 1); - /* fetch the appropriate page from the directory */ - page = afs_dir_get_page(dir, blkoff / PAGE_SIZE, key); - if (IS_ERR(page)) { - ret = PTR_ERR(page); + /* Fetch the appropriate page from the directory and re-add it + * to the LRU. + */ + page = req->pages[blkoff / PAGE_SIZE]; + if (!page) { + ret = -EIO; break; } + mark_page_accessed(page); limit = blkoff & ~(PAGE_SIZE - 1); - dbuf = page_address(page); + dbuf = kmap(page); /* deal with the individual blocks stashed on this page */ do { dblock = &dbuf->blocks[(blkoff % PAGE_SIZE) / - sizeof(union afs_dir_block)]; + sizeof(union afs_xdr_dir_block)]; ret = afs_dir_iterate_block(ctx, dblock, blkoff); if (ret != 1) { - afs_dir_put_page(page); + kunmap(page); goto out; } - blkoff += sizeof(union afs_dir_block); + blkoff += sizeof(union afs_xdr_dir_block); } while (ctx->pos < dir->i_size && blkoff < limit); - afs_dir_put_page(page); + kunmap(page); ret = 0; } out: + up_read(&dvnode->validate_lock); + afs_put_read(req); _leave(" = %d", ret); return ret; } @@ -414,23 +485,23 @@ static int afs_readdir(struct file *file, struct dir_context *ctx) } /* - * search the directory for a name + * Search the directory for a single name * - if afs_dir_iterate_block() spots this function, it'll pass the FID * uniquifier through dtype */ -static int afs_lookup_filldir(struct dir_context *ctx, const char *name, - int nlen, loff_t fpos, u64 ino, unsigned dtype) +static int afs_lookup_one_filldir(struct dir_context *ctx, const char *name, + int nlen, loff_t fpos, u64 ino, unsigned dtype) { - struct afs_lookup_cookie *cookie = - container_of(ctx, struct afs_lookup_cookie, ctx); + struct afs_lookup_one_cookie *cookie = + container_of(ctx, struct afs_lookup_one_cookie, ctx); _enter("{%s,%u},%s,%u,,%llu,%u", cookie->name.name, cookie->name.len, name, nlen, (unsigned long long) ino, dtype); /* insanity checks first */ - BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048); - BUILD_BUG_ON(sizeof(union afs_dirent) != 32); + BUILD_BUG_ON(sizeof(union afs_xdr_dir_block) != 2048); + BUILD_BUG_ON(sizeof(union afs_xdr_dirent) != 32); if (cookie->name.len != nlen || memcmp(cookie->name.name, name, nlen) != 0) { @@ -447,15 +518,15 @@ static int afs_lookup_filldir(struct dir_context *ctx, const char *name, } /* - * do a lookup in a directory + * Do a lookup of a single name in a directory * - just returns the FID the dentry name maps to if found */ -static int afs_do_lookup(struct inode *dir, struct dentry *dentry, - struct afs_fid *fid, struct key *key) +static int afs_do_lookup_one(struct inode *dir, struct dentry *dentry, + struct afs_fid *fid, struct key *key) { struct afs_super_info *as = dir->i_sb->s_fs_info; - struct afs_lookup_cookie cookie = { - .ctx.actor = afs_lookup_filldir, + struct afs_lookup_one_cookie cookie = { + .ctx.actor = afs_lookup_one_filldir, .name = dentry->d_name, .fid.vid = as->volume->vid }; @@ -482,70 +553,265 @@ static int afs_do_lookup(struct inode *dir, struct dentry *dentry, } /* - * Probe to see if a cell may exist. This prevents positive dentries from - * being created unnecessarily. + * search the directory for a name + * - if afs_dir_iterate_block() spots this function, it'll pass the FID + * uniquifier through dtype */ -static int afs_probe_cell_name(struct dentry *dentry) +static int afs_lookup_filldir(struct dir_context *ctx, const char *name, + int nlen, loff_t fpos, u64 ino, unsigned dtype) { - struct afs_cell *cell; - const char *name = dentry->d_name.name; - size_t len = dentry->d_name.len; + struct afs_lookup_cookie *cookie = + container_of(ctx, struct afs_lookup_cookie, ctx); int ret; - /* Names prefixed with a dot are R/W mounts. */ - if (name[0] == '.') { - if (len == 1) - return -EINVAL; - name++; - len--; - } + _enter("{%s,%u},%s,%u,,%llu,%u", + cookie->name.name, cookie->name.len, name, nlen, + (unsigned long long) ino, dtype); - cell = afs_lookup_cell_rcu(afs_d2net(dentry), name, len); - if (!IS_ERR(cell)) { - afs_put_cell(afs_d2net(dentry), cell); - return 0; + /* insanity checks first */ + BUILD_BUG_ON(sizeof(union afs_xdr_dir_block) != 2048); + BUILD_BUG_ON(sizeof(union afs_xdr_dirent) != 32); + + if (cookie->found) { + if (cookie->nr_fids < 50) { + cookie->fids[cookie->nr_fids].vnode = ino; + cookie->fids[cookie->nr_fids].unique = dtype; + cookie->nr_fids++; + } + } else if (cookie->name.len == nlen && + memcmp(cookie->name.name, name, nlen) == 0) { + cookie->fids[0].vnode = ino; + cookie->fids[0].unique = dtype; + cookie->found = 1; + if (cookie->one_only) + return -1; } - ret = dns_query("afsdb", name, len, "ipv4", NULL, NULL); - if (ret == -ENODATA) - ret = -EDESTADDRREQ; + ret = cookie->nr_fids >= 50 ? -1 : 0; + _leave(" = %d", ret); return ret; } /* - * Try to auto mount the mountpoint with pseudo directory, if the autocell - * operation is setted. + * Do a lookup in a directory. We make use of bulk lookup to query a slew of + * files in one go and create inodes for them. The inode of the file we were + * asked for is returned. */ -static struct inode *afs_try_auto_mntpt(struct dentry *dentry, - struct inode *dir, struct afs_fid *fid) +static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, + struct key *key) { - struct afs_vnode *vnode = AFS_FS_I(dir); - struct inode *inode; - int ret = -ENOENT; + struct afs_lookup_cookie *cookie; + struct afs_cb_interest *cbi = NULL; + struct afs_super_info *as = dir->i_sb->s_fs_info; + struct afs_iget_data data; + struct afs_fs_cursor fc; + struct afs_vnode *dvnode = AFS_FS_I(dir); + struct inode *inode = NULL; + int ret, i; + + _enter("{%lu},%p{%pd},", dir->i_ino, dentry, dentry); + + cookie = kzalloc(sizeof(struct afs_lookup_cookie), GFP_KERNEL); + if (!cookie) + return ERR_PTR(-ENOMEM); + + cookie->ctx.actor = afs_lookup_filldir; + cookie->name = dentry->d_name; + cookie->nr_fids = 1; /* slot 0 is saved for the fid we actually want */ - _enter("%p{%pd}, {%x:%u}", - dentry, dentry, vnode->fid.vid, vnode->fid.vnode); + read_seqlock_excl(&dvnode->cb_lock); + if (dvnode->cb_interest && + dvnode->cb_interest->server && + test_bit(AFS_SERVER_FL_NO_IBULK, &dvnode->cb_interest->server->flags)) + cookie->one_only = true; + read_sequnlock_excl(&dvnode->cb_lock); - if (!test_bit(AFS_VNODE_AUTOCELL, &vnode->flags)) + for (i = 0; i < 50; i++) + cookie->fids[i].vid = as->volume->vid; + + /* search the directory */ + ret = afs_dir_iterate(dir, &cookie->ctx, key); + if (ret < 0) { + inode = ERR_PTR(ret); goto out; + } - ret = afs_probe_cell_name(dentry); - if (ret < 0) + inode = ERR_PTR(-ENOENT); + if (!cookie->found) goto out; - inode = afs_iget_pseudo_dir(dir->i_sb, false); - if (IS_ERR(inode)) { - ret = PTR_ERR(inode); + /* Check to see if we already have an inode for the primary fid. */ + data.volume = dvnode->volume; + data.fid = cookie->fids[0]; + inode = ilookup5(dir->i_sb, cookie->fids[0].vnode, afs_iget5_test, &data); + if (inode) + goto out; + + /* Need space for examining all the selected files */ + inode = ERR_PTR(-ENOMEM); + cookie->statuses = kcalloc(cookie->nr_fids, sizeof(struct afs_file_status), + GFP_KERNEL); + if (!cookie->statuses) goto out; + + cookie->callbacks = kcalloc(cookie->nr_fids, sizeof(struct afs_callback), + GFP_KERNEL); + if (!cookie->callbacks) + goto out_s; + + /* Try FS.InlineBulkStatus first. Abort codes for the individual + * lookups contained therein are stored in the reply without aborting + * the whole operation. + */ + if (cookie->one_only) + goto no_inline_bulk_status; + + inode = ERR_PTR(-ERESTARTSYS); + if (afs_begin_vnode_operation(&fc, dvnode, key)) { + while (afs_select_fileserver(&fc)) { + if (test_bit(AFS_SERVER_FL_NO_IBULK, + &fc.cbi->server->flags)) { + fc.ac.abort_code = RX_INVALID_OPERATION; + fc.ac.error = -ECONNABORTED; + break; + } + afs_fs_inline_bulk_status(&fc, + afs_v2net(dvnode), + cookie->fids, + cookie->statuses, + cookie->callbacks, + cookie->nr_fids, NULL); + } + + if (fc.ac.error == 0) + cbi = afs_get_cb_interest(fc.cbi); + if (fc.ac.abort_code == RX_INVALID_OPERATION) + set_bit(AFS_SERVER_FL_NO_IBULK, &fc.cbi->server->flags); + inode = ERR_PTR(afs_end_vnode_operation(&fc)); } - *fid = AFS_FS_I(inode)->fid; - _leave("= %p", inode); - return inode; + if (!IS_ERR(inode)) + goto success; + if (fc.ac.abort_code != RX_INVALID_OPERATION) + goto out_c; +no_inline_bulk_status: + /* We could try FS.BulkStatus next, but this aborts the entire op if + * any of the lookups fails - so, for the moment, revert to + * FS.FetchStatus for just the primary fid. + */ + cookie->nr_fids = 1; + inode = ERR_PTR(-ERESTARTSYS); + if (afs_begin_vnode_operation(&fc, dvnode, key)) { + while (afs_select_fileserver(&fc)) { + afs_fs_fetch_status(&fc, + afs_v2net(dvnode), + cookie->fids, + cookie->statuses, + cookie->callbacks, + NULL); + } + + if (fc.ac.error == 0) + cbi = afs_get_cb_interest(fc.cbi); + inode = ERR_PTR(afs_end_vnode_operation(&fc)); + } + + if (IS_ERR(inode)) + goto out_c; + + for (i = 0; i < cookie->nr_fids; i++) + cookie->statuses[i].abort_code = 0; + +success: + /* Turn all the files into inodes and save the first one - which is the + * one we actually want. + */ + if (cookie->statuses[0].abort_code != 0) + inode = ERR_PTR(afs_abort_to_error(cookie->statuses[0].abort_code)); + + for (i = 0; i < cookie->nr_fids; i++) { + struct inode *ti; + + if (cookie->statuses[i].abort_code != 0) + continue; + + ti = afs_iget(dir->i_sb, key, &cookie->fids[i], + &cookie->statuses[i], + &cookie->callbacks[i], + cbi); + if (i == 0) { + inode = ti; + } else { + if (!IS_ERR(ti)) + iput(ti); + } + } + +out_c: + afs_put_cb_interest(afs_v2net(dvnode), cbi); + kfree(cookie->callbacks); +out_s: + kfree(cookie->statuses); out: - _leave("= %d", ret); - return ERR_PTR(ret); + kfree(cookie); + return inode; +} + +/* + * Look up an entry in a directory with @sys substitution. + */ +static struct dentry *afs_lookup_atsys(struct inode *dir, struct dentry *dentry, + struct key *key) +{ + struct afs_sysnames *subs; + struct afs_net *net = afs_i2net(dir); + struct dentry *ret; + char *buf, *p, *name; + int len, i; + + _enter(""); + + ret = ERR_PTR(-ENOMEM); + p = buf = kmalloc(AFSNAMEMAX, GFP_KERNEL); + if (!buf) + goto out_p; + if (dentry->d_name.len > 4) { + memcpy(p, dentry->d_name.name, dentry->d_name.len - 4); + p += dentry->d_name.len - 4; + } + + /* There is an ordered list of substitutes that we have to try. */ + read_lock(&net->sysnames_lock); + subs = net->sysnames; + refcount_inc(&subs->usage); + read_unlock(&net->sysnames_lock); + + for (i = 0; i < subs->nr; i++) { + name = subs->subs[i]; + len = dentry->d_name.len - 4 + strlen(name); + if (len >= AFSNAMEMAX) { + ret = ERR_PTR(-ENAMETOOLONG); + goto out_s; + } + + strcpy(p, name); + ret = lookup_one_len(buf, dentry->d_parent, len); + if (IS_ERR(ret) || d_is_positive(ret)) + goto out_s; + dput(ret); + } + + /* We don't want to d_add() the @sys dentry here as we don't want to + * the cached dentry to hide changes to the sysnames list. + */ + ret = NULL; +out_s: + afs_put_sysnames(subs); + kfree(buf); +out_p: + key_put(key); + return ret; } /* @@ -554,16 +820,13 @@ out: static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { - struct afs_vnode *vnode; - struct afs_fid fid; + struct afs_vnode *dvnode = AFS_FS_I(dir); struct inode *inode; struct key *key; int ret; - vnode = AFS_FS_I(dir); - _enter("{%x:%u},%p{%pd},", - vnode->fid.vid, vnode->fid.vnode, dentry, dentry); + dvnode->fid.vid, dvnode->fid.vnode, dentry, dentry); ASSERTCMP(d_inode(dentry), ==, NULL); @@ -572,28 +835,37 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, return ERR_PTR(-ENAMETOOLONG); } - if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { + if (test_bit(AFS_VNODE_DELETED, &dvnode->flags)) { _leave(" = -ESTALE"); return ERR_PTR(-ESTALE); } - key = afs_request_key(vnode->volume->cell); + key = afs_request_key(dvnode->volume->cell); if (IS_ERR(key)) { _leave(" = %ld [key]", PTR_ERR(key)); return ERR_CAST(key); } - ret = afs_validate(vnode, key); + ret = afs_validate(dvnode, key); if (ret < 0) { key_put(key); _leave(" = %d [val]", ret); return ERR_PTR(ret); } - ret = afs_do_lookup(dir, dentry, &fid, key); - if (ret < 0) { + if (dentry->d_name.len >= 4 && + dentry->d_name.name[dentry->d_name.len - 4] == '@' && + dentry->d_name.name[dentry->d_name.len - 3] == 's' && + dentry->d_name.name[dentry->d_name.len - 2] == 'y' && + dentry->d_name.name[dentry->d_name.len - 1] == 's') + return afs_lookup_atsys(dir, dentry, key); + + afs_stat_v(dvnode, n_lookup); + inode = afs_do_lookup(dir, dentry, key); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); if (ret == -ENOENT) { - inode = afs_try_auto_mntpt(dentry, dir, &fid); + inode = afs_try_auto_mntpt(dentry, dir); if (!IS_ERR(inode)) { key_put(key); goto success; @@ -611,10 +883,9 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, _leave(" = %d [do]", ret); return ERR_PTR(ret); } - dentry->d_fsdata = (void *)(unsigned long) vnode->status.data_version; + dentry->d_fsdata = (void *)(unsigned long)dvnode->status.data_version; /* instantiate the dentry */ - inode = afs_iget(dir->i_sb, key, &fid, NULL, NULL, NULL); key_put(key); if (IS_ERR(inode)) { _leave(" = %ld", PTR_ERR(inode)); @@ -623,9 +894,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, success: d_add(dentry, inode); - _leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%u }", - fid.vnode, - fid.unique, + _leave(" = 0 { ino=%lu v=%u }", d_inode(dentry)->i_ino, d_inode(dentry)->i_generation); @@ -633,67 +902,23 @@ success: } /* - * Look up an entry in a dynroot directory. - */ -static struct dentry *afs_dynroot_lookup(struct inode *dir, struct dentry *dentry, - unsigned int flags) -{ - struct afs_vnode *vnode; - struct afs_fid fid; - struct inode *inode; - int ret; - - vnode = AFS_FS_I(dir); - - _enter("%pd", dentry); - - ASSERTCMP(d_inode(dentry), ==, NULL); - - if (dentry->d_name.len >= AFSNAMEMAX) { - _leave(" = -ENAMETOOLONG"); - return ERR_PTR(-ENAMETOOLONG); - } - - inode = afs_try_auto_mntpt(dentry, dir, &fid); - if (IS_ERR(inode)) { - ret = PTR_ERR(inode); - if (ret == -ENOENT) { - d_add(dentry, NULL); - _leave(" = NULL [negative]"); - return NULL; - } - _leave(" = %d [do]", ret); - return ERR_PTR(ret); - } - - d_add(dentry, inode); - _leave(" = 0 { ino=%lu v=%u }", - d_inode(dentry)->i_ino, d_inode(dentry)->i_generation); - return NULL; -} - -/* * check that a dentry lookup hit has found a valid entry * - NOTE! the hit can be a negative hit too, so we can't assume we have an * inode */ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) { - struct afs_super_info *as = dentry->d_sb->s_fs_info; struct afs_vnode *vnode, *dir; struct afs_fid uninitialized_var(fid); struct dentry *parent; struct inode *inode; struct key *key; - void *dir_version; + long dir_version, de_version; int ret; if (flags & LOOKUP_RCU) return -ECHILD; - if (as->dyn_root) - return 1; - if (d_really_is_positive(dentry)) { vnode = AFS_FS_I(d_inode(dentry)); _enter("{v={%x:%u} n=%pd fl=%lx},", @@ -729,14 +954,25 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) goto out_bad_parent; } - dir_version = (void *) (unsigned long) dir->status.data_version; - if (dentry->d_fsdata == dir_version) - goto out_valid; /* the dir contents are unchanged */ + /* We only need to invalidate a dentry if the server's copy changed + * behind our back. If we made the change, it's no problem. Note that + * on a 32-bit system, we only have 32 bits in the dentry to store the + * version. + */ + dir_version = (long)dir->status.data_version; + de_version = (long)dentry->d_fsdata; + if (de_version == dir_version) + goto out_valid; + + dir_version = (long)dir->invalid_before; + if (de_version - dir_version >= 0) + goto out_valid; _debug("dir modified"); + afs_stat_v(dir, n_reval); /* search the directory for this vnode */ - ret = afs_do_lookup(&dir->vfs_inode, dentry, &fid, key); + ret = afs_do_lookup_one(&dir->vfs_inode, dentry, &fid, key); switch (ret) { case 0: /* the filename maps to something */ @@ -789,7 +1025,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) } out_valid: - dentry->d_fsdata = dir_version; + dentry->d_fsdata = (void *)dir_version; dput(parent); key_put(key); _leave(" = 1 [valid]"); @@ -840,7 +1076,7 @@ zap: /* * handle dentry release */ -static void afs_d_release(struct dentry *dentry) +void afs_d_release(struct dentry *dentry) { _enter("%pd", dentry); } @@ -854,6 +1090,7 @@ static void afs_vnode_new_inode(struct afs_fs_cursor *fc, struct afs_file_status *newstatus, struct afs_callback *newcb) { + struct afs_vnode *vnode; struct inode *inode; if (fc->ac.error < 0) @@ -871,6 +1108,8 @@ static void afs_vnode_new_inode(struct afs_fs_cursor *fc, return; } + vnode = AFS_FS_I(inode); + set_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); d_add(new_dentry, inode); } @@ -885,6 +1124,7 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct afs_vnode *dvnode = AFS_FS_I(dir); struct afs_fid newfid; struct key *key; + u64 data_version = dvnode->status.data_version; int ret; mode |= S_IFDIR; @@ -901,8 +1141,8 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, dvnode, key)) { while (afs_select_fileserver(&fc)) { - fc.cb_break = dvnode->cb_break + dvnode->cb_s_break; - afs_fs_create(&fc, dentry->d_name.name, mode, + fc.cb_break = afs_calc_vnode_cb_break(dvnode); + afs_fs_create(&fc, dentry->d_name.name, mode, data_version, &newfid, &newstatus, &newcb); } @@ -916,6 +1156,11 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) goto error_key; } + if (ret == 0 && + test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) + afs_edit_dir_add(dvnode, &dentry->d_name, &newfid, + afs_edit_dir_for_create); + key_put(key); _leave(" = 0"); return 0; @@ -939,6 +1184,7 @@ static void afs_dir_remove_subdir(struct dentry *dentry) clear_nlink(&vnode->vfs_inode); set_bit(AFS_VNODE_DELETED, &vnode->flags); clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); + clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); } } @@ -950,6 +1196,7 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry) struct afs_fs_cursor fc; struct afs_vnode *dvnode = AFS_FS_I(dir); struct key *key; + u64 data_version = dvnode->status.data_version; int ret; _enter("{%x:%u},{%pd}", @@ -964,14 +1211,19 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry) ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, dvnode, key)) { while (afs_select_fileserver(&fc)) { - fc.cb_break = dvnode->cb_break + dvnode->cb_s_break; - afs_fs_remove(&fc, dentry->d_name.name, true); + fc.cb_break = afs_calc_vnode_cb_break(dvnode); + afs_fs_remove(&fc, dentry->d_name.name, true, + data_version); } afs_vnode_commit_status(&fc, dvnode, fc.cb_break); ret = afs_end_vnode_operation(&fc); - if (ret == 0) + if (ret == 0) { afs_dir_remove_subdir(dentry); + if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) + afs_edit_dir_remove(dvnode, &dentry->d_name, + afs_edit_dir_for_rmdir); + } } key_put(key); @@ -1036,6 +1288,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode; struct key *key; unsigned long d_version = (unsigned long)dentry->d_fsdata; + u64 data_version = dvnode->status.data_version; int ret; _enter("{%x:%u},{%pd}", @@ -1061,8 +1314,9 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, dvnode, key)) { while (afs_select_fileserver(&fc)) { - fc.cb_break = dvnode->cb_break + dvnode->cb_s_break; - afs_fs_remove(&fc, dentry->d_name.name, false); + fc.cb_break = afs_calc_vnode_cb_break(dvnode); + afs_fs_remove(&fc, dentry->d_name.name, false, + data_version); } afs_vnode_commit_status(&fc, dvnode, fc.cb_break); @@ -1071,6 +1325,10 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) ret = afs_dir_remove_link( dentry, key, d_version, (unsigned long)dvnode->status.data_version); + if (ret == 0 && + test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) + afs_edit_dir_remove(dvnode, &dentry->d_name, + afs_edit_dir_for_unlink); } error_key: @@ -1092,6 +1350,7 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct afs_vnode *dvnode = AFS_FS_I(dir); struct afs_fid newfid; struct key *key; + u64 data_version = dvnode->status.data_version; int ret; mode |= S_IFREG; @@ -1112,8 +1371,8 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, dvnode, key)) { while (afs_select_fileserver(&fc)) { - fc.cb_break = dvnode->cb_break + dvnode->cb_s_break; - afs_fs_create(&fc, dentry->d_name.name, mode, + fc.cb_break = afs_calc_vnode_cb_break(dvnode); + afs_fs_create(&fc, dentry->d_name.name, mode, data_version, &newfid, &newstatus, &newcb); } @@ -1127,6 +1386,10 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, goto error_key; } + if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) + afs_edit_dir_add(dvnode, &dentry->d_name, &newfid, + afs_edit_dir_for_create); + key_put(key); _leave(" = 0"); return 0; @@ -1148,10 +1411,12 @@ static int afs_link(struct dentry *from, struct inode *dir, struct afs_fs_cursor fc; struct afs_vnode *dvnode, *vnode; struct key *key; + u64 data_version; int ret; vnode = AFS_FS_I(d_inode(from)); dvnode = AFS_FS_I(dir); + data_version = dvnode->status.data_version; _enter("{%x:%u},{%x:%u},{%pd}", vnode->fid.vid, vnode->fid.vnode, @@ -1176,9 +1441,9 @@ static int afs_link(struct dentry *from, struct inode *dir, } while (afs_select_fileserver(&fc)) { - fc.cb_break = dvnode->cb_break + dvnode->cb_s_break; - fc.cb_break_2 = vnode->cb_break + vnode->cb_s_break; - afs_fs_link(&fc, vnode, dentry->d_name.name); + fc.cb_break = afs_calc_vnode_cb_break(dvnode); + fc.cb_break_2 = afs_calc_vnode_cb_break(vnode); + afs_fs_link(&fc, vnode, dentry->d_name.name, data_version); } afs_vnode_commit_status(&fc, dvnode, fc.cb_break); @@ -1194,6 +1459,10 @@ static int afs_link(struct dentry *from, struct inode *dir, goto error_key; } + if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) + afs_edit_dir_add(dvnode, &dentry->d_name, &vnode->fid, + afs_edit_dir_for_link); + key_put(key); _leave(" = 0"); return 0; @@ -1217,6 +1486,7 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry, struct afs_vnode *dvnode = AFS_FS_I(dir); struct afs_fid newfid; struct key *key; + u64 data_version = dvnode->status.data_version; int ret; _enter("{%x:%u},{%pd},%s", @@ -1240,8 +1510,9 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry, ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, dvnode, key)) { while (afs_select_fileserver(&fc)) { - fc.cb_break = dvnode->cb_break + dvnode->cb_s_break; - afs_fs_symlink(&fc, dentry->d_name.name, content, + fc.cb_break = afs_calc_vnode_cb_break(dvnode); + afs_fs_symlink(&fc, dentry->d_name.name, + content, data_version, &newfid, &newstatus); } @@ -1255,6 +1526,10 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry, goto error_key; } + if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) + afs_edit_dir_add(dvnode, &dentry->d_name, &newfid, + afs_edit_dir_for_symlink); + key_put(key); _leave(" = 0"); return 0; @@ -1277,6 +1552,8 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, struct afs_fs_cursor fc; struct afs_vnode *orig_dvnode, *new_dvnode, *vnode; struct key *key; + u64 orig_data_version, new_data_version; + bool new_negative = d_is_negative(new_dentry); int ret; if (flags) @@ -1285,6 +1562,8 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, vnode = AFS_FS_I(d_inode(old_dentry)); orig_dvnode = AFS_FS_I(old_dir); new_dvnode = AFS_FS_I(new_dir); + orig_data_version = orig_dvnode->status.data_version; + new_data_version = new_dvnode->status.data_version; _enter("{%x:%u},{%x:%u},{%x:%u},{%pd}", orig_dvnode->fid.vid, orig_dvnode->fid.vnode, @@ -1307,10 +1586,11 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, } } while (afs_select_fileserver(&fc)) { - fc.cb_break = orig_dvnode->cb_break + orig_dvnode->cb_s_break; - fc.cb_break_2 = new_dvnode->cb_break + new_dvnode->cb_s_break; + fc.cb_break = afs_calc_vnode_cb_break(orig_dvnode); + fc.cb_break_2 = afs_calc_vnode_cb_break(new_dvnode); afs_fs_rename(&fc, old_dentry->d_name.name, - new_dvnode, new_dentry->d_name.name); + new_dvnode, new_dentry->d_name.name, + orig_data_version, new_data_version); } afs_vnode_commit_status(&fc, orig_dvnode, fc.cb_break); @@ -1322,9 +1602,68 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, goto error_key; } + if (ret == 0) { + if (test_bit(AFS_VNODE_DIR_VALID, &orig_dvnode->flags)) + afs_edit_dir_remove(orig_dvnode, &old_dentry->d_name, + afs_edit_dir_for_rename); + + if (!new_negative && + test_bit(AFS_VNODE_DIR_VALID, &new_dvnode->flags)) + afs_edit_dir_remove(new_dvnode, &new_dentry->d_name, + afs_edit_dir_for_rename); + + if (test_bit(AFS_VNODE_DIR_VALID, &new_dvnode->flags)) + afs_edit_dir_add(new_dvnode, &new_dentry->d_name, + &vnode->fid, afs_edit_dir_for_rename); + } + error_key: key_put(key); error: _leave(" = %d", ret); return ret; } + +/* + * Release a directory page and clean up its private state if it's not busy + * - return true if the page can now be released, false if not + */ +static int afs_dir_releasepage(struct page *page, gfp_t gfp_flags) +{ + struct afs_vnode *dvnode = AFS_FS_I(page->mapping->host); + + _enter("{{%x:%u}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, page->index); + + set_page_private(page, 0); + ClearPagePrivate(page); + + /* The directory will need reloading. */ + if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) + afs_stat_v(dvnode, n_relpg); + return 1; +} + +/* + * invalidate part or all of a page + * - release a page and clean up its private data if offset is 0 (indicating + * the entire page) + */ +static void afs_dir_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) +{ + struct afs_vnode *dvnode = AFS_FS_I(page->mapping->host); + + _enter("{%lu},%u,%u", page->index, offset, length); + + BUG_ON(!PageLocked(page)); + + /* The directory will need reloading. */ + if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) + afs_stat_v(dvnode, n_inval); + + /* we clean up only if the entire page is being invalidated */ + if (offset == 0 && length == PAGE_SIZE) { + set_page_private(page, 0); + ClearPagePrivate(page); + } +} diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c new file mode 100644 index 0000000..8b400f5 --- /dev/null +++ b/fs/afs/dir_edit.c @@ -0,0 +1,505 @@ +/* AFS filesystem directory editing + * + * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/fs.h> +#include <linux/namei.h> +#include <linux/pagemap.h> +#include <linux/iversion.h> +#include "internal.h" +#include "xdr_fs.h" + +/* + * Find a number of contiguous clear bits in a directory block bitmask. + * + * There are 64 slots, which means we can load the entire bitmap into a + * variable. The first bit doesn't count as it corresponds to the block header + * slot. nr_slots is between 1 and 9. + */ +static int afs_find_contig_bits(union afs_xdr_dir_block *block, unsigned int nr_slots) +{ + u64 bitmap; + u32 mask; + int bit, n; + + bitmap = (u64)block->hdr.bitmap[0] << 0 * 8; + bitmap |= (u64)block->hdr.bitmap[1] << 1 * 8; + bitmap |= (u64)block->hdr.bitmap[2] << 2 * 8; + bitmap |= (u64)block->hdr.bitmap[3] << 3 * 8; + bitmap |= (u64)block->hdr.bitmap[4] << 4 * 8; + bitmap |= (u64)block->hdr.bitmap[5] << 5 * 8; + bitmap |= (u64)block->hdr.bitmap[6] << 6 * 8; + bitmap |= (u64)block->hdr.bitmap[7] << 7 * 8; + bitmap >>= 1; /* The first entry is metadata */ + bit = 1; + mask = (1 << nr_slots) - 1; + + do { + if (sizeof(unsigned long) == 8) + n = ffz(bitmap); + else + n = ((u32)bitmap) != 0 ? + ffz((u32)bitmap) : + ffz((u32)(bitmap >> 32)) + 32; + bitmap >>= n; + bit += n; + + if ((bitmap & mask) == 0) { + if (bit > 64 - nr_slots) + return -1; + return bit; + } + + n = __ffs(bitmap); + bitmap >>= n; + bit += n; + } while (bitmap); + + return -1; +} + +/* + * Set a number of contiguous bits in the directory block bitmap. + */ +static void afs_set_contig_bits(union afs_xdr_dir_block *block, + int bit, unsigned int nr_slots) +{ + u64 mask, before, after; + + mask = (1 << nr_slots) - 1; + mask <<= bit; + + before = *(u64 *)block->hdr.bitmap; + + block->hdr.bitmap[0] |= (u8)(mask >> 0 * 8); + block->hdr.bitmap[1] |= (u8)(mask >> 1 * 8); + block->hdr.bitmap[2] |= (u8)(mask >> 2 * 8); + block->hdr.bitmap[3] |= (u8)(mask >> 3 * 8); + block->hdr.bitmap[4] |= (u8)(mask >> 4 * 8); + block->hdr.bitmap[5] |= (u8)(mask >> 5 * 8); + block->hdr.bitmap[6] |= (u8)(mask >> 6 * 8); + block->hdr.bitmap[7] |= (u8)(mask >> 7 * 8); + + after = *(u64 *)block->hdr.bitmap; +} + +/* + * Clear a number of contiguous bits in the directory block bitmap. + */ +static void afs_clear_contig_bits(union afs_xdr_dir_block *block, + int bit, unsigned int nr_slots) +{ + u64 mask, before, after; + + mask = (1 << nr_slots) - 1; + mask <<= bit; + + before = *(u64 *)block->hdr.bitmap; + + block->hdr.bitmap[0] &= ~(u8)(mask >> 0 * 8); + block->hdr.bitmap[1] &= ~(u8)(mask >> 1 * 8); + block->hdr.bitmap[2] &= ~(u8)(mask >> 2 * 8); + block->hdr.bitmap[3] &= ~(u8)(mask >> 3 * 8); + block->hdr.bitmap[4] &= ~(u8)(mask >> 4 * 8); + block->hdr.bitmap[5] &= ~(u8)(mask >> 5 * 8); + block->hdr.bitmap[6] &= ~(u8)(mask >> 6 * 8); + block->hdr.bitmap[7] &= ~(u8)(mask >> 7 * 8); + + after = *(u64 *)block->hdr.bitmap; +} + +/* + * Scan a directory block looking for a dirent of the right name. + */ +static int afs_dir_scan_block(union afs_xdr_dir_block *block, struct qstr *name, + unsigned int blocknum) +{ + union afs_xdr_dirent *de; + u64 bitmap; + int d, len, n; + + _enter(""); + + bitmap = (u64)block->hdr.bitmap[0] << 0 * 8; + bitmap |= (u64)block->hdr.bitmap[1] << 1 * 8; + bitmap |= (u64)block->hdr.bitmap[2] << 2 * 8; + bitmap |= (u64)block->hdr.bitmap[3] << 3 * 8; + bitmap |= (u64)block->hdr.bitmap[4] << 4 * 8; + bitmap |= (u64)block->hdr.bitmap[5] << 5 * 8; + bitmap |= (u64)block->hdr.bitmap[6] << 6 * 8; + bitmap |= (u64)block->hdr.bitmap[7] << 7 * 8; + + for (d = (blocknum == 0 ? AFS_DIR_RESV_BLOCKS0 : AFS_DIR_RESV_BLOCKS); + d < AFS_DIR_SLOTS_PER_BLOCK; + d++) { + if (!((bitmap >> d) & 1)) + continue; + de = &block->dirents[d]; + if (de->u.valid != 1) + continue; + + /* The block was NUL-terminated by afs_dir_check_page(). */ + len = strlen(de->u.name); + if (len == name->len && + memcmp(de->u.name, name->name, name->len) == 0) + return d; + + n = round_up(12 + len + 1 + 4, AFS_DIR_DIRENT_SIZE); + n /= AFS_DIR_DIRENT_SIZE; + d += n - 1; + } + + return -1; +} + +/* + * Initialise a new directory block. Note that block 0 is special and contains + * some extra metadata. + */ +static void afs_edit_init_block(union afs_xdr_dir_block *meta, + union afs_xdr_dir_block *block, int block_num) +{ + memset(block, 0, sizeof(*block)); + block->hdr.npages = htons(1); + block->hdr.magic = AFS_DIR_MAGIC; + block->hdr.bitmap[0] = 1; + + if (block_num == 0) { + block->hdr.bitmap[0] = 0xff; + block->hdr.bitmap[1] = 0x1f; + memset(block->meta.alloc_ctrs, + AFS_DIR_SLOTS_PER_BLOCK, + sizeof(block->meta.alloc_ctrs)); + meta->meta.alloc_ctrs[0] = + AFS_DIR_SLOTS_PER_BLOCK - AFS_DIR_RESV_BLOCKS0; + } + + if (block_num < AFS_DIR_BLOCKS_WITH_CTR) + meta->meta.alloc_ctrs[block_num] = + AFS_DIR_SLOTS_PER_BLOCK - AFS_DIR_RESV_BLOCKS; +} + +/* + * Edit a directory's file data to add a new directory entry. Doing this after + * create, mkdir, symlink, link or rename if the data version number is + * incremented by exactly one avoids the need to re-download the entire + * directory contents. + * + * The caller must hold the inode locked. + */ +void afs_edit_dir_add(struct afs_vnode *vnode, + struct qstr *name, struct afs_fid *new_fid, + enum afs_edit_dir_reason why) +{ + union afs_xdr_dir_block *meta, *block; + struct afs_xdr_dir_page *meta_page, *dir_page; + union afs_xdr_dirent *de; + struct page *page0, *page; + unsigned int need_slots, nr_blocks, b; + pgoff_t index; + loff_t i_size; + gfp_t gfp; + int slot; + + _enter(",,{%d,%s},", name->len, name->name); + + i_size = i_size_read(&vnode->vfs_inode); + if (i_size > AFS_DIR_BLOCK_SIZE * AFS_DIR_MAX_BLOCKS || + (i_size & (AFS_DIR_BLOCK_SIZE - 1))) { + clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + return; + } + + gfp = vnode->vfs_inode.i_mapping->gfp_mask; + page0 = find_or_create_page(vnode->vfs_inode.i_mapping, 0, gfp); + if (!page0) { + clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + _leave(" [fgp]"); + return; + } + + /* Work out how many slots we're going to need. */ + need_slots = round_up(12 + name->len + 1 + 4, AFS_DIR_DIRENT_SIZE); + need_slots /= AFS_DIR_DIRENT_SIZE; + + meta_page = kmap(page0); + meta = &meta_page->blocks[0]; + if (i_size == 0) + goto new_directory; + nr_blocks = i_size / AFS_DIR_BLOCK_SIZE; + + /* Find a block that has sufficient slots available. Each VM page + * contains two or more directory blocks. + */ + for (b = 0; b < nr_blocks + 1; b++) { + /* If the directory extended into a new page, then we need to + * tack a new page on the end. + */ + index = b / AFS_DIR_BLOCKS_PER_PAGE; + if (index == 0) { + page = page0; + dir_page = meta_page; + } else { + if (nr_blocks >= AFS_DIR_MAX_BLOCKS) + goto error; + gfp = vnode->vfs_inode.i_mapping->gfp_mask; + page = find_or_create_page(vnode->vfs_inode.i_mapping, + index, gfp); + if (!page) + goto error; + if (!PagePrivate(page)) { + set_page_private(page, 1); + SetPagePrivate(page); + } + dir_page = kmap(page); + } + + /* Abandon the edit if we got a callback break. */ + if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) + goto invalidated; + + block = &dir_page->blocks[b % AFS_DIR_BLOCKS_PER_PAGE]; + + _debug("block %u: %2u %3u %u", + b, + (b < AFS_DIR_BLOCKS_WITH_CTR) ? meta->meta.alloc_ctrs[b] : 99, + ntohs(block->hdr.npages), + ntohs(block->hdr.magic)); + + /* Initialise the block if necessary. */ + if (b == nr_blocks) { + _debug("init %u", b); + afs_edit_init_block(meta, block, b); + i_size_write(&vnode->vfs_inode, (b + 1) * AFS_DIR_BLOCK_SIZE); + } + + /* Only lower dir pages have a counter in the header. */ + if (b >= AFS_DIR_BLOCKS_WITH_CTR || + meta->meta.alloc_ctrs[b] >= need_slots) { + /* We need to try and find one or more consecutive + * slots to hold the entry. + */ + slot = afs_find_contig_bits(block, need_slots); + if (slot >= 0) { + _debug("slot %u", slot); + goto found_space; + } + } + + if (page != page0) { + unlock_page(page); + kunmap(page); + put_page(page); + } + } + + /* There are no spare slots of sufficient size, yet the operation + * succeeded. Download the directory again. + */ + trace_afs_edit_dir(vnode, why, afs_edit_dir_create_nospc, 0, 0, 0, 0, name->name); + clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + goto out_unmap; + +new_directory: + afs_edit_init_block(meta, meta, 0); + i_size = AFS_DIR_BLOCK_SIZE; + i_size_write(&vnode->vfs_inode, i_size); + slot = AFS_DIR_RESV_BLOCKS0; + page = page0; + block = meta; + nr_blocks = 1; + b = 0; + +found_space: + /* Set the dirent slot. */ + trace_afs_edit_dir(vnode, why, afs_edit_dir_create, b, slot, + new_fid->vnode, new_fid->unique, name->name); + de = &block->dirents[slot]; + de->u.valid = 1; + de->u.unused[0] = 0; + de->u.hash_next = 0; // TODO: Really need to maintain this + de->u.vnode = htonl(new_fid->vnode); + de->u.unique = htonl(new_fid->unique); + memcpy(de->u.name, name->name, name->len + 1); + de->u.name[name->len] = 0; + + /* Adjust the bitmap. */ + afs_set_contig_bits(block, slot, need_slots); + if (page != page0) { + unlock_page(page); + kunmap(page); + put_page(page); + } + + /* Adjust the allocation counter. */ + if (b < AFS_DIR_BLOCKS_WITH_CTR) + meta->meta.alloc_ctrs[b] -= need_slots; + + inode_inc_iversion_raw(&vnode->vfs_inode); + afs_stat_v(vnode, n_dir_cr); + _debug("Insert %s in %u[%u]", name->name, b, slot); + +out_unmap: + unlock_page(page0); + kunmap(page0); + put_page(page0); + _leave(""); + return; + +invalidated: + trace_afs_edit_dir(vnode, why, afs_edit_dir_create_inval, 0, 0, 0, 0, name->name); + clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + if (page != page0) { + kunmap(page); + put_page(page); + } + goto out_unmap; + +error: + trace_afs_edit_dir(vnode, why, afs_edit_dir_create_error, 0, 0, 0, 0, name->name); + clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + goto out_unmap; +} + +/* + * Edit a directory's file data to remove a new directory entry. Doing this + * after unlink, rmdir or rename if the data version number is incremented by + * exactly one avoids the need to re-download the entire directory contents. + * + * The caller must hold the inode locked. + */ +void afs_edit_dir_remove(struct afs_vnode *vnode, + struct qstr *name, enum afs_edit_dir_reason why) +{ + struct afs_xdr_dir_page *meta_page, *dir_page; + union afs_xdr_dir_block *meta, *block; + union afs_xdr_dirent *de; + struct page *page0, *page; + unsigned int need_slots, nr_blocks, b; + pgoff_t index; + loff_t i_size; + int slot; + + _enter(",,{%d,%s},", name->len, name->name); + + i_size = i_size_read(&vnode->vfs_inode); + if (i_size < AFS_DIR_BLOCK_SIZE || + i_size > AFS_DIR_BLOCK_SIZE * AFS_DIR_MAX_BLOCKS || + (i_size & (AFS_DIR_BLOCK_SIZE - 1))) { + clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + return; + } + nr_blocks = i_size / AFS_DIR_BLOCK_SIZE; + + page0 = find_lock_page(vnode->vfs_inode.i_mapping, 0); + if (!page0) { + clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + _leave(" [fgp]"); + return; + } + + /* Work out how many slots we're going to discard. */ + need_slots = round_up(12 + name->len + 1 + 4, AFS_DIR_DIRENT_SIZE); + need_slots /= AFS_DIR_DIRENT_SIZE; + + meta_page = kmap(page0); + meta = &meta_page->blocks[0]; + + /* Find a page that has sufficient slots available. Each VM page + * contains two or more directory blocks. + */ + for (b = 0; b < nr_blocks; b++) { + index = b / AFS_DIR_BLOCKS_PER_PAGE; + if (index != 0) { + page = find_lock_page(vnode->vfs_inode.i_mapping, index); + if (!page) + goto error; + dir_page = kmap(page); + } else { + page = page0; + dir_page = meta_page; + } + + /* Abandon the edit if we got a callback break. */ + if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) + goto invalidated; + + block = &dir_page->blocks[b % AFS_DIR_BLOCKS_PER_PAGE]; + + if (b > AFS_DIR_BLOCKS_WITH_CTR || + meta->meta.alloc_ctrs[b] <= AFS_DIR_SLOTS_PER_BLOCK - 1 - need_slots) { + slot = afs_dir_scan_block(block, name, b); + if (slot >= 0) + goto found_dirent; + } + + if (page != page0) { + unlock_page(page); + kunmap(page); + put_page(page); + } + } + + /* Didn't find the dirent to clobber. Download the directory again. */ + trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_noent, + 0, 0, 0, 0, name->name); + clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + goto out_unmap; + +found_dirent: + de = &block->dirents[slot]; + + trace_afs_edit_dir(vnode, why, afs_edit_dir_delete, b, slot, + ntohl(de->u.vnode), ntohl(de->u.unique), + name->name); + + memset(de, 0, sizeof(*de) * need_slots); + + /* Adjust the bitmap. */ + afs_clear_contig_bits(block, slot, need_slots); + if (page != page0) { + unlock_page(page); + kunmap(page); + put_page(page); + } + + /* Adjust the allocation counter. */ + if (b < AFS_DIR_BLOCKS_WITH_CTR) + meta->meta.alloc_ctrs[b] += need_slots; + + inode_set_iversion_raw(&vnode->vfs_inode, vnode->status.data_version); + afs_stat_v(vnode, n_dir_rm); + _debug("Remove %s from %u[%u]", name->name, b, slot); + +out_unmap: + unlock_page(page0); + kunmap(page0); + put_page(page0); + _leave(""); + return; + +invalidated: + trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_inval, + 0, 0, 0, 0, name->name); + clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + if (page != page0) { + unlock_page(page); + kunmap(page); + put_page(page); + } + goto out_unmap; + +error: + trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_error, + 0, 0, 0, 0, name->name); + clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + goto out_unmap; +} diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c new file mode 100644 index 0000000..983f394 --- /dev/null +++ b/fs/afs/dynroot.c @@ -0,0 +1,209 @@ +/* dir.c: AFS dynamic root handling + * + * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/fs.h> +#include <linux/namei.h> +#include <linux/dns_resolver.h> +#include "internal.h" + +const struct file_operations afs_dynroot_file_operations = { + .open = dcache_dir_open, + .release = dcache_dir_close, + .iterate_shared = dcache_readdir, + .llseek = dcache_dir_lseek, +}; + +/* + * Probe to see if a cell may exist. This prevents positive dentries from + * being created unnecessarily. + */ +static int afs_probe_cell_name(struct dentry *dentry) +{ + struct afs_cell *cell; + const char *name = dentry->d_name.name; + size_t len = dentry->d_name.len; + int ret; + + /* Names prefixed with a dot are R/W mounts. */ + if (name[0] == '.') { + if (len == 1) + return -EINVAL; + name++; + len--; + } + + cell = afs_lookup_cell_rcu(afs_d2net(dentry), name, len); + if (!IS_ERR(cell)) { + afs_put_cell(afs_d2net(dentry), cell); + return 0; + } + + ret = dns_query("afsdb", name, len, "ipv4", NULL, NULL); + if (ret == -ENODATA) + ret = -EDESTADDRREQ; + return ret; +} + +/* + * Try to auto mount the mountpoint with pseudo directory, if the autocell + * operation is setted. + */ +struct inode *afs_try_auto_mntpt(struct dentry *dentry, struct inode *dir) +{ + struct afs_vnode *vnode = AFS_FS_I(dir); + struct inode *inode; + int ret = -ENOENT; + + _enter("%p{%pd}, {%x:%u}", + dentry, dentry, vnode->fid.vid, vnode->fid.vnode); + + if (!test_bit(AFS_VNODE_AUTOCELL, &vnode->flags)) + goto out; + + ret = afs_probe_cell_name(dentry); + if (ret < 0) + goto out; + + inode = afs_iget_pseudo_dir(dir->i_sb, false); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + goto out; + } + + _leave("= %p", inode); + return inode; + +out: + _leave("= %d", ret); + return ERR_PTR(ret); +} + +/* + * Look up @cell in a dynroot directory. This is a substitution for the + * local cell name for the net namespace. + */ +static struct dentry *afs_lookup_atcell(struct dentry *dentry) +{ + struct afs_cell *cell; + struct afs_net *net = afs_d2net(dentry); + struct dentry *ret; + unsigned int seq = 0; + char *name; + int len; + + if (!net->ws_cell) + return ERR_PTR(-ENOENT); + + ret = ERR_PTR(-ENOMEM); + name = kmalloc(AFS_MAXCELLNAME + 1, GFP_KERNEL); + if (!name) + goto out_p; + + rcu_read_lock(); + do { + read_seqbegin_or_lock(&net->cells_lock, &seq); + cell = rcu_dereference_raw(net->ws_cell); + if (cell) { + len = cell->name_len; + memcpy(name, cell->name, len + 1); + } + } while (need_seqretry(&net->cells_lock, seq)); + done_seqretry(&net->cells_lock, seq); + rcu_read_unlock(); + + ret = ERR_PTR(-ENOENT); + if (!cell) + goto out_n; + + ret = lookup_one_len(name, dentry->d_parent, len); + + /* We don't want to d_add() the @cell dentry here as we don't want to + * the cached dentry to hide changes to the local cell name. + */ + +out_n: + kfree(name); +out_p: + return ret; +} + +/* + * Look up an entry in a dynroot directory. + */ +static struct dentry *afs_dynroot_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct afs_vnode *vnode; + struct inode *inode; + int ret; + + vnode = AFS_FS_I(dir); + + _enter("%pd", dentry); + + ASSERTCMP(d_inode(dentry), ==, NULL); + + if (dentry->d_name.len >= AFSNAMEMAX) { + _leave(" = -ENAMETOOLONG"); + return ERR_PTR(-ENAMETOOLONG); + } + + if (dentry->d_name.len == 5 && + memcmp(dentry->d_name.name, "@cell", 5) == 0) + return afs_lookup_atcell(dentry); + + inode = afs_try_auto_mntpt(dentry, dir); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + if (ret == -ENOENT) { + d_add(dentry, NULL); + _leave(" = NULL [negative]"); + return NULL; + } + _leave(" = %d [do]", ret); + return ERR_PTR(ret); + } + + d_add(dentry, inode); + _leave(" = 0 { ino=%lu v=%u }", + d_inode(dentry)->i_ino, d_inode(dentry)->i_generation); + return NULL; +} + +const struct inode_operations afs_dynroot_inode_operations = { + .lookup = afs_dynroot_lookup, +}; + +/* + * Dirs in the dynamic root don't need revalidation. + */ +static int afs_dynroot_d_revalidate(struct dentry *dentry, unsigned int flags) +{ + return 1; +} + +/* + * Allow the VFS to enquire as to whether a dentry should be unhashed (mustn't + * sleep) + * - called from dput() when d_count is going to 0. + * - return 1 to request dentry be unhashed, 0 otherwise + */ +static int afs_dynroot_d_delete(const struct dentry *dentry) +{ + return d_really_is_positive(dentry); +} + +const struct dentry_operations afs_dynroot_dentry_operations = { + .d_revalidate = afs_dynroot_d_revalidate, + .d_delete = afs_dynroot_d_delete, + .d_release = afs_d_release, + .d_automount = afs_d_automount, +}; diff --git a/fs/afs/file.c b/fs/afs/file.c index 79e665a..7d4f261 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -30,7 +30,6 @@ static int afs_readpages(struct file *filp, struct address_space *mapping, const struct file_operations afs_file_operations = { .open = afs_open, - .flush = afs_flush, .release = afs_release, .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, @@ -146,6 +145,9 @@ int afs_open(struct inode *inode, struct file *file) if (ret < 0) goto error_af; } + + if (file->f_flags & O_TRUNC) + set_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); file->private_data = af; _leave(" = 0"); @@ -170,6 +172,9 @@ int afs_release(struct inode *inode, struct file *file) _enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode); + if ((file->f_mode & FMODE_WRITE)) + return vfs_fsync(file, 0); + file->private_data = NULL; if (af->wb) afs_put_wb_key(af->wb); @@ -187,10 +192,12 @@ void afs_put_read(struct afs_read *req) { int i; - if (atomic_dec_and_test(&req->usage)) { + if (refcount_dec_and_test(&req->usage)) { for (i = 0; i < req->nr_pages; i++) if (req->pages[i]) put_page(req->pages[i]); + if (req->pages != req->array) + kfree(req->pages); kfree(req); } } @@ -231,7 +238,7 @@ int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *de ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, vnode, key)) { while (afs_select_fileserver(&fc)) { - fc.cb_break = vnode->cb_break + vnode->cb_s_break; + fc.cb_break = afs_calc_vnode_cb_break(vnode); afs_fs_fetch_data(&fc, desc); } @@ -240,6 +247,12 @@ int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *de ret = afs_end_vnode_operation(&fc); } + if (ret == 0) { + afs_stat_v(vnode, n_fetches); + atomic_long_add(desc->actual_len, + &afs_v2net(vnode)->n_fetch_bytes); + } + _leave(" = %d", ret); return ret; } @@ -297,10 +310,11 @@ int afs_page_filler(void *data, struct page *page) * end of the file, the server will return a short read and the * unmarshalling code will clear the unfilled space. */ - atomic_set(&req->usage, 1); + refcount_set(&req->usage, 1); req->pos = (loff_t)page->index << PAGE_SHIFT; req->len = PAGE_SIZE; req->nr_pages = 1; + req->pages = req->array; req->pages[0] = page; get_page(page); @@ -309,10 +323,6 @@ int afs_page_filler(void *data, struct page *page) ret = afs_fetch_data(vnode, key, req); afs_put_read(req); - if (ret >= 0 && S_ISDIR(inode->i_mode) && - !afs_dir_check_page(inode, page)) - ret = -EIO; - if (ret < 0) { if (ret == -ENOENT) { _debug("got NOENT from server" @@ -447,10 +457,11 @@ static int afs_readpages_one(struct file *file, struct address_space *mapping, if (!req) return -ENOMEM; - atomic_set(&req->usage, 1); + refcount_set(&req->usage, 1); req->page_done = afs_readpages_page_done; req->pos = first->index; req->pos <<= PAGE_SHIFT; + req->pages = req->array; /* Transfer the pages to the request. We add them in until one fails * to add to the LRU and then we stop (as that'll make a hole in the diff --git a/fs/afs/flock.c b/fs/afs/flock.c index c40ba2f..dc62d15 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -86,7 +86,7 @@ static int afs_set_lock(struct afs_vnode *vnode, struct key *key, ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, vnode, key)) { while (afs_select_fileserver(&fc)) { - fc.cb_break = vnode->cb_break + vnode->cb_s_break; + fc.cb_break = afs_calc_vnode_cb_break(vnode); afs_fs_set_lock(&fc, type); } @@ -117,7 +117,7 @@ static int afs_extend_lock(struct afs_vnode *vnode, struct key *key) ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, vnode, key)) { while (afs_select_current_fileserver(&fc)) { - fc.cb_break = vnode->cb_break + vnode->cb_s_break; + fc.cb_break = afs_calc_vnode_cb_break(vnode); afs_fs_extend_lock(&fc); } @@ -148,7 +148,7 @@ static int afs_release_lock(struct afs_vnode *vnode, struct key *key) ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, vnode, key)) { while (afs_select_current_fileserver(&fc)) { - fc.cb_break = vnode->cb_break + vnode->cb_s_break; + fc.cb_break = afs_calc_vnode_cb_break(vnode); afs_fs_release_lock(&fc); } @@ -613,7 +613,7 @@ static int afs_do_getlk(struct file *file, struct file_lock *fl) posix_test_lock(file, fl); if (fl->fl_type == F_UNLCK) { /* no local locks; consult the server */ - ret = afs_fetch_status(vnode, key); + ret = afs_fetch_status(vnode, key, false); if (ret < 0) goto error; diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 88ec38c..b273e1d 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -16,6 +16,7 @@ #include <linux/iversion.h> #include "internal.h" #include "afs_fs.h" +#include "xdr_fs.h" static const struct afs_fid afs_zero_fid; @@ -44,109 +45,207 @@ static void xdr_decode_AFSFid(const __be32 **_bp, struct afs_fid *fid) } /* - * decode an AFSFetchStatus block + * Dump a bad file status record. */ -static void xdr_decode_AFSFetchStatus(const __be32 **_bp, - struct afs_file_status *status, - struct afs_vnode *vnode, - afs_dataversion_t *store_version) +static void xdr_dump_bad(const __be32 *bp) { - afs_dataversion_t expected_version; - const __be32 *bp = *_bp; + __be32 x[4]; + int i; + + pr_notice("AFS XDR: Bad status record\n"); + for (i = 0; i < 5 * 4 * 4; i += 16) { + memcpy(x, bp, 16); + bp += 4; + pr_notice("%03x: %08x %08x %08x %08x\n", + i, ntohl(x[0]), ntohl(x[1]), ntohl(x[2]), ntohl(x[3])); + } + + memcpy(x, bp, 4); + pr_notice("0x50: %08x\n", ntohl(x[0])); +} + +/* + * Update the core inode struct from a returned status record. + */ +void afs_update_inode_from_status(struct afs_vnode *vnode, + struct afs_file_status *status, + const afs_dataversion_t *expected_version, + u8 flags) +{ + struct timespec t; umode_t mode; + + t.tv_sec = status->mtime_client; + t.tv_nsec = 0; + vnode->vfs_inode.i_ctime = t; + vnode->vfs_inode.i_mtime = t; + vnode->vfs_inode.i_atime = t; + + if (flags & (AFS_VNODE_META_CHANGED | AFS_VNODE_NOT_YET_SET)) { + vnode->vfs_inode.i_uid = make_kuid(&init_user_ns, status->owner); + vnode->vfs_inode.i_gid = make_kgid(&init_user_ns, status->group); + set_nlink(&vnode->vfs_inode, status->nlink); + + mode = vnode->vfs_inode.i_mode; + mode &= ~S_IALLUGO; + mode |= status->mode; + barrier(); + vnode->vfs_inode.i_mode = mode; + } + + if (!(flags & AFS_VNODE_NOT_YET_SET)) { + if (expected_version && + *expected_version != status->data_version) { + _debug("vnode modified %llx on {%x:%u} [exp %llx]", + (unsigned long long) status->data_version, + vnode->fid.vid, vnode->fid.vnode, + (unsigned long long) *expected_version); + vnode->invalid_before = status->data_version; + if (vnode->status.type == AFS_FTYPE_DIR) { + if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) + afs_stat_v(vnode, n_inval); + } else { + set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags); + } + } else if (vnode->status.type == AFS_FTYPE_DIR) { + /* Expected directory change is handled elsewhere so + * that we can locally edit the directory and save on a + * download. + */ + if (test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) + flags &= ~AFS_VNODE_DATA_CHANGED; + } + } + + if (flags & (AFS_VNODE_DATA_CHANGED | AFS_VNODE_NOT_YET_SET)) { + inode_set_iversion_raw(&vnode->vfs_inode, status->data_version); + i_size_write(&vnode->vfs_inode, status->size); + } +} + +/* + * decode an AFSFetchStatus block + */ +static int xdr_decode_AFSFetchStatus(struct afs_call *call, + const __be32 **_bp, + struct afs_file_status *status, + struct afs_vnode *vnode, + const afs_dataversion_t *expected_version, + struct afs_read *read_req) +{ + const struct afs_xdr_AFSFetchStatus *xdr = (const void *)*_bp; + bool inline_error = (call->operation_ID == afs_FS_InlineBulkStatus); u64 data_version, size; - bool changed = false; - kuid_t owner; - kgid_t group; + u32 type, abort_code; + u8 flags = 0; + int ret; if (vnode) write_seqlock(&vnode->cb_lock); -#define EXTRACT(DST) \ - do { \ - u32 x = ntohl(*bp++); \ - if (DST != x) \ - changed |= true; \ - DST = x; \ - } while (0) - - status->if_version = ntohl(*bp++); - EXTRACT(status->type); - EXTRACT(status->nlink); - size = ntohl(*bp++); - data_version = ntohl(*bp++); - EXTRACT(status->author); - owner = make_kuid(&init_user_ns, ntohl(*bp++)); - changed |= !uid_eq(owner, status->owner); - status->owner = owner; - EXTRACT(status->caller_access); /* call ticket dependent */ - EXTRACT(status->anon_access); - EXTRACT(status->mode); - bp++; /* parent.vnode */ - bp++; /* parent.unique */ - bp++; /* seg size */ - status->mtime_client = ntohl(*bp++); - status->mtime_server = ntohl(*bp++); - group = make_kgid(&init_user_ns, ntohl(*bp++)); - changed |= !gid_eq(group, status->group); - status->group = group; - bp++; /* sync counter */ - data_version |= (u64) ntohl(*bp++) << 32; - EXTRACT(status->lock_count); - size |= (u64) ntohl(*bp++) << 32; - bp++; /* spare 4 */ - *_bp = bp; + abort_code = ntohl(xdr->abort_code); + + if (xdr->if_version != htonl(AFS_FSTATUS_VERSION)) { + if (xdr->if_version == htonl(0) && + abort_code != 0 && + inline_error) { + /* The OpenAFS fileserver has a bug in FS.InlineBulkStatus + * whereby it doesn't set the interface version in the error + * case. + */ + status->abort_code = abort_code; + ret = 0; + goto out; + } - if (size != status->size) { - status->size = size; - changed |= true; + pr_warn("Unknown AFSFetchStatus version %u\n", ntohl(xdr->if_version)); + goto bad; } - status->mode &= S_IALLUGO; - _debug("vnode time %lx, %lx", - status->mtime_client, status->mtime_server); + if (abort_code != 0 && inline_error) { + status->abort_code = abort_code; + ret = 0; + goto out; + } - if (vnode) { - if (changed && !test_bit(AFS_VNODE_UNSET, &vnode->flags)) { - _debug("vnode changed"); - i_size_write(&vnode->vfs_inode, size); - vnode->vfs_inode.i_uid = status->owner; - vnode->vfs_inode.i_gid = status->group; - vnode->vfs_inode.i_generation = vnode->fid.unique; - set_nlink(&vnode->vfs_inode, status->nlink); - - mode = vnode->vfs_inode.i_mode; - mode &= ~S_IALLUGO; - mode |= status->mode; - barrier(); - vnode->vfs_inode.i_mode = mode; + type = ntohl(xdr->type); + switch (type) { + case AFS_FTYPE_FILE: + case AFS_FTYPE_DIR: + case AFS_FTYPE_SYMLINK: + if (type != status->type && + vnode && + !test_bit(AFS_VNODE_UNSET, &vnode->flags)) { + pr_warning("Vnode %x:%x:%x changed type %u to %u\n", + vnode->fid.vid, + vnode->fid.vnode, + vnode->fid.unique, + status->type, type); + goto bad; } - - vnode->vfs_inode.i_ctime.tv_sec = status->mtime_client; - vnode->vfs_inode.i_mtime = vnode->vfs_inode.i_ctime; - vnode->vfs_inode.i_atime = vnode->vfs_inode.i_ctime; - inode_set_iversion_raw(&vnode->vfs_inode, data_version); + status->type = type; + break; + default: + goto bad; } - expected_version = status->data_version; - if (store_version) - expected_version = *store_version; +#define EXTRACT_M(FIELD) \ + do { \ + u32 x = ntohl(xdr->FIELD); \ + if (status->FIELD != x) { \ + flags |= AFS_VNODE_META_CHANGED; \ + status->FIELD = x; \ + } \ + } while (0) - if (expected_version != data_version) { - status->data_version = data_version; - if (vnode && !test_bit(AFS_VNODE_UNSET, &vnode->flags)) { - _debug("vnode modified %llx on {%x:%u}", - (unsigned long long) data_version, - vnode->fid.vid, vnode->fid.vnode); - set_bit(AFS_VNODE_DIR_MODIFIED, &vnode->flags); - set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags); - } - } else if (store_version) { + EXTRACT_M(nlink); + EXTRACT_M(author); + EXTRACT_M(owner); + EXTRACT_M(caller_access); /* call ticket dependent */ + EXTRACT_M(anon_access); + EXTRACT_M(mode); + EXTRACT_M(group); + + status->mtime_client = ntohl(xdr->mtime_client); + status->mtime_server = ntohl(xdr->mtime_server); + status->lock_count = ntohl(xdr->lock_count); + + size = (u64)ntohl(xdr->size_lo); + size |= (u64)ntohl(xdr->size_hi) << 32; + status->size = size; + + data_version = (u64)ntohl(xdr->data_version_lo); + data_version |= (u64)ntohl(xdr->data_version_hi) << 32; + if (data_version != status->data_version) { status->data_version = data_version; + flags |= AFS_VNODE_DATA_CHANGED; + } + + if (read_req) { + read_req->data_version = data_version; + read_req->file_size = size; + } + + *_bp = (const void *)*_bp + sizeof(*xdr); + + if (vnode) { + if (test_bit(AFS_VNODE_UNSET, &vnode->flags)) + flags |= AFS_VNODE_NOT_YET_SET; + afs_update_inode_from_status(vnode, status, expected_version, + flags); } + ret = 0; + +out: if (vnode) write_sequnlock(&vnode->cb_lock); + return ret; + +bad: + xdr_dump_bad(*_bp); + ret = afs_protocol_error(call, -EBADMSG); + goto out; } /* @@ -162,7 +261,7 @@ static void xdr_decode_AFSCallBack(struct afs_call *call, write_seqlock(&vnode->cb_lock); - if (call->cb_break == (vnode->cb_break + cbi->server->cb_s_break)) { + if (call->cb_break == afs_cb_break_sum(vnode, cbi)) { vnode->cb_version = ntohl(*bp++); cb_expiry = ntohl(*bp++); vnode->cb_type = ntohl(*bp++); @@ -274,7 +373,7 @@ static void xdr_decode_AFSFetchVolumeStatus(const __be32 **_bp, /* * deliver reply data to an FS.FetchStatus */ -static int afs_deliver_fs_fetch_status(struct afs_call *call) +static int afs_deliver_fs_fetch_status_vnode(struct afs_call *call) { struct afs_vnode *vnode = call->reply[0]; const __be32 *bp; @@ -288,7 +387,9 @@ static int afs_deliver_fs_fetch_status(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); + if (xdr_decode_AFSFetchStatus(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL) < 0) + return afs_protocol_error(call, -EBADMSG); xdr_decode_AFSCallBack(call, vnode, &bp); if (call->reply[1]) xdr_decode_AFSVolSync(&bp, call->reply[1]); @@ -300,17 +401,18 @@ static int afs_deliver_fs_fetch_status(struct afs_call *call) /* * FS.FetchStatus operation type */ -static const struct afs_call_type afs_RXFSFetchStatus = { - .name = "FS.FetchStatus", +static const struct afs_call_type afs_RXFSFetchStatus_vnode = { + .name = "FS.FetchStatus(vnode)", .op = afs_FS_FetchStatus, - .deliver = afs_deliver_fs_fetch_status, + .deliver = afs_deliver_fs_fetch_status_vnode, .destructor = afs_flat_call_destructor, }; /* * fetch the status information for a file */ -int afs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_volsync *volsync) +int afs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_volsync *volsync, + bool new_inode) { struct afs_vnode *vnode = fc->vnode; struct afs_call *call; @@ -320,7 +422,8 @@ int afs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_volsync *volsy _enter(",%x,{%x:%u},,", key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode); - call = afs_alloc_flat_call(net, &afs_RXFSFetchStatus, 16, (21 + 3 + 6) * 4); + call = afs_alloc_flat_call(net, &afs_RXFSFetchStatus_vnode, + 16, (21 + 3 + 6) * 4); if (!call) { fc->ac.error = -ENOMEM; return -ENOMEM; @@ -329,6 +432,7 @@ int afs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_volsync *volsy call->key = fc->key; call->reply[0] = vnode; call->reply[1] = volsync; + call->expected_version = new_inode ? 1 : vnode->status.data_version; /* marshall the parameters */ bp = call->request; @@ -464,7 +568,9 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) return ret; bp = call->buffer; - xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); + if (xdr_decode_AFSFetchStatus(call, &bp, &vnode->status, vnode, + &vnode->status.data_version, req) < 0) + return afs_protocol_error(call, -EBADMSG); xdr_decode_AFSCallBack(call, vnode, &bp); if (call->reply[1]) xdr_decode_AFSVolSync(&bp, call->reply[1]); @@ -534,6 +640,7 @@ static int afs_fs_fetch_data64(struct afs_fs_cursor *fc, struct afs_read *req) call->reply[0] = vnode; call->reply[1] = NULL; /* volsync */ call->reply[2] = req; + call->expected_version = vnode->status.data_version; /* marshall the parameters */ bp = call->request; @@ -546,7 +653,7 @@ static int afs_fs_fetch_data64(struct afs_fs_cursor *fc, struct afs_read *req) bp[6] = 0; bp[7] = htonl(lower_32_bits(req->len)); - atomic_inc(&req->usage); + refcount_inc(&req->usage); call->cb_break = fc->cb_break; afs_use_fs_server(call, fc->cbi); trace_afs_make_fs_call(call, &vnode->fid); @@ -578,6 +685,7 @@ int afs_fs_fetch_data(struct afs_fs_cursor *fc, struct afs_read *req) call->reply[0] = vnode; call->reply[1] = NULL; /* volsync */ call->reply[2] = req; + call->expected_version = vnode->status.data_version; /* marshall the parameters */ bp = call->request; @@ -588,7 +696,7 @@ int afs_fs_fetch_data(struct afs_fs_cursor *fc, struct afs_read *req) bp[4] = htonl(lower_32_bits(req->pos)); bp[5] = htonl(lower_32_bits(req->len)); - atomic_inc(&req->usage); + refcount_inc(&req->usage); call->cb_break = fc->cb_break; afs_use_fs_server(call, fc->cbi); trace_afs_make_fs_call(call, &vnode->fid); @@ -613,8 +721,10 @@ static int afs_deliver_fs_create_vnode(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; xdr_decode_AFSFid(&bp, call->reply[1]); - xdr_decode_AFSFetchStatus(&bp, call->reply[2], NULL, NULL); - xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); + if (xdr_decode_AFSFetchStatus(call, &bp, call->reply[2], NULL, NULL, NULL) < 0 || + xdr_decode_AFSFetchStatus(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL) < 0) + return afs_protocol_error(call, -EBADMSG); xdr_decode_AFSCallBack_raw(&bp, call->reply[3]); /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ @@ -645,6 +755,7 @@ static const struct afs_call_type afs_RXFSMakeDir = { int afs_fs_create(struct afs_fs_cursor *fc, const char *name, umode_t mode, + u64 current_data_version, struct afs_fid *newfid, struct afs_file_status *newstatus, struct afs_callback *newcb) @@ -672,6 +783,7 @@ int afs_fs_create(struct afs_fs_cursor *fc, call->reply[1] = newfid; call->reply[2] = newstatus; call->reply[3] = newcb; + call->expected_version = current_data_version + 1; /* marshall the parameters */ bp = call->request; @@ -715,7 +827,9 @@ static int afs_deliver_fs_remove(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); + if (xdr_decode_AFSFetchStatus(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL) < 0) + return afs_protocol_error(call, -EBADMSG); /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ _leave(" = 0 [done]"); @@ -742,7 +856,8 @@ static const struct afs_call_type afs_RXFSRemoveDir = { /* * remove a file or directory */ -int afs_fs_remove(struct afs_fs_cursor *fc, const char *name, bool isdir) +int afs_fs_remove(struct afs_fs_cursor *fc, const char *name, bool isdir, + u64 current_data_version) { struct afs_vnode *vnode = fc->vnode; struct afs_call *call; @@ -764,6 +879,7 @@ int afs_fs_remove(struct afs_fs_cursor *fc, const char *name, bool isdir) call->key = fc->key; call->reply[0] = vnode; + call->expected_version = current_data_version + 1; /* marshall the parameters */ bp = call->request; @@ -801,8 +917,10 @@ static int afs_deliver_fs_link(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); - xdr_decode_AFSFetchStatus(&bp, &dvnode->status, dvnode, NULL); + if (xdr_decode_AFSFetchStatus(call, &bp, &vnode->status, vnode, NULL, NULL) < 0 || + xdr_decode_AFSFetchStatus(call, &bp, &dvnode->status, dvnode, + &call->expected_version, NULL) < 0) + return afs_protocol_error(call, -EBADMSG); /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ _leave(" = 0 [done]"); @@ -823,7 +941,7 @@ static const struct afs_call_type afs_RXFSLink = { * make a hard link */ int afs_fs_link(struct afs_fs_cursor *fc, struct afs_vnode *vnode, - const char *name) + const char *name, u64 current_data_version) { struct afs_vnode *dvnode = fc->vnode; struct afs_call *call; @@ -844,6 +962,7 @@ int afs_fs_link(struct afs_fs_cursor *fc, struct afs_vnode *vnode, call->key = fc->key; call->reply[0] = dvnode; call->reply[1] = vnode; + call->expected_version = current_data_version + 1; /* marshall the parameters */ bp = call->request; @@ -885,8 +1004,10 @@ static int afs_deliver_fs_symlink(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; xdr_decode_AFSFid(&bp, call->reply[1]); - xdr_decode_AFSFetchStatus(&bp, call->reply[2], NULL, NULL); - xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); + if (xdr_decode_AFSFetchStatus(call, &bp, call->reply[2], NULL, NULL, NULL) || + xdr_decode_AFSFetchStatus(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL) < 0) + return afs_protocol_error(call, -EBADMSG); /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ _leave(" = 0 [done]"); @@ -909,6 +1030,7 @@ static const struct afs_call_type afs_RXFSSymlink = { int afs_fs_symlink(struct afs_fs_cursor *fc, const char *name, const char *contents, + u64 current_data_version, struct afs_fid *newfid, struct afs_file_status *newstatus) { @@ -937,6 +1059,7 @@ int afs_fs_symlink(struct afs_fs_cursor *fc, call->reply[0] = vnode; call->reply[1] = newfid; call->reply[2] = newstatus; + call->expected_version = current_data_version + 1; /* marshall the parameters */ bp = call->request; @@ -987,10 +1110,13 @@ static int afs_deliver_fs_rename(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - xdr_decode_AFSFetchStatus(&bp, &orig_dvnode->status, orig_dvnode, NULL); - if (new_dvnode != orig_dvnode) - xdr_decode_AFSFetchStatus(&bp, &new_dvnode->status, new_dvnode, - NULL); + if (xdr_decode_AFSFetchStatus(call, &bp, &orig_dvnode->status, orig_dvnode, + &call->expected_version, NULL) < 0) + return afs_protocol_error(call, -EBADMSG); + if (new_dvnode != orig_dvnode && + xdr_decode_AFSFetchStatus(call, &bp, &new_dvnode->status, new_dvnode, + &call->expected_version_2, NULL) < 0) + return afs_protocol_error(call, -EBADMSG); /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ _leave(" = 0 [done]"); @@ -1013,7 +1139,9 @@ static const struct afs_call_type afs_RXFSRename = { int afs_fs_rename(struct afs_fs_cursor *fc, const char *orig_name, struct afs_vnode *new_dvnode, - const char *new_name) + const char *new_name, + u64 current_orig_data_version, + u64 current_new_data_version) { struct afs_vnode *orig_dvnode = fc->vnode; struct afs_call *call; @@ -1041,6 +1169,8 @@ int afs_fs_rename(struct afs_fs_cursor *fc, call->key = fc->key; call->reply[0] = orig_dvnode; call->reply[1] = new_dvnode; + call->expected_version = current_orig_data_version + 1; + call->expected_version_2 = current_new_data_version + 1; /* marshall the parameters */ bp = call->request; @@ -1089,8 +1219,9 @@ static int afs_deliver_fs_store_data(struct afs_call *call) /* unmarshall the reply once we've received all of it */ bp = call->buffer; - xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, - &call->store_version); + if (xdr_decode_AFSFetchStatus(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL) < 0) + return afs_protocol_error(call, -EBADMSG); /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ afs_pages_written_back(vnode, call); @@ -1147,7 +1278,7 @@ static int afs_fs_store_data64(struct afs_fs_cursor *fc, call->first_offset = offset; call->last_to = to; call->send_pages = true; - call->store_version = vnode->status.data_version + 1; + call->expected_version = vnode->status.data_version + 1; /* marshall the parameters */ bp = call->request; @@ -1222,7 +1353,7 @@ int afs_fs_store_data(struct afs_fs_cursor *fc, struct address_space *mapping, call->first_offset = offset; call->last_to = to; call->send_pages = true; - call->store_version = vnode->status.data_version + 1; + call->expected_version = vnode->status.data_version + 1; /* marshall the parameters */ bp = call->request; @@ -1252,7 +1383,6 @@ int afs_fs_store_data(struct afs_fs_cursor *fc, struct address_space *mapping, */ static int afs_deliver_fs_store_status(struct afs_call *call) { - afs_dataversion_t *store_version; struct afs_vnode *vnode = call->reply[0]; const __be32 *bp; int ret; @@ -1264,12 +1394,10 @@ static int afs_deliver_fs_store_status(struct afs_call *call) return ret; /* unmarshall the reply once we've received all of it */ - store_version = NULL; - if (call->operation_ID == FSSTOREDATA) - store_version = &call->store_version; - bp = call->buffer; - xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, store_version); + if (xdr_decode_AFSFetchStatus(call, &bp, &vnode->status, vnode, + &call->expected_version, NULL) < 0) + return afs_protocol_error(call, -EBADMSG); /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */ _leave(" = 0 [done]"); @@ -1324,7 +1452,7 @@ static int afs_fs_setattr_size64(struct afs_fs_cursor *fc, struct iattr *attr) call->key = fc->key; call->reply[0] = vnode; - call->store_version = vnode->status.data_version + 1; + call->expected_version = vnode->status.data_version + 1; /* marshall the parameters */ bp = call->request; @@ -1373,7 +1501,7 @@ static int afs_fs_setattr_size(struct afs_fs_cursor *fc, struct iattr *attr) call->key = fc->key; call->reply[0] = vnode; - call->store_version = vnode->status.data_version + 1; + call->expected_version = vnode->status.data_version + 1; /* marshall the parameters */ bp = call->request; @@ -1418,6 +1546,7 @@ int afs_fs_setattr(struct afs_fs_cursor *fc, struct iattr *attr) call->key = fc->key; call->reply[0] = vnode; + call->expected_version = vnode->status.data_version; /* marshall the parameters */ bp = call->request; @@ -1471,7 +1600,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) call->count = ntohl(call->tmp); _debug("volname length: %u", call->count); if (call->count >= AFSNAMEMAX) - return -EBADMSG; + return afs_protocol_error(call, -EBADMSG); call->offset = 0; call->unmarshall++; @@ -1518,7 +1647,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) call->count = ntohl(call->tmp); _debug("offline msg length: %u", call->count); if (call->count >= AFSNAMEMAX) - return -EBADMSG; + return afs_protocol_error(call, -EBADMSG); call->offset = 0; call->unmarshall++; @@ -1565,7 +1694,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) call->count = ntohl(call->tmp); _debug("motd length: %u", call->count); if (call->count >= AFSNAMEMAX) - return -EBADMSG; + return afs_protocol_error(call, -EBADMSG); call->offset = 0; call->unmarshall++; @@ -1947,3 +2076,265 @@ int afs_fs_get_capabilities(struct afs_net *net, trace_afs_make_fs_call(call, NULL); return afs_make_call(ac, call, GFP_NOFS, false); } + +/* + * Deliver reply data to an FS.FetchStatus with no vnode. + */ +static int afs_deliver_fs_fetch_status(struct afs_call *call) +{ + struct afs_file_status *status = call->reply[1]; + struct afs_callback *callback = call->reply[2]; + struct afs_volsync *volsync = call->reply[3]; + struct afs_vnode *vnode = call->reply[0]; + const __be32 *bp; + int ret; + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + _enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode); + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + xdr_decode_AFSFetchStatus(call, &bp, status, vnode, + &call->expected_version, NULL); + callback[call->count].version = ntohl(bp[0]); + callback[call->count].expiry = ntohl(bp[1]); + callback[call->count].type = ntohl(bp[2]); + if (vnode) + xdr_decode_AFSCallBack(call, vnode, &bp); + else + bp += 3; + if (volsync) + xdr_decode_AFSVolSync(&bp, volsync); + + _leave(" = 0 [done]"); + return 0; +} + +/* + * FS.FetchStatus operation type + */ +static const struct afs_call_type afs_RXFSFetchStatus = { + .name = "FS.FetchStatus", + .op = afs_FS_FetchStatus, + .deliver = afs_deliver_fs_fetch_status, + .destructor = afs_flat_call_destructor, +}; + +/* + * Fetch the status information for a fid without needing a vnode handle. + */ +int afs_fs_fetch_status(struct afs_fs_cursor *fc, + struct afs_net *net, + struct afs_fid *fid, + struct afs_file_status *status, + struct afs_callback *callback, + struct afs_volsync *volsync) +{ + struct afs_call *call; + __be32 *bp; + + _enter(",%x,{%x:%u},,", + key_serial(fc->key), fid->vid, fid->vnode); + + call = afs_alloc_flat_call(net, &afs_RXFSFetchStatus, 16, (21 + 3 + 6) * 4); + if (!call) { + fc->ac.error = -ENOMEM; + return -ENOMEM; + } + + call->key = fc->key; + call->reply[0] = NULL; /* vnode for fid[0] */ + call->reply[1] = status; + call->reply[2] = callback; + call->reply[3] = volsync; + call->expected_version = 1; /* vnode->status.data_version */ + + /* marshall the parameters */ + bp = call->request; + bp[0] = htonl(FSFETCHSTATUS); + bp[1] = htonl(fid->vid); + bp[2] = htonl(fid->vnode); + bp[3] = htonl(fid->unique); + + call->cb_break = fc->cb_break; + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, fid); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); +} + +/* + * Deliver reply data to an FS.InlineBulkStatus call + */ +static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) +{ + struct afs_file_status *statuses; + struct afs_callback *callbacks; + struct afs_vnode *vnode = call->reply[0]; + const __be32 *bp; + u32 tmp; + int ret; + + _enter("{%u}", call->unmarshall); + + switch (call->unmarshall) { + case 0: + call->offset = 0; + call->unmarshall++; + + /* Extract the file status count and array in two steps */ + case 1: + _debug("extract status count"); + ret = afs_extract_data(call, &call->tmp, 4, true); + if (ret < 0) + return ret; + + tmp = ntohl(call->tmp); + _debug("status count: %u/%u", tmp, call->count2); + if (tmp != call->count2) + return afs_protocol_error(call, -EBADMSG); + + call->count = 0; + call->unmarshall++; + more_counts: + call->offset = 0; + + case 2: + _debug("extract status array %u", call->count); + ret = afs_extract_data(call, call->buffer, 21 * 4, true); + if (ret < 0) + return ret; + + bp = call->buffer; + statuses = call->reply[1]; + if (xdr_decode_AFSFetchStatus(call, &bp, &statuses[call->count], + call->count == 0 ? vnode : NULL, + NULL, NULL) < 0) + return afs_protocol_error(call, -EBADMSG); + + call->count++; + if (call->count < call->count2) + goto more_counts; + + call->count = 0; + call->unmarshall++; + call->offset = 0; + + /* Extract the callback count and array in two steps */ + case 3: + _debug("extract CB count"); + ret = afs_extract_data(call, &call->tmp, 4, true); + if (ret < 0) + return ret; + + tmp = ntohl(call->tmp); + _debug("CB count: %u", tmp); + if (tmp != call->count2) + return afs_protocol_error(call, -EBADMSG); + call->count = 0; + call->unmarshall++; + more_cbs: + call->offset = 0; + + case 4: + _debug("extract CB array"); + ret = afs_extract_data(call, call->buffer, 3 * 4, true); + if (ret < 0) + return ret; + + _debug("unmarshall CB array"); + bp = call->buffer; + callbacks = call->reply[2]; + callbacks[call->count].version = ntohl(bp[0]); + callbacks[call->count].expiry = ntohl(bp[1]); + callbacks[call->count].type = ntohl(bp[2]); + statuses = call->reply[1]; + if (call->count == 0 && vnode && statuses[0].abort_code == 0) + xdr_decode_AFSCallBack(call, vnode, &bp); + call->count++; + if (call->count < call->count2) + goto more_cbs; + + call->offset = 0; + call->unmarshall++; + + case 5: + ret = afs_extract_data(call, call->buffer, 6 * 4, false); + if (ret < 0) + return ret; + + bp = call->buffer; + if (call->reply[3]) + xdr_decode_AFSVolSync(&bp, call->reply[3]); + + call->offset = 0; + call->unmarshall++; + + case 6: + break; + } + + _leave(" = 0 [done]"); + return 0; +} + +/* + * FS.InlineBulkStatus operation type + */ +static const struct afs_call_type afs_RXFSInlineBulkStatus = { + .name = "FS.InlineBulkStatus", + .op = afs_FS_InlineBulkStatus, + .deliver = afs_deliver_fs_inline_bulk_status, + .destructor = afs_flat_call_destructor, +}; + +/* + * Fetch the status information for up to 50 files + */ +int afs_fs_inline_bulk_status(struct afs_fs_cursor *fc, + struct afs_net *net, + struct afs_fid *fids, + struct afs_file_status *statuses, + struct afs_callback *callbacks, + unsigned int nr_fids, + struct afs_volsync *volsync) +{ + struct afs_call *call; + __be32 *bp; + int i; + + _enter(",%x,{%x:%u},%u", + key_serial(fc->key), fids[0].vid, fids[1].vnode, nr_fids); + + call = afs_alloc_flat_call(net, &afs_RXFSInlineBulkStatus, + (2 + nr_fids * 3) * 4, + 21 * 4); + if (!call) { + fc->ac.error = -ENOMEM; + return -ENOMEM; + } + + call->key = fc->key; + call->reply[0] = NULL; /* vnode for fid[0] */ + call->reply[1] = statuses; + call->reply[2] = callbacks; + call->reply[3] = volsync; + call->count2 = nr_fids; + + /* marshall the parameters */ + bp = call->request; + *bp++ = htonl(FSINLINEBULKSTATUS); + *bp++ = htonl(nr_fids); + for (i = 0; i < nr_fids; i++) { + *bp++ = htonl(fids[i].vid); + *bp++ = htonl(fids[i].vnode); + *bp++ = htonl(fids[i].unique); + } + + call->cb_break = fc->cb_break; + afs_use_fs_server(call, fc->cbi); + trace_afs_make_fs_call(call, &fids[0]); + return afs_make_call(&fc->ac, call, GFP_NOFS, false); +} diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 65c5b1e..479b7fd 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -30,12 +30,11 @@ static const struct inode_operations afs_symlink_inode_operations = { }; /* - * map the AFS file status to the inode member variables + * Initialise an inode from the vnode status. */ -static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key) +static int afs_inode_init_from_status(struct afs_vnode *vnode, struct key *key) { struct inode *inode = AFS_VNODE_TO_I(vnode); - bool changed; _debug("FS: ft=%d lk=%d sz=%llu ver=%Lu mod=%hu", vnode->status.type, @@ -46,16 +45,21 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key) read_seqlock_excl(&vnode->cb_lock); + afs_update_inode_from_status(vnode, &vnode->status, NULL, + AFS_VNODE_NOT_YET_SET); + switch (vnode->status.type) { case AFS_FTYPE_FILE: inode->i_mode = S_IFREG | vnode->status.mode; inode->i_op = &afs_file_inode_operations; inode->i_fop = &afs_file_operations; + inode->i_mapping->a_ops = &afs_fs_aops; break; case AFS_FTYPE_DIR: inode->i_mode = S_IFDIR | vnode->status.mode; inode->i_op = &afs_dir_inode_operations; inode->i_fop = &afs_dir_file_operations; + inode->i_mapping->a_ops = &afs_dir_aops; break; case AFS_FTYPE_SYMLINK: /* Symlinks with a mode of 0644 are actually mountpoints. */ @@ -67,45 +71,31 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key) inode->i_mode = S_IFDIR | 0555; inode->i_op = &afs_mntpt_inode_operations; inode->i_fop = &afs_mntpt_file_operations; + inode->i_mapping->a_ops = &afs_fs_aops; } else { inode->i_mode = S_IFLNK | vnode->status.mode; inode->i_op = &afs_symlink_inode_operations; + inode->i_mapping->a_ops = &afs_fs_aops; } inode_nohighmem(inode); break; default: printk("kAFS: AFS vnode with undefined type\n"); read_sequnlock_excl(&vnode->cb_lock); - return -EBADMSG; + return afs_protocol_error(NULL, -EBADMSG); } - changed = (vnode->status.size != inode->i_size); - - set_nlink(inode, vnode->status.nlink); - inode->i_uid = vnode->status.owner; - inode->i_gid = vnode->status.group; - inode->i_size = vnode->status.size; - inode->i_ctime.tv_sec = vnode->status.mtime_client; - inode->i_ctime.tv_nsec = 0; - inode->i_atime = inode->i_mtime = inode->i_ctime; inode->i_blocks = 0; - inode->i_generation = vnode->fid.unique; - inode_set_iversion_raw(inode, vnode->status.data_version); - inode->i_mapping->a_ops = &afs_fs_aops; + vnode->invalid_before = vnode->status.data_version; read_sequnlock_excl(&vnode->cb_lock); - -#ifdef CONFIG_AFS_FSCACHE - if (changed) - fscache_attr_changed(vnode->cache); -#endif return 0; } /* * Fetch file status from the volume. */ -int afs_fetch_status(struct afs_vnode *vnode, struct key *key) +int afs_fetch_status(struct afs_vnode *vnode, struct key *key, bool new_inode) { struct afs_fs_cursor fc; int ret; @@ -118,8 +108,8 @@ int afs_fetch_status(struct afs_vnode *vnode, struct key *key) ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, vnode, key)) { while (afs_select_fileserver(&fc)) { - fc.cb_break = vnode->cb_break + vnode->cb_s_break; - afs_fs_fetch_file_status(&fc, NULL); + fc.cb_break = afs_calc_vnode_cb_break(vnode); + afs_fs_fetch_file_status(&fc, NULL, new_inode); } afs_check_for_remote_deletion(&fc, fc.vnode); @@ -255,6 +245,11 @@ static void afs_get_inode_cache(struct afs_vnode *vnode) } __packed key; struct afs_vnode_cache_aux aux; + if (vnode->status.type == AFS_FTYPE_DIR) { + vnode->cache = NULL; + return; + } + key.vnode_id = vnode->fid.vnode; key.unique = vnode->fid.unique; key.vnode_id_ext[0] = 0; @@ -307,7 +302,7 @@ struct inode *afs_iget(struct super_block *sb, struct key *key, if (!status) { /* it's a remotely extant inode */ - ret = afs_fetch_status(vnode, key); + ret = afs_fetch_status(vnode, key, true); if (ret < 0) goto bad_inode; } else { @@ -331,15 +326,12 @@ struct inode *afs_iget(struct super_block *sb, struct key *key, vnode->cb_expires_at += ktime_get_real_seconds(); } - /* set up caching before mapping the status, as map-status reads the - * first page of symlinks to see if they're really mountpoints */ - inode->i_size = vnode->status.size; - afs_get_inode_cache(vnode); - - ret = afs_inode_map_status(vnode, key); + ret = afs_inode_init_from_status(vnode, key); if (ret < 0) goto bad_inode; + afs_get_inode_cache(vnode); + /* success */ clear_bit(AFS_VNODE_UNSET, &vnode->flags); inode->i_flags |= S_NOATIME; @@ -349,10 +341,6 @@ struct inode *afs_iget(struct super_block *sb, struct key *key, /* failure */ bad_inode: -#ifdef CONFIG_AFS_FSCACHE - fscache_relinquish_cookie(vnode->cache, NULL, ret == -ENOENT); - vnode->cache = NULL; -#endif iget_failed(inode); _leave(" = %d [bad]", ret); return ERR_PTR(ret); @@ -405,12 +393,18 @@ int afs_validate(struct afs_vnode *vnode, struct key *key) read_seqlock_excl(&vnode->cb_lock); if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) { - if (vnode->cb_s_break != vnode->cb_interest->server->cb_s_break) { + if (vnode->cb_s_break != vnode->cb_interest->server->cb_s_break || + vnode->cb_v_break != vnode->volume->cb_v_break) { vnode->cb_s_break = vnode->cb_interest->server->cb_s_break; - } else if (!test_bit(AFS_VNODE_DIR_MODIFIED, &vnode->flags) && - !test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags) && + vnode->cb_v_break = vnode->volume->cb_v_break; + valid = false; + } else if (vnode->status.type == AFS_FTYPE_DIR && + test_bit(AFS_VNODE_DIR_VALID, &vnode->flags) && vnode->cb_expires_at - 10 > now) { - valid = true; + valid = true; + } else if (!test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags) && + vnode->cb_expires_at - 10 > now) { + valid = true; } } else if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { valid = true; @@ -424,7 +418,7 @@ int afs_validate(struct afs_vnode *vnode, struct key *key) if (valid) goto valid; - mutex_lock(&vnode->validate_lock); + down_write(&vnode->validate_lock); /* if the promise has expired, we need to check the server again to get * a new promise - note that if the (parent) directory's metadata was @@ -432,7 +426,7 @@ int afs_validate(struct afs_vnode *vnode, struct key *key) * access */ if (!test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) { _debug("not promised"); - ret = afs_fetch_status(vnode, key); + ret = afs_fetch_status(vnode, key, false); if (ret < 0) { if (ret == -ENOENT) { set_bit(AFS_VNODE_DELETED, &vnode->flags); @@ -453,15 +447,13 @@ int afs_validate(struct afs_vnode *vnode, struct key *key) * different */ if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) afs_zap_data(vnode); - - clear_bit(AFS_VNODE_DIR_MODIFIED, &vnode->flags); - mutex_unlock(&vnode->validate_lock); + up_write(&vnode->validate_lock); valid: _leave(" = 0"); return 0; error_unlock: - mutex_unlock(&vnode->validate_lock); + up_write(&vnode->validate_lock); _leave(" = %d", ret); return ret; } @@ -544,7 +536,7 @@ void afs_evict_inode(struct inode *inode) } #endif - afs_put_permits(vnode->permit_cache); + afs_put_permits(rcu_access_pointer(vnode->permit_cache)); _leave(""); } @@ -585,7 +577,7 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr) ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, vnode, key)) { while (afs_select_fileserver(&fc)) { - fc.cb_break = vnode->cb_break + vnode->cb_s_break; + fc.cb_break = afs_calc_vnode_cb_break(vnode); afs_fs_setattr(&fc, attr); } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index a6a1d75..e3f8a46 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -122,7 +122,8 @@ struct afs_call { u32 operation_ID; /* operation ID for an incoming call */ u32 count; /* count for use in unmarshalling */ __be32 tmp; /* place to extract temporary data */ - afs_dataversion_t store_version; /* updated version expected from store */ + afs_dataversion_t expected_version; /* Updated version expected from store */ + afs_dataversion_t expected_version_2; /* 2nd updated version expected from store */ }; struct afs_call_type { @@ -173,11 +174,14 @@ struct afs_read { loff_t len; /* How much we're asking for */ loff_t actual_len; /* How much we're actually getting */ loff_t remain; /* Amount remaining */ - atomic_t usage; + loff_t file_size; /* File size returned by server */ + afs_dataversion_t data_version; /* Version number returned by server */ + refcount_t usage; unsigned int index; /* Which page we're reading into */ unsigned int nr_pages; void (*page_done)(struct afs_call *, struct afs_read *); - struct page *pages[]; + struct page **pages; + struct page *array[]; }; /* @@ -199,6 +203,18 @@ static inline struct afs_super_info *AFS_FS_S(struct super_block *sb) extern struct file_system_type afs_fs_type; /* + * Set of substitutes for @sys. + */ +struct afs_sysnames { +#define AFS_NR_SYSNAME 16 + char *subs[AFS_NR_SYSNAME]; + refcount_t usage; + unsigned short nr; + short error; + char blank[1]; +}; + +/* * AFS network namespace record. */ struct afs_net { @@ -245,9 +261,25 @@ struct afs_net { struct mutex lock_manager_mutex; /* Misc */ - struct proc_dir_entry *proc_afs; /* /proc/net/afs directory */ + struct proc_dir_entry *proc_afs; /* /proc/net/afs directory */ + struct afs_sysnames *sysnames; + rwlock_t sysnames_lock; + + /* Statistics counters */ + atomic_t n_lookup; /* Number of lookups done */ + atomic_t n_reval; /* Number of dentries needing revalidation */ + atomic_t n_inval; /* Number of invalidations by the server */ + atomic_t n_relpg; /* Number of invalidations by releasepage */ + atomic_t n_read_dir; /* Number of directory pages read */ + atomic_t n_dir_cr; /* Number of directory entry creation edits */ + atomic_t n_dir_rm; /* Number of directory entry removal edits */ + atomic_t n_stores; /* Number of store ops */ + atomic_long_t n_store_bytes; /* Number of bytes stored */ + atomic_long_t n_fetch_bytes; /* Number of bytes fetched */ + atomic_t n_fetches; /* Number of data fetch ops */ }; +extern const char afs_init_sysname[]; extern struct afs_net __afs_net;// Dummy AFS network namespace; TODO: replace with real netns enum afs_cell_state { @@ -363,6 +395,8 @@ struct afs_server { #define AFS_SERVER_FL_UPDATING 4 #define AFS_SERVER_FL_PROBED 5 /* The fileserver has been probed */ #define AFS_SERVER_FL_PROBING 6 /* Fileserver is being probed */ +#define AFS_SERVER_FL_NO_IBULK 7 /* Fileserver doesn't support FS.InlineBulkStatus */ +#define AFS_SERVER_FL_MAY_HAVE_CB 8 /* May have callbacks on this fileserver */ atomic_t usage; u32 addr_version; /* Address list version */ @@ -400,6 +434,7 @@ struct afs_server_list { unsigned short index; /* Server currently in use */ unsigned short vnovol_mask; /* Servers to be skipped due to VNOVOL */ unsigned int seq; /* Set to ->servers_seq when installed */ + rwlock_t lock; struct afs_server_entry servers[]; }; @@ -426,6 +461,9 @@ struct afs_volume { rwlock_t servers_lock; /* Lock for ->servers */ unsigned int servers_seq; /* Incremented each time ->servers changes */ + unsigned cb_v_break; /* Break-everything counter. */ + rwlock_t cb_break_lock; + afs_voltype_t type; /* type of volume */ short error; char type_force; /* force volume type (suppress R/O -> R/W) */ @@ -455,23 +493,25 @@ struct afs_vnode { struct afs_volume *volume; /* volume on which vnode resides */ struct afs_fid fid; /* the file identifier for this inode */ struct afs_file_status status; /* AFS status info for this file */ + afs_dataversion_t invalid_before; /* Child dentries are invalid before this */ #ifdef CONFIG_AFS_FSCACHE struct fscache_cookie *cache; /* caching cookie */ #endif - struct afs_permits *permit_cache; /* cache of permits so far obtained */ + struct afs_permits __rcu *permit_cache; /* cache of permits so far obtained */ struct mutex io_lock; /* Lock for serialising I/O on this mutex */ - struct mutex validate_lock; /* lock for validating this vnode */ + struct rw_semaphore validate_lock; /* lock for validating this vnode */ spinlock_t wb_lock; /* lock for wb_keys */ spinlock_t lock; /* waitqueue/flags lock */ unsigned long flags; #define AFS_VNODE_CB_PROMISED 0 /* Set if vnode has a callback promise */ #define AFS_VNODE_UNSET 1 /* set if vnode attributes not yet set */ -#define AFS_VNODE_DIR_MODIFIED 2 /* set if dir vnode's data modified */ +#define AFS_VNODE_DIR_VALID 2 /* Set if dir contents are valid */ #define AFS_VNODE_ZAP_DATA 3 /* set if vnode's data should be invalidated */ #define AFS_VNODE_DELETED 4 /* set if vnode deleted on server */ #define AFS_VNODE_MOUNTPOINT 5 /* set if vnode is a mountpoint symlink */ #define AFS_VNODE_AUTOCELL 6 /* set if Vnode is an auto mount point */ #define AFS_VNODE_PSEUDODIR 7 /* set if Vnode is a pseudo directory */ +#define AFS_VNODE_NEW_CONTENT 8 /* Set if file has new content (create/trunc-0) */ struct list_head wb_keys; /* List of keys available for writeback */ struct list_head pending_locks; /* locks waiting to be granted */ @@ -484,6 +524,7 @@ struct afs_vnode { /* outstanding callback notification on this file */ struct afs_cb_interest *cb_interest; /* Server on which this resides */ unsigned int cb_s_break; /* Mass break counter on ->server */ + unsigned int cb_v_break; /* Mass break counter on ->volume */ unsigned int cb_break; /* Break counter on vnode */ seqlock_t cb_lock; /* Lock for ->cb_interest, ->status, ->cb_*break */ @@ -611,18 +652,31 @@ extern struct fscache_cookie_def afs_vnode_cache_index_def; */ extern void afs_init_callback_state(struct afs_server *); extern void afs_break_callback(struct afs_vnode *); -extern void afs_break_callbacks(struct afs_server *, size_t,struct afs_callback[]); +extern void afs_break_callbacks(struct afs_server *, size_t, struct afs_callback_break*); -extern int afs_register_server_cb_interest(struct afs_vnode *, struct afs_server_entry *); +extern int afs_register_server_cb_interest(struct afs_vnode *, + struct afs_server_list *, unsigned int); extern void afs_put_cb_interest(struct afs_net *, struct afs_cb_interest *); extern void afs_clear_callback_interests(struct afs_net *, struct afs_server_list *); static inline struct afs_cb_interest *afs_get_cb_interest(struct afs_cb_interest *cbi) { - refcount_inc(&cbi->usage); + if (cbi) + refcount_inc(&cbi->usage); return cbi; } +static inline unsigned int afs_calc_vnode_cb_break(struct afs_vnode *vnode) +{ + return vnode->cb_break + vnode->cb_s_break + vnode->cb_v_break; +} + +static inline unsigned int afs_cb_break_sum(struct afs_vnode *vnode, + struct afs_cb_interest *cbi) +{ + return vnode->cb_break + cbi->server->cb_s_break + vnode->volume->cb_v_break; +} + /* * cell.c */ @@ -646,11 +700,26 @@ extern bool afs_cm_incoming_call(struct afs_call *); */ extern const struct file_operations afs_dir_file_operations; extern const struct inode_operations afs_dir_inode_operations; +extern const struct address_space_operations afs_dir_aops; +extern const struct dentry_operations afs_fs_dentry_operations; + +extern void afs_d_release(struct dentry *); + +/* + * dir_edit.c + */ +extern void afs_edit_dir_add(struct afs_vnode *, struct qstr *, struct afs_fid *, + enum afs_edit_dir_reason); +extern void afs_edit_dir_remove(struct afs_vnode *, struct qstr *, enum afs_edit_dir_reason); + +/* + * dynroot.c + */ extern const struct file_operations afs_dynroot_file_operations; extern const struct inode_operations afs_dynroot_inode_operations; -extern const struct dentry_operations afs_fs_dentry_operations; +extern const struct dentry_operations afs_dynroot_dentry_operations; -extern bool afs_dir_check_page(struct inode *, struct page *); +extern struct inode *afs_try_auto_mntpt(struct dentry *, struct inode *); /* * file.c @@ -680,17 +749,23 @@ extern int afs_flock(struct file *, int, struct file_lock *); /* * fsclient.c */ -extern int afs_fs_fetch_file_status(struct afs_fs_cursor *, struct afs_volsync *); +#define AFS_VNODE_NOT_YET_SET 0x01 +#define AFS_VNODE_META_CHANGED 0x02 +#define AFS_VNODE_DATA_CHANGED 0x04 +extern void afs_update_inode_from_status(struct afs_vnode *, struct afs_file_status *, + const afs_dataversion_t *, u8); + +extern int afs_fs_fetch_file_status(struct afs_fs_cursor *, struct afs_volsync *, bool); extern int afs_fs_give_up_callbacks(struct afs_net *, struct afs_server *); extern int afs_fs_fetch_data(struct afs_fs_cursor *, struct afs_read *); -extern int afs_fs_create(struct afs_fs_cursor *, const char *, umode_t, +extern int afs_fs_create(struct afs_fs_cursor *, const char *, umode_t, u64, struct afs_fid *, struct afs_file_status *, struct afs_callback *); -extern int afs_fs_remove(struct afs_fs_cursor *, const char *, bool); -extern int afs_fs_link(struct afs_fs_cursor *, struct afs_vnode *, const char *); -extern int afs_fs_symlink(struct afs_fs_cursor *, const char *, const char *, +extern int afs_fs_remove(struct afs_fs_cursor *, const char *, bool, u64); +extern int afs_fs_link(struct afs_fs_cursor *, struct afs_vnode *, const char *, u64); +extern int afs_fs_symlink(struct afs_fs_cursor *, const char *, const char *, u64, struct afs_fid *, struct afs_file_status *); extern int afs_fs_rename(struct afs_fs_cursor *, const char *, - struct afs_vnode *, const char *); + struct afs_vnode *, const char *, u64, u64); extern int afs_fs_store_data(struct afs_fs_cursor *, struct address_space *, pgoff_t, pgoff_t, unsigned, unsigned); extern int afs_fs_setattr(struct afs_fs_cursor *, struct iattr *); @@ -702,11 +777,18 @@ extern int afs_fs_give_up_all_callbacks(struct afs_net *, struct afs_server *, struct afs_addr_cursor *, struct key *); extern int afs_fs_get_capabilities(struct afs_net *, struct afs_server *, struct afs_addr_cursor *, struct key *); +extern int afs_fs_inline_bulk_status(struct afs_fs_cursor *, struct afs_net *, + struct afs_fid *, struct afs_file_status *, + struct afs_callback *, unsigned int, + struct afs_volsync *); +extern int afs_fs_fetch_status(struct afs_fs_cursor *, struct afs_net *, + struct afs_fid *, struct afs_file_status *, + struct afs_callback *, struct afs_volsync *); /* * inode.c */ -extern int afs_fetch_status(struct afs_vnode *, struct key *); +extern int afs_fetch_status(struct afs_vnode *, struct key *, bool); extern int afs_iget5_test(struct inode *, void *); extern struct inode *afs_iget_pseudo_dir(struct super_block *, bool); extern struct inode *afs_iget(struct super_block *, struct key *, @@ -754,6 +836,13 @@ static inline void afs_put_net(struct afs_net *net) { } +static inline void __afs_stat(atomic_t *s) +{ + atomic_inc(s); +} + +#define afs_stat_v(vnode, n) __afs_stat(&afs_v2net(vnode)->n) + /* * misc.c */ @@ -781,6 +870,7 @@ extern int __net_init afs_proc_init(struct afs_net *); extern void __net_exit afs_proc_cleanup(struct afs_net *); extern int afs_proc_cell_setup(struct afs_net *, struct afs_cell *); extern void afs_proc_cell_remove(struct afs_net *, struct afs_cell *); +extern void afs_put_sysnames(struct afs_sysnames *); /* * rotate.c @@ -809,6 +899,7 @@ extern void afs_flat_call_destructor(struct afs_call *); extern void afs_send_empty_reply(struct afs_call *); extern void afs_send_simple_reply(struct afs_call *, const void *, size_t); extern int afs_extract_data(struct afs_call *, void *, size_t, bool); +extern int afs_protocol_error(struct afs_call *, int); static inline int afs_transfer_reply(struct afs_call *call) { @@ -955,7 +1046,6 @@ extern int afs_writepage(struct page *, struct writeback_control *); extern int afs_writepages(struct address_space *, struct writeback_control *); extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *); extern ssize_t afs_file_write(struct kiocb *, struct iov_iter *); -extern int afs_flush(struct file *, fl_owner_t); extern int afs_fsync(struct file *, loff_t, loff_t, int); extern int afs_page_mkwrite(struct vm_fault *); extern void afs_prune_wb_keys(struct afs_vnode *); diff --git a/fs/afs/main.c b/fs/afs/main.c index 15a02a0..d756016 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -34,11 +34,42 @@ MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list"); struct workqueue_struct *afs_wq; struct afs_net __afs_net; +#if defined(CONFIG_ALPHA) +const char afs_init_sysname[] = "alpha_linux26"; +#elif defined(CONFIG_X86_64) +const char afs_init_sysname[] = "amd64_linux26"; +#elif defined(CONFIG_ARM) +const char afs_init_sysname[] = "arm_linux26"; +#elif defined(CONFIG_ARM64) +const char afs_init_sysname[] = "aarch64_linux26"; +#elif defined(CONFIG_X86_32) +const char afs_init_sysname[] = "i386_linux26"; +#elif defined(CONFIG_IA64) +const char afs_init_sysname[] = "ia64_linux26"; +#elif defined(CONFIG_PPC64) +const char afs_init_sysname[] = "ppc64_linux26"; +#elif defined(CONFIG_PPC32) +const char afs_init_sysname[] = "ppc_linux26"; +#elif defined(CONFIG_S390) +#ifdef CONFIG_64BIT +const char afs_init_sysname[] = "s390x_linux26"; +#else +const char afs_init_sysname[] = "s390_linux26"; +#endif +#elif defined(CONFIG_SPARC64) +const char afs_init_sysname[] = "sparc64_linux26"; +#elif defined(CONFIG_SPARC32) +const char afs_init_sysname[] = "sparc_linux26"; +#else +const char afs_init_sysname[] = "unknown_linux26"; +#endif + /* * Initialise an AFS network namespace record. */ static int __net_init afs_net_init(struct afs_net *net) { + struct afs_sysnames *sysnames; int ret; net->live = true; @@ -67,6 +98,16 @@ static int __net_init afs_net_init(struct afs_net *net) INIT_WORK(&net->fs_manager, afs_manage_servers); timer_setup(&net->fs_timer, afs_servers_timer, 0); + ret = -ENOMEM; + sysnames = kzalloc(sizeof(*sysnames), GFP_KERNEL); + if (!sysnames) + goto error_sysnames; + sysnames->subs[0] = (char *)&afs_init_sysname; + sysnames->nr = 1; + refcount_set(&sysnames->usage, 1); + net->sysnames = sysnames; + rwlock_init(&net->sysnames_lock); + /* Register the /proc stuff */ ret = afs_proc_init(net); if (ret < 0) @@ -92,6 +133,8 @@ error_cell_init: net->live = false; afs_proc_cleanup(net); error_proc: + afs_put_sysnames(net->sysnames); +error_sysnames: net->live = false; return ret; } @@ -106,6 +149,7 @@ static void __net_exit afs_net_exit(struct afs_net *net) afs_purge_servers(net); afs_close_socket(net); afs_proc_cleanup(net); + afs_put_sysnames(net->sysnames); } /* diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 4508dd5..839a222 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -126,6 +126,34 @@ static const struct file_operations afs_proc_servers_fops = { .release = seq_release, }; +static int afs_proc_sysname_open(struct inode *inode, struct file *file); +static int afs_proc_sysname_release(struct inode *inode, struct file *file); +static void *afs_proc_sysname_start(struct seq_file *p, loff_t *pos); +static void *afs_proc_sysname_next(struct seq_file *p, void *v, + loff_t *pos); +static void afs_proc_sysname_stop(struct seq_file *p, void *v); +static int afs_proc_sysname_show(struct seq_file *m, void *v); +static ssize_t afs_proc_sysname_write(struct file *file, + const char __user *buf, + size_t size, loff_t *_pos); + +static const struct seq_operations afs_proc_sysname_ops = { + .start = afs_proc_sysname_start, + .next = afs_proc_sysname_next, + .stop = afs_proc_sysname_stop, + .show = afs_proc_sysname_show, +}; + +static const struct file_operations afs_proc_sysname_fops = { + .open = afs_proc_sysname_open, + .read = seq_read, + .llseek = seq_lseek, + .release = afs_proc_sysname_release, + .write = afs_proc_sysname_write, +}; + +static const struct file_operations afs_proc_stats_fops; + /* * initialise the /proc/fs/afs/ directory */ @@ -139,7 +167,9 @@ int afs_proc_init(struct afs_net *net) if (!proc_create("cells", 0644, net->proc_afs, &afs_proc_cells_fops) || !proc_create("rootcell", 0644, net->proc_afs, &afs_proc_rootcell_fops) || - !proc_create("servers", 0644, net->proc_afs, &afs_proc_servers_fops)) + !proc_create("servers", 0644, net->proc_afs, &afs_proc_servers_fops) || + !proc_create("stats", 0644, net->proc_afs, &afs_proc_stats_fops) || + !proc_create("sysname", 0644, net->proc_afs, &afs_proc_sysname_fops)) goto error_tree; _leave(" = 0"); @@ -183,6 +213,7 @@ static int afs_proc_cells_open(struct inode *inode, struct file *file) * first item */ static void *afs_proc_cells_start(struct seq_file *m, loff_t *_pos) + __acquires(rcu) { struct afs_net *net = afs_seq2net(m); @@ -204,6 +235,7 @@ static void *afs_proc_cells_next(struct seq_file *m, void *v, loff_t *pos) * clean up after reading from the cells list */ static void afs_proc_cells_stop(struct seq_file *m, void *v) + __releases(rcu) { rcu_read_unlock(); } @@ -282,7 +314,8 @@ static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf, goto done; } - set_bit(AFS_CELL_FL_NO_GC, &cell->flags); + if (test_and_set_bit(AFS_CELL_FL_NO_GC, &cell->flags)) + afs_put_cell(net, cell); printk("kAFS: Added new cell '%s'\n", name); } else { goto inval; @@ -304,7 +337,40 @@ inval: static ssize_t afs_proc_rootcell_read(struct file *file, char __user *buf, size_t size, loff_t *_pos) { - return 0; + struct afs_cell *cell; + struct afs_net *net = afs_proc2net(file); + unsigned int seq = 0; + char name[AFS_MAXCELLNAME + 1]; + int len; + + if (*_pos > 0) + return 0; + if (!net->ws_cell) + return 0; + + rcu_read_lock(); + do { + read_seqbegin_or_lock(&net->cells_lock, &seq); + len = 0; + cell = rcu_dereference_raw(net->ws_cell); + if (cell) { + len = cell->name_len; + memcpy(name, cell->name, len); + } + } while (need_seqretry(&net->cells_lock, seq)); + done_seqretry(&net->cells_lock, seq); + rcu_read_unlock(); + + if (!len) + return 0; + + name[len++] = '\n'; + if (len > size) + len = size; + if (copy_to_user(buf, name, len) != 0) + return -EFAULT; + *_pos = 1; + return len; } /* @@ -327,6 +393,12 @@ static ssize_t afs_proc_rootcell_write(struct file *file, if (IS_ERR(kbuf)) return PTR_ERR(kbuf); + ret = -EINVAL; + if (kbuf[0] == '.') + goto out; + if (memchr(kbuf, '/', size)) + goto out; + /* trim to first NL */ s = memchr(kbuf, '\n', size); if (s) @@ -339,6 +411,7 @@ static ssize_t afs_proc_rootcell_write(struct file *file, if (ret >= 0) ret = size; /* consume everything, always */ +out: kfree(kbuf); _leave(" = %d", ret); return ret; @@ -413,6 +486,7 @@ static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file) * first item */ static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos) + __acquires(cell->proc_lock) { struct afs_cell *cell = m->private; @@ -438,6 +512,7 @@ static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v, * clean up after reading from the cells list */ static void afs_proc_cell_volumes_stop(struct seq_file *p, void *v) + __releases(cell->proc_lock) { struct afs_cell *cell = p->private; @@ -500,6 +575,7 @@ static int afs_proc_cell_vlservers_open(struct inode *inode, struct file *file) * first item */ static void *afs_proc_cell_vlservers_start(struct seq_file *m, loff_t *_pos) + __acquires(rcu) { struct afs_addr_list *alist; struct afs_cell *cell = m->private; @@ -544,6 +620,7 @@ static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v, * clean up after reading from the cells list */ static void afs_proc_cell_vlservers_stop(struct seq_file *p, void *v) + __releases(rcu) { rcu_read_unlock(); } @@ -580,6 +657,7 @@ static int afs_proc_servers_open(struct inode *inode, struct file *file) * first item. */ static void *afs_proc_servers_start(struct seq_file *m, loff_t *_pos) + __acquires(rcu) { struct afs_net *net = afs_seq2net(m); @@ -601,6 +679,7 @@ static void *afs_proc_servers_next(struct seq_file *m, void *v, loff_t *_pos) * clean up after reading from the cells list */ static void afs_proc_servers_stop(struct seq_file *p, void *v) + __releases(rcu) { rcu_read_unlock(); } @@ -626,3 +705,244 @@ static int afs_proc_servers_show(struct seq_file *m, void *v) &alist->addrs[alist->index].transport); return 0; } + +void afs_put_sysnames(struct afs_sysnames *sysnames) +{ + int i; + + if (sysnames && refcount_dec_and_test(&sysnames->usage)) { + for (i = 0; i < sysnames->nr; i++) + if (sysnames->subs[i] != afs_init_sysname && + sysnames->subs[i] != sysnames->blank) + kfree(sysnames->subs[i]); + } +} + +/* + * Handle opening of /proc/fs/afs/sysname. If it is opened for writing, we + * assume the caller wants to change the substitution list and we allocate a + * buffer to hold the list. + */ +static int afs_proc_sysname_open(struct inode *inode, struct file *file) +{ + struct afs_sysnames *sysnames; + struct seq_file *m; + int ret; + + ret = seq_open(file, &afs_proc_sysname_ops); + if (ret < 0) + return ret; + + if (file->f_mode & FMODE_WRITE) { + sysnames = kzalloc(sizeof(*sysnames), GFP_KERNEL); + if (!sysnames) { + seq_release(inode, file); + return -ENOMEM; + } + + refcount_set(&sysnames->usage, 1); + m = file->private_data; + m->private = sysnames; + } + + return 0; +} + +/* + * Handle writes to /proc/fs/afs/sysname to set the @sys substitution. + */ +static ssize_t afs_proc_sysname_write(struct file *file, + const char __user *buf, + size_t size, loff_t *_pos) +{ + struct afs_sysnames *sysnames; + struct seq_file *m = file->private_data; + char *kbuf = NULL, *s, *p, *sub; + int ret, len; + + sysnames = m->private; + if (!sysnames) + return -EINVAL; + if (sysnames->error) + return sysnames->error; + + if (size >= PAGE_SIZE - 1) { + sysnames->error = -EINVAL; + return -EINVAL; + } + if (size == 0) + return 0; + + kbuf = memdup_user_nul(buf, size); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + inode_lock(file_inode(file)); + + p = kbuf; + while ((s = strsep(&p, " \t\n"))) { + len = strlen(s); + if (len == 0) + continue; + ret = -ENAMETOOLONG; + if (len >= AFSNAMEMAX) + goto error; + + if (len >= 4 && + s[len - 4] == '@' && + s[len - 3] == 's' && + s[len - 2] == 'y' && + s[len - 1] == 's') + /* Protect against recursion */ + goto invalid; + + if (s[0] == '.' && + (len < 2 || (len == 2 && s[1] == '.'))) + goto invalid; + + if (memchr(s, '/', len)) + goto invalid; + + ret = -EFBIG; + if (sysnames->nr >= AFS_NR_SYSNAME) + goto out; + + if (strcmp(s, afs_init_sysname) == 0) { + sub = (char *)afs_init_sysname; + } else { + ret = -ENOMEM; + sub = kmemdup(s, len + 1, GFP_KERNEL); + if (!sub) + goto out; + } + + sysnames->subs[sysnames->nr] = sub; + sysnames->nr++; + } + + ret = size; /* consume everything, always */ +out: + inode_unlock(file_inode(file)); + kfree(kbuf); + return ret; + +invalid: + ret = -EINVAL; +error: + sysnames->error = ret; + goto out; +} + +static int afs_proc_sysname_release(struct inode *inode, struct file *file) +{ + struct afs_sysnames *sysnames, *kill = NULL; + struct seq_file *m = file->private_data; + struct afs_net *net = afs_seq2net(m); + + sysnames = m->private; + if (sysnames) { + if (!sysnames->error) { + kill = sysnames; + if (sysnames->nr == 0) { + sysnames->subs[0] = sysnames->blank; + sysnames->nr++; + } + write_lock(&net->sysnames_lock); + kill = net->sysnames; + net->sysnames = sysnames; + write_unlock(&net->sysnames_lock); + } + afs_put_sysnames(kill); + } + + return seq_release(inode, file); +} + +static void *afs_proc_sysname_start(struct seq_file *m, loff_t *pos) + __acquires(&net->sysnames_lock) +{ + struct afs_net *net = afs_seq2net(m); + struct afs_sysnames *names = net->sysnames; + + read_lock(&net->sysnames_lock); + + if (*pos >= names->nr) + return NULL; + return (void *)(unsigned long)(*pos + 1); +} + +static void *afs_proc_sysname_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct afs_net *net = afs_seq2net(m); + struct afs_sysnames *names = net->sysnames; + + *pos += 1; + if (*pos >= names->nr) + return NULL; + return (void *)(unsigned long)(*pos + 1); +} + +static void afs_proc_sysname_stop(struct seq_file *m, void *v) + __releases(&net->sysnames_lock) +{ + struct afs_net *net = afs_seq2net(m); + + read_unlock(&net->sysnames_lock); +} + +static int afs_proc_sysname_show(struct seq_file *m, void *v) +{ + struct afs_net *net = afs_seq2net(m); + struct afs_sysnames *sysnames = net->sysnames; + unsigned int i = (unsigned long)v - 1; + + if (i < sysnames->nr) + seq_printf(m, "%s\n", sysnames->subs[i]); + return 0; +} + +/* + * Display general per-net namespace statistics + */ +static int afs_proc_stats_show(struct seq_file *m, void *v) +{ + struct afs_net *net = afs_seq2net(m); + + seq_puts(m, "kAFS statistics\n"); + + seq_printf(m, "dir-mgmt: look=%u reval=%u inval=%u relpg=%u\n", + atomic_read(&net->n_lookup), + atomic_read(&net->n_reval), + atomic_read(&net->n_inval), + atomic_read(&net->n_relpg)); + + seq_printf(m, "dir-data: rdpg=%u\n", + atomic_read(&net->n_read_dir)); + + seq_printf(m, "dir-edit: cr=%u rm=%u\n", + atomic_read(&net->n_dir_cr), + atomic_read(&net->n_dir_rm)); + + seq_printf(m, "file-rd : n=%u nb=%lu\n", + atomic_read(&net->n_fetches), + atomic_long_read(&net->n_fetch_bytes)); + seq_printf(m, "file-wr : n=%u nb=%lu\n", + atomic_read(&net->n_stores), + atomic_long_read(&net->n_store_bytes)); + return 0; +} + +/* + * Open "/proc/fs/afs/stats" to allow reading of the stat counters. + */ +static int afs_proc_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, afs_proc_stats_show, NULL); +} + +static const struct file_operations afs_proc_stats_fops = { + .open = afs_proc_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c index ad1328d..e065bc0 100644 --- a/fs/afs/rotate.c +++ b/fs/afs/rotate.c @@ -21,7 +21,7 @@ /* * Initialise a filesystem server cursor for iterating over FS servers. */ -void afs_init_fs_cursor(struct afs_fs_cursor *fc, struct afs_vnode *vnode) +static void afs_init_fs_cursor(struct afs_fs_cursor *fc, struct afs_vnode *vnode) { memset(fc, 0, sizeof(*fc)); } @@ -179,7 +179,7 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) */ if (fc->flags & AFS_FS_CURSOR_VNOVOL) { fc->ac.error = -EREMOTEIO; - goto failed; + goto next_server; } write_lock(&vnode->volume->servers_lock); @@ -201,7 +201,7 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) */ if (vnode->volume->servers == fc->server_list) { fc->ac.error = -EREMOTEIO; - goto failed; + goto next_server; } /* Try again */ @@ -350,8 +350,8 @@ use_server: * break request before we've finished decoding the reply and * installing the vnode. */ - fc->ac.error = afs_register_server_cb_interest( - vnode, &fc->server_list->servers[fc->index]); + fc->ac.error = afs_register_server_cb_interest(vnode, fc->server_list, + fc->index); if (fc->ac.error < 0) goto failed; @@ -369,8 +369,16 @@ use_server: if (!test_bit(AFS_SERVER_FL_PROBED, &server->flags)) { fc->ac.alist = afs_get_addrlist(alist); - if (!afs_probe_fileserver(fc)) - goto failed; + if (!afs_probe_fileserver(fc)) { + switch (fc->ac.error) { + case -ENOMEM: + case -ERESTARTSYS: + case -EINTR: + goto failed; + default: + goto next_server; + } + } } if (!fc->ac.alist) diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index f7ae54b..0873594 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -41,6 +41,7 @@ int afs_open_socket(struct afs_net *net) { struct sockaddr_rxrpc srx; struct socket *socket; + unsigned int min_level; int ret; _enter(""); @@ -60,6 +61,12 @@ int afs_open_socket(struct afs_net *net) srx.transport.sin6.sin6_family = AF_INET6; srx.transport.sin6.sin6_port = htons(AFS_CM_PORT); + min_level = RXRPC_SECURITY_ENCRYPT; + ret = kernel_setsockopt(socket, SOL_RXRPC, RXRPC_MIN_SECURITY_LEVEL, + (void *)&min_level, sizeof(min_level)); + if (ret < 0) + goto error_2; + ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx)); if (ret == -EADDRINUSE) { srx.transport.sin6.sin6_port = 0; @@ -482,8 +489,12 @@ static void afs_deliver_to_call(struct afs_call *call) state = READ_ONCE(call->state); switch (ret) { case 0: - if (state == AFS_CALL_CL_PROC_REPLY) + if (state == AFS_CALL_CL_PROC_REPLY) { + if (call->cbi) + set_bit(AFS_SERVER_FL_MAY_HAVE_CB, + &call->cbi->server->flags); goto call_complete; + } ASSERTCMP(state, >, AFS_CALL_CL_PROC_REPLY); goto done; case -EINPROGRESS: @@ -493,11 +504,6 @@ static void afs_deliver_to_call(struct afs_call *call) case -ECONNABORTED: ASSERTCMP(state, ==, AFS_CALL_COMPLETE); goto done; - case -ENOTCONN: - abort_code = RX_CALL_DEAD; - rxrpc_kernel_abort_call(call->net->socket, call->rxcall, - abort_code, ret, "KNC"); - goto local_abort; case -ENOTSUPP: abort_code = RXGEN_OPCODE; rxrpc_kernel_abort_call(call->net->socket, call->rxcall, @@ -926,3 +932,12 @@ int afs_extract_data(struct afs_call *call, void *buf, size_t count, afs_set_call_complete(call, ret, remote_abort); return ret; } + +/* + * Log protocol error production. + */ +noinline int afs_protocol_error(struct afs_call *call, int error) +{ + trace_afs_protocol_error(call, error, __builtin_return_address(0)); + return error; +} diff --git a/fs/afs/security.c b/fs/afs/security.c index b88b7d4..1992b0f 100644 --- a/fs/afs/security.c +++ b/fs/afs/security.c @@ -147,8 +147,7 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key, break; } - if (cb_break != (vnode->cb_break + - vnode->cb_interest->server->cb_s_break)) { + if (cb_break != afs_cb_break_sum(vnode, vnode->cb_interest)) { changed = true; break; } @@ -178,18 +177,14 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key, } } - if (cb_break != (vnode->cb_break + vnode->cb_interest->server->cb_s_break)) { - rcu_read_unlock(); + if (cb_break != afs_cb_break_sum(vnode, vnode->cb_interest)) goto someone_else_changed_it; - } /* We need a ref on any permits list we want to copy as we'll have to * drop the lock to do memory allocation. */ - if (permits && !refcount_inc_not_zero(&permits->usage)) { - rcu_read_unlock(); + if (permits && !refcount_inc_not_zero(&permits->usage)) goto someone_else_changed_it; - } rcu_read_unlock(); @@ -261,7 +256,7 @@ found: spin_lock(&vnode->lock); zap = rcu_access_pointer(vnode->permit_cache); - if (cb_break == (vnode->cb_break + vnode->cb_interest->server->cb_s_break) && + if (cb_break == afs_cb_break_sum(vnode, vnode->cb_interest) && zap == permits) rcu_assign_pointer(vnode->permit_cache, replacement); else @@ -278,6 +273,7 @@ someone_else_changed_it: /* Someone else changed the cache under us - don't recheck at this * time. */ + rcu_read_unlock(); return; } @@ -296,8 +292,6 @@ int afs_check_permit(struct afs_vnode *vnode, struct key *key, _enter("{%x:%u},%x", vnode->fid.vid, vnode->fid.vnode, key_serial(key)); - permits = vnode->permit_cache; - /* check the permits to see if we've got one yet */ if (key == vnode->volume->cell->anonymous_key) { _debug("anon"); @@ -327,7 +321,7 @@ int afs_check_permit(struct afs_vnode *vnode, struct key *key, */ _debug("no valid permit"); - ret = afs_fetch_status(vnode, key); + ret = afs_fetch_status(vnode, key, false); if (ret < 0) { *_access = 0; _leave(" = %d", ret); diff --git a/fs/afs/server.c b/fs/afs/server.c index a43ef77..3af4625 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -59,19 +59,14 @@ struct afs_server *afs_find_server(struct afs_net *net, alist = rcu_dereference(server->addresses); for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) { b = &alist->addrs[i].transport.sin6; - diff = (u16)a->sin6_port - (u16)b->sin6_port; + diff = ((u16 __force)a->sin6_port - + (u16 __force)b->sin6_port); if (diff == 0) diff = memcmp(&a->sin6_addr, &b->sin6_addr, sizeof(struct in6_addr)); if (diff == 0) goto found; - if (diff < 0) { - // TODO: Sort the list - //if (i == alist->nr_ipv4) - // goto not_found; - break; - } } } } else { @@ -79,23 +74,17 @@ struct afs_server *afs_find_server(struct afs_net *net, alist = rcu_dereference(server->addresses); for (i = 0; i < alist->nr_ipv4; i++) { b = &alist->addrs[i].transport.sin6; - diff = (u16)a->sin6_port - (u16)b->sin6_port; + diff = ((u16 __force)a->sin6_port - + (u16 __force)b->sin6_port); if (diff == 0) - diff = ((u32)a->sin6_addr.s6_addr32[3] - - (u32)b->sin6_addr.s6_addr32[3]); + diff = ((u32 __force)a->sin6_addr.s6_addr32[3] - + (u32 __force)b->sin6_addr.s6_addr32[3]); if (diff == 0) goto found; - if (diff < 0) { - // TODO: Sort the list - //if (i == 0) - // goto not_found; - break; - } } } } - //not_found: server = NULL; found: if (server && !atomic_inc_not_zero(&server->usage)) @@ -381,7 +370,7 @@ static void afs_server_rcu(struct rcu_head *rcu) { struct afs_server *server = container_of(rcu, struct afs_server, rcu); - afs_put_addrlist(server->addresses); + afs_put_addrlist(rcu_access_pointer(server->addresses)); kfree(server); } @@ -390,17 +379,19 @@ static void afs_server_rcu(struct rcu_head *rcu) */ static void afs_destroy_server(struct afs_net *net, struct afs_server *server) { - struct afs_addr_list *alist = server->addresses; + struct afs_addr_list *alist = rcu_access_pointer(server->addresses); struct afs_addr_cursor ac = { .alist = alist, - .addr = &alist->addrs[0], .start = alist->index, - .index = alist->index, + .index = 0, + .addr = &alist->addrs[alist->index], .error = 0, }; _enter("%p", server); - afs_fs_give_up_all_callbacks(net, server, &ac, NULL); + if (test_bit(AFS_SERVER_FL_MAY_HAVE_CB, &server->flags)) + afs_fs_give_up_all_callbacks(net, server, &ac, NULL); + call_rcu(&server->rcu, afs_server_rcu); afs_dec_servers_outstanding(net); } @@ -426,8 +417,15 @@ static void afs_gc_servers(struct afs_net *net, struct afs_server *gc_list) } write_sequnlock(&net->fs_lock); - if (deleted) + if (deleted) { + write_seqlock(&net->fs_addr_lock); + if (!hlist_unhashed(&server->addr4_link)) + hlist_del_rcu(&server->addr4_link); + if (!hlist_unhashed(&server->addr6_link)) + hlist_del_rcu(&server->addr6_link); + write_sequnlock(&net->fs_addr_lock); afs_destroy_server(net, server); + } } } diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c index 0f8dc4c..8a5760a 100644 --- a/fs/afs/server_list.c +++ b/fs/afs/server_list.c @@ -49,6 +49,7 @@ struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell, goto error; refcount_set(&slist->usage, 1); + rwlock_init(&slist->lock); /* Make sure a records exists for each server in the list. */ for (i = 0; i < vldb->nr_servers; i++) { @@ -64,9 +65,11 @@ struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell, goto error_2; } - /* Insertion-sort by server pointer */ + /* Insertion-sort by UUID */ for (j = 0; j < slist->nr_servers; j++) - if (slist->servers[j].server >= server) + if (memcmp(&slist->servers[j].server->uuid, + &server->uuid, + sizeof(server->uuid)) >= 0) break; if (j < slist->nr_servers) { if (slist->servers[j].server == server) { diff --git a/fs/afs/super.c b/fs/afs/super.c index 3623c95..9e5d796 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -154,7 +154,7 @@ static int afs_show_devname(struct seq_file *m, struct dentry *root) seq_puts(m, "none"); return 0; } - + switch (volume->type) { case AFSVL_RWVOL: break; @@ -269,7 +269,7 @@ static int afs_parse_device_name(struct afs_mount_params *params, int cellnamesz; _enter(",%s", name); - + if (!name) { printk(KERN_ERR "kAFS: no volume name specified\n"); return -EINVAL; @@ -418,7 +418,10 @@ static int afs_fill_super(struct super_block *sb, if (!sb->s_root) goto error; - sb->s_d_op = &afs_fs_dentry_operations; + if (params->dyn_root) + sb->s_d_op = &afs_dynroot_dentry_operations; + else + sb->s_d_op = &afs_fs_dentry_operations; _leave(" = 0"); return 0; @@ -587,7 +590,7 @@ static void afs_i_init_once(void *_vnode) memset(vnode, 0, sizeof(*vnode)); inode_init_once(&vnode->vfs_inode); mutex_init(&vnode->io_lock); - mutex_init(&vnode->validate_lock); + init_rwsem(&vnode->validate_lock); spin_lock_init(&vnode->wb_lock); spin_lock_init(&vnode->lock); INIT_LIST_HEAD(&vnode->wb_keys); @@ -676,7 +679,7 @@ static int afs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bfree = 0; return 0; } - + key = afs_request_key(vnode->volume->cell); if (IS_ERR(key)) return PTR_ERR(key); @@ -685,7 +688,7 @@ static int afs_statfs(struct dentry *dentry, struct kstatfs *buf) if (afs_begin_vnode_operation(&fc, vnode, key)) { fc.flags |= AFS_FS_CURSOR_NO_VSLEEP; while (afs_select_fileserver(&fc)) { - fc.cb_break = vnode->cb_break + vnode->cb_s_break; + fc.cb_break = afs_calc_vnode_cb_break(vnode); afs_fs_get_volume_status(&fc, &vs); } diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c index 5d8562f..1ed7e2f 100644 --- a/fs/afs/vlclient.c +++ b/fs/afs/vlclient.c @@ -303,7 +303,7 @@ struct afs_addr_list *afs_vl_get_addrs_u(struct afs_net *net, r->uuid.clock_seq_hi_and_reserved = htonl(u->clock_seq_hi_and_reserved); r->uuid.clock_seq_low = htonl(u->clock_seq_low); for (i = 0; i < 6; i++) - r->uuid.node[i] = ntohl(u->node[i]); + r->uuid.node[i] = htonl(u->node[i]); trace_afs_make_vl_call(call); return (struct afs_addr_list *)afs_make_call(ac, call, GFP_KERNEL, false); @@ -450,7 +450,7 @@ again: call->count2 = ntohl(*bp); /* Type or next count */ if (call->count > YFS_MAXENDPOINTS) - return -EBADMSG; + return afs_protocol_error(call, -EBADMSG); alist = afs_alloc_addrlist(call->count, FS_SERVICE, AFS_FS_PORT); if (!alist) @@ -474,7 +474,7 @@ again: size = sizeof(__be32) * (1 + 4 + 1); break; default: - return -EBADMSG; + return afs_protocol_error(call, -EBADMSG); } size += sizeof(__be32); @@ -487,24 +487,24 @@ again: switch (call->count2) { case YFS_ENDPOINT_IPV4: if (ntohl(bp[0]) != sizeof(__be32) * 2) - return -EBADMSG; + return afs_protocol_error(call, -EBADMSG); afs_merge_fs_addr4(alist, bp[1], ntohl(bp[2])); bp += 3; break; case YFS_ENDPOINT_IPV6: if (ntohl(bp[0]) != sizeof(__be32) * 5) - return -EBADMSG; + return afs_protocol_error(call, -EBADMSG); afs_merge_fs_addr6(alist, bp + 1, ntohl(bp[5])); bp += 6; break; default: - return -EBADMSG; + return afs_protocol_error(call, -EBADMSG); } /* Got either the type of the next entry or the count of * volEndpoints if no more fsEndpoints. */ - call->count2 = htonl(*bp++); + call->count2 = ntohl(*bp++); call->offset = 0; call->count--; @@ -517,7 +517,7 @@ again: if (!call->count) goto end; if (call->count > YFS_MAXENDPOINTS) - return -EBADMSG; + return afs_protocol_error(call, -EBADMSG); call->unmarshall = 3; @@ -531,7 +531,7 @@ again: return ret; bp = call->buffer; - call->count2 = htonl(*bp++); + call->count2 = ntohl(*bp++); call->offset = 0; call->unmarshall = 4; @@ -545,7 +545,7 @@ again: size = sizeof(__be32) * (1 + 4 + 1); break; default: - return -EBADMSG; + return afs_protocol_error(call, -EBADMSG); } if (call->count > 1) @@ -558,16 +558,16 @@ again: switch (call->count2) { case YFS_ENDPOINT_IPV4: if (ntohl(bp[0]) != sizeof(__be32) * 2) - return -EBADMSG; + return afs_protocol_error(call, -EBADMSG); bp += 3; break; case YFS_ENDPOINT_IPV6: if (ntohl(bp[0]) != sizeof(__be32) * 5) - return -EBADMSG; + return afs_protocol_error(call, -EBADMSG); bp += 6; break; default: - return -EBADMSG; + return afs_protocol_error(call, -EBADMSG); } /* Got either the type of the next entry or the count of @@ -576,7 +576,7 @@ again: call->offset = 0; call->count--; if (call->count > 0) { - call->count2 = htonl(*bp++); + call->count2 = ntohl(*bp++); goto again; } diff --git a/fs/afs/write.c b/fs/afs/write.c index dbc3c0b..8b39e6e 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -42,10 +42,11 @@ static int afs_fill_page(struct afs_vnode *vnode, struct key *key, if (!req) return -ENOMEM; - atomic_set(&req->usage, 1); + refcount_set(&req->usage, 1); req->pos = pos; req->len = len; req->nr_pages = 1; + req->pages = req->array; req->pages[0] = page; get_page(page); @@ -124,7 +125,12 @@ try_again: page->index, priv); goto flush_conflicting_write; } - if (to < f || from > t) + /* If the file is being filled locally, allow inter-write + * spaces to be merged into writes. If it's not, only write + * back what the user gives us. + */ + if (!test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags) && + (to < f || from > t)) goto flush_conflicting_write; if (from < f) f = from; @@ -345,7 +351,7 @@ found_key: ret = -ERESTARTSYS; if (afs_begin_vnode_operation(&fc, vnode, wbk->key)) { while (afs_select_fileserver(&fc)) { - fc.cb_break = vnode->cb_break + vnode->cb_s_break; + fc.cb_break = afs_calc_vnode_cb_break(vnode); afs_fs_store_data(&fc, mapping, first, last, offset, to); } @@ -355,6 +361,12 @@ found_key: } switch (ret) { + case 0: + afs_stat_v(vnode, n_stores); + atomic_long_add((last * PAGE_SIZE + to) - + (first * PAGE_SIZE + offset), + &afs_v2net(vnode)->n_store_bytes); + break; case -EACCES: case -EPERM: case -ENOKEY: @@ -412,7 +424,8 @@ static int afs_write_back_from_locked_page(struct address_space *mapping, trace_afs_page_dirty(vnode, tracepoint_string("WARN"), primary_page->index, priv); - if (start >= final_page || to < PAGE_SIZE) + if (start >= final_page || + (to < PAGE_SIZE && !test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags))) goto no_more; start++; @@ -433,9 +446,10 @@ static int afs_write_back_from_locked_page(struct address_space *mapping, } for (loop = 0; loop < n; loop++) { - if (to != PAGE_SIZE) - break; page = pages[loop]; + if (to != PAGE_SIZE && + !test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags)) + break; if (page->index > final_page) break; if (!trylock_page(page)) @@ -448,7 +462,8 @@ static int afs_write_back_from_locked_page(struct address_space *mapping, priv = page_private(page); f = priv & AFS_PRIV_MAX; t = priv >> AFS_PRIV_SHIFT; - if (f != 0) { + if (f != 0 && + !test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags)) { unlock_page(page); break; } @@ -735,20 +750,6 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync) } /* - * Flush out all outstanding writes on a file opened for writing when it is - * closed. - */ -int afs_flush(struct file *file, fl_owner_t id) -{ - _enter(""); - - if ((file->f_mode & FMODE_WRITE) == 0) - return 0; - - return vfs_fsync(file, 0); -} - -/* * notification that a previously read-only page is about to become writable * - if it returns an error, the caller will deliver a bus error signal */ diff --git a/fs/afs/xdr_fs.h b/fs/afs/xdr_fs.h new file mode 100644 index 0000000..aa21f30 --- /dev/null +++ b/fs/afs/xdr_fs.h @@ -0,0 +1,103 @@ +/* AFS fileserver XDR types + * + * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#ifndef XDR_FS_H +#define XDR_FS_H + +struct afs_xdr_AFSFetchStatus { + __be32 if_version; +#define AFS_FSTATUS_VERSION 1 + __be32 type; + __be32 nlink; + __be32 size_lo; + __be32 data_version_lo; + __be32 author; + __be32 owner; + __be32 caller_access; + __be32 anon_access; + __be32 mode; + __be32 parent_vnode; + __be32 parent_unique; + __be32 seg_size; + __be32 mtime_client; + __be32 mtime_server; + __be32 group; + __be32 sync_counter; + __be32 data_version_hi; + __be32 lock_count; + __be32 size_hi; + __be32 abort_code; +} __packed; + +#define AFS_DIR_HASHTBL_SIZE 128 +#define AFS_DIR_DIRENT_SIZE 32 +#define AFS_DIR_SLOTS_PER_BLOCK 64 +#define AFS_DIR_BLOCK_SIZE 2048 +#define AFS_DIR_BLOCKS_PER_PAGE (PAGE_SIZE / AFS_DIR_BLOCK_SIZE) +#define AFS_DIR_MAX_SLOTS 65536 +#define AFS_DIR_BLOCKS_WITH_CTR 128 +#define AFS_DIR_MAX_BLOCKS 1023 +#define AFS_DIR_RESV_BLOCKS 1 +#define AFS_DIR_RESV_BLOCKS0 13 + +/* + * Directory entry structure. + */ +union afs_xdr_dirent { + struct { + u8 valid; + u8 unused[1]; + __be16 hash_next; + __be32 vnode; + __be32 unique; + u8 name[16]; + u8 overflow[4]; /* if any char of the name (inc + * NUL) reaches here, consume + * the next dirent too */ + } u; + u8 extended_name[32]; +} __packed; + +/* + * Directory block header (one at the beginning of every 2048-byte block). + */ +struct afs_xdr_dir_hdr { + __be16 npages; + __be16 magic; +#define AFS_DIR_MAGIC htons(1234) + u8 reserved; + u8 bitmap[8]; + u8 pad[19]; +} __packed; + +/* + * Directory block layout + */ +union afs_xdr_dir_block { + struct afs_xdr_dir_hdr hdr; + + struct { + struct afs_xdr_dir_hdr hdr; + u8 alloc_ctrs[AFS_DIR_MAX_BLOCKS]; + __be16 hashtable[AFS_DIR_HASHTBL_SIZE]; + } meta; + + union afs_xdr_dirent dirents[AFS_DIR_SLOTS_PER_BLOCK]; +} __packed; + +/* + * Directory layout on a linux VM page. + */ +struct afs_xdr_dir_page { + union afs_xdr_dir_block blocks[AFS_DIR_BLOCKS_PER_PAGE]; +}; + +#endif /* XDR_FS_H */ diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 82e8f6e..b12e37f 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -749,7 +749,7 @@ static int autofs4_dir_mkdir(struct inode *dir, autofs4_del_active(dentry); - inode = autofs4_get_inode(dir->i_sb, S_IFDIR | 0555); + inode = autofs4_get_inode(dir->i_sb, S_IFDIR | mode); if (!inode) return -ENOMEM; d_add(dentry, inode); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 41e0418..4ad6f66 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -377,10 +377,10 @@ static unsigned long elf_map(struct file *filep, unsigned long addr, } else map_addr = vm_mmap(filep, addr, size, prot, type, off); - if ((type & MAP_FIXED_NOREPLACE) && BAD_ADDR(map_addr)) - pr_info("%d (%s): Uhuuh, elf segment at %p requested but the memory is mapped already\n", - task_pid_nr(current), current->comm, - (void *)addr); + if ((type & MAP_FIXED_NOREPLACE) && + PTR_ERR((void *)map_addr) == -EEXIST) + pr_info("%d (%s): Uhuuh, elf segment at %px requested but the memory is mapped already\n", + task_pid_nr(current), current->comm, (void *)addr); return(map_addr); } diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 167e5dc..23537bc 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0 + config BTRFS_FS tristate "Btrfs filesystem support" select LIBCRC32C diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 0066d95..15e1dfe 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Red Hat. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/fs.h> diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index d554074..d522494 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -1,20 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. * Copyright (C) 2014 Fujitsu. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/kthread.h> diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index fc957e0..7861c9f 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -1,24 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2007 Oracle. All rights reserved. * Copyright (C) 2014 Fujitsu. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __BTRFS_ASYNC_THREAD_ -#define __BTRFS_ASYNC_THREAD_ +#ifndef BTRFS_ASYNC_THREAD_H +#define BTRFS_ASYNC_THREAD_H + #include <linux/workqueue.h> struct btrfs_fs_info; @@ -85,4 +73,5 @@ void btrfs_set_work_high_priority(struct btrfs_work *work); struct btrfs_fs_info *btrfs_work_owner(const struct btrfs_work *work); struct btrfs_fs_info *btrfs_workqueue_owner(const struct __btrfs_workqueue *wq); bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq); + #endif diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 571024b..0a8e2e2 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2011 STRATO. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/mm.h> diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 0a30028..54d5898 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2011 STRATO. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __BTRFS_BACKREF__ -#define __BTRFS_BACKREF__ +#ifndef BTRFS_BACKREF_H +#define BTRFS_BACKREF_H #include <linux/btrfs.h> #include "ulist.h" diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index ca15be5..234bae5 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __BTRFS_I__ -#define __BTRFS_I__ +#ifndef BTRFS_INODE_H +#define BTRFS_INODE_H #include <linux/hash.h> #include "extent_map.h" diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 3baebbc..dc062b1 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) STRATO AG 2011. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ /* diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h index 2de58a9..9bf4359 100644 --- a/fs/btrfs/check-integrity.h +++ b/fs/btrfs/check-integrity.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) STRATO AG 2011. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#if !defined(__BTRFS_CHECK_INTEGRITY__) -#define __BTRFS_CHECK_INTEGRITY__ +#ifndef BTRFS_CHECK_INTEGRITY_H +#define BTRFS_CHECK_INTEGRITY_H #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY int btrfsic_submit_bh(int op, int op_flags, struct buffer_head *bh); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 578181c..1061575 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/kernel.h> diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index ce79655..cc605f7 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __BTRFS_COMPRESSION_ -#define __BTRFS_COMPRESSION_ +#ifndef BTRFS_COMPRESSION_H +#define BTRFS_COMPRESSION_H /* * We want to make sure that amount of RAM required to uncompress an extent is diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index a2c9d21..8c68961 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007,2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/sched.h> @@ -2449,10 +2436,8 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, if (p->reada != READA_NONE) reada_for_search(fs_info, p, level, slot, key->objectid); - btrfs_release_path(p); - ret = -EAGAIN; - tmp = read_tree_block(fs_info, blocknr, 0, parent_level - 1, + tmp = read_tree_block(fs_info, blocknr, gen, parent_level - 1, &first_key); if (!IS_ERR(tmp)) { /* @@ -2467,6 +2452,8 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, } else { ret = PTR_ERR(tmp); } + + btrfs_release_path(p); return ret; } @@ -5427,12 +5414,24 @@ int btrfs_compare_trees(struct btrfs_root *left_root, down_read(&fs_info->commit_root_sem); left_level = btrfs_header_level(left_root->commit_root); left_root_level = left_level; - left_path->nodes[left_level] = left_root->commit_root; + left_path->nodes[left_level] = + btrfs_clone_extent_buffer(left_root->commit_root); + if (!left_path->nodes[left_level]) { + up_read(&fs_info->commit_root_sem); + ret = -ENOMEM; + goto out; + } extent_buffer_get(left_path->nodes[left_level]); right_level = btrfs_header_level(right_root->commit_root); right_root_level = right_level; - right_path->nodes[right_level] = right_root->commit_root; + right_path->nodes[right_level] = + btrfs_clone_extent_buffer(right_root->commit_root); + if (!right_path->nodes[right_level]) { + up_read(&fs_info->commit_root_sem); + ret = -ENOMEM; + goto out; + } extent_buffer_get(right_path->nodes[right_level]); up_read(&fs_info->commit_root_sem); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0eb5582..0d422c9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __BTRFS_CTREE__ -#define __BTRFS_CTREE__ +#ifndef BTRFS_CTREE_H +#define BTRFS_CTREE_H #include <linux/mm.h> #include <linux/sched/signal.h> @@ -472,6 +459,25 @@ struct btrfs_block_rsv { unsigned short full; unsigned short type; unsigned short failfast; + + /* + * Qgroup equivalent for @size @reserved + * + * Unlike normal @size/@reserved for inode rsv, qgroup doesn't care + * about things like csum size nor how many tree blocks it will need to + * reserve. + * + * Qgroup cares more about net change of the extent usage. + * + * So for one newly inserted file extent, in worst case it will cause + * leaf split and level increase, nodesize for each file extent is + * already too much. + * + * In short, qgroup_size/reserved is the upper limit of possible needed + * qgroup metadata reservation. + */ + u64 qgroup_rsv_size; + u64 qgroup_rsv_reserved; }; /* @@ -727,6 +733,12 @@ struct btrfs_delayed_root; */ #define BTRFS_FS_EXCL_OP 16 +/* + * To info transaction_kthread we need an immediate commit so it doesn't + * need to wait for commit_interval + */ +#define BTRFS_FS_NEED_ASYNC_COMMIT 17 + struct btrfs_fs_info { u8 fsid[BTRFS_FSID_SIZE]; u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; @@ -3170,6 +3182,8 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *orig_start, u64 *orig_block_len, u64 *ram_bytes); +void __btrfs_del_delalloc_inode(struct btrfs_root *root, + struct btrfs_inode *inode); struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index); int btrfs_unlink_inode(struct btrfs_trans_handle *trans, @@ -3752,4 +3766,5 @@ static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) #endif return 0; } + #endif diff --git a/fs/btrfs/dedupe.h b/fs/btrfs/dedupe.h index 83ebfe2..90281a7 100644 --- a/fs/btrfs/dedupe.h +++ b/fs/btrfs/dedupe.h @@ -1,24 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2016 Fujitsu. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __BTRFS_DEDUPE__ -#define __BTRFS_DEDUPE__ +#ifndef BTRFS_DEDUPE_H +#define BTRFS_DEDUPE_H /* later in-band dedupe will expand this struct */ struct btrfs_dedupe_hash; + #endif diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 86ec2ed..a8d492db 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1,20 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2011 Fujitsu. All rights reserved. * Written by Miao Xie <miaox@cn.fujitsu.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/slab.h> @@ -569,6 +556,12 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, dst_rsv = &fs_info->delayed_block_rsv; num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); + + /* + * Here we migrate space rsv from transaction rsv, since have already + * reserved space when starting a transaction. So no need to reserve + * qgroup space here. + */ ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1); if (!ret) { trace_btrfs_space_reservation(fs_info, "delayed_item", @@ -590,7 +583,10 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, return; rsv = &fs_info->delayed_block_rsv; - btrfs_qgroup_convert_reserved_meta(root, item->bytes_reserved); + /* + * Check btrfs_delayed_item_reserve_metadata() to see why we don't need + * to release/reserve qgroup space. + */ trace_btrfs_space_reservation(fs_info, "delayed_item", item->key.objectid, item->bytes_reserved, 0); @@ -615,9 +611,6 @@ static int btrfs_delayed_inode_reserve_metadata( num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); - ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true); - if (ret < 0) - return ret; /* * btrfs_dirty_inode will update the inode under btrfs_join_transaction * which doesn't reserve space for speed. This is a problem since we @@ -629,6 +622,10 @@ static int btrfs_delayed_inode_reserve_metadata( */ if (!src_rsv || (!trans->bytes_reserved && src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) { + ret = btrfs_qgroup_reserve_meta_prealloc(root, + fs_info->nodesize, true); + if (ret < 0) + return ret; ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes, BTRFS_RESERVE_NO_FLUSH); /* @@ -647,6 +644,8 @@ static int btrfs_delayed_inode_reserve_metadata( "delayed_inode", btrfs_ino(inode), num_bytes, 1); + } else { + btrfs_qgroup_free_meta_prealloc(root, fs_info->nodesize); } return ret; } diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 100a91e..ca7a97f 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -1,24 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2011 Fujitsu. All rights reserved. * Written by Miao Xie <miaox@cn.fujitsu.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __DELAYED_TREE_OPERATION_H -#define __DELAYED_TREE_OPERATION_H +#ifndef BTRFS_DELAYED_INODE_H +#define BTRFS_DELAYED_INODE_H #include <linux/rbtree.h> #include <linux/spinlock.h> diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 2677257..e1b0651 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2009 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/sched.h> @@ -553,8 +540,10 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head *head_ref, struct btrfs_qgroup_extent_record *qrecord, u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved, - int action, int is_data, int *qrecord_inserted_ret, + int action, int is_data, int is_system, + int *qrecord_inserted_ret, int *old_ref_mod, int *new_ref_mod) + { struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_root *delayed_refs; @@ -598,6 +587,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, head_ref->ref_mod = count_mod; head_ref->must_insert_reserved = must_insert_reserved; head_ref->is_data = is_data; + head_ref->is_system = is_system; head_ref->ref_tree = RB_ROOT; INIT_LIST_HEAD(&head_ref->ref_add_list); RB_CLEAR_NODE(&head_ref->href_node); @@ -785,6 +775,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_qgroup_extent_record *record = NULL; int qrecord_inserted; + int is_system = (ref_root == BTRFS_CHUNK_TREE_OBJECTID); BUG_ON(extent_op && extent_op->is_data); ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS); @@ -813,8 +804,8 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, */ head_ref = add_delayed_ref_head(fs_info, trans, head_ref, record, bytenr, num_bytes, 0, 0, action, 0, - &qrecord_inserted, old_ref_mod, - new_ref_mod); + is_system, &qrecord_inserted, + old_ref_mod, new_ref_mod); add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr, num_bytes, parent, ref_root, level, action); @@ -881,7 +872,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, */ head_ref = add_delayed_ref_head(fs_info, trans, head_ref, record, bytenr, num_bytes, ref_root, reserved, - action, 1, &qrecord_inserted, + action, 1, 0, &qrecord_inserted, old_ref_mod, new_ref_mod); add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr, @@ -911,9 +902,14 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); + /* + * extent_ops just modify the flags of an extent and they don't result + * in ref count changes, hence it's safe to pass false/0 for is_system + * argument + */ add_delayed_ref_head(fs_info, trans, head_ref, NULL, bytenr, num_bytes, 0, 0, BTRFS_UPDATE_DELAYED_HEAD, - extent_op->is_data, NULL, NULL, NULL); + extent_op->is_data, 0, NULL, NULL, NULL); spin_unlock(&delayed_refs->lock); return 0; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 9e3e5af..7f00db5 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -1,22 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __DELAYED_REF__ -#define __DELAYED_REF__ + +#ifndef BTRFS_DELAYED_REF_H +#define BTRFS_DELAYED_REF_H #include <linux/refcount.h> @@ -139,6 +127,7 @@ struct btrfs_delayed_ref_head { */ unsigned int must_insert_reserved:1; unsigned int is_data:1; + unsigned int is_system:1; unsigned int processing:1; }; @@ -298,4 +287,5 @@ btrfs_delayed_node_to_data_ref(struct btrfs_delayed_ref_node *node) { return container_of(node, struct btrfs_delayed_data_ref, node); } + #endif diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 0d20363..f82be26 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) STRATO AG 2012. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ + #include <linux/sched.h> #include <linux/bio.h> #include <linux/slab.h> diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index 8566a02..b6d4206 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) STRATO AG 2012. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#if !defined(__BTRFS_DEV_REPLACE__) -#define __BTRFS_DEV_REPLACE__ +#ifndef BTRFS_DEV_REPLACE_H +#define BTRFS_DEV_REPLACE_H struct btrfs_ioctl_dev_replace_args; @@ -48,4 +35,5 @@ static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value) { atomic64_inc(stat_value); } + #endif diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 29e967b..39e9766 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include "ctree.h" diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 07b5e6f..c3504b4 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/fs.h> @@ -449,6 +436,14 @@ static int verify_level_key(struct btrfs_fs_info *fs_info, if (!first_key) return 0; + /* + * For live tree block (new tree blocks in current transaction), + * we need proper lock context to avoid race, which is impossible here. + * So we only checks tree blocks which is read from disk, whose + * generation <= fs_info->last_trans_committed. + */ + if (btrfs_header_generation(eb) > fs_info->last_trans_committed) + return 0; if (found_level) btrfs_node_key_to_cpu(eb, &found_key, 0); else @@ -1829,6 +1824,7 @@ static int transaction_kthread(void *arg) now = get_seconds(); if (cur->state < TRANS_STATE_BLOCKED && + !test_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags) && (now < cur->start_time || now - cur->start_time < fs_info->commit_interval)) { spin_unlock(&fs_info->trans_lock); @@ -3812,7 +3808,8 @@ void close_ctree(struct btrfs_fs_info *fs_info) btrfs_err(fs_info, "commit super ret %d", ret); } - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state) || + test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state)) btrfs_error_commit_super(fs_info); kthread_stop(fs_info->transaction_kthread); @@ -3821,6 +3818,7 @@ void close_ctree(struct btrfs_fs_info *fs_info) set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags); btrfs_free_qgroup_config(fs_info); + ASSERT(list_empty(&fs_info->delalloc_roots)); if (percpu_counter_sum(&fs_info->delalloc_bytes)) { btrfs_info(fs_info, "at unmount delalloc count %lld", @@ -4128,15 +4126,15 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info) static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info) { + /* cleanup FS via transaction */ + btrfs_cleanup_transaction(fs_info); + mutex_lock(&fs_info->cleaner_mutex); btrfs_run_delayed_iputs(fs_info); mutex_unlock(&fs_info->cleaner_mutex); down_write(&fs_info->cleanup_work_sem); up_write(&fs_info->cleanup_work_sem); - - /* cleanup FS via transaction */ - btrfs_cleanup_transaction(fs_info); } static void btrfs_destroy_ordered_extents(struct btrfs_root *root) @@ -4261,19 +4259,23 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) list_splice_init(&root->delalloc_inodes, &splice); while (!list_empty(&splice)) { + struct inode *inode = NULL; btrfs_inode = list_first_entry(&splice, struct btrfs_inode, delalloc_inodes); - - list_del_init(&btrfs_inode->delalloc_inodes); - clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &btrfs_inode->runtime_flags); + __btrfs_del_delalloc_inode(root, btrfs_inode); spin_unlock(&root->delalloc_lock); - btrfs_invalidate_inodes(btrfs_inode->root); - + /* + * Make sure we get a live inode and that it'll not disappear + * meanwhile. + */ + inode = igrab(&btrfs_inode->vfs_inode); + if (inode) { + invalidate_inode_pages2(inode->i_mapping); + iput(inode); + } spin_lock(&root->delalloc_lock); } - spin_unlock(&root->delalloc_lock); } @@ -4289,7 +4291,6 @@ static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info) while (!list_empty(&splice)) { root = list_first_entry(&splice, struct btrfs_root, delalloc_root); - list_del_init(&root->delalloc_root); root = btrfs_grab_fs_root(root); BUG_ON(!root); spin_unlock(&fs_info->delalloc_root_lock); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 453ea9f..1a3d277 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __DISKIO__ -#define __DISKIO__ +#ifndef BTRFS_DISK_IO_H +#define BTRFS_DISK_IO_H #define BTRFS_SUPER_INFO_OFFSET SZ_64K #define BTRFS_SUPER_INFO_SIZE 4096 @@ -169,4 +156,5 @@ static inline void btrfs_set_buffer_lockdep_class(u64 objectid, { } #endif + #endif diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index ddaccad..1f3755b 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 + #include <linux/fs.h> #include <linux/types.h> #include "ctree.h" diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h index 91b3908..57488ec 100644 --- a/fs/btrfs/export.h +++ b/fs/btrfs/export.h @@ -1,4 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ + #ifndef BTRFS_EXPORT_H #define BTRFS_EXPORT_H diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e08d0d4..51b5e2da 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ + #include <linux/sched.h> #include <linux/sched/signal.h> #include <linux/pagemap.h> @@ -2613,13 +2601,19 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, trace_run_delayed_ref_head(fs_info, head, 0); if (head->total_ref_mod < 0) { - struct btrfs_block_group_cache *cache; + struct btrfs_space_info *space_info; + u64 flags; - cache = btrfs_lookup_block_group(fs_info, head->bytenr); - ASSERT(cache); - percpu_counter_add(&cache->space_info->total_bytes_pinned, + if (head->is_data) + flags = BTRFS_BLOCK_GROUP_DATA; + else if (head->is_system) + flags = BTRFS_BLOCK_GROUP_SYSTEM; + else + flags = BTRFS_BLOCK_GROUP_METADATA; + space_info = __find_space_info(fs_info, flags); + ASSERT(space_info); + percpu_counter_add(&space_info->total_bytes_pinned, -head->num_bytes); - btrfs_put_block_group(cache); if (head->is_data) { spin_lock(&delayed_refs->lock); @@ -3148,7 +3142,11 @@ static noinline int check_delayed_ref(struct btrfs_root *root, struct rb_node *node; int ret = 0; + spin_lock(&root->fs_info->trans_lock); cur_trans = root->fs_info->running_transaction; + if (cur_trans) + refcount_inc(&cur_trans->use_count); + spin_unlock(&root->fs_info->trans_lock); if (!cur_trans) return 0; @@ -3157,6 +3155,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root, head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); if (!head) { spin_unlock(&delayed_refs->lock); + btrfs_put_transaction(cur_trans); return 0; } @@ -3173,6 +3172,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root, mutex_lock(&head->mutex); mutex_unlock(&head->mutex); btrfs_put_delayed_ref_head(head); + btrfs_put_transaction(cur_trans); return -EAGAIN; } spin_unlock(&delayed_refs->lock); @@ -3205,6 +3205,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root, } spin_unlock(&head->lock); mutex_unlock(&head->mutex); + btrfs_put_transaction(cur_trans); return ret; } @@ -4642,6 +4643,7 @@ again: if (wait_for_alloc) { mutex_unlock(&fs_info->chunk_mutex); wait_for_alloc = 0; + cond_resched(); goto again; } @@ -5570,14 +5572,18 @@ again: static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, - struct btrfs_block_rsv *dest, u64 num_bytes) + struct btrfs_block_rsv *dest, u64 num_bytes, + u64 *qgroup_to_release_ret) { struct btrfs_space_info *space_info = block_rsv->space_info; + u64 qgroup_to_release = 0; u64 ret; spin_lock(&block_rsv->lock); - if (num_bytes == (u64)-1) + if (num_bytes == (u64)-1) { num_bytes = block_rsv->size; + qgroup_to_release = block_rsv->qgroup_rsv_size; + } block_rsv->size -= num_bytes; if (block_rsv->reserved >= block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; @@ -5586,6 +5592,13 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, } else { num_bytes = 0; } + if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) { + qgroup_to_release = block_rsv->qgroup_rsv_reserved - + block_rsv->qgroup_rsv_size; + block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size; + } else { + qgroup_to_release = 0; + } spin_unlock(&block_rsv->lock); ret = num_bytes; @@ -5608,6 +5621,8 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, space_info_add_old_bytes(fs_info, space_info, num_bytes); } + if (qgroup_to_release_ret) + *qgroup_to_release_ret = qgroup_to_release; return ret; } @@ -5749,17 +5764,21 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode, struct btrfs_root *root = inode->root; struct btrfs_block_rsv *block_rsv = &inode->block_rsv; u64 num_bytes = 0; + u64 qgroup_num_bytes = 0; int ret = -ENOSPC; spin_lock(&block_rsv->lock); if (block_rsv->reserved < block_rsv->size) num_bytes = block_rsv->size - block_rsv->reserved; + if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size) + qgroup_num_bytes = block_rsv->qgroup_rsv_size - + block_rsv->qgroup_rsv_reserved; spin_unlock(&block_rsv->lock); if (num_bytes == 0) return 0; - ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true); + ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes, true); if (ret) return ret; ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); @@ -5767,7 +5786,13 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode, block_rsv_add_bytes(block_rsv, num_bytes, 0); trace_btrfs_space_reservation(root->fs_info, "delalloc", btrfs_ino(inode), num_bytes, 1); - } + + /* Don't forget to increase qgroup_rsv_reserved */ + spin_lock(&block_rsv->lock); + block_rsv->qgroup_rsv_reserved += qgroup_num_bytes; + spin_unlock(&block_rsv->lock); + } else + btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes); return ret; } @@ -5788,20 +5813,23 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; struct btrfs_block_rsv *block_rsv = &inode->block_rsv; u64 released = 0; + u64 qgroup_to_release = 0; /* * Since we statically set the block_rsv->size we just want to say we * are releasing 0 bytes, and then we'll just get the reservation over * the size free'd. */ - released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0); + released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0, + &qgroup_to_release); if (released > 0) trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode), released, 0); if (qgroup_free) - btrfs_qgroup_free_meta_prealloc(inode->root, released); + btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release); else - btrfs_qgroup_convert_reserved_meta(inode->root, released); + btrfs_qgroup_convert_reserved_meta(inode->root, + qgroup_to_release); } void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, @@ -5813,7 +5841,7 @@ void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, if (global_rsv == block_rsv || block_rsv->space_info != global_rsv->space_info) global_rsv = NULL; - block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes); + block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes, NULL); } static void update_global_block_rsv(struct btrfs_fs_info *fs_info) @@ -5893,7 +5921,7 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info) static void release_global_block_rsv(struct btrfs_fs_info *fs_info) { block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL, - (u64)-1); + (u64)-1, NULL); WARN_ON(fs_info->trans_block_rsv.size > 0); WARN_ON(fs_info->trans_block_rsv.reserved > 0); WARN_ON(fs_info->chunk_block_rsv.size > 0); @@ -5917,7 +5945,7 @@ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) WARN_ON_ONCE(!list_empty(&trans->new_bgs)); block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL, - trans->chunk_bytes_reserved); + trans->chunk_bytes_reserved, NULL); trans->chunk_bytes_reserved = 0; } @@ -6022,6 +6050,7 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, { struct btrfs_block_rsv *block_rsv = &inode->block_rsv; u64 reserve_size = 0; + u64 qgroup_rsv_size = 0; u64 csum_leaves; unsigned outstanding_extents; @@ -6034,9 +6063,17 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, inode->csum_bytes); reserve_size += btrfs_calc_trans_metadata_size(fs_info, csum_leaves); + /* + * For qgroup rsv, the calculation is very simple: + * account one nodesize for each outstanding extent + * + * This is overestimating in most cases. + */ + qgroup_rsv_size = outstanding_extents * fs_info->nodesize; spin_lock(&block_rsv->lock); block_rsv->size = reserve_size; + block_rsv->qgroup_rsv_size = qgroup_rsv_size; spin_unlock(&block_rsv->lock); } @@ -8414,7 +8451,7 @@ static void unuse_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u32 blocksize) { block_rsv_add_bytes(block_rsv, blocksize, 0); - block_rsv_release_bytes(fs_info, block_rsv, NULL, 0); + block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL); } /* diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index cf87976..e99b329 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 + #include <linux/bitops.h> #include <linux/slab.h> #include <linux/bio.h> diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index b77d849..a530096 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __EXTENTIO__ -#define __EXTENTIO__ + +#ifndef BTRFS_EXTENT_IO_H +#define BTRFS_EXTENT_IO_H #include <linux/rbtree.h> #include <linux/refcount.h> @@ -572,4 +573,5 @@ noinline u64 find_lock_delalloc_range(struct inode *inode, #endif struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); + #endif diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 53a0633..1b8a078 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 + #include <linux/err.h> #include <linux/slab.h> #include <linux/spinlock.h> diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index f6f8ba1..5fcb80a 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __EXTENTMAP__ -#define __EXTENTMAP__ + +#ifndef BTRFS_EXTENT_MAP_H +#define BTRFS_EXTENT_MAP_H #include <linux/rbtree.h> #include <linux/refcount.h> @@ -93,4 +94,5 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len); int btrfs_add_extent_mapping(struct extent_map_tree *em_tree, struct extent_map **em_in, u64 start, u64 len); + #endif diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index fdcb410..f9dd6d1 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/bio.h> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index f247300..f660ba1 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/fs.h> @@ -1761,7 +1748,7 @@ again: unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes, - (ret != 0)); + true); if (ret) { btrfs_drop_pages(pages, num_pages); break; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index d0dde9e..e5b569b 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2008 Red Hat. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/pagemap.h> diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 79eca4c..15e30b9 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2009 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __BTRFS_FREE_SPACE_CACHE -#define __BTRFS_FREE_SPACE_CACHE +#ifndef BTRFS_FREE_SPACE_CACHE_H +#define BTRFS_FREE_SPACE_CACHE_H struct btrfs_free_space { struct rb_node offset_index; diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index af36a6a..32a0f6c 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2015 Facebook. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/kernel.h> diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h index ba3787d..874b4fe 100644 --- a/fs/btrfs/free-space-tree.h +++ b/fs/btrfs/free-space-tree.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2015 Facebook. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __BTRFS_FREE_SPACE_TREE -#define __BTRFS_FREE_SPACE_TREE +#ifndef BTRFS_FREE_SPACE_TREE_H +#define BTRFS_FREE_SPACE_TREE_H /* * The default size for new free space bitmap items. The last bitmap in a block diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 1d5631e..a8956a3 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include "ctree.h" diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 9409dcc..12fcd88 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/delay.h> diff --git a/fs/btrfs/inode-map.h b/fs/btrfs/inode-map.h index 6734ec9..7a96281 100644 --- a/fs/btrfs/inode-map.h +++ b/fs/btrfs/inode-map.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __BTRFS_INODE_MAP -#define __BTRFS_INODE_MAP + +#ifndef BTRFS_INODE_MAP_H +#define BTRFS_INODE_MAP_H void btrfs_init_free_ino_ctl(struct btrfs_root *root); void btrfs_unpin_free_ino(struct btrfs_root *root); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1f091c2..8e604e7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/kernel.h> @@ -44,6 +31,7 @@ #include <linux/uio.h> #include <linux/magic.h> #include <linux/iversion.h> +#include <asm/unaligned.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -1754,12 +1742,12 @@ static void btrfs_add_delalloc_inodes(struct btrfs_root *root, spin_unlock(&root->delalloc_lock); } -static void btrfs_del_delalloc_inode(struct btrfs_root *root, - struct btrfs_inode *inode) + +void __btrfs_del_delalloc_inode(struct btrfs_root *root, + struct btrfs_inode *inode) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); - spin_lock(&root->delalloc_lock); if (!list_empty(&inode->delalloc_inodes)) { list_del_init(&inode->delalloc_inodes); clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, @@ -1772,6 +1760,13 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root, spin_unlock(&fs_info->delalloc_root_lock); } } +} + +static void btrfs_del_delalloc_inode(struct btrfs_root *root, + struct btrfs_inode *inode) +{ + spin_lock(&root->delalloc_lock); + __btrfs_del_delalloc_inode(root, inode); spin_unlock(&root->delalloc_lock); } @@ -5918,11 +5913,13 @@ static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx) struct dir_entry *entry = addr; char *name = (char *)(entry + 1); - ctx->pos = entry->offset; - if (!dir_emit(ctx, name, entry->name_len, entry->ino, - entry->type)) + ctx->pos = get_unaligned(&entry->offset); + if (!dir_emit(ctx, name, get_unaligned(&entry->name_len), + get_unaligned(&entry->ino), + get_unaligned(&entry->type))) return 1; - addr += sizeof(struct dir_entry) + entry->name_len; + addr += sizeof(struct dir_entry) + + get_unaligned(&entry->name_len); ctx->pos++; } return 0; @@ -6012,14 +6009,15 @@ again: } entry = addr; - entry->name_len = name_len; + put_unaligned(name_len, &entry->name_len); name_ptr = (char *)(entry + 1); read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1), name_len); - entry->type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; + put_unaligned(btrfs_filetype_table[btrfs_dir_type(leaf, di)], + &entry->type); btrfs_dir_item_key_to_cpu(leaf, di, &location); - entry->ino = location.objectid; - entry->offset = found_key.offset; + put_unaligned(location.objectid, &entry->ino); + put_unaligned(found_key.offset, &entry->offset); entries++; addr += sizeof(struct dir_entry) + name_len; total_len += sizeof(struct dir_entry) + name_len; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index b2db398..632e26d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/kernel.h> diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 621083f..e4faefa 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ + #include <linux/sched.h> #include <linux/pagemap.h> #include <linux/spinlock.h> diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index c44a9d5..29135de 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __BTRFS_LOCKING_ -#define __BTRFS_LOCKING_ +#ifndef BTRFS_LOCKING_H +#define BTRFS_LOCKING_H #define BTRFS_WRITE_LOCK 1 #define BTRFS_READ_LOCK 2 diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 1c7f7f7..0667ea0 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/kernel.h> diff --git a/fs/btrfs/math.h b/fs/btrfs/math.h index 1b10a3c..75246f2 100644 --- a/fs/btrfs/math.h +++ b/fs/btrfs/math.h @@ -1,25 +1,11 @@ - +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2012 Fujitsu. All rights reserved. * Written by Miao Xie <miaox@cn.fujitsu.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __BTRFS_MATH_H -#define __BTRFS_MATH_H +#ifndef BTRFS_MATH_H +#define BTRFS_MATH_H #include <asm/div64.h> diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 661cc3d..6db8bb2 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/slab.h> diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 4a1672a..3be443f 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __BTRFS_ORDERED_DATA__ -#define __BTRFS_ORDERED_DATA__ +#ifndef BTRFS_ORDERED_DATA_H +#define BTRFS_ORDERED_DATA_H /* one of these per inode */ struct btrfs_ordered_inode_tree { @@ -218,4 +205,5 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans, void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid); int __init ordered_data_init(void); void __cold ordered_data_exit(void); + #endif diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c index 47767d5..aa53410 100644 --- a/fs/btrfs/orphan.c +++ b/fs/btrfs/orphan.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2008 Red Hat. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include "ctree.h" diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 4a87704..21a831d 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include "ctree.h" @@ -202,9 +189,10 @@ void btrfs_print_leaf(struct extent_buffer *l) fs_info = l->fs_info; nr = btrfs_header_nritems(l); - btrfs_info(fs_info, "leaf %llu total ptrs %d free space %d", - btrfs_header_bytenr(l), nr, - btrfs_leaf_free_space(fs_info, l)); + btrfs_info(fs_info, + "leaf %llu gen %llu total ptrs %d free space %d owner %llu", + btrfs_header_bytenr(l), btrfs_header_generation(l), nr, + btrfs_leaf_free_space(fs_info, l), btrfs_header_owner(l)); for (i = 0 ; i < nr ; i++) { item = btrfs_item_nr(i); btrfs_item_key_to_cpu(l, &key, i); @@ -338,7 +326,7 @@ void btrfs_print_leaf(struct extent_buffer *l) } } -void btrfs_print_tree(struct extent_buffer *c) +void btrfs_print_tree(struct extent_buffer *c, bool follow) { struct btrfs_fs_info *fs_info; int i; u32 nr; @@ -355,15 +343,19 @@ void btrfs_print_tree(struct extent_buffer *c) return; } btrfs_info(fs_info, - "node %llu level %d total ptrs %d free spc %u", - btrfs_header_bytenr(c), level, nr, - (u32)BTRFS_NODEPTRS_PER_BLOCK(fs_info) - nr); + "node %llu level %d gen %llu total ptrs %d free spc %u owner %llu", + btrfs_header_bytenr(c), level, btrfs_header_generation(c), + nr, (u32)BTRFS_NODEPTRS_PER_BLOCK(fs_info) - nr, + btrfs_header_owner(c)); for (i = 0; i < nr; i++) { btrfs_node_key_to_cpu(c, &key, i); - pr_info("\tkey %d (%llu %u %llu) block %llu\n", + pr_info("\tkey %d (%llu %u %llu) block %llu gen %llu\n", i, key.objectid, key.type, key.offset, - btrfs_node_blockptr(c, i)); + btrfs_node_blockptr(c, i), + btrfs_node_ptr_generation(c, i)); } + if (!follow) + return; for (i = 0; i < nr; i++) { struct btrfs_key first_key; struct extent_buffer *next; @@ -385,7 +377,7 @@ void btrfs_print_tree(struct extent_buffer *c) if (btrfs_header_level(next) != level - 1) BUG(); - btrfs_print_tree(next); + btrfs_print_tree(next, follow); free_extent_buffer(next); } } diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h index 3afd508..e6bb38f 100644 --- a/fs/btrfs/print-tree.h +++ b/fs/btrfs/print-tree.h @@ -1,23 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __PRINT_TREE_ -#define __PRINT_TREE_ +#ifndef BTRFS_PRINT_TREE_H +#define BTRFS_PRINT_TREE_H + void btrfs_print_leaf(struct extent_buffer *l); -void btrfs_print_tree(struct extent_buffer *c); +void btrfs_print_tree(struct extent_buffer *c, bool follow); + #endif diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index 5859f7d..dc61400 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/hashtable.h> @@ -393,6 +380,7 @@ static int prop_compression_apply(struct inode *inode, const char *value, size_t len) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int type; if (len == 0) { @@ -403,14 +391,17 @@ static int prop_compression_apply(struct inode *inode, return 0; } - if (!strncmp("lzo", value, 3)) + if (!strncmp("lzo", value, 3)) { type = BTRFS_COMPRESS_LZO; - else if (!strncmp("zlib", value, 4)) + btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); + } else if (!strncmp("zlib", value, 4)) { type = BTRFS_COMPRESS_ZLIB; - else if (!strncmp("zstd", value, len)) + } else if (!strncmp("zstd", value, len)) { type = BTRFS_COMPRESS_ZSTD; - else + btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); + } else { return -EINVAL; + } BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h index 100f188..618815b 100644 --- a/fs/btrfs/props.h +++ b/fs/btrfs/props.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __BTRFS_PROPS_H -#define __BTRFS_PROPS_H +#ifndef BTRFS_PROPS_H +#define BTRFS_PROPS_H #include "ctree.h" diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index f583f13..9fb758d 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2011 STRATO. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/sched.h> @@ -24,6 +11,7 @@ #include <linux/slab.h> #include <linux/workqueue.h> #include <linux/btrfs.h> +#include <linux/sizes.h> #include "ctree.h" #include "transaction.h" @@ -2388,8 +2376,21 @@ out: return ret; } -static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes) +/* + * Two limits to commit transaction in advance. + * + * For RATIO, it will be 1/RATIO of the remaining limit + * (excluding data and prealloc meta) as threshold. + * For SIZE, it will be in byte unit as threshold. + */ +#define QGROUP_PERTRANS_RATIO 32 +#define QGROUP_PERTRANS_SIZE SZ_32M +static bool qgroup_check_limits(struct btrfs_fs_info *fs_info, + const struct btrfs_qgroup *qg, u64 num_bytes) { + u64 limit; + u64 threshold; + if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer) return false; @@ -2398,6 +2399,31 @@ static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes) qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl) return false; + /* + * Even if we passed the check, it's better to check if reservation + * for meta_pertrans is pushing us near limit. + * If there is too much pertrans reservation or it's near the limit, + * let's try commit transaction to free some, using transaction_kthread + */ + if ((qg->lim_flags & (BTRFS_QGROUP_LIMIT_MAX_RFER | + BTRFS_QGROUP_LIMIT_MAX_EXCL))) { + if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) + limit = qg->max_excl; + else + limit = qg->max_rfer; + threshold = (limit - qg->rsv.values[BTRFS_QGROUP_RSV_DATA] - + qg->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC]) / + QGROUP_PERTRANS_RATIO; + threshold = min_t(u64, threshold, QGROUP_PERTRANS_SIZE); + + /* + * Use transaction_kthread to commit transaction, so we no + * longer need to bother nested transaction nor lock context. + */ + if (qg->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > threshold) + btrfs_commit_transaction_locksafe(fs_info); + } + return true; } @@ -2447,7 +2473,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce, qg = unode_aux_to_qgroup(unode); - if (enforce && !qgroup_check_limits(qg, num_bytes)) { + if (enforce && !qgroup_check_limits(fs_info, qg, num_bytes)) { ret = -EDQUOT; goto out; } diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index e63e2d4..d60dd06 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2014 Facebook. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __BTRFS_QGROUP__ -#define __BTRFS_QGROUP__ +#ifndef BTRFS_QGROUP_H +#define BTRFS_QGROUP_H #include "ulist.h" #include "delayed-ref.h" @@ -341,4 +328,5 @@ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root); void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes); void btrfs_qgroup_check_reserved_leak(struct inode *inode); -#endif /* __BTRFS_QGROUP__ */ + +#endif diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index c3a2bc8..9abd950 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1,21 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2012 Fusion-io All rights reserved. * Copyright (C) 2012 Intel Corp. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ + #include <linux/sched.h> #include <linux/wait.h> #include <linux/bio.h> diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 4ee4fe3..f5d4c13 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -1,24 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2012 Fusion-io All rights reserved. * Copyright (C) 2012 Intel Corp. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __BTRFS_RAID56__ -#define __BTRFS_RAID56__ +#ifndef BTRFS_RAID56_H +#define BTRFS_RAID56_H + static inline int nr_parity_stripes(struct map_lookup *map) { if (map->type & BTRFS_BLOCK_GROUP_RAID5) @@ -65,4 +53,5 @@ void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio); int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); + #endif diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h index 9e111e4..a97dc74 100644 --- a/fs/btrfs/rcu-string.h +++ b/fs/btrfs/rcu-string.h @@ -1,21 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2012 Red Hat. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ +#ifndef BTRFS_RCU_STRING_H +#define BTRFS_RCU_STRING_H + struct rcu_string { struct rcu_head rcu; char str[0]; @@ -54,3 +44,5 @@ static inline void rcu_string_free(struct rcu_string *str) struct rcu_string *__str = rcu_dereference(rcu_str); \ __str->str; \ }) + +#endif diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index a52dd12..40f1bce 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2011 STRATO. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/sched.h> diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 35fab67..e5b9e59 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2014 Facebook. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/sched.h> diff --git a/fs/btrfs/ref-verify.h b/fs/btrfs/ref-verify.h index 3bf02ce..b7d2a4e 100644 --- a/fs/btrfs/ref-verify.h +++ b/fs/btrfs/ref-verify.h @@ -1,22 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2014 Facebook. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __REF_VERIFY__ -#define __REF_VERIFY__ + +#ifndef BTRFS_REF_VERIFY_H +#define BTRFS_REF_VERIFY_H #ifdef CONFIG_BTRFS_FS_REF_VERIFY int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info); @@ -59,4 +47,5 @@ static inline void btrfs_init_ref_verify(struct btrfs_fs_info *fs_info) } #endif /* CONFIG_BTRFS_FS_REF_VERIFY */ -#endif /* _REF_VERIFY__ */ + +#endif diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 4874c09..b041b94 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2009 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/sched.h> @@ -1854,7 +1841,7 @@ again: old_bytenr = btrfs_node_blockptr(parent, slot); blocksize = fs_info->nodesize; old_ptr_gen = btrfs_node_ptr_generation(parent, slot); - btrfs_node_key_to_cpu(parent, &key, slot); + btrfs_node_key_to_cpu(parent, &first_key, slot); if (level <= max_level) { eb = path->nodes[level]; diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index aab0194..6db3bda 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/err.h> diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 1a2066a..52b39a0 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2011, 2012 STRATO. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/blkdev.h> diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 1f5748c..c0074d2 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2012 Alexander Block. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/bsearch.h> @@ -5249,6 +5236,10 @@ static int send_write_or_clone(struct send_ctx *sctx, len = btrfs_file_extent_num_bytes(path->nodes[0], ei); } + if (offset >= sctx->cur_inode_size) { + ret = 0; + goto out; + } if (offset + len > sctx->cur_inode_size) len = sctx->cur_inode_size - offset; if (len == 0) { diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 3aa4bc5..ead397f 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -1,22 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2012 Alexander Block. All rights reserved. * Copyright (C) 2012 STRATO. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ +#ifndef BTRFS_SEND_H +#define BTRFS_SEND_H + #include "ctree.h" #define BTRFS_SEND_STREAM_MAGIC "btrfs-stream" @@ -132,3 +122,5 @@ enum { #ifdef __KERNEL__ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg); #endif + +#endif diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c index 5e2b92d..b7b4acb 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/struct-funcs.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/highmem.h> diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 170baef..0628092 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/blkdev.h> diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index ca06747..4848a43 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/sched.h> diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 80457f3..b567560 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BTRFS_SYSFS_H_ -#define _BTRFS_SYSFS_H_ + +#ifndef BTRFS_SYSFS_H +#define BTRFS_SYSFS_H /* * Data exported through sysfs @@ -90,4 +91,4 @@ void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info, u64 bit, enum btrfs_feature_set set); -#endif /* _BTRFS_SYSFS_H_ */ +#endif diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index e742781..30ed438 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2013 Fusion IO. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/fs.h> diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index bc0615b..a5a0b95 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2013 Fusion IO. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __BTRFS_TESTS -#define __BTRFS_TESTS +#ifndef BTRFS_TESTS_H +#define BTRFS_TESTS_H #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS int btrfs_run_sanity_tests(void); diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c index b9142c6..31e8a9e 100644 --- a/fs/btrfs/tests/extent-buffer-tests.c +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2013 Fusion IO. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/slab.h> diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 2e7f64a..76aa5a6 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2013 Fusion IO. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/pagemap.h> diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index c23bd00..79e0a5f 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2017 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/types.h> diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c index eca6412..d3c9f8a 100644 --- a/fs/btrfs/tests/free-space-tests.c +++ b/fs/btrfs/tests/free-space-tests.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2013 Fusion IO. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/slab.h> diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index 8444a01..e1f9666 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2015 Facebook. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/types.h> diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 13420cd..e0ba799 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2013 Fusion IO. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/types.h> diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 160eb2f..39b9578 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2013 Facebook. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/types.h> diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 5c4cf0f..c944b47 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/fs.h> @@ -2280,6 +2267,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) */ cur_trans->state = TRANS_STATE_COMPLETED; wake_up(&cur_trans->commit_wait); + clear_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags); spin_lock(&fs_info->trans_lock); list_del_init(&cur_trans->list); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index b6c94ce..d8c0826 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __BTRFS_TRANSACTION__ -#define __BTRFS_TRANSACTION__ +#ifndef BTRFS_TRANSACTION_H +#define BTRFS_TRANSACTION_H #include <linux/refcount.h> #include "btrfs_inode.h" @@ -212,6 +199,20 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root); int btrfs_commit_transaction(struct btrfs_trans_handle *trans); int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, int wait_for_unblock); + +/* + * Try to commit transaction asynchronously, so this is safe to call + * even holding a spinlock. + * + * It's done by informing transaction_kthread to commit transaction without + * waiting for commit interval. + */ +static inline void btrfs_commit_transaction_locksafe( + struct btrfs_fs_info *fs_info) +{ + set_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags); + wake_up_process(fs_info->transaction_kthread); +} int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans); int btrfs_should_end_transaction(struct btrfs_trans_handle *trans); void btrfs_throttle(struct btrfs_fs_info *fs_info); @@ -228,4 +229,5 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction); void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info); void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); + #endif diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 8871286..8d40e7d 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1,17 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) Qu Wenruo 2017. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program. */ /* diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h index aba5427..ff04327 100644 --- a/fs/btrfs/tree-checker.h +++ b/fs/btrfs/tree-checker.h @@ -1,21 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) Qu Wenruo 2017. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program. */ -#ifndef __BTRFS_TREE_CHECKER__ -#define __BTRFS_TREE_CHECKER__ +#ifndef BTRFS_TREE_CHECKER_H +#define BTRFS_TREE_CHECKER_H #include "ctree.h" #include "extent_io.h" diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index c09dbe4..3c0987a 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/sched.h> diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c91babc..8f23a94 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/sched.h> @@ -2352,8 +2339,10 @@ again: nritems = btrfs_header_nritems(path->nodes[0]); if (path->slots[0] >= nritems) { ret = btrfs_next_leaf(root, path); - if (ret) + if (ret == 1) break; + else if (ret < 0) + goto out; } btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); @@ -2456,13 +2445,41 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, if (ret) break; - /* for regular files, make sure corresponding - * orphan item exist. extents past the new EOF - * will be truncated later by orphan cleanup. + /* + * Before replaying extents, truncate the inode to its + * size. We need to do it now and not after log replay + * because before an fsync we can have prealloc extents + * added beyond the inode's i_size. If we did it after, + * through orphan cleanup for example, we would drop + * those prealloc extents just after replaying them. */ if (S_ISREG(mode)) { - ret = insert_orphan_item(wc->trans, root, - key.objectid); + struct inode *inode; + u64 from; + + inode = read_one_inode(root, key.objectid); + if (!inode) { + ret = -EIO; + break; + } + from = ALIGN(i_size_read(inode), + root->fs_info->sectorsize); + ret = btrfs_drop_extents(wc->trans, root, inode, + from, (u64)-1, 1); + /* + * If the nlink count is zero here, the iput + * will free the inode. We bump it to make + * sure it doesn't get freed until the link + * count fixup is done. + */ + if (!ret) { + if (inode->i_nlink == 0) + inc_nlink(inode); + /* Update link count and nbytes. */ + ret = btrfs_update_inode(wc->trans, + root, inode); + } + iput(inode); if (ret) break; } @@ -3520,8 +3537,11 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, * from this directory and from this transaction */ ret = btrfs_next_leaf(root, path); - if (ret == 1) { - last_offset = (u64)-1; + if (ret) { + if (ret == 1) + last_offset = (u64)-1; + else + err = ret; goto done; } btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); @@ -4300,6 +4320,110 @@ static int log_one_extent(struct btrfs_trans_handle *trans, return ret; } +/* + * Log all prealloc extents beyond the inode's i_size to make sure we do not + * lose them after doing a fast fsync and replaying the log. We scan the + * subvolume's root instead of iterating the inode's extent map tree because + * otherwise we can log incorrect extent items based on extent map conversion. + * That can happen due to the fact that extent maps are merged when they + * are not in the extent map tree's list of modified extents. + */ +static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path) +{ + struct btrfs_root *root = inode->root; + struct btrfs_key key; + const u64 i_size = i_size_read(&inode->vfs_inode); + const u64 ino = btrfs_ino(inode); + struct btrfs_path *dst_path = NULL; + u64 last_extent = (u64)-1; + int ins_nr = 0; + int start_slot; + int ret; + + if (!(inode->flags & BTRFS_INODE_PREALLOC)) + return 0; + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = i_size; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + while (true) { + struct extent_buffer *leaf = path->nodes[0]; + int slot = path->slots[0]; + + if (slot >= btrfs_header_nritems(leaf)) { + if (ins_nr > 0) { + ret = copy_items(trans, inode, dst_path, path, + &last_extent, start_slot, + ins_nr, 1, 0); + if (ret < 0) + goto out; + ins_nr = 0; + } + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + if (ret > 0) { + ret = 0; + break; + } + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid > ino) + break; + if (WARN_ON_ONCE(key.objectid < ino) || + key.type < BTRFS_EXTENT_DATA_KEY || + key.offset < i_size) { + path->slots[0]++; + continue; + } + if (last_extent == (u64)-1) { + last_extent = key.offset; + /* + * Avoid logging extent items logged in past fsync calls + * and leading to duplicate keys in the log tree. + */ + do { + ret = btrfs_truncate_inode_items(trans, + root->log_root, + &inode->vfs_inode, + i_size, + BTRFS_EXTENT_DATA_KEY); + } while (ret == -EAGAIN); + if (ret) + goto out; + } + if (ins_nr == 0) + start_slot = slot; + ins_nr++; + path->slots[0]++; + if (!dst_path) { + dst_path = btrfs_alloc_path(); + if (!dst_path) { + ret = -ENOMEM; + goto out; + } + } + } + if (ins_nr > 0) { + ret = copy_items(trans, inode, dst_path, path, &last_extent, + start_slot, ins_nr, 1, 0); + if (ret > 0) + ret = 0; + } +out: + btrfs_release_path(path); + btrfs_free_path(dst_path); + return ret; +} + static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, @@ -4342,6 +4466,11 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, if (em->generation <= test_gen) continue; + /* We log prealloc extents beyond eof later. */ + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && + em->start >= i_size_read(&inode->vfs_inode)) + continue; + if (em->start < logged_start) logged_start = em->start; if ((em->start + em->len - 1) > logged_end) @@ -4398,6 +4527,9 @@ process: up_write(&inode->dio_sem); btrfs_release_path(path); + if (!ret) + ret = btrfs_log_prealloc_extents(trans, inode, path); + return ret; } @@ -4782,6 +4914,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct extent_map_tree *em_tree = &inode->extent_tree; u64 logged_isize = 0; bool need_log_inode_item = true; + bool xattrs_logged = false; path = btrfs_alloc_path(); if (!path) @@ -5083,6 +5216,7 @@ next_key: err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path); if (err) goto out_unlock; + xattrs_logged = true; if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { btrfs_release_path(path); btrfs_release_path(dst_path); @@ -5095,6 +5229,11 @@ log_extents: btrfs_release_path(dst_path); if (need_log_inode_item) { err = log_inode_item(trans, log, dst_path, inode); + if (!err && !xattrs_logged) { + err = btrfs_log_all_xattrs(trans, root, inode, path, + dst_path); + btrfs_release_path(path); + } if (err) goto out_unlock; } diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 88abc43..122e68b 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __TREE_LOG_ -#define __TREE_LOG_ +#ifndef BTRFS_TREE_LOG_H +#define BTRFS_TREE_LOG_H #include "ctree.h" #include "transaction.h" @@ -87,4 +74,5 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, int btrfs_log_new_name(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_inode *old_dir, struct dentry *parent); + #endif diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index d8edf16..3374c9e 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -1,7 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2011 STRATO AG * written by Arne Jansen <sensille@gmx.net> - * Distributed under the GNU GPL license version 2. */ #include <linux/slab.h> diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h index 53c9136..02fda0a 100644 --- a/fs/btrfs/ulist.h +++ b/fs/btrfs/ulist.h @@ -1,12 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2011 STRATO AG * written by Arne Jansen <sensille@gmx.net> - * Distributed under the GNU GPL license version 2. - * */ -#ifndef __ULIST__ -#define __ULIST__ +#ifndef BTRFS_ULIST_H +#define BTRFS_ULIST_H #include <linux/list.h> #include <linux/rbtree.h> diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 9916f03..1ba7ca2 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) STRATO AG 2013. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ + #include <linux/uuid.h> #include <asm/unaligned.h> #include "ctree.h" diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 93f8f17..be3fc70 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ + #include <linux/sched.h> #include <linux/bio.h> #include <linux/slab.h> @@ -4064,6 +4052,15 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) return 0; } + /* + * A ro->rw remount sequence should continue with the paused balance + * regardless of who pauses it, system or the user as of now, so set + * the resume flag. + */ + spin_lock(&fs_info->balance_lock); + fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME; + spin_unlock(&fs_info->balance_lock); + tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); return PTR_ERR_OR_ZERO(tsk); } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index d1fcaea..7909688 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __BTRFS_VOLUMES_ -#define __BTRFS_VOLUMES_ +#ifndef BTRFS_VOLUMES_H +#define BTRFS_VOLUMES_H #include <linux/bio.h> #include <linux/sort.h> diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index e1e8177..ea78c3d 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Red Hat. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/init.h> @@ -32,7 +19,6 @@ #include "props.h" #include "locking.h" - int btrfs_getxattr(struct inode *inode, const char *name, void *buffer, size_t size) { diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h index e215a32..471fcac 100644 --- a/fs/btrfs/xattr.h +++ b/fs/btrfs/xattr.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2007 Red Hat. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ -#ifndef __XATTR__ -#define __XATTR__ +#ifndef BTRFS_XATTR_H +#define BTRFS_XATTR_H #include <linux/xattr.h> @@ -34,4 +21,4 @@ int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir, const struct qstr *qstr); -#endif /* __XATTR__ */ +#endif diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 2b52950..970ff3e 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -1,20 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2008 Oracle. All rights reserved. * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * * Based on jffs2 zlib code: * Copyright © 2001-2007 Red Hat, Inc. * Created by David Woodhouse <dwmw2@infradead.org> diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 01a4eab6..af6ec59 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -1,16 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2016-present, Facebook, Inc. * All rights reserved. * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ + #include <linux/bio.h> #include <linux/err.h> #include <linux/init.h> diff --git a/fs/buffer.c b/fs/buffer.c index f349107..249b83f 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -494,35 +494,12 @@ repeat: return err; } -static void do_thaw_one(struct super_block *sb, void *unused) +void emergency_thaw_bdev(struct super_block *sb) { while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb)) printk(KERN_WARNING "Emergency Thaw on %pg\n", sb->s_bdev); } -static void do_thaw_all(struct work_struct *work) -{ - iterate_supers(do_thaw_one, NULL); - kfree(work); - printk(KERN_WARNING "Emergency Thaw complete\n"); -} - -/** - * emergency_thaw_all -- forcibly thaw every frozen filesystem - * - * Used for emergency unfreeze of all filesystems via SysRq - */ -void emergency_thaw_all(void) -{ - struct work_struct *work; - - work = kmalloc(sizeof(*work), GFP_ATOMIC); - if (work) { - INIT_WORK(work, do_thaw_all); - schedule_work(work); - } -} - /** * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers * @mapping: the mapping which wants those buffers written diff --git a/fs/ceph/file.c b/fs/ceph/file.c index f85040d..cf0e45b 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -70,69 +70,104 @@ static __le32 ceph_flags_sys2wire(u32 flags) */ /* - * Calculate the length sum of direct io vectors that can - * be combined into one page vector. + * How many pages to get in one call to iov_iter_get_pages(). This + * determines the size of the on-stack array used as a buffer. */ -static size_t dio_get_pagev_size(const struct iov_iter *it) +#define ITER_GET_BVECS_PAGES 64 + +static ssize_t __iter_get_bvecs(struct iov_iter *iter, size_t maxsize, + struct bio_vec *bvecs) { - const struct iovec *iov = it->iov; - const struct iovec *iovend = iov + it->nr_segs; - size_t size; - - size = iov->iov_len - it->iov_offset; - /* - * An iov can be page vectored when both the current tail - * and the next base are page aligned. - */ - while (PAGE_ALIGNED((iov->iov_base + iov->iov_len)) && - (++iov < iovend && PAGE_ALIGNED((iov->iov_base)))) { - size += iov->iov_len; - } - dout("dio_get_pagevlen len = %zu\n", size); - return size; + size_t size = 0; + int bvec_idx = 0; + + if (maxsize > iov_iter_count(iter)) + maxsize = iov_iter_count(iter); + + while (size < maxsize) { + struct page *pages[ITER_GET_BVECS_PAGES]; + ssize_t bytes; + size_t start; + int idx = 0; + + bytes = iov_iter_get_pages(iter, pages, maxsize - size, + ITER_GET_BVECS_PAGES, &start); + if (bytes < 0) + return size ?: bytes; + + iov_iter_advance(iter, bytes); + size += bytes; + + for ( ; bytes; idx++, bvec_idx++) { + struct bio_vec bv = { + .bv_page = pages[idx], + .bv_len = min_t(int, bytes, PAGE_SIZE - start), + .bv_offset = start, + }; + + bvecs[bvec_idx] = bv; + bytes -= bv.bv_len; + start = 0; + } + } + + return size; } /* - * Allocate a page vector based on (@it, @nbytes). - * The return value is the tuple describing a page vector, - * that is (@pages, @page_align, @num_pages). + * iov_iter_get_pages() only considers one iov_iter segment, no matter + * what maxsize or maxpages are given. For ITER_BVEC that is a single + * page. + * + * Attempt to get up to @maxsize bytes worth of pages from @iter. + * Return the number of bytes in the created bio_vec array, or an error. */ -static struct page ** -dio_get_pages_alloc(const struct iov_iter *it, size_t nbytes, - size_t *page_align, int *num_pages) +static ssize_t iter_get_bvecs_alloc(struct iov_iter *iter, size_t maxsize, + struct bio_vec **bvecs, int *num_bvecs) { - struct iov_iter tmp_it = *it; - size_t align; - struct page **pages; - int ret = 0, idx, npages; + struct bio_vec *bv; + size_t orig_count = iov_iter_count(iter); + ssize_t bytes; + int npages; - align = (unsigned long)(it->iov->iov_base + it->iov_offset) & - (PAGE_SIZE - 1); - npages = calc_pages_for(align, nbytes); - pages = kvmalloc(sizeof(*pages) * npages, GFP_KERNEL); - if (!pages) - return ERR_PTR(-ENOMEM); + iov_iter_truncate(iter, maxsize); + npages = iov_iter_npages(iter, INT_MAX); + iov_iter_reexpand(iter, orig_count); - for (idx = 0; idx < npages; ) { - size_t start; - ret = iov_iter_get_pages(&tmp_it, pages + idx, nbytes, - npages - idx, &start); - if (ret < 0) - goto fail; + /* + * __iter_get_bvecs() may populate only part of the array -- zero it + * out. + */ + bv = kvmalloc_array(npages, sizeof(*bv), GFP_KERNEL | __GFP_ZERO); + if (!bv) + return -ENOMEM; - iov_iter_advance(&tmp_it, ret); - nbytes -= ret; - idx += (ret + start + PAGE_SIZE - 1) / PAGE_SIZE; + bytes = __iter_get_bvecs(iter, maxsize, bv); + if (bytes < 0) { + /* + * No pages were pinned -- just free the array. + */ + kvfree(bv); + return bytes; } - BUG_ON(nbytes != 0); - *num_pages = npages; - *page_align = align; - dout("dio_get_pages_alloc: got %d pages align %zu\n", npages, align); - return pages; -fail: - ceph_put_page_vector(pages, idx, false); - return ERR_PTR(ret); + *bvecs = bv; + *num_bvecs = npages; + return bytes; +} + +static void put_bvecs(struct bio_vec *bvecs, int num_bvecs, bool should_dirty) +{ + int i; + + for (i = 0; i < num_bvecs; i++) { + if (bvecs[i].bv_page) { + if (should_dirty) + set_page_dirty_lock(bvecs[i].bv_page); + put_page(bvecs[i].bv_page); + } + } + kvfree(bvecs); } /* @@ -746,11 +781,12 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) struct inode *inode = req->r_inode; struct ceph_aio_request *aio_req = req->r_priv; struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0); - int num_pages = calc_pages_for((u64)osd_data->alignment, - osd_data->length); - dout("ceph_aio_complete_req %p rc %d bytes %llu\n", - inode, rc, osd_data->length); + BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS); + BUG_ON(!osd_data->num_bvecs); + + dout("ceph_aio_complete_req %p rc %d bytes %u\n", + inode, rc, osd_data->bvec_pos.iter.bi_size); if (rc == -EOLDSNAPC) { struct ceph_aio_work *aio_work; @@ -768,9 +804,10 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) } else if (!aio_req->write) { if (rc == -ENOENT) rc = 0; - if (rc >= 0 && osd_data->length > rc) { - int zoff = osd_data->alignment + rc; - int zlen = osd_data->length - rc; + if (rc >= 0 && osd_data->bvec_pos.iter.bi_size > rc) { + struct iov_iter i; + int zlen = osd_data->bvec_pos.iter.bi_size - rc; + /* * If read is satisfied by single OSD request, * it can pass EOF. Otherwise read is within @@ -785,13 +822,16 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) aio_req->total_len = rc + zlen; } - if (zlen > 0) - ceph_zero_page_vector_range(zoff, zlen, - osd_data->pages); + iov_iter_bvec(&i, ITER_BVEC, osd_data->bvec_pos.bvecs, + osd_data->num_bvecs, + osd_data->bvec_pos.iter.bi_size); + iov_iter_advance(&i, rc); + iov_iter_zero(zlen, &i); } } - ceph_put_page_vector(osd_data->pages, num_pages, aio_req->should_dirty); + put_bvecs(osd_data->bvec_pos.bvecs, osd_data->num_bvecs, + aio_req->should_dirty); ceph_osdc_put_request(req); if (rc < 0) @@ -879,7 +919,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_vino vino; struct ceph_osd_request *req; - struct page **pages; + struct bio_vec *bvecs; struct ceph_aio_request *aio_req = NULL; int num_pages = 0; int flags; @@ -914,10 +954,14 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, } while (iov_iter_count(iter) > 0) { - u64 size = dio_get_pagev_size(iter); - size_t start = 0; + u64 size = iov_iter_count(iter); ssize_t len; + if (write) + size = min_t(u64, size, fsc->mount_options->wsize); + else + size = min_t(u64, size, fsc->mount_options->rsize); + vino = ceph_vino(inode); req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, pos, &size, 0, @@ -933,18 +977,14 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, break; } - if (write) - size = min_t(u64, size, fsc->mount_options->wsize); - else - size = min_t(u64, size, fsc->mount_options->rsize); - - len = size; - pages = dio_get_pages_alloc(iter, len, &start, &num_pages); - if (IS_ERR(pages)) { + len = iter_get_bvecs_alloc(iter, size, &bvecs, &num_pages); + if (len < 0) { ceph_osdc_put_request(req); - ret = PTR_ERR(pages); + ret = len; break; } + if (len != size) + osd_req_op_extent_update(req, 0, len); /* * To simplify error handling, allow AIO when IO within i_size @@ -977,8 +1017,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, req->r_mtime = mtime; } - osd_req_op_extent_osd_data_pages(req, 0, pages, len, start, - false, false); + osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len); if (aio_req) { aio_req->total_len += len; @@ -991,7 +1030,6 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, list_add_tail(&req->r_unsafe_item, &aio_req->osd_reqs); pos += len; - iov_iter_advance(iter, len); continue; } @@ -1004,25 +1042,26 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, if (ret == -ENOENT) ret = 0; if (ret >= 0 && ret < len && pos + ret < size) { + struct iov_iter i; int zlen = min_t(size_t, len - ret, size - pos - ret); - ceph_zero_page_vector_range(start + ret, zlen, - pages); + + iov_iter_bvec(&i, ITER_BVEC, bvecs, num_pages, + len); + iov_iter_advance(&i, ret); + iov_iter_zero(zlen, &i); ret += zlen; } if (ret >= 0) len = ret; } - ceph_put_page_vector(pages, num_pages, should_dirty); - + put_bvecs(bvecs, num_pages, should_dirty); ceph_osdc_put_request(req); if (ret < 0) break; pos += len; - iov_iter_advance(iter, len); - if (!write && pos >= size) break; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 8bf6025..ae05692 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -669,13 +669,15 @@ void ceph_fill_file_time(struct inode *inode, int issued, CEPH_CAP_FILE_BUFFER| CEPH_CAP_AUTH_EXCL| CEPH_CAP_XATTR_EXCL)) { - if (timespec_compare(ctime, &inode->i_ctime) > 0) { + if (ci->i_version == 0 || + timespec_compare(ctime, &inode->i_ctime) > 0) { dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n", inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, ctime->tv_sec, ctime->tv_nsec); inode->i_ctime = *ctime; } - if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) { + if (ci->i_version == 0 || + ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) { /* the MDS did a utimes() */ dout("mtime %ld.%09ld -> %ld.%09ld " "tw %d -> %d\n", @@ -795,7 +797,6 @@ static int fill_inode(struct inode *inode, struct page *locked_page, new_issued = ~issued & le32_to_cpu(info->cap.caps); /* update inode */ - ci->i_version = le64_to_cpu(info->version); inode->i_rdev = le32_to_cpu(info->rdev); inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; @@ -868,6 +869,9 @@ static int fill_inode(struct inode *inode, struct page *locked_page, xattr_blob = NULL; } + /* finally update i_version */ + ci->i_version = le64_to_cpu(info->version); + inode->i_mapping->a_ops = &ceph_aops; switch (inode->i_mode & S_IFMT) { diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 7e72348..315f7e6 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -228,7 +228,15 @@ static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val, static bool ceph_vxattrcb_quota_exists(struct ceph_inode_info *ci) { - return (ci->i_max_files || ci->i_max_bytes); + bool ret = false; + spin_lock(&ci->i_ceph_lock); + if ((ci->i_max_files || ci->i_max_bytes) && + ci->i_vino.snap == CEPH_NOSNAP && + ci->i_snap_realm && + ci->i_snap_realm->ino == ci->i_vino.ino) + ret = true; + spin_unlock(&ci->i_ceph_lock); + return ret; } static size_t ceph_vxattrcb_quota(struct ceph_inode_info *ci, char *val, @@ -1008,14 +1016,19 @@ int __ceph_setxattr(struct inode *inode, const char *name, char *newval = NULL; struct ceph_inode_xattr *xattr = NULL; int required_blob_size; + bool check_realm = false; bool lock_snap_rwsem = false; if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; vxattr = ceph_match_vxattr(inode, name); - if (vxattr && vxattr->readonly) - return -EOPNOTSUPP; + if (vxattr) { + if (vxattr->readonly) + return -EOPNOTSUPP; + if (value && !strncmp(vxattr->name, "ceph.quota", 10)) + check_realm = true; + } /* pass any unhandled ceph.* xattrs through to the MDS */ if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN)) @@ -1109,6 +1122,15 @@ do_sync_unlocked: err = -EBUSY; } else { err = ceph_sync_setxattr(inode, name, value, size, flags); + if (err >= 0 && check_realm) { + /* check if snaprealm was created for quota inode */ + spin_lock(&ci->i_ceph_lock); + if ((ci->i_max_files || ci->i_max_bytes) && + !(ci->i_snap_realm && + ci->i_snap_realm->ino == ci->i_vino.ino)) + err = -EOPNOTSUPP; + spin_unlock(&ci->i_ceph_lock); + } } out: ceph_free_cap_flush(prealloc_cf); diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index 741749a..5f132d5 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -197,7 +197,7 @@ config CIFS_SMB311 config CIFS_SMB_DIRECT bool "SMB Direct support (Experimental)" - depends on CIFS=m && INFINIBAND || CIFS=y && INFINIBAND=y + depends on CIFS=m && INFINIBAND && INFINIBAND_ADDR_TRANS || CIFS=y && INFINIBAND=y && INFINIBAND_ADDR_TRANS=y help Enables SMB Direct experimental support for SMB 3.0, 3.02 and 3.1.1. SMB Direct allows transferring SMB packets over RDMA. If unsure, diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index e35e711..9d69ea4 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -42,23 +42,6 @@ cifs_dump_mem(char *label, void *data, int length) data, length, true); } -#ifdef CONFIG_CIFS_DEBUG -void cifs_vfs_err(const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - - va_start(args, fmt); - - vaf.fmt = fmt; - vaf.va = &args; - - pr_err_ratelimited("CIFS VFS: %pV", &vaf); - - va_end(args); -} -#endif - void cifs_dump_detail(void *buf) { #ifdef CONFIG_CIFS_DEBUG2 diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h index c611ca2..0e74690 100644 --- a/fs/cifs/cifs_debug.h +++ b/fs/cifs/cifs_debug.h @@ -39,6 +39,7 @@ extern int cifsFYI; #else #define NOISY 0 #endif +#define ONCE 8 /* * debug ON @@ -46,19 +47,28 @@ extern int cifsFYI; */ #ifdef CONFIG_CIFS_DEBUG -__printf(1, 2) void cifs_vfs_err(const char *fmt, ...); - /* information message: e.g., configuration, major event */ -#define cifs_dbg(type, fmt, ...) \ -do { \ - if (type == FYI && cifsFYI & CIFS_INFO) { \ - pr_debug_ratelimited("%s: " \ - fmt, __FILE__, ##__VA_ARGS__); \ - } else if (type == VFS) { \ - cifs_vfs_err(fmt, ##__VA_ARGS__); \ - } else if (type == NOISY && type != 0) { \ - pr_debug_ratelimited(fmt, ##__VA_ARGS__); \ - } \ +#define cifs_dbg_func(ratefunc, type, fmt, ...) \ +do { \ + if ((type) & FYI && cifsFYI & CIFS_INFO) { \ + pr_debug_ ## ratefunc("%s: " \ + fmt, __FILE__, ##__VA_ARGS__); \ + } else if ((type) & VFS) { \ + pr_err_ ## ratefunc("CIFS VFS: " \ + fmt, ##__VA_ARGS__); \ + } else if ((type) & NOISY && (NOISY != 0)) { \ + pr_debug_ ## ratefunc(fmt, ##__VA_ARGS__); \ + } \ +} while (0) + +#define cifs_dbg(type, fmt, ...) \ +do { \ + if ((type) & ONCE) \ + cifs_dbg_func(once, \ + type, fmt, ##__VA_ARGS__); \ + else \ + cifs_dbg_func(ratelimited, \ + type, fmt, ##__VA_ARGS__); \ } while (0) /* diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index f715609..5a5a015 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -1047,6 +1047,18 @@ out: return rc; } +/* + * Directory operations under CIFS/SMB2/SMB3 are synchronous, so fsync() + * is a dummy operation. + */ +static int cifs_dir_fsync(struct file *file, loff_t start, loff_t end, int datasync) +{ + cifs_dbg(FYI, "Sync directory - name: %pD datasync: 0x%x\n", + file, datasync); + + return 0; +} + static ssize_t cifs_copy_file_range(struct file *src_file, loff_t off, struct file *dst_file, loff_t destoff, size_t len, unsigned int flags) @@ -1181,6 +1193,7 @@ const struct file_operations cifs_dir_ops = { .copy_file_range = cifs_copy_file_range, .clone_file_range = cifs_clone_file_range, .llseek = generic_file_llseek, + .fsync = cifs_dir_fsync, }; static void diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 2282562..cb950a5 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -665,6 +665,8 @@ struct TCP_Server_Info { struct delayed_work echo; /* echo ping workqueue job */ char *smallbuf; /* pointer to current "small" buffer */ char *bigbuf; /* pointer to current "big" buffer */ + /* Total size of this PDU. Only valid from cifs_demultiplex_thread */ + unsigned int pdu_size; unsigned int total_read; /* total amount of data read in this pass */ #ifdef CONFIG_CIFS_FSCACHE struct fscache_cookie *fscache; /* client index cache cookie */ @@ -676,6 +678,7 @@ struct TCP_Server_Info { unsigned int max_read; unsigned int max_write; #ifdef CONFIG_CIFS_SMB311 + __le16 cipher_type; /* save initital negprot hash */ __u8 preauth_sha_hash[SMB2_PREAUTH_HASH_SIZE]; #endif /* 3.1.1 */ @@ -1373,6 +1376,7 @@ struct mid_q_entry { mid_handle_t *handle; /* call handle mid callback */ void *callback_data; /* general purpose pointer for callback */ void *resp_buf; /* pointer to received SMB header */ + unsigned int resp_buf_size; int mid_state; /* wish this were enum but can not pass to wait_event */ unsigned int mid_flags; __le16 command; /* smb command code */ diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 59c09a5..1529a08 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -206,8 +206,10 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) mutex_unlock(&ses->session_mutex); cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc); - if (rc) + if (rc) { + printk_once(KERN_WARNING "reconnect tcon failed rc = %d\n", rc); goto out; + } atomic_inc(&tconInfoReconnectCount); @@ -453,6 +455,9 @@ cifs_enable_signing(struct TCP_Server_Info *server, bool mnt_sign_required) server->sign = true; } + if (cifs_rdma_enabled(server) && server->sign) + cifs_dbg(VFS, "Signing is enabled, and RDMA read/write will be disabled"); + return 0; } @@ -1416,8 +1421,9 @@ openRetry: int cifs_discard_remaining_data(struct TCP_Server_Info *server) { - unsigned int rfclen = get_rfc1002_length(server->smallbuf); - int remaining = rfclen + 4 - server->total_read; + unsigned int rfclen = server->pdu_size; + int remaining = rfclen + server->vals->header_preamble_size - + server->total_read; while (remaining > 0) { int length; @@ -1454,7 +1460,7 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) unsigned int data_offset, data_len; struct cifs_readdata *rdata = mid->callback_data; char *buf = server->smallbuf; - unsigned int buflen = get_rfc1002_length(buf) + + unsigned int buflen = server->pdu_size + server->vals->header_preamble_size; bool use_rdma_mr = false; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 4e0808f..7a10a5d 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -772,7 +772,7 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid) { int length; char *buf = server->smallbuf; - unsigned int pdu_length = get_rfc1002_length(buf); + unsigned int pdu_length = server->pdu_size; /* make sure this will fit in a large buffer */ if (pdu_length > CIFSMaxBufSize + MAX_HEADER_SIZE(server) - @@ -881,6 +881,7 @@ cifs_demultiplex_thread(void *p) * so we can now interpret the length field. */ pdu_length = get_rfc1002_length(buf); + server->pdu_size = pdu_length; cifs_dbg(FYI, "RFC1002 header 0x%x\n", pdu_length); if (!is_smb_response(server, buf[0])) @@ -927,6 +928,7 @@ cifs_demultiplex_thread(void *p) server->lstrp = jiffies; if (mid_entry != NULL) { + mid_entry->resp_buf_size = server->pdu_size; if ((mid_entry->mid_flags & MID_WAIT_CANCELLED) && mid_entry->mid_state == MID_RESPONSE_RECEIVED && server->ops->handle_cancelled_mid) @@ -1975,14 +1977,6 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, goto cifs_parse_mount_err; } -#ifdef CONFIG_CIFS_SMB_DIRECT - if (vol->rdma && vol->sign) { - cifs_dbg(VFS, "Currently SMB direct doesn't support signing." - " This is being fixed\n"); - goto cifs_parse_mount_err; - } -#endif - #ifndef CONFIG_KEYS /* Muliuser mounts require CONFIG_KEYS support */ if (vol->multiuser) { @@ -2957,6 +2951,22 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) } } + if (volume_info->seal) { + if (ses->server->vals->protocol_id == 0) { + cifs_dbg(VFS, + "SMB3 or later required for encryption\n"); + rc = -EOPNOTSUPP; + goto out_fail; + } else if (tcon->ses->server->capabilities & + SMB2_GLOBAL_CAP_ENCRYPTION) + tcon->seal = true; + else { + cifs_dbg(VFS, "Encryption is not supported on share\n"); + rc = -EOPNOTSUPP; + goto out_fail; + } + } + /* * BB Do we need to wrap session_mutex around this TCon call and Unix * SetFS as we do on SessSetup and reconnect? @@ -3005,22 +3015,6 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) tcon->use_resilient = true; } - if (volume_info->seal) { - if (ses->server->vals->protocol_id == 0) { - cifs_dbg(VFS, - "SMB3 or later required for encryption\n"); - rc = -EOPNOTSUPP; - goto out_fail; - } else if (tcon->ses->server->capabilities & - SMB2_GLOBAL_CAP_ENCRYPTION) - tcon->seal = true; - else { - cifs_dbg(VFS, "Encryption is not supported on share\n"); - rc = -EOPNOTSUPP; - goto out_fail; - } - } - /* * We can have only one retry value for a connection to a share so for * resources mounted more than once to the same server share the last diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 81ba6e0..9258443 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -684,6 +684,9 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode, goto mknod_out; } + if (!S_ISCHR(mode) && !S_ISBLK(mode)) + goto mknod_out; + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) goto mknod_out; @@ -692,10 +695,8 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode, buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); if (buf == NULL) { - kfree(full_path); rc = -ENOMEM; - free_xid(xid); - return rc; + goto mknod_out; } if (backup_cred(cifs_sb)) @@ -742,7 +743,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode, pdev->minor = cpu_to_le64(MINOR(device_number)); rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms, &bytes_written, iov, 1); - } /* else if (S_ISFIFO) */ + } tcon->ses->server->ops->close(xid, tcon, &fid); d_drop(direntry); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 4bcd4e8..23fd430 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -3462,7 +3462,7 @@ cifs_read(struct file *file, char *read_data, size_t read_size, loff_t *offset) * If the page is mmap'ed into a process' page tables, then we need to make * sure that it doesn't change while being written back. */ -static int +static vm_fault_t cifs_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index f856df4..3c371f7 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -710,7 +710,7 @@ cgfi_exit: /* Simple function to return a 64 bit hash of string. Rarely called */ static __u64 simple_hashstr(const char *str) { - const __u64 hash_mult = 1125899906842597L; /* a big enough prime */ + const __u64 hash_mult = 1125899906842597ULL; /* a big enough prime */ __u64 hash = 0; while (*str) diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 5406e95..68ea849 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -93,6 +93,43 @@ static const __le16 smb2_rsp_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = { /* SMB2_OPLOCK_BREAK */ cpu_to_le16(24) }; +#ifdef CONFIG_CIFS_SMB311 +static __u32 get_neg_ctxt_len(struct smb2_hdr *hdr, __u32 len, __u32 non_ctxlen, + size_t hdr_preamble_size) +{ + __u16 neg_count; + __u32 nc_offset, size_of_pad_before_neg_ctxts; + struct smb2_negotiate_rsp *pneg_rsp = (struct smb2_negotiate_rsp *)hdr; + + /* Negotiate contexts are only valid for latest dialect SMB3.11 */ + neg_count = le16_to_cpu(pneg_rsp->NegotiateContextCount); + if ((neg_count == 0) || + (pneg_rsp->DialectRevision != cpu_to_le16(SMB311_PROT_ID))) + return 0; + + /* Make sure that negotiate contexts start after gss security blob */ + nc_offset = le32_to_cpu(pneg_rsp->NegotiateContextOffset); + if (nc_offset < non_ctxlen - hdr_preamble_size /* RFC1001 len */) { + printk_once(KERN_WARNING "invalid negotiate context offset\n"); + return 0; + } + size_of_pad_before_neg_ctxts = nc_offset - + (non_ctxlen - hdr_preamble_size); + + /* Verify that at least minimal negotiate contexts fit within frame */ + if (len < nc_offset + (neg_count * sizeof(struct smb2_neg_context))) { + printk_once(KERN_WARNING "negotiate context goes beyond end\n"); + return 0; + } + + cifs_dbg(FYI, "length of negcontexts %d pad %d\n", + len - nc_offset, size_of_pad_before_neg_ctxts); + + /* length of negcontexts including pad from end of sec blob to them */ + return (len - nc_offset) + size_of_pad_before_neg_ctxts; +} +#endif /* CIFS_SMB311 */ + int smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr) { @@ -198,6 +235,11 @@ smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr) clc_len = smb2_calc_size(hdr); +#ifdef CONFIG_CIFS_SMB311 + if (shdr->Command == SMB2_NEGOTIATE) + clc_len += get_neg_ctxt_len(hdr, len, clc_len, + srvr->vals->header_preamble_size); +#endif /* SMB311 */ if (srvr->vals->header_preamble_size + len != clc_len) { cifs_dbg(FYI, "Calculated size %u length %zu mismatch mid %llu\n", clc_len, srvr->vals->header_preamble_size + len, mid); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 968b1d43..9c6d95f 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -252,9 +252,14 @@ smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) wsize = volume_info->wsize ? volume_info->wsize : CIFS_DEFAULT_IOSIZE; wsize = min_t(unsigned int, wsize, server->max_write); #ifdef CONFIG_CIFS_SMB_DIRECT - if (server->rdma) - wsize = min_t(unsigned int, + if (server->rdma) { + if (server->sign) + wsize = min_t(unsigned int, + wsize, server->smbd_conn->max_fragmented_send_size); + else + wsize = min_t(unsigned int, wsize, server->smbd_conn->max_readwrite_size); + } #endif if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE); @@ -272,9 +277,14 @@ smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) rsize = volume_info->rsize ? volume_info->rsize : CIFS_DEFAULT_IOSIZE; rsize = min_t(unsigned int, rsize, server->max_read); #ifdef CONFIG_CIFS_SMB_DIRECT - if (server->rdma) - rsize = min_t(unsigned int, + if (server->rdma) { + if (server->sign) + rsize = min_t(unsigned int, + rsize, server->smbd_conn->max_fragmented_recv_size); + else + rsize = min_t(unsigned int, rsize, server->smbd_conn->max_readwrite_size); + } #endif if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) @@ -579,9 +589,15 @@ smb2_query_eas(const unsigned int xid, struct cifs_tcon *tcon, SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); + /* + * If ea_name is NULL (listxattr) and there are no EAs, return 0 as it's + * not an error. Otherwise, the specified ea_name was not found. + */ if (!rc) rc = move_smb2_ea_to_cifs(ea_data, buf_size, smb2_data, SMB2_MAX_EA_BUF, ea_name); + else if (!ea_name && rc == -ENODATA) + rc = 0; kfree(smb2_data); return rc; @@ -1451,7 +1467,8 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; struct cifs_open_parms oparms; struct cifs_fid fid; - struct smb2_err_rsp *err_buf = NULL; + struct kvec err_iov = {NULL, 0}; + struct smb2_err_rsp *err_buf; struct smb2_symlink_err_rsp *symlink; unsigned int sub_len; unsigned int sub_offset; @@ -1473,15 +1490,16 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, oparms.fid = &fid; oparms.reconnect = false; - rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, &err_buf); + rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, &err_iov); - if (!rc || !err_buf) { + if (!rc || !err_iov.iov_base) { kfree(utf16_path); return -ENOENT; } + err_buf = err_iov.iov_base; if (le32_to_cpu(err_buf->ByteCount) < sizeof(struct smb2_symlink_err_rsp) || - get_rfc1002_length(err_buf) + server->vals->header_preamble_size < SMB2_SYMLINK_STRUCT_SIZE) { + err_iov.iov_len + server->vals->header_preamble_size < SMB2_SYMLINK_STRUCT_SIZE) { kfree(utf16_path); return -ENOENT; } @@ -1494,13 +1512,13 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, print_len = le16_to_cpu(symlink->PrintNameLength); print_offset = le16_to_cpu(symlink->PrintNameOffset); - if (get_rfc1002_length(err_buf) + server->vals->header_preamble_size < + if (err_iov.iov_len + server->vals->header_preamble_size < SMB2_SYMLINK_STRUCT_SIZE + sub_offset + sub_len) { kfree(utf16_path); return -ENOENT; } - if (get_rfc1002_length(err_buf) + server->vals->header_preamble_size < + if (err_iov.iov_len + server->vals->header_preamble_size < SMB2_SYMLINK_STRUCT_SIZE + print_offset + print_len) { kfree(utf16_path); return -ENOENT; @@ -2550,7 +2568,7 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid) unsigned int npages; struct page **pages; unsigned int len; - unsigned int buflen = get_rfc1002_length(buf) + server->vals->header_preamble_size; + unsigned int buflen = server->pdu_size + server->vals->header_preamble_size; int rc; int i = 0; @@ -2624,7 +2642,7 @@ receive_encrypted_standard(struct TCP_Server_Info *server, { int length; char *buf = server->smallbuf; - unsigned int pdu_length = get_rfc1002_length(buf); + unsigned int pdu_length = server->pdu_size; unsigned int buf_size; struct mid_q_entry *mid_entry; @@ -2668,7 +2686,7 @@ static int smb3_receive_transform(struct TCP_Server_Info *server, struct mid_q_entry **mid) { char *buf = server->smallbuf; - unsigned int pdu_length = get_rfc1002_length(buf); + unsigned int pdu_length = server->pdu_size; struct smb2_transform_hdr *tr_hdr = (struct smb2_transform_hdr *)buf; unsigned int orig_len = le32_to_cpu(tr_hdr->OriginalMessageSize); @@ -2699,7 +2717,7 @@ smb3_handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid) { char *buf = server->large_buf ? server->bigbuf : server->smallbuf; - return handle_read_data(server, mid, buf, get_rfc1002_length(buf) + + return handle_read_data(server, mid, buf, server->pdu_size + server->vals->header_preamble_size, NULL, 0, 0); } diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index f7741ce..0f48741 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -268,8 +268,11 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon) mutex_unlock(&tcon->ses->session_mutex); cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc); - if (rc) + if (rc) { + /* If sess reconnected but tcon didn't, something strange ... */ + printk_once(KERN_WARNING "reconnect tcon failed rc = %d\n", rc); goto out; + } if (smb2_command != SMB2_INTERNAL_CMD) queue_delayed_work(cifsiod_wq, &server->reconnect, 0); @@ -380,10 +383,10 @@ static void build_encrypt_ctxt(struct smb2_encryption_neg_context *pneg_ctxt) { pneg_ctxt->ContextType = SMB2_ENCRYPTION_CAPABILITIES; - pneg_ctxt->DataLength = cpu_to_le16(6); - pneg_ctxt->CipherCount = cpu_to_le16(2); - pneg_ctxt->Ciphers[0] = SMB2_ENCRYPTION_AES128_GCM; - pneg_ctxt->Ciphers[1] = SMB2_ENCRYPTION_AES128_CCM; + pneg_ctxt->DataLength = cpu_to_le16(4); /* Cipher Count + le16 cipher */ + pneg_ctxt->CipherCount = cpu_to_le16(1); +/* pneg_ctxt->Ciphers[0] = SMB2_ENCRYPTION_AES128_GCM;*/ /* not supported yet */ + pneg_ctxt->Ciphers[0] = SMB2_ENCRYPTION_AES128_CCM; } static void @@ -403,6 +406,101 @@ assemble_neg_contexts(struct smb2_negotiate_req *req, *total_len += 4 + sizeof(struct smb2_preauth_neg_context) + sizeof(struct smb2_encryption_neg_context); } + +static void decode_preauth_context(struct smb2_preauth_neg_context *ctxt) +{ + unsigned int len = le16_to_cpu(ctxt->DataLength); + + /* If invalid preauth context warn but use what we requested, SHA-512 */ + if (len < MIN_PREAUTH_CTXT_DATA_LEN) { + printk_once(KERN_WARNING "server sent bad preauth context\n"); + return; + } + if (le16_to_cpu(ctxt->HashAlgorithmCount) != 1) + printk_once(KERN_WARNING "illegal SMB3 hash algorithm count\n"); + if (ctxt->HashAlgorithms != SMB2_PREAUTH_INTEGRITY_SHA512) + printk_once(KERN_WARNING "unknown SMB3 hash algorithm\n"); +} + +static int decode_encrypt_ctx(struct TCP_Server_Info *server, + struct smb2_encryption_neg_context *ctxt) +{ + unsigned int len = le16_to_cpu(ctxt->DataLength); + + cifs_dbg(FYI, "decode SMB3.11 encryption neg context of len %d\n", len); + if (len < MIN_ENCRYPT_CTXT_DATA_LEN) { + printk_once(KERN_WARNING "server sent bad crypto ctxt len\n"); + return -EINVAL; + } + + if (le16_to_cpu(ctxt->CipherCount) != 1) { + printk_once(KERN_WARNING "illegal SMB3.11 cipher count\n"); + return -EINVAL; + } + cifs_dbg(FYI, "SMB311 cipher type:%d\n", le16_to_cpu(ctxt->Ciphers[0])); + if ((ctxt->Ciphers[0] != SMB2_ENCRYPTION_AES128_CCM) && + (ctxt->Ciphers[0] != SMB2_ENCRYPTION_AES128_GCM)) { + printk_once(KERN_WARNING "invalid SMB3.11 cipher returned\n"); + return -EINVAL; + } + server->cipher_type = ctxt->Ciphers[0]; + server->capabilities |= SMB2_GLOBAL_CAP_ENCRYPTION; + return 0; +} + +static int smb311_decode_neg_context(struct smb2_negotiate_rsp *rsp, + struct TCP_Server_Info *server) +{ + struct smb2_neg_context *pctx; + unsigned int offset = le32_to_cpu(rsp->NegotiateContextOffset); + unsigned int ctxt_cnt = le16_to_cpu(rsp->NegotiateContextCount); + unsigned int len_of_smb = be32_to_cpu(rsp->hdr.smb2_buf_length); + unsigned int len_of_ctxts, i; + int rc = 0; + + cifs_dbg(FYI, "decoding %d negotiate contexts\n", ctxt_cnt); + if (len_of_smb <= offset) { + cifs_dbg(VFS, "Invalid response: negotiate context offset\n"); + return -EINVAL; + } + + len_of_ctxts = len_of_smb - offset; + + for (i = 0; i < ctxt_cnt; i++) { + int clen; + /* check that offset is not beyond end of SMB */ + if (len_of_ctxts == 0) + break; + + if (len_of_ctxts < sizeof(struct smb2_neg_context)) + break; + + pctx = (struct smb2_neg_context *)(offset + + server->vals->header_preamble_size + (char *)rsp); + clen = le16_to_cpu(pctx->DataLength); + if (clen > len_of_ctxts) + break; + + if (pctx->ContextType == SMB2_PREAUTH_INTEGRITY_CAPABILITIES) + decode_preauth_context( + (struct smb2_preauth_neg_context *)pctx); + else if (pctx->ContextType == SMB2_ENCRYPTION_CAPABILITIES) + rc = decode_encrypt_ctx(server, + (struct smb2_encryption_neg_context *)pctx); + else + cifs_dbg(VFS, "unknown negcontext of type %d ignored\n", + le16_to_cpu(pctx->ContextType)); + + if (rc) + break; + /* offsets must be 8 byte aligned */ + clen = (clen + 7) & ~0x7; + offset += clen + sizeof(struct smb2_neg_context); + len_of_ctxts -= clen; + } + return rc; +} + #else static void assemble_neg_contexts(struct smb2_negotiate_req *req, unsigned int *total_len) @@ -616,6 +714,15 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) else if (rc == 0) rc = -EIO; } + +#ifdef CONFIG_CIFS_SMB311 + if (rsp->DialectRevision == cpu_to_le16(SMB311_PROT_ID)) { + if (rsp->NegotiateContextCount) + rc = smb311_decode_neg_context(rsp, server); + else + cifs_dbg(VFS, "Missing expected negotiate contexts\n"); + } +#endif /* CONFIG_CIFS_SMB311 */ neg_exit: free_rsp_buf(resp_buftype, rsp); return rc; @@ -623,19 +730,14 @@ neg_exit: int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) { - int rc = 0; - struct validate_negotiate_info_req vneg_inbuf; + int rc; + struct validate_negotiate_info_req *pneg_inbuf; struct validate_negotiate_info_rsp *pneg_rsp = NULL; u32 rsplen; u32 inbuflen; /* max of 4 dialects */ cifs_dbg(FYI, "validate negotiate\n"); -#ifdef CONFIG_CIFS_SMB_DIRECT - if (tcon->ses->server->rdma) - return 0; -#endif - /* In SMB3.11 preauth integrity supersedes validate negotiate */ if (tcon->ses->server->dialect == SMB311_PROT_ID) return 0; @@ -658,63 +760,69 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) if (tcon->ses->session_flags & SMB2_SESSION_FLAG_IS_NULL) cifs_dbg(VFS, "Unexpected null user (anonymous) auth flag sent by server\n"); - vneg_inbuf.Capabilities = + pneg_inbuf = kmalloc(sizeof(*pneg_inbuf), GFP_NOFS); + if (!pneg_inbuf) + return -ENOMEM; + + pneg_inbuf->Capabilities = cpu_to_le32(tcon->ses->server->vals->req_capabilities); - memcpy(vneg_inbuf.Guid, tcon->ses->server->client_guid, + memcpy(pneg_inbuf->Guid, tcon->ses->server->client_guid, SMB2_CLIENT_GUID_SIZE); if (tcon->ses->sign) - vneg_inbuf.SecurityMode = + pneg_inbuf->SecurityMode = cpu_to_le16(SMB2_NEGOTIATE_SIGNING_REQUIRED); else if (global_secflags & CIFSSEC_MAY_SIGN) - vneg_inbuf.SecurityMode = + pneg_inbuf->SecurityMode = cpu_to_le16(SMB2_NEGOTIATE_SIGNING_ENABLED); else - vneg_inbuf.SecurityMode = 0; + pneg_inbuf->SecurityMode = 0; if (strcmp(tcon->ses->server->vals->version_string, SMB3ANY_VERSION_STRING) == 0) { - vneg_inbuf.Dialects[0] = cpu_to_le16(SMB30_PROT_ID); - vneg_inbuf.Dialects[1] = cpu_to_le16(SMB302_PROT_ID); - vneg_inbuf.DialectCount = cpu_to_le16(2); + pneg_inbuf->Dialects[0] = cpu_to_le16(SMB30_PROT_ID); + pneg_inbuf->Dialects[1] = cpu_to_le16(SMB302_PROT_ID); + pneg_inbuf->DialectCount = cpu_to_le16(2); /* structure is big enough for 3 dialects, sending only 2 */ - inbuflen = sizeof(struct validate_negotiate_info_req) - 2; + inbuflen = sizeof(*pneg_inbuf) - + sizeof(pneg_inbuf->Dialects[0]); } else if (strcmp(tcon->ses->server->vals->version_string, SMBDEFAULT_VERSION_STRING) == 0) { - vneg_inbuf.Dialects[0] = cpu_to_le16(SMB21_PROT_ID); - vneg_inbuf.Dialects[1] = cpu_to_le16(SMB30_PROT_ID); - vneg_inbuf.Dialects[2] = cpu_to_le16(SMB302_PROT_ID); - vneg_inbuf.DialectCount = cpu_to_le16(3); + pneg_inbuf->Dialects[0] = cpu_to_le16(SMB21_PROT_ID); + pneg_inbuf->Dialects[1] = cpu_to_le16(SMB30_PROT_ID); + pneg_inbuf->Dialects[2] = cpu_to_le16(SMB302_PROT_ID); + pneg_inbuf->DialectCount = cpu_to_le16(3); /* structure is big enough for 3 dialects */ - inbuflen = sizeof(struct validate_negotiate_info_req); + inbuflen = sizeof(*pneg_inbuf); } else { /* otherwise specific dialect was requested */ - vneg_inbuf.Dialects[0] = + pneg_inbuf->Dialects[0] = cpu_to_le16(tcon->ses->server->vals->protocol_id); - vneg_inbuf.DialectCount = cpu_to_le16(1); + pneg_inbuf->DialectCount = cpu_to_le16(1); /* structure is big enough for 3 dialects, sending only 1 */ - inbuflen = sizeof(struct validate_negotiate_info_req) - 4; + inbuflen = sizeof(*pneg_inbuf) - + sizeof(pneg_inbuf->Dialects[0]) * 2; } rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, FSCTL_VALIDATE_NEGOTIATE_INFO, true /* is_fsctl */, - (char *)&vneg_inbuf, sizeof(struct validate_negotiate_info_req), - (char **)&pneg_rsp, &rsplen); + (char *)pneg_inbuf, inbuflen, (char **)&pneg_rsp, &rsplen); if (rc != 0) { cifs_dbg(VFS, "validate protocol negotiate failed: %d\n", rc); - return -EIO; + rc = -EIO; + goto out_free_inbuf; } - if (rsplen != sizeof(struct validate_negotiate_info_rsp)) { + rc = -EIO; + if (rsplen != sizeof(*pneg_rsp)) { cifs_dbg(VFS, "invalid protocol negotiate response size: %d\n", rsplen); /* relax check since Mac returns max bufsize allowed on ioctl */ - if ((rsplen > CIFSMaxBufSize) - || (rsplen < sizeof(struct validate_negotiate_info_rsp))) - goto err_rsp_free; + if (rsplen > CIFSMaxBufSize || rsplen < sizeof(*pneg_rsp)) + goto out_free_rsp; } /* check validate negotiate info response matches what we got earlier */ @@ -731,15 +839,17 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) goto vneg_out; /* validate negotiate successful */ + rc = 0; cifs_dbg(FYI, "validate negotiate info successful\n"); - kfree(pneg_rsp); - return 0; + goto out_free_rsp; vneg_out: cifs_dbg(VFS, "protocol revalidation - security settings mismatch\n"); -err_rsp_free: +out_free_rsp: kfree(pneg_rsp); - return -EIO; +out_free_inbuf: + kfree(pneg_inbuf); + return rc; } enum securityEnum @@ -1026,7 +1136,7 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data) if (rc) goto out; - if (offsetof(struct smb2_sess_setup_rsp, Buffer) - 4 != + if (offsetof(struct smb2_sess_setup_rsp, Buffer) - ses->server->vals->header_preamble_size != le16_to_cpu(rsp->SecurityBufferOffset)) { cifs_dbg(VFS, "Invalid security buffer offset %d\n", le16_to_cpu(rsp->SecurityBufferOffset)); @@ -1701,7 +1811,7 @@ alloc_path_with_tree_prefix(__le16 **out_path, int *out_size, int *out_len, int SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, __u8 *oplock, struct smb2_file_all_info *buf, - struct smb2_err_rsp **err_buf) + struct kvec *err_iov) { struct smb2_create_req *req; struct smb2_create_rsp *rsp; @@ -1841,9 +1951,11 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, if (rc != 0) { cifs_stats_fail_inc(tcon, SMB2_CREATE_HE); - if (err_buf && rsp) - *err_buf = kmemdup(rsp, get_rfc1002_length(rsp) + 4, - GFP_KERNEL); + if (err_iov && rsp) { + *err_iov = rsp_iov; + resp_buftype = CIFS_NO_BUFFER; + rsp = NULL; + } goto creat_exit; } @@ -2098,13 +2210,13 @@ close_exit: } static int -validate_buf(unsigned int offset, unsigned int buffer_length, - struct smb2_hdr *hdr, unsigned int min_buf_size) - +validate_iov(struct TCP_Server_Info *server, + unsigned int offset, unsigned int buffer_length, + struct kvec *iov, unsigned int min_buf_size) { - unsigned int smb_len = be32_to_cpu(hdr->smb2_buf_length); - char *end_of_smb = smb_len + 4 /* RFC1001 length field */ + (char *)hdr; - char *begin_of_buf = 4 /* RFC1001 len field */ + offset + (char *)hdr; + unsigned int smb_len = iov->iov_len; + char *end_of_smb = smb_len + server->vals->header_preamble_size + (char *)iov->iov_base; + char *begin_of_buf = server->vals->header_preamble_size + offset + (char *)iov->iov_base; char *end_of_buf = begin_of_buf + buffer_length; @@ -2134,18 +2246,18 @@ validate_buf(unsigned int offset, unsigned int buffer_length, * Caller must free buffer. */ static int -validate_and_copy_buf(unsigned int offset, unsigned int buffer_length, - struct smb2_hdr *hdr, unsigned int minbufsize, +validate_and_copy_iov(struct TCP_Server_Info *server, + unsigned int offset, unsigned int buffer_length, + struct kvec *iov, unsigned int minbufsize, char *data) - { - char *begin_of_buf = 4 /* RFC1001 len field */ + offset + (char *)hdr; + char *begin_of_buf = server->vals->header_preamble_size + offset + (char *)(iov->iov_base); int rc; if (!data) return -EINVAL; - rc = validate_buf(offset, buffer_length, hdr, minbufsize); + rc = validate_iov(server, offset, buffer_length, iov, minbufsize); if (rc) return rc; @@ -2223,9 +2335,10 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, } } - rc = validate_and_copy_buf(le16_to_cpu(rsp->OutputBufferOffset), + rc = validate_and_copy_iov(ses->server, + le16_to_cpu(rsp->OutputBufferOffset), le32_to_cpu(rsp->OutputBufferLength), - &rsp->hdr, min_len, *data); + &rsp_iov, min_len, *data); qinf_exit: free_rsp_buf(resp_buftype, rsp); @@ -2481,7 +2594,7 @@ smb2_new_read_req(void **buf, unsigned int *total_len, * If we want to do a RDMA write, fill in and append * smbd_buffer_descriptor_v1 to the end of read request */ - if (server->rdma && rdata && + if (server->rdma && rdata && !server->sign && rdata->bytes >= server->smbd_conn->rdma_readwrite_threshold) { struct smbd_buffer_descriptor_v1 *v1; @@ -2859,7 +2972,7 @@ smb2_async_writev(struct cifs_writedata *wdata, * If we want to do a server RDMA read, fill in and append * smbd_buffer_descriptor_v1 to the end of write request */ - if (server->rdma && wdata->bytes >= + if (server->rdma && !server->sign && wdata->bytes >= server->smbd_conn->rdma_readwrite_threshold) { struct smbd_buffer_descriptor_v1 *v1; @@ -3146,8 +3259,9 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, goto qdir_exit; } - rc = validate_buf(le16_to_cpu(rsp->OutputBufferOffset), - le32_to_cpu(rsp->OutputBufferLength), &rsp->hdr, + rc = validate_iov(server, + le16_to_cpu(rsp->OutputBufferOffset), + le32_to_cpu(rsp->OutputBufferLength), &rsp_iov, info_buf_size); if (rc) goto qdir_exit; @@ -3454,7 +3568,7 @@ static int build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon, int level, int outbuf_len, u64 persistent_fid, u64 volatile_fid) { - struct TCP_Server_Info *server = tcon->ses->server; + struct TCP_Server_Info *server; int rc; struct smb2_query_info_req *req; unsigned int total_len; @@ -3464,6 +3578,8 @@ build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon, int level, if ((tcon->ses == NULL) || (tcon->ses->server == NULL)) return -EIO; + server = tcon->ses->server; + rc = smb2_plain_req_init(SMB2_QUERY_INFO, tcon, (void **) &req, &total_len); if (rc) @@ -3517,8 +3633,9 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, info = (struct smb2_fs_full_size_info *)(server->vals->header_preamble_size + le16_to_cpu(rsp->OutputBufferOffset) + (char *)&rsp->hdr); - rc = validate_buf(le16_to_cpu(rsp->OutputBufferOffset), - le32_to_cpu(rsp->OutputBufferLength), &rsp->hdr, + rc = validate_iov(server, + le16_to_cpu(rsp->OutputBufferOffset), + le32_to_cpu(rsp->OutputBufferLength), &rsp_iov, sizeof(struct smb2_fs_full_size_info)); if (!rc) copy_fs_info_to_kstatfs(info, fsdata); @@ -3574,7 +3691,7 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon, rsp_len = le32_to_cpu(rsp->OutputBufferLength); offset = le16_to_cpu(rsp->OutputBufferOffset); - rc = validate_buf(offset, rsp_len, &rsp->hdr, min_len); + rc = validate_iov(server, offset, rsp_len, &rsp_iov, min_len); if (rc) goto qfsattr_exit; diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 253e2c7..d28f358 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -263,11 +263,19 @@ struct smb2_negotiate_req { #define SMB2_NT_FIND 0x00100000 #define SMB2_LARGE_FILES 0x00200000 +struct smb2_neg_context { + __le16 ContextType; + __le16 DataLength; + __le32 Reserved; + /* Followed by array of data */ +} __packed; + #define SMB311_SALT_SIZE 32 /* Hash Algorithm Types */ #define SMB2_PREAUTH_INTEGRITY_SHA512 cpu_to_le16(0x0001) #define SMB2_PREAUTH_HASH_SIZE 64 +#define MIN_PREAUTH_CTXT_DATA_LEN (SMB311_SALT_SIZE + 6) struct smb2_preauth_neg_context { __le16 ContextType; /* 1 */ __le16 DataLength; @@ -282,12 +290,14 @@ struct smb2_preauth_neg_context { #define SMB2_ENCRYPTION_AES128_CCM cpu_to_le16(0x0001) #define SMB2_ENCRYPTION_AES128_GCM cpu_to_le16(0x0002) +/* Min encrypt context data is one cipher so 2 bytes + 2 byte count field */ +#define MIN_ENCRYPT_CTXT_DATA_LEN 4 struct smb2_encryption_neg_context { __le16 ContextType; /* 2 */ __le16 DataLength; __le32 Reserved; __le16 CipherCount; /* AES-128-GCM and AES-128-CCM */ - __le16 Ciphers[2]; /* Ciphers[0] since only one used now */ + __le16 Ciphers[1]; /* Ciphers[0] since only one used now */ } __packed; struct smb2_negotiate_rsp { diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index cbcce3f..8ba24a9 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -122,7 +122,7 @@ extern int SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon); extern int SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, __u8 *oplock, struct smb2_file_all_info *buf, - struct smb2_err_rsp **err_buf); + struct kvec *err_iov); extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, u32 opcode, bool is_fsctl, char *in_data, u32 indatalen, diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index bf49cb7..8806f3f 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -604,7 +604,7 @@ int smb2_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server, bool log_error) { - unsigned int len = get_rfc1002_length(mid->resp_buf); + unsigned int len = mid->resp_buf_size; struct kvec iov[2]; struct smb_rqst rqst = { .rq_iov = iov, .rq_nvec = 2 }; diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c index 5008af5..c62f7c9 100644 --- a/fs/cifs/smbdirect.c +++ b/fs/cifs/smbdirect.c @@ -1028,7 +1028,7 @@ static int smbd_post_send(struct smbd_connection *info, for (i = 0; i < request->num_sge; i++) { log_rdma_send(INFO, "rdma_request sge[%d] addr=%llu length=%u\n", - i, request->sge[0].addr, request->sge[0].length); + i, request->sge[i].addr, request->sge[i].length); ib_dma_sync_single_for_device( info->id->device, request->sge[i].addr, @@ -2086,7 +2086,7 @@ int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst) int start, i, j; int max_iov_size = info->max_send_size - sizeof(struct smbd_data_transfer); - struct kvec iov[SMBDIRECT_MAX_SGE]; + struct kvec *iov; int rc; info->smbd_send_pending++; @@ -2096,32 +2096,20 @@ int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst) } /* - * This usually means a configuration error - * We use RDMA read/write for packet size > rdma_readwrite_threshold - * as long as it's properly configured we should never get into this - * situation - */ - if (rqst->rq_nvec + rqst->rq_npages > SMBDIRECT_MAX_SGE) { - log_write(ERR, "maximum send segment %x exceeding %x\n", - rqst->rq_nvec + rqst->rq_npages, SMBDIRECT_MAX_SGE); - rc = -EINVAL; - goto done; - } - - /* - * Remove the RFC1002 length defined in MS-SMB2 section 2.1 - * It is used only for TCP transport + * Skip the RFC1002 length defined in MS-SMB2 section 2.1 + * It is used only for TCP transport in the iov[0] * In future we may want to add a transport layer under protocol * layer so this will only be issued to TCP transport */ - iov[0].iov_base = (char *)rqst->rq_iov[0].iov_base + 4; - iov[0].iov_len = rqst->rq_iov[0].iov_len - 4; - buflen += iov[0].iov_len; + + if (rqst->rq_iov[0].iov_len != 4) { + log_write(ERR, "expected the pdu length in 1st iov, but got %zu\n", rqst->rq_iov[0].iov_len); + return -EINVAL; + } + iov = &rqst->rq_iov[1]; /* total up iov array first */ - for (i = 1; i < rqst->rq_nvec; i++) { - iov[i].iov_base = rqst->rq_iov[i].iov_base; - iov[i].iov_len = rqst->rq_iov[i].iov_len; + for (i = 0; i < rqst->rq_nvec-1; i++) { buflen += iov[i].iov_len; } @@ -2139,6 +2127,10 @@ int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst) goto done; } + cifs_dbg(FYI, "Sending smb (RDMA): smb_len=%u\n", buflen); + for (i = 0; i < rqst->rq_nvec-1; i++) + dump_smb(iov[i].iov_base, iov[i].iov_len); + remaining_data_length = buflen; log_write(INFO, "rqst->rq_nvec=%d rqst->rq_npages=%d rq_pagesz=%d " @@ -2194,12 +2186,14 @@ int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst) goto done; } i++; + if (i == rqst->rq_nvec-1) + break; } start = i; buflen = 0; } else { i++; - if (i == rqst->rq_nvec) { + if (i == rqst->rq_nvec-1) { /* send out all remaining vecs */ remaining_data_length -= buflen; log_write(INFO, diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 279718d..927226a 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -753,7 +753,7 @@ cifs_send_recv(const unsigned int xid, struct cifs_ses *ses, goto out; #ifdef CONFIG_CIFS_SMB311 - if (ses->status == CifsNew) + if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP)) smb311_update_preauth_hash(ses, rqst->rq_iov+1, rqst->rq_nvec-1); #endif @@ -790,7 +790,7 @@ cifs_send_recv(const unsigned int xid, struct cifs_ses *ses, buf = (char *)midQ->resp_buf; resp_iov->iov_base = buf; - resp_iov->iov_len = get_rfc1002_length(buf) + + resp_iov->iov_len = midQ->resp_buf_size + ses->server->vals->header_preamble_size; if (midQ->large_buf) *resp_buf_type = CIFS_LARGE_BUFFER; @@ -798,7 +798,7 @@ cifs_send_recv(const unsigned int xid, struct cifs_ses *ses, *resp_buf_type = CIFS_SMALL_BUFFER; #ifdef CONFIG_CIFS_SMB311 - if (ses->status == CifsNew) { + if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP)) { struct kvec iov = { .iov_base = buf + 4, .iov_len = get_rfc1002_length(buf) @@ -834,8 +834,11 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses, if (n_vec + 1 > CIFS_MAX_IOV_SIZE) { new_iov = kmalloc(sizeof(struct kvec) * (n_vec + 1), GFP_KERNEL); - if (!new_iov) + if (!new_iov) { + /* otherwise cifs_send_recv below sets resp_buf_type */ + *resp_buf_type = CIFS_NO_BUFFER; return -ENOMEM; + } } else new_iov = s_iov; diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 846ca15..4dd842f 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1997,6 +1997,16 @@ out: return rc; } +static bool is_dot_dotdot(const char *name, size_t name_size) +{ + if (name_size == 1 && name[0] == '.') + return true; + else if (name_size == 2 && name[0] == '.' && name[1] == '.') + return true; + + return false; +} + /** * ecryptfs_decode_and_decrypt_filename - converts the encoded cipher text name to decoded plaintext * @plaintext_name: The plaintext name @@ -2021,13 +2031,21 @@ int ecryptfs_decode_and_decrypt_filename(char **plaintext_name, size_t packet_size; int rc = 0; - if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) - && !(mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) - && (name_size > ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) - && (strncmp(name, ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX, - ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) == 0)) { - const char *orig_name = name; - size_t orig_name_size = name_size; + if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) && + !(mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED)) { + if (is_dot_dotdot(name, name_size)) { + rc = ecryptfs_copy_filename(plaintext_name, + plaintext_name_size, + name, name_size); + goto out; + } + + if (name_size <= ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE || + strncmp(name, ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX, + ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE)) { + rc = -EINVAL; + goto out; + } name += ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE; name_size -= ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE; @@ -2047,12 +2065,9 @@ int ecryptfs_decode_and_decrypt_filename(char **plaintext_name, decoded_name, decoded_name_size); if (rc) { - printk(KERN_INFO "%s: Could not parse tag 70 packet " - "from filename; copying through filename " - "as-is\n", __func__); - rc = ecryptfs_copy_filename(plaintext_name, - plaintext_name_size, - orig_name, orig_name_size); + ecryptfs_printk(KERN_DEBUG, + "%s: Could not parse tag 70 packet from filename\n", + __func__); goto out_free; } } else { diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index c74ed3c..b76a985 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -82,17 +82,28 @@ ecryptfs_filldir(struct dir_context *ctx, const char *lower_name, buf->sb, lower_name, lower_namelen); if (rc) { - printk(KERN_ERR "%s: Error attempting to decode and decrypt " - "filename [%s]; rc = [%d]\n", __func__, lower_name, - rc); - goto out; + if (rc != -EINVAL) { + ecryptfs_printk(KERN_DEBUG, + "%s: Error attempting to decode and decrypt filename [%s]; rc = [%d]\n", + __func__, lower_name, rc); + return rc; + } + + /* Mask -EINVAL errors as these are most likely due a plaintext + * filename present in the lower filesystem despite filename + * encryption being enabled. One unavoidable example would be + * the "lost+found" dentry in the root directory of an Ext4 + * filesystem. + */ + return 0; } + buf->caller->pos = buf->ctx.pos; rc = !dir_emit(buf->caller, name, name_size, ino, d_type); kfree(name); if (!rc) buf->entries_written++; -out: + return rc; } diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 847904a..97d17ea 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -395,8 +395,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, mount_crypt_stat = &ecryptfs_superblock_to_private( ecryptfs_dentry->d_sb)->mount_crypt_stat; - if (mount_crypt_stat - && (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)) { + if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) { rc = ecryptfs_encrypt_and_encode_filename( &encrypted_and_encoded_name, &len, mount_crypt_stat, name, len); diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index c89a58cf..e74fe84 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -1880,7 +1880,7 @@ find_next_matching_auth_tok: candidate_auth_tok = &auth_tok_list_item->auth_tok; if (unlikely(ecryptfs_verbosity > 0)) { ecryptfs_printk(KERN_DEBUG, - "Considering cadidate auth tok:\n"); + "Considering candidate auth tok:\n"); ecryptfs_dump_auth_tok(candidate_auth_tok); } rc = ecryptfs_get_auth_tok_sig(&candidate_auth_tok_sig, diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 329a5d1..645158d 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -435,6 +435,15 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, if (IS_ERR_OR_NULL(result)) return ERR_PTR(-ESTALE); + /* + * If no acceptance criteria was specified by caller, a disconnected + * dentry is also accepatable. Callers may use this mode to query if + * file handle is stale or to get a reference to an inode without + * risking the high overhead caused by directory reconnect. + */ + if (!acceptable) + return result; + if (d_is_dir(result)) { /* * This request is for a directory. diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 0964022..047c327 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -88,11 +88,11 @@ out_unlock: * The default page_lock and i_size verification done by non-DAX fault paths * is sufficient because ext2 doesn't support hole punching. */ -static int ext2_dax_fault(struct vm_fault *vmf) +static vm_fault_t ext2_dax_fault(struct vm_fault *vmf) { struct inode *inode = file_inode(vmf->vma->vm_file); struct ext2_inode_info *ei = EXT2_I(inode); - int ret; + vm_fault_t ret; if (vmf->flags & FAULT_FLAG_WRITE) { sb_start_pagefault(inode->i_sb); diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index a33d8fb..508b905 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -321,6 +321,7 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_grpblk_t offset; ext4_grpblk_t next_zero_bit; + ext4_grpblk_t max_bit = EXT4_CLUSTERS_PER_GROUP(sb); ext4_fsblk_t blk; ext4_fsblk_t group_first_block; @@ -338,7 +339,7 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, /* check whether block bitmap block number is set */ blk = ext4_block_bitmap(sb, desc); offset = blk - group_first_block; - if (offset < 0 || EXT4_B2C(sbi, offset) >= sb->s_blocksize || + if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit || !ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data)) /* bad block bitmap */ return blk; @@ -346,7 +347,7 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, /* check whether the inode bitmap block number is set */ blk = ext4_inode_bitmap(sb, desc); offset = blk - group_first_block; - if (offset < 0 || EXT4_B2C(sbi, offset) >= sb->s_blocksize || + if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit || !ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data)) /* bad block bitmap */ return blk; @@ -354,8 +355,8 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, /* check whether the inode table block number is set */ blk = ext4_inode_table(sb, desc); offset = blk - group_first_block; - if (offset < 0 || EXT4_B2C(sbi, offset) >= sb->s_blocksize || - EXT4_B2C(sbi, offset + sbi->s_itb_per_group) >= sb->s_blocksize) + if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit || + EXT4_B2C(sbi, offset + sbi->s_itb_per_group) >= max_bit) return blk; next_zero_bit = ext4_find_next_zero_bit(bh->b_data, EXT4_B2C(sbi, offset + sbi->s_itb_per_group), diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 0a73159..c969275 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5329,8 +5329,9 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, stop = le32_to_cpu(extent->ee_block); /* - * In case of left shift, Don't start shifting extents until we make - * sure the hole is big enough to accommodate the shift. + * For left shifts, make sure the hole on the left is big enough to + * accommodate the shift. For right shifts, make sure the last extent + * won't be shifted beyond EXT_MAX_BLOCKS. */ if (SHIFT == SHIFT_LEFT) { path = ext4_find_extent(inode, start - 1, &path, @@ -5350,9 +5351,14 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, if ((start == ex_start && shift > ex_start) || (shift > start - ex_end)) { - ext4_ext_drop_refs(path); - kfree(path); - return -EINVAL; + ret = -EINVAL; + goto out; + } + } else { + if (shift > EXT_MAX_BLOCKS - + (stop + ext4_ext_get_actual_len(extent))) { + ret = -EINVAL; + goto out; } } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 185f7e6..eb104e8 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5886,5 +5886,6 @@ static void __exit ext4_exit_fs(void) MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); MODULE_DESCRIPTION("Fourth Extended Filesystem"); MODULE_LICENSE("GPL"); +MODULE_SOFTDEP("pre: crc32c"); module_init(ext4_init_fs) module_exit(ext4_exit_fs) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 4b12ba7..471d863 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -745,11 +745,12 @@ int inode_congested(struct inode *inode, int cong_bits) */ if (inode && inode_to_wb_is_valid(inode)) { struct bdi_writeback *wb; - bool locked, congested; + struct wb_lock_cookie lock_cookie = {}; + bool congested; - wb = unlocked_inode_to_wb_begin(inode, &locked); + wb = unlocked_inode_to_wb_begin(inode, &lock_cookie); congested = wb_congested(wb, cong_bits); - unlocked_inode_to_wb_end(inode, locked); + unlocked_inode_to_wb_end(inode, &lock_cookie); return congested; } @@ -1960,7 +1961,7 @@ void wb_workfn(struct work_struct *work) } if (!list_empty(&wb->work_list)) - mod_delayed_work(bdi_wq, &wb->dwork, 0); + wb_wakeup(wb); else if (wb_has_dirty_io(wb) && dirty_writeback_interval) wb_wakeup_delayed(wb); diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 685c305..278ed08 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1744,7 +1744,7 @@ do_grow_qunlock: * @newsize: the size to make the file * * The file size can grow, shrink, or stay the same size. This - * is called holding i_mutex and an exclusive glock on the inode + * is called holding i_rwsem and an exclusive glock on the inode * in question. * * Returns: errno diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 82fb558..097bd3c 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1923,28 +1923,37 @@ void gfs2_glock_exit(void) static void gfs2_glock_iter_next(struct gfs2_glock_iter *gi, loff_t n) { - if (n == 0) - gi->gl = rhashtable_walk_peek(&gi->hti); - else { - gi->gl = rhashtable_walk_next(&gi->hti); - n--; + struct gfs2_glock *gl = gi->gl; + + if (gl) { + if (n == 0) + return; + if (!lockref_put_not_zero(&gl->gl_lockref)) + gfs2_glock_queue_put(gl); } for (;;) { - if (IS_ERR_OR_NULL(gi->gl)) { - if (!gi->gl) - return; - if (PTR_ERR(gi->gl) != -EAGAIN) { - gi->gl = NULL; - return; + gl = rhashtable_walk_next(&gi->hti); + if (IS_ERR_OR_NULL(gl)) { + if (gl == ERR_PTR(-EAGAIN)) { + n = 1; + continue; } - n = 0; - } else if (gi->sdp == gi->gl->gl_name.ln_sbd && - !__lockref_is_dead(&gi->gl->gl_lockref)) { - if (!n--) - break; + gl = NULL; + break; + } + if (gl->gl_name.ln_sbd != gi->sdp) + continue; + if (n <= 1) { + if (!lockref_get_not_dead(&gl->gl_lockref)) + continue; + break; + } else { + if (__lockref_is_dead(&gl->gl_lockref)) + continue; + n--; } - gi->gl = rhashtable_walk_next(&gi->hti); } + gi->gl = gl; } static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos) @@ -1988,7 +1997,6 @@ static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr) { struct gfs2_glock_iter *gi = seq->private; - gi->gl = NULL; rhashtable_walk_stop(&gi->hti); } @@ -2076,7 +2084,8 @@ static int gfs2_glocks_release(struct inode *inode, struct file *file) struct seq_file *seq = file->private_data; struct gfs2_glock_iter *gi = seq->private; - gi->gl = NULL; + if (gi->gl) + gfs2_glock_put(gi->gl); rhashtable_walk_exit(&gi->hti); return seq_release_private(inode, file); } diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index e6a0a8a..3ba3f16 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -825,7 +825,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo) goto fail_rindex; } /* - * i_mutex on quota files is special. Since this inode is hidden system + * i_rwsem on quota files is special. Since this inode is hidden system * file, we are safe to define locking ourselves. */ lockdep_set_class(&sdp->sd_quota_inode->i_rwsem, diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 513c357..a6c0f54 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -588,6 +588,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) return 0; out_put_hidden_dir: + cancel_delayed_work_sync(&sbi->sync_work); iput(sbi->hidden_dir); out_put_root: dput(sb->s_root); diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c index 9bb2fe3..10205ec 100644 --- a/fs/isofs/compress.c +++ b/fs/isofs/compress.c @@ -20,6 +20,7 @@ #include <linux/init.h> #include <linux/bio.h> +#include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/zlib.h> @@ -59,7 +60,7 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, >> bufshift; int haveblocks; blkcnt_t blocknum; - struct buffer_head *bhs[needblocks + 1]; + struct buffer_head **bhs; int curbh, curpage; if (block_size > deflateBound(1UL << zisofs_block_shift)) { @@ -80,7 +81,11 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, /* Because zlib is not thread-safe, do all the I/O at the top. */ blocknum = block_start >> bufshift; - memset(bhs, 0, (needblocks + 1) * sizeof(struct buffer_head *)); + bhs = kcalloc(needblocks + 1, sizeof(*bhs), GFP_KERNEL); + if (!bhs) { + *errp = -ENOMEM; + return 0; + } haveblocks = isofs_get_blocks(inode, blocknum, bhs, needblocks); ll_rw_block(REQ_OP_READ, 0, haveblocks, bhs); @@ -190,6 +195,7 @@ z_eio: b_eio: for (i = 0; i < haveblocks; i++) brelse(bhs[i]); + kfree(bhs); return stream.total_out; } @@ -305,7 +311,7 @@ static int zisofs_readpage(struct file *file, struct page *page) unsigned int zisofs_pages_per_cblock = PAGE_SHIFT <= zisofs_block_shift ? (1 << (zisofs_block_shift - PAGE_SHIFT)) : 0; - struct page *pages[max_t(unsigned, zisofs_pages_per_cblock, 1)]; + struct page **pages; pgoff_t index = page->index, end_index; end_index = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -330,6 +336,12 @@ static int zisofs_readpage(struct file *file, struct page *page) full_page = 0; pcount = 1; } + pages = kcalloc(max_t(unsigned int, zisofs_pages_per_cblock, 1), + sizeof(*pages), GFP_KERNEL); + if (!pages) { + unlock_page(page); + return -ENOMEM; + } pages[full_page] = page; for (i = 0; i < pcount; i++, index++) { @@ -357,6 +369,7 @@ static int zisofs_readpage(struct file *file, struct page *page) } /* At this point, err contains 0 or -EIO depending on the "critical" page */ + kfree(pages); return err; } diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index bc258a4..ec3fba7 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -394,7 +394,10 @@ static int parse_options(char *options, struct iso9660_options *popt) break; #ifdef CONFIG_JOLIET case Opt_iocharset: + kfree(popt->iocharset); popt->iocharset = match_strdup(&args[0]); + if (!popt->iocharset) + return 0; break; #endif case Opt_map_a: diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index ac31103..8aa4537 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -532,6 +532,7 @@ int jbd2_journal_start_reserved(handle_t *handle, unsigned int type, */ ret = start_this_handle(journal, handle, GFP_NOFS); if (ret < 0) { + handle->h_journal = journal; jbd2_journal_free_reserved(handle); return ret; } diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index f60dee7..87bdf0f 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -342,7 +342,7 @@ static void jffs2_put_super (struct super_block *sb) static void jffs2_kill_sb(struct super_block *sb) { struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); - if (!sb_rdonly(sb)) + if (c && !sb_rdonly(sb)) jffs2_stop_garbage_collect_thread(c); kill_mtd_super(sb); kfree(c); diff --git a/fs/namespace.c b/fs/namespace.c index e398f32..5f75969 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1089,7 +1089,8 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, goto out_free; } - mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED); + mnt->mnt.mnt_flags = old->mnt.mnt_flags; + mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL); /* Don't allow unprivileged users to change mount flags */ if (flag & CL_UNPRIVILEGED) { mnt->mnt.mnt_flags |= MNT_LOCK_ATIME; @@ -2814,7 +2815,7 @@ long do_mount(const char *dev_name, const char __user *dir_name, mnt_flags |= MNT_NODIRATIME; if (flags & MS_STRICTATIME) mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME); - if (flags & SB_RDONLY) + if (flags & MS_RDONLY) mnt_flags |= MNT_READONLY; /* The default atime for remount is preservation */ diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 123c069..a813979 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -535,35 +535,10 @@ static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char return 0; } -#define CB_SUPPORTED_ATTR0 (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE) -#define CB_SUPPORTED_ATTR1 (FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY) -static __be32 encode_attr_bitmap(struct xdr_stream *xdr, const uint32_t *bitmap, __be32 **savep) +static __be32 encode_attr_bitmap(struct xdr_stream *xdr, const uint32_t *bitmap, size_t sz) { - __be32 bm[2]; - __be32 *p; - - bm[0] = htonl(bitmap[0] & CB_SUPPORTED_ATTR0); - bm[1] = htonl(bitmap[1] & CB_SUPPORTED_ATTR1); - if (bm[1] != 0) { - p = xdr_reserve_space(xdr, 16); - if (unlikely(p == NULL)) - return htonl(NFS4ERR_RESOURCE); - *p++ = htonl(2); - *p++ = bm[0]; - *p++ = bm[1]; - } else if (bm[0] != 0) { - p = xdr_reserve_space(xdr, 12); - if (unlikely(p == NULL)) - return htonl(NFS4ERR_RESOURCE); - *p++ = htonl(1); - *p++ = bm[0]; - } else { - p = xdr_reserve_space(xdr, 8); - if (unlikely(p == NULL)) - return htonl(NFS4ERR_RESOURCE); - *p++ = htonl(0); - } - *savep = p; + if (xdr_stream_encode_uint32_array(xdr, bitmap, sz) < 0) + return cpu_to_be32(NFS4ERR_RESOURCE); return 0; } @@ -656,9 +631,13 @@ static __be32 encode_getattr_res(struct svc_rqst *rqstp, struct xdr_stream *xdr, if (unlikely(status != 0)) goto out; - status = encode_attr_bitmap(xdr, res->bitmap, &savep); + status = encode_attr_bitmap(xdr, res->bitmap, ARRAY_SIZE(res->bitmap)); if (unlikely(status != 0)) goto out; + status = cpu_to_be32(NFS4ERR_RESOURCE); + savep = xdr_reserve_space(xdr, sizeof(*savep)); + if (unlikely(!savep)) + goto out; status = encode_attr_change(xdr, res->bitmap, res->change_attr); if (unlikely(status != 0)) goto out; diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index d8b4762..1819d0d 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -19,6 +19,7 @@ #include <linux/nfs_xdr.h> #include "nfs4_fs.h" +#include "nfs4session.h" #include "delegation.h" #include "internal.h" #include "nfs4trace.h" @@ -171,11 +172,15 @@ again: * nfs_inode_reclaim_delegation - process a delegation reclaim request * @inode: inode to process * @cred: credential to use for request - * @res: new delegation state from server + * @type: delegation type + * @stateid: delegation stateid + * @pagemod_limit: write delegation "space_limit" * */ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, - struct nfs_openres *res) + fmode_t type, + const nfs4_stateid *stateid, + unsigned long pagemod_limit) { struct nfs_delegation *delegation; struct rpc_cred *oldcred = NULL; @@ -185,9 +190,9 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, if (delegation != NULL) { spin_lock(&delegation->lock); if (delegation->inode != NULL) { - nfs4_stateid_copy(&delegation->stateid, &res->delegation); - delegation->type = res->delegation_type; - delegation->pagemod_limit = res->pagemod_limit; + nfs4_stateid_copy(&delegation->stateid, stateid); + delegation->type = type; + delegation->pagemod_limit = pagemod_limit; oldcred = delegation->cred; delegation->cred = get_rpccred(cred); clear_bit(NFS_DELEGATION_NEED_RECLAIM, @@ -195,14 +200,14 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, spin_unlock(&delegation->lock); rcu_read_unlock(); put_rpccred(oldcred); - trace_nfs4_reclaim_delegation(inode, res->delegation_type); + trace_nfs4_reclaim_delegation(inode, type); return; } /* We appear to have raced with a delegation return. */ spin_unlock(&delegation->lock); } rcu_read_unlock(); - nfs_inode_set_delegation(inode, cred, res); + nfs_inode_set_delegation(inode, cred, type, stateid, pagemod_limit); } static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync) @@ -329,11 +334,16 @@ nfs_update_inplace_delegation(struct nfs_delegation *delegation, * nfs_inode_set_delegation - set up a delegation on an inode * @inode: inode to which delegation applies * @cred: cred to use for subsequent delegation processing - * @res: new delegation state from server + * @type: delegation type + * @stateid: delegation stateid + * @pagemod_limit: write delegation "space_limit" * * Returns zero on success, or a negative errno value. */ -int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res) +int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, + fmode_t type, + const nfs4_stateid *stateid, + unsigned long pagemod_limit) { struct nfs_server *server = NFS_SERVER(inode); struct nfs_client *clp = server->nfs_client; @@ -345,9 +355,9 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct delegation = kmalloc(sizeof(*delegation), GFP_NOFS); if (delegation == NULL) return -ENOMEM; - nfs4_stateid_copy(&delegation->stateid, &res->delegation); - delegation->type = res->delegation_type; - delegation->pagemod_limit = res->pagemod_limit; + nfs4_stateid_copy(&delegation->stateid, stateid); + delegation->type = type; + delegation->pagemod_limit = pagemod_limit; delegation->change_attr = inode_peek_iversion_raw(inode); delegation->cred = get_rpccred(cred); delegation->inode = inode; @@ -392,7 +402,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct rcu_assign_pointer(nfsi->delegation, delegation); delegation = NULL; - trace_nfs4_set_delegation(inode, res->delegation_type); + trace_nfs4_set_delegation(inode, type); out: spin_unlock(&clp->cl_lock); @@ -547,6 +557,22 @@ int nfs4_inode_return_delegation(struct inode *inode) return err; } +/** + * nfs4_inode_make_writeable + * @inode: pointer to inode + * + * Make the inode writeable by returning the delegation if necessary + * + * Returns zero on success, or a negative errno value. + */ +int nfs4_inode_make_writeable(struct inode *inode) +{ + if (!nfs4_has_session(NFS_SERVER(inode)->nfs_client) || + !nfs4_check_delegation(inode, FMODE_WRITE)) + return nfs4_inode_return_delegation(inode); + return 0; +} + static void nfs_mark_return_if_closed_delegation(struct nfs_server *server, struct nfs_delegation *delegation) { diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index 185a09f..bb1ef8c 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -36,8 +36,10 @@ enum { NFS_DELEGATION_TEST_EXPIRED, }; -int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); -void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); +int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, + fmode_t type, const nfs4_stateid *stateid, unsigned long pagemod_limit); +void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, + fmode_t type, const nfs4_stateid *stateid, unsigned long pagemod_limit); int nfs4_inode_return_delegation(struct inode *inode); int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid); void nfs_inode_return_delegation_noreclaim(struct inode *inode); @@ -70,6 +72,7 @@ int nfs4_check_delegation(struct inode *inode, fmode_t flags); bool nfs4_delegation_flush_on_close(const struct inode *inode); void nfs_inode_find_delegation_state_and_recover(struct inode *inode, const nfs4_stateid *stateid); +int nfs4_inode_make_writeable(struct inode *inode); #endif diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 2f3f867..73f8b43 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1272,7 +1272,9 @@ static void nfs_drop_nlink(struct inode *inode) /* drop the inode if we're reasonably sure this is the last link */ if (inode->i_nlink == 1) clear_nlink(inode); - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR; + NFS_I(inode)->cache_validity |= NFS_INO_INVALID_CHANGE + | NFS_INO_INVALID_CTIME + | NFS_INO_INVALID_OTHER; spin_unlock(&inode->i_lock); } @@ -1798,12 +1800,11 @@ static int nfs_safe_remove(struct dentry *dentry) trace_nfs_remove_enter(dir, dentry); if (inode != NULL) { - NFS_PROTO(inode)->return_delegation(inode); - error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); + error = NFS_PROTO(dir)->remove(dir, dentry); if (error == 0) nfs_drop_nlink(inode); } else - error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); + error = NFS_PROTO(dir)->remove(dir, dentry); if (error == -ENOENT) nfs_dentry_handle_enoent(dentry); trace_nfs_remove_exit(dir, dentry, error); @@ -1932,8 +1933,6 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) old_dentry, dentry); trace_nfs_link_enter(inode, dir, dentry); - NFS_PROTO(inode)->return_delegation(inode); - d_drop(dentry); error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name); if (error == 0) { @@ -2023,10 +2022,6 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, } } - NFS_PROTO(old_inode)->return_delegation(old_inode); - if (new_inode != NULL) - NFS_PROTO(new_inode)->return_delegation(new_inode); - task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, NULL); if (IS_ERR(task)) { error = PTR_ERR(task); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index d17a90c..bd15d0b 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -195,7 +195,10 @@ bool nfs_check_cache_invalid(struct inode *inode, unsigned long flags) static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) { struct nfs_inode *nfsi = NFS_I(inode); + bool have_delegation = nfs_have_delegated_attributes(inode); + if (have_delegation) + flags &= ~(NFS_INO_INVALID_CHANGE|NFS_INO_REVAL_PAGECACHE); if (inode->i_mapping->nrpages == 0) flags &= ~NFS_INO_INVALID_DATA; nfsi->cache_validity |= flags; @@ -447,7 +450,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st inode->i_mode = fattr->mode; if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0 && nfs_server_capable(inode, NFS_CAP_MODE)) - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER); /* Why so? Because we want revalidate for devices/FIFOs, and * that's precisely what we have in nfs_file_inode_operations. */ @@ -493,37 +496,35 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st if (fattr->valid & NFS_ATTR_FATTR_ATIME) inode->i_atime = fattr->atime; else if (nfs_server_capable(inode, NFS_CAP_ATIME)) - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME); if (fattr->valid & NFS_ATTR_FATTR_MTIME) inode->i_mtime = fattr->mtime; else if (nfs_server_capable(inode, NFS_CAP_MTIME)) - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME); if (fattr->valid & NFS_ATTR_FATTR_CTIME) inode->i_ctime = fattr->ctime; else if (nfs_server_capable(inode, NFS_CAP_CTIME)) - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_CTIME); if (fattr->valid & NFS_ATTR_FATTR_CHANGE) inode_set_iversion_raw(inode, fattr->change_attr); else - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR - | NFS_INO_REVAL_PAGECACHE); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE); if (fattr->valid & NFS_ATTR_FATTR_SIZE) inode->i_size = nfs_size_to_loff_t(fattr->size); else - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR - | NFS_INO_REVAL_PAGECACHE); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_SIZE); if (fattr->valid & NFS_ATTR_FATTR_NLINK) set_nlink(inode, fattr->nlink); else if (nfs_server_capable(inode, NFS_CAP_NLINK)) - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER); if (fattr->valid & NFS_ATTR_FATTR_OWNER) inode->i_uid = fattr->uid; else if (nfs_server_capable(inode, NFS_CAP_OWNER)) - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER); if (fattr->valid & NFS_ATTR_FATTR_GROUP) inode->i_gid = fattr->gid; else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP)) - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER); if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) inode->i_blocks = fattr->du.nfs2.blocks; if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { @@ -608,11 +609,6 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) goto out; } - /* - * Return any delegations if we're going to change ACLs - */ - if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) - NFS_PROTO(inode)->return_delegation(inode); error = NFS_PROTO(inode)->setattr(dentry, fattr, attr); if (error == 0) error = nfs_refresh_inode(inode, fattr); @@ -645,6 +641,7 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset) /* Optimisation */ if (offset == 0) NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_DATA; + NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_SIZE; spin_unlock(&inode->i_lock); truncate_pagecache(inode, offset); @@ -657,6 +654,7 @@ out: * nfs_setattr_update_inode - Update inode metadata after a setattr call. * @inode: pointer to struct inode * @attr: pointer to struct iattr + * @fattr: pointer to struct nfs_fattr * * Note: we do this in the *proc.c in order to ensure that * it works for things like exclusive creates too. @@ -669,6 +667,8 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, spin_lock(&inode->i_lock); NFS_I(inode)->attr_gencount = fattr->gencount; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE + | NFS_INO_INVALID_CTIME); if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) { if ((attr->ia_valid & ATTR_MODE) != 0) { int mode = attr->ia_mode & S_IALLUGO; @@ -683,13 +683,12 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, | NFS_INO_INVALID_ACL); } if ((attr->ia_valid & ATTR_SIZE) != 0) { + nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME); nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC); nfs_vmtruncate(inode, attr->ia_size); } if (fattr->valid) nfs_update_inode(inode, fattr); - else - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR; spin_unlock(&inode->i_lock); } EXPORT_SYMBOL_GPL(nfs_setattr_update_inode); @@ -1303,24 +1302,20 @@ static bool nfs_file_has_buffered_writers(struct nfs_inode *nfsi) return nfs_file_has_writers(nfsi) && nfs_file_io_is_buffered(nfsi); } -static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) +static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) { - unsigned long ret = 0; - if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE) && (fattr->valid & NFS_ATTR_FATTR_CHANGE) && inode_eq_iversion_raw(inode, fattr->pre_change_attr)) { inode_set_iversion_raw(inode, fattr->change_attr); if (S_ISDIR(inode->i_mode)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); - ret |= NFS_INO_INVALID_ATTR; } /* If we have atomic WCC data, we may update some attributes */ if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME) && (fattr->valid & NFS_ATTR_FATTR_CTIME) && timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) { memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); - ret |= NFS_INO_INVALID_ATTR; } if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME) @@ -1329,17 +1324,13 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); if (S_ISDIR(inode->i_mode)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); - ret |= NFS_INO_INVALID_ATTR; } if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE) && (fattr->valid & NFS_ATTR_FATTR_SIZE) && i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) && !nfs_have_writebacks(inode)) { i_size_write(inode, nfs_size_to_loff_t(fattr->size)); - ret |= NFS_INO_INVALID_ATTR; } - - return ret; } /** @@ -1369,33 +1360,41 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat if (!nfs_file_has_buffered_writers(nfsi)) { /* Verify a few of the more important attributes */ if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && !inode_eq_iversion_raw(inode, fattr->change_attr)) - invalid |= NFS_INO_INVALID_ATTR | NFS_INO_REVAL_PAGECACHE; + invalid |= NFS_INO_INVALID_CHANGE + | NFS_INO_REVAL_PAGECACHE; if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime)) - invalid |= NFS_INO_INVALID_ATTR; + invalid |= NFS_INO_INVALID_MTIME; if ((fattr->valid & NFS_ATTR_FATTR_CTIME) && !timespec_equal(&inode->i_ctime, &fattr->ctime)) - invalid |= NFS_INO_INVALID_ATTR; + invalid |= NFS_INO_INVALID_CTIME; if (fattr->valid & NFS_ATTR_FATTR_SIZE) { cur_size = i_size_read(inode); new_isize = nfs_size_to_loff_t(fattr->size); if (cur_size != new_isize) - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + invalid |= NFS_INO_INVALID_SIZE + | NFS_INO_REVAL_PAGECACHE; } } /* Have any file permissions changed? */ if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) - invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; + invalid |= NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_INVALID_OTHER; if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && !uid_eq(inode->i_uid, fattr->uid)) - invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; + invalid |= NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_INVALID_OTHER; if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && !gid_eq(inode->i_gid, fattr->gid)) - invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; + invalid |= NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_INVALID_OTHER; /* Has the link count changed? */ if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink) - invalid |= NFS_INO_INVALID_ATTR; + invalid |= NFS_INO_INVALID_OTHER; if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec_equal(&inode->i_atime, &fattr->atime)) invalid |= NFS_INO_INVALID_ATIME; @@ -1597,10 +1596,9 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) } EXPORT_SYMBOL_GPL(nfs_refresh_inode); -static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr) +static int nfs_post_op_update_inode_locked(struct inode *inode, + struct nfs_fattr *fattr, unsigned int invalid) { - unsigned long invalid = NFS_INO_INVALID_ATTR; - if (S_ISDIR(inode->i_mode)) invalid |= NFS_INO_INVALID_DATA; nfs_set_cache_invalid(inode, invalid); @@ -1629,7 +1627,9 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr) spin_lock(&inode->i_lock); nfs_fattr_set_barrier(fattr); - status = nfs_post_op_update_inode_locked(inode, fattr); + status = nfs_post_op_update_inode_locked(inode, fattr, + NFS_INO_INVALID_CHANGE + | NFS_INO_INVALID_CTIME); spin_unlock(&inode->i_lock); return status; @@ -1681,7 +1681,10 @@ int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fa fattr->valid |= NFS_ATTR_FATTR_PRESIZE; } out_noforce: - status = nfs_post_op_update_inode_locked(inode, fattr); + status = nfs_post_op_update_inode_locked(inode, fattr, + NFS_INO_INVALID_CHANGE + | NFS_INO_INVALID_CTIME + | NFS_INO_INVALID_MTIME); return status; } @@ -1789,7 +1792,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) | NFS_INO_REVAL_PAGECACHE); /* Do atomic weak cache consistency updates */ - invalid |= nfs_wcc_update_inode(inode, fattr); + nfs_wcc_update_inode(inode, fattr); if (pnfs_layoutcommit_outstanding(inode)) { nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_ATTR; @@ -1803,17 +1806,25 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) inode->i_sb->s_id, inode->i_ino); /* Could it be a race with writeback? */ if (!have_writers) { - invalid |= NFS_INO_INVALID_ATTR + invalid |= NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_DATA | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; + /* Force revalidate of all attributes */ + save_cache_validity |= NFS_INO_INVALID_CTIME + | NFS_INO_INVALID_MTIME + | NFS_INO_INVALID_SIZE + | NFS_INO_INVALID_OTHER; if (S_ISDIR(inode->i_mode)) nfs_force_lookup_revalidate(inode); } inode_set_iversion_raw(inode, fattr->change_attr); } } else { - nfsi->cache_validity |= save_cache_validity; + nfsi->cache_validity |= save_cache_validity & + (NFS_INO_INVALID_CHANGE + | NFS_INO_REVAL_PAGECACHE + | NFS_INO_REVAL_FORCED); cache_revalidated = false; } @@ -1821,7 +1832,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); } else if (server->caps & NFS_CAP_MTIME) { nfsi->cache_validity |= save_cache_validity & - (NFS_INO_INVALID_ATTR + (NFS_INO_INVALID_MTIME | NFS_INO_REVAL_FORCED); cache_revalidated = false; } @@ -1830,7 +1841,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); } else if (server->caps & NFS_CAP_CTIME) { nfsi->cache_validity |= save_cache_validity & - (NFS_INO_INVALID_ATTR + (NFS_INO_INVALID_CTIME | NFS_INO_REVAL_FORCED); cache_revalidated = false; } @@ -1845,7 +1856,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (!nfs_have_writebacks(inode) || new_isize > cur_isize) { i_size_write(inode, new_isize); if (!have_writers) - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; + invalid |= NFS_INO_INVALID_DATA; } dprintk("NFS: isize change on server for file %s/%ld " "(%Ld to %Ld)\n", @@ -1856,7 +1867,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) } } else { nfsi->cache_validity |= save_cache_validity & - (NFS_INO_INVALID_ATTR + (NFS_INO_INVALID_SIZE | NFS_INO_REVAL_PAGECACHE | NFS_INO_REVAL_FORCED); cache_revalidated = false; @@ -1877,55 +1888,61 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) umode_t newmode = inode->i_mode & S_IFMT; newmode |= fattr->mode & S_IALLUGO; inode->i_mode = newmode; - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + invalid |= NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_INVALID_OTHER; } } else if (server->caps & NFS_CAP_MODE) { nfsi->cache_validity |= save_cache_validity & - (NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_ACCESS + (NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL + | NFS_INO_INVALID_OTHER | NFS_INO_REVAL_FORCED); cache_revalidated = false; } if (fattr->valid & NFS_ATTR_FATTR_OWNER) { if (!uid_eq(inode->i_uid, fattr->uid)) { - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + invalid |= NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_INVALID_OTHER; inode->i_uid = fattr->uid; } } else if (server->caps & NFS_CAP_OWNER) { nfsi->cache_validity |= save_cache_validity & - (NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_ACCESS + (NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL + | NFS_INO_INVALID_OTHER | NFS_INO_REVAL_FORCED); cache_revalidated = false; } if (fattr->valid & NFS_ATTR_FATTR_GROUP) { if (!gid_eq(inode->i_gid, fattr->gid)) { - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + invalid |= NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_INVALID_OTHER; inode->i_gid = fattr->gid; } } else if (server->caps & NFS_CAP_OWNER_GROUP) { nfsi->cache_validity |= save_cache_validity & - (NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_ACCESS + (NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL + | NFS_INO_INVALID_OTHER | NFS_INO_REVAL_FORCED); cache_revalidated = false; } if (fattr->valid & NFS_ATTR_FATTR_NLINK) { if (inode->i_nlink != fattr->nlink) { - invalid |= NFS_INO_INVALID_ATTR; + invalid |= NFS_INO_INVALID_OTHER; if (S_ISDIR(inode->i_mode)) invalid |= NFS_INO_INVALID_DATA; set_nlink(inode, fattr->nlink); } } else if (server->caps & NFS_CAP_NLINK) { nfsi->cache_validity |= save_cache_validity & - (NFS_INO_INVALID_ATTR + (NFS_INO_INVALID_OTHER | NFS_INO_REVAL_FORCED); cache_revalidated = false; } @@ -1942,6 +1959,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) /* Update attrtimeo value if we're out of the unstable period */ if (invalid & NFS_INO_INVALID_ATTR) { + invalid &= ~NFS_INO_INVALID_ATTR; nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = now; @@ -1962,10 +1980,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfsi->attr_gencount = fattr->gencount; } - /* Don't declare attrcache up to date if there were no attrs! */ - if (cache_revalidated) - invalid &= ~NFS_INO_INVALID_ATTR; - /* Don't invalidate the data if we were to blame */ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 7327930..eadf1ab 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -138,8 +138,11 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, msg.rpc_cred = nfs_file_cred(sattr->ia_file); nfs_fattr_init(fattr); status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); - if (status == 0) + if (status == 0) { + if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL) + nfs_zap_acl_cache(inode); nfs_setattr_update_inode(inode, sattr, fattr); + } dprintk("NFS reply setattr: %d\n", status); return status; } @@ -383,11 +386,11 @@ out: } static int -nfs3_proc_remove(struct inode *dir, const struct qstr *name) +nfs3_proc_remove(struct inode *dir, struct dentry *dentry) { struct nfs_removeargs arg = { .fh = NFS_FH(dir), - .name = *name, + .name = dentry->d_name, }; struct nfs_removeres res; struct rpc_message msg = { @@ -397,7 +400,7 @@ nfs3_proc_remove(struct inode *dir, const struct qstr *name) }; int status = -ENOMEM; - dprintk("NFS call remove %s\n", name->name); + dprintk("NFS call remove %pd2\n", dentry); res.dir_attr = nfs_alloc_fattr(); if (res.dir_attr == NULL) goto out; @@ -411,7 +414,7 @@ out: } static void -nfs3_proc_unlink_setup(struct rpc_message *msg, struct inode *dir) +nfs3_proc_unlink_setup(struct rpc_message *msg, struct dentry *dentry) { msg->rpc_proc = &nfs3_procedures[NFS3PROC_REMOVE]; } @@ -433,7 +436,9 @@ nfs3_proc_unlink_done(struct rpc_task *task, struct inode *dir) } static void -nfs3_proc_rename_setup(struct rpc_message *msg, struct inode *dir) +nfs3_proc_rename_setup(struct rpc_message *msg, + struct dentry *old_dentry, + struct dentry *new_dentry) { msg->rpc_proc = &nfs3_procedures[NFS3PROC_RENAME]; } @@ -908,12 +913,6 @@ static int nfs3_have_delegation(struct inode *inode, fmode_t flags) return 0; } -static int nfs3_return_delegation(struct inode *inode) -{ - nfs_wb_all(inode); - return 0; -} - static const struct inode_operations nfs3_dir_inode_operations = { .create = nfs_create, .lookup = nfs_lookup, @@ -990,7 +989,6 @@ const struct nfs_rpc_ops nfs_v3_clientops = { .clear_acl_cache = forget_all_cached_acls, .close_context = nfs_close_context, .have_delegation = nfs3_have_delegation, - .return_delegation = nfs3_return_delegation, .alloc_client = nfs_alloc_client, .init_client = nfs_init_client, .free_client = nfs_free_client, diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 6cd33bd..09ee36d 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -1997,6 +1997,7 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_entry old = *entry; __be32 *p; int error; + u64 new_cookie; p = xdr_inline_decode(xdr, 4); if (unlikely(p == NULL)) @@ -2019,8 +2020,7 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, if (unlikely(error)) return error; - entry->prev_cookie = entry->cookie; - error = decode_cookie3(xdr, &entry->cookie); + error = decode_cookie3(xdr, &new_cookie); if (unlikely(error)) return error; @@ -2054,6 +2054,9 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, zero_nfs_fh3(entry->fh); } + entry->prev_cookie = entry->cookie; + entry->cookie = new_cookie; + return 0; out_overflow: diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 47f3c27..b71757e 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1045,7 +1045,9 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo, struct nfs_inode *nfsi = NFS_I(dir); spin_lock(&dir->i_lock); - nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; + nfsi->cache_validity |= NFS_INO_INVALID_CTIME + | NFS_INO_INVALID_MTIME + | NFS_INO_INVALID_DATA; if (cinfo->atomic && cinfo->before == inode_peek_iversion_raw(dir)) { nfsi->cache_validity &= ~NFS_INO_REVAL_PAGECACHE; nfsi->attrtimeo_timestamp = jiffies; @@ -1669,6 +1671,7 @@ static void nfs4_return_incompatible_delegation(struct inode *inode, fmode_t fmo { struct nfs_delegation *delegation; + fmode &= FMODE_READ|FMODE_WRITE; rcu_read_lock(); delegation = rcu_dereference(NFS_I(inode)->delegation); if (delegation == NULL || (delegation->type & fmode) == fmode) { @@ -1751,12 +1754,16 @@ nfs4_opendata_check_deleg(struct nfs4_opendata *data, struct nfs4_state *state) } if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0) nfs_inode_set_delegation(state->inode, - data->owner->so_cred, - &data->o_res); + data->owner->so_cred, + data->o_res.delegation_type, + &data->o_res.delegation, + data->o_res.pagemod_limit); else nfs_inode_reclaim_delegation(state->inode, - data->owner->so_cred, - &data->o_res); + data->owner->so_cred, + data->o_res.delegation_type, + &data->o_res.delegation, + data->o_res.pagemod_limit); } /* @@ -2743,27 +2750,40 @@ static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st * fields corresponding to attributes that were used to store the verifier. * Make sure we clobber those fields in the later setattr call */ -static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, +static unsigned nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct iattr *sattr, struct nfs4_label **label) { - const u32 *attrset = opendata->o_res.attrset; + const __u32 *bitmask = opendata->o_arg.server->exclcreat_bitmask; + __u32 attrset[3]; + unsigned ret; + unsigned i; - if ((attrset[1] & FATTR4_WORD1_TIME_ACCESS) && - !(sattr->ia_valid & ATTR_ATIME_SET)) - sattr->ia_valid |= ATTR_ATIME; + for (i = 0; i < ARRAY_SIZE(attrset); i++) { + attrset[i] = opendata->o_res.attrset[i]; + if (opendata->o_arg.createmode == NFS4_CREATE_EXCLUSIVE4_1) + attrset[i] &= ~bitmask[i]; + } + + ret = (opendata->o_arg.createmode == NFS4_CREATE_EXCLUSIVE) ? + sattr->ia_valid : 0; - if ((attrset[1] & FATTR4_WORD1_TIME_MODIFY) && - !(sattr->ia_valid & ATTR_MTIME_SET)) - sattr->ia_valid |= ATTR_MTIME; + if ((attrset[1] & (FATTR4_WORD1_TIME_ACCESS|FATTR4_WORD1_TIME_ACCESS_SET))) { + if (sattr->ia_valid & ATTR_ATIME_SET) + ret |= ATTR_ATIME_SET; + else + ret |= ATTR_ATIME; + } - /* Except MODE, it seems harmless of setting twice. */ - if (opendata->o_arg.createmode != NFS4_CREATE_EXCLUSIVE && - (attrset[1] & FATTR4_WORD1_MODE || - attrset[2] & FATTR4_WORD2_MODE_UMASK)) - sattr->ia_valid &= ~ATTR_MODE; + if ((attrset[1] & (FATTR4_WORD1_TIME_MODIFY|FATTR4_WORD1_TIME_MODIFY_SET))) { + if (sattr->ia_valid & ATTR_MTIME_SET) + ret |= ATTR_MTIME_SET; + else + ret |= ATTR_MTIME; + } - if (attrset[2] & FATTR4_WORD2_SECURITY_LABEL) + if (!(attrset[2] & FATTR4_WORD2_SECURITY_LABEL)) *label = NULL; + return ret; } static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, @@ -2892,12 +2912,15 @@ static int _nfs4_do_open(struct inode *dir, if ((opendata->o_arg.open_flags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL) && (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) { - nfs4_exclusive_attrset(opendata, sattr, &label); + unsigned attrs = nfs4_exclusive_attrset(opendata, sattr, &label); /* * send create attributes which was not set by open * with an extra setattr. */ - if (sattr->ia_valid & NFS4_VALID_ATTRS) { + if (attrs || label) { + unsigned ia_old = sattr->ia_valid; + + sattr->ia_valid = attrs; nfs_fattr_init(opendata->o_res.f_attr); status = nfs4_do_setattr(state->inode, cred, opendata->o_res.f_attr, sattr, @@ -2907,6 +2930,7 @@ static int _nfs4_do_open(struct inode *dir, opendata->o_res.f_attr); nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel); } + sattr->ia_valid = ia_old; } } if (opened && opendata->file_created) @@ -3874,6 +3898,10 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, if (IS_ERR(label)) return PTR_ERR(label); + /* Return any delegations if we're going to change ACLs */ + if ((sattr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) + nfs4_inode_make_writeable(inode); + status = nfs4_do_setattr(inode, cred, fattr, sattr, ctx, NULL, label); if (status == 0) { nfs_setattr_update_inode(inode, sattr, fattr); @@ -4048,7 +4076,6 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry struct nfs_server *server = NFS_SERVER(inode); struct nfs4_accessargs args = { .fh = NFS_FH(inode), - .bitmask = server->cache_consistency_bitmask, .access = entry->mask, }; struct nfs4_accessres res = { @@ -4062,14 +4089,18 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry }; int status = 0; - res.fattr = nfs_alloc_fattr(); - if (res.fattr == NULL) - return -ENOMEM; + if (!nfs_have_delegated_attributes(inode)) { + res.fattr = nfs_alloc_fattr(); + if (res.fattr == NULL) + return -ENOMEM; + args.bitmask = server->cache_consistency_bitmask; + } status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); if (!status) { nfs_access_set_mask(entry, res.access); - nfs_refresh_inode(inode, res.fattr); + if (res.fattr) + nfs_refresh_inode(inode, res.fattr); } nfs_free_fattr(res.fattr); return status; @@ -4199,10 +4230,32 @@ static int _nfs4_proc_remove(struct inode *dir, const struct qstr *name) return status; } -static int nfs4_proc_remove(struct inode *dir, const struct qstr *name) +static int nfs4_proc_remove(struct inode *dir, struct dentry *dentry) +{ + struct nfs4_exception exception = { }; + struct inode *inode = d_inode(dentry); + int err; + + if (inode) { + if (inode->i_nlink == 1) + nfs4_inode_return_delegation(inode); + else + nfs4_inode_make_writeable(inode); + } + do { + err = _nfs4_proc_remove(dir, &dentry->d_name); + trace_nfs4_remove(dir, &dentry->d_name, err); + err = nfs4_handle_exception(NFS_SERVER(dir), err, + &exception); + } while (exception.retry); + return err; +} + +static int nfs4_proc_rmdir(struct inode *dir, const struct qstr *name) { struct nfs4_exception exception = { }; int err; + do { err = _nfs4_proc_remove(dir, name); trace_nfs4_remove(dir, name, err); @@ -4212,17 +4265,20 @@ static int nfs4_proc_remove(struct inode *dir, const struct qstr *name) return err; } -static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir) +static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct dentry *dentry) { - struct nfs_server *server = NFS_SERVER(dir); struct nfs_removeargs *args = msg->rpc_argp; struct nfs_removeres *res = msg->rpc_resp; + struct inode *inode = d_inode(dentry); - res->server = server; + res->server = NFS_SB(dentry->d_sb); msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE]; nfs4_init_sequence(&args->seq_args, &res->seq_res, 1); nfs_fattr_init(res->dir_attr); + + if (inode) + nfs4_inode_return_delegation(inode); } static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data) @@ -4248,14 +4304,21 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir) return 1; } -static void nfs4_proc_rename_setup(struct rpc_message *msg, struct inode *dir) +static void nfs4_proc_rename_setup(struct rpc_message *msg, + struct dentry *old_dentry, + struct dentry *new_dentry) { - struct nfs_server *server = NFS_SERVER(dir); struct nfs_renameargs *arg = msg->rpc_argp; struct nfs_renameres *res = msg->rpc_resp; + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); + if (old_inode) + nfs4_inode_make_writeable(old_inode); + if (new_inode) + nfs4_inode_return_delegation(new_inode); msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME]; - res->server = server; + res->server = NFS_SB(old_dentry->d_sb); nfs4_init_sequence(&arg->seq_args, &res->seq_res, 1); } @@ -4317,6 +4380,8 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, const struct } arg.bitmask = nfs4_bitmask(server, res.label); + nfs4_inode_make_writeable(inode); + status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); if (!status) { update_changeattr(dir, &res.cinfo, res.fattr->time_start); @@ -5310,7 +5375,7 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl i = buf_to_pages_noslab(buf, buflen, arg.acl_pages); if (i < 0) return i; - nfs4_inode_return_delegation(inode); + nfs4_inode_make_writeable(inode); ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); /* @@ -5325,7 +5390,8 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl * so mark the attribute cache invalid. */ spin_lock(&inode->i_lock); - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR; + NFS_I(inode)->cache_validity |= NFS_INO_INVALID_CHANGE + | NFS_INO_INVALID_CTIME; spin_unlock(&inode->i_lock); nfs_access_zap_cache(inode); nfs_zap_acl_cache(inode); @@ -6621,22 +6687,24 @@ static int nfs4_wake_lock_waiter(wait_queue_entry_t *wait, unsigned int mode, int flags, void *key) { int ret; - struct cb_notify_lock_args *cbnl = key; struct nfs4_lock_waiter *waiter = wait->private; - struct nfs_lowner *lowner = &cbnl->cbnl_owner, - *wowner = waiter->owner; - /* Only wake if the callback was for the same owner */ - if (lowner->clientid != wowner->clientid || - lowner->id != wowner->id || - lowner->s_dev != wowner->s_dev) - return 0; + /* NULL key means to wake up everyone */ + if (key) { + struct cb_notify_lock_args *cbnl = key; + struct nfs_lowner *lowner = &cbnl->cbnl_owner, + *wowner = waiter->owner; - /* Make sure it's for the right inode */ - if (nfs_compare_fh(NFS_FH(waiter->inode), &cbnl->cbnl_fh)) - return 0; + /* Only wake if the callback was for the same owner. */ + if (lowner->id != wowner->id || lowner->s_dev != wowner->s_dev) + return 0; - waiter->notified = true; + /* Make sure it's for the right inode */ + if (nfs_compare_fh(NFS_FH(waiter->inode), &cbnl->cbnl_fh)) + return 0; + + waiter->notified = true; + } /* override "private" so we can use default_wake_function */ wait->private = waiter->task; @@ -6673,6 +6741,7 @@ nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) add_wait_queue(q, &wait); while(!signalled()) { + waiter.notified = false; status = nfs4_proc_setlk(state, cmd, request); if ((status != -EAGAIN) || IS_SETLK(cmd)) break; @@ -8414,6 +8483,8 @@ static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nf { switch(task->tk_status) { case 0: + wake_up_all(&clp->cl_lock_waitq); + /* Fallthrough */ case -NFS4ERR_COMPLETE_ALREADY: case -NFS4ERR_WRONG_CRED: /* What to do here? */ break; @@ -9593,7 +9664,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .link = nfs4_proc_link, .symlink = nfs4_proc_symlink, .mkdir = nfs4_proc_mkdir, - .rmdir = nfs4_proc_remove, + .rmdir = nfs4_proc_rmdir, .readdir = nfs4_proc_readdir, .mknod = nfs4_proc_mknod, .statfs = nfs4_proc_statfs, @@ -9614,7 +9685,6 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .close_context = nfs4_close_context, .open_context = nfs4_atomic_open, .have_delegation = nfs4_have_delegation, - .return_delegation = nfs4_inode_return_delegation, .alloc_client = nfs4_alloc_client, .init_client = nfs4_init_client, .free_client = nfs4_free_client, diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 91a4d4e..c10a422 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -428,7 +428,6 @@ nfs4_insert_state_owner_locked(struct nfs4_state_owner *new) struct rb_node **p = &server->state_owners.rb_node, *parent = NULL; struct nfs4_state_owner *sp; - int err; while (*p != NULL) { parent = *p; @@ -445,9 +444,6 @@ nfs4_insert_state_owner_locked(struct nfs4_state_owner *new) return sp; } } - err = ida_get_new(&server->openowner_id, &new->so_seqid.owner_id); - if (err) - return ERR_PTR(err); rb_link_node(&new->so_server_node, parent, p); rb_insert_color(&new->so_server_node, &server->state_owners); return new; @@ -460,7 +456,6 @@ nfs4_remove_state_owner_locked(struct nfs4_state_owner *sp) if (!RB_EMPTY_NODE(&sp->so_server_node)) rb_erase(&sp->so_server_node, &server->state_owners); - ida_remove(&server->openowner_id, sp->so_seqid.owner_id); } static void @@ -495,6 +490,12 @@ nfs4_alloc_state_owner(struct nfs_server *server, sp = kzalloc(sizeof(*sp), gfp_flags); if (!sp) return NULL; + sp->so_seqid.owner_id = ida_simple_get(&server->openowner_id, 0, 0, + gfp_flags); + if (sp->so_seqid.owner_id < 0) { + kfree(sp); + return NULL; + } sp->so_server = server; sp->so_cred = get_rpccred(cred); spin_lock_init(&sp->so_lock); @@ -526,6 +527,7 @@ static void nfs4_free_state_owner(struct nfs4_state_owner *sp) { nfs4_destroy_seqid_counter(&sp->so_seqid); put_rpccred(sp->so_cred); + ida_simple_remove(&sp->so_server->openowner_id, sp->so_seqid.owner_id); kfree(sp); } @@ -576,13 +578,9 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, new = nfs4_alloc_state_owner(server, cred, gfp_flags); if (new == NULL) goto out; - do { - if (ida_pre_get(&server->openowner_id, gfp_flags) == 0) - break; - spin_lock(&clp->cl_lock); - sp = nfs4_insert_state_owner_locked(new); - spin_unlock(&clp->cl_lock); - } while (sp == ERR_PTR(-EAGAIN)); + spin_lock(&clp->cl_lock); + sp = nfs4_insert_state_owner_locked(new); + spin_unlock(&clp->cl_lock); if (sp != new) nfs4_free_state_owner(new); out: diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index b993ad2..9b73920 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -98,6 +98,7 @@ static int nfs4_stat_to_errno(int); ((3+NFS4_FHSIZE) >> 2)) #define nfs4_fattr_bitmap_maxsz 4 #define encode_getattr_maxsz (op_encode_hdr_maxsz + nfs4_fattr_bitmap_maxsz) +#define nfstime4_maxsz (3) #define nfs4_name_maxsz (1 + ((3 + NFS4_MAXNAMLEN) >> 2)) #define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2)) #define nfs4_owner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) @@ -112,7 +113,8 @@ static int nfs4_stat_to_errno(int); #define decode_mdsthreshold_maxsz (1 + 1 + nfs4_fattr_bitmap_maxsz + 1 + 8) /* This is based on getfattr, which uses the most attributes: */ #define nfs4_fattr_value_maxsz (1 + (1 + 2 + 2 + 4 + 2 + 1 + 1 + 2 + 2 + \ - 3 + 3 + 3 + nfs4_owner_maxsz + \ + 3*nfstime4_maxsz + \ + nfs4_owner_maxsz + \ nfs4_group_maxsz + nfs4_label_maxsz + \ decode_mdsthreshold_maxsz)) #define nfs4_fattr_maxsz (nfs4_fattr_bitmap_maxsz + \ @@ -123,7 +125,8 @@ static int nfs4_stat_to_errno(int); nfs4_owner_maxsz + \ nfs4_group_maxsz + \ nfs4_label_maxsz + \ - 4 + 4) + 1 + nfstime4_maxsz + \ + 1 + nfstime4_maxsz) #define encode_savefh_maxsz (op_encode_hdr_maxsz) #define decode_savefh_maxsz (op_decode_hdr_maxsz) #define encode_restorefh_maxsz (op_encode_hdr_maxsz) @@ -957,6 +960,35 @@ static void encode_uint64(struct xdr_stream *xdr, u64 n) WARN_ON_ONCE(xdr_stream_encode_u64(xdr, n) < 0); } +static ssize_t xdr_encode_bitmap4(struct xdr_stream *xdr, + const __u32 *bitmap, size_t len) +{ + ssize_t ret; + + /* Trim empty words */ + while (len > 0 && bitmap[len-1] == 0) + len--; + ret = xdr_stream_encode_uint32_array(xdr, bitmap, len); + if (WARN_ON_ONCE(ret < 0)) + return ret; + return len; +} + +static size_t mask_bitmap4(const __u32 *bitmap, const __u32 *mask, + __u32 *res, size_t len) +{ + size_t i; + __u32 tmp; + + while (len > 0 && (bitmap[len-1] == 0 || mask[len-1] == 0)) + len--; + for (i = len; i-- > 0;) { + tmp = bitmap[i] & mask[i]; + res[i] = tmp; + } + return len; +} + static void encode_nfs4_seqid(struct xdr_stream *xdr, const struct nfs_seqid *seqid) { @@ -1011,6 +1043,14 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve encode_opaque_fixed(xdr, verf->data, NFS4_VERIFIER_SIZE); } +static __be32 * +xdr_encode_nfstime4(__be32 *p, const struct timespec *t) +{ + p = xdr_encode_hyper(p, (__s64)t->tv_sec); + *p++ = cpu_to_be32(t->tv_nsec); + return p; +} + static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs4_label *label, const umode_t *umask, @@ -1022,9 +1062,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, int owner_namelen = 0; int owner_grouplen = 0; __be32 *p; - unsigned i; uint32_t len = 0; - uint32_t bmval_len; uint32_t bmval[3] = { 0 }; /* @@ -1072,7 +1110,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, if (attrmask[1] & FATTR4_WORD1_TIME_ACCESS_SET) { if (iap->ia_valid & ATTR_ATIME_SET) { bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET; - len += 16; + len += 4 + (nfstime4_maxsz << 2); } else if (iap->ia_valid & ATTR_ATIME) { bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET; len += 4; @@ -1081,7 +1119,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, if (attrmask[1] & FATTR4_WORD1_TIME_MODIFY_SET) { if (iap->ia_valid & ATTR_MTIME_SET) { bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET; - len += 16; + len += 4 + (nfstime4_maxsz << 2); } else if (iap->ia_valid & ATTR_MTIME) { bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET; len += 4; @@ -1093,19 +1131,8 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, bmval[2] |= FATTR4_WORD2_SECURITY_LABEL; } - if (bmval[2] != 0) - bmval_len = 3; - else if (bmval[1] != 0) - bmval_len = 2; - else - bmval_len = 1; - - p = reserve_space(xdr, 4 + (bmval_len << 2) + 4 + len); - - *p++ = cpu_to_be32(bmval_len); - for (i = 0; i < bmval_len; i++) - *p++ = cpu_to_be32(bmval[i]); - *p++ = cpu_to_be32(len); + xdr_encode_bitmap4(xdr, bmval, ARRAY_SIZE(bmval)); + xdr_stream_encode_opaque_inline(xdr, (void **)&p, len); if (bmval[0] & FATTR4_WORD0_SIZE) p = xdr_encode_hyper(p, iap->ia_size); @@ -1118,16 +1145,14 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) { if (iap->ia_valid & ATTR_ATIME_SET) { *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); - p = xdr_encode_hyper(p, (s64)iap->ia_atime.tv_sec); - *p++ = cpu_to_be32(iap->ia_atime.tv_nsec); + p = xdr_encode_nfstime4(p, &iap->ia_atime); } else *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); } if (bmval[1] & FATTR4_WORD1_TIME_MODIFY_SET) { if (iap->ia_valid & ATTR_MTIME_SET) { *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); - p = xdr_encode_hyper(p, (s64)iap->ia_mtime.tv_sec); - *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec); + p = xdr_encode_nfstime4(p, &iap->ia_mtime); } else *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); } @@ -1199,85 +1224,45 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg * create->server, create->server->attr_bitmask); } -static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr) -{ - __be32 *p; - - encode_op_hdr(xdr, OP_GETATTR, decode_getattr_maxsz, hdr); - p = reserve_space(xdr, 8); - *p++ = cpu_to_be32(1); - *p = cpu_to_be32(bitmap); -} - -static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm1, struct compound_hdr *hdr) -{ - __be32 *p; - - encode_op_hdr(xdr, OP_GETATTR, decode_getattr_maxsz, hdr); - p = reserve_space(xdr, 12); - *p++ = cpu_to_be32(2); - *p++ = cpu_to_be32(bm0); - *p = cpu_to_be32(bm1); -} - -static void -encode_getattr_three(struct xdr_stream *xdr, - uint32_t bm0, uint32_t bm1, uint32_t bm2, - struct compound_hdr *hdr) +static void encode_getattr(struct xdr_stream *xdr, + const __u32 *bitmap, const __u32 *mask, size_t len, + struct compound_hdr *hdr) { - __be32 *p; + __u32 masked_bitmap[nfs4_fattr_bitmap_maxsz]; encode_op_hdr(xdr, OP_GETATTR, decode_getattr_maxsz, hdr); - if (bm2) { - p = reserve_space(xdr, 16); - *p++ = cpu_to_be32(3); - *p++ = cpu_to_be32(bm0); - *p++ = cpu_to_be32(bm1); - *p = cpu_to_be32(bm2); - } else if (bm1) { - p = reserve_space(xdr, 12); - *p++ = cpu_to_be32(2); - *p++ = cpu_to_be32(bm0); - *p = cpu_to_be32(bm1); - } else { - p = reserve_space(xdr, 8); - *p++ = cpu_to_be32(1); - *p = cpu_to_be32(bm0); + if (mask) { + if (WARN_ON_ONCE(len > ARRAY_SIZE(masked_bitmap))) + len = ARRAY_SIZE(masked_bitmap); + len = mask_bitmap4(bitmap, mask, masked_bitmap, len); + bitmap = masked_bitmap; } + xdr_encode_bitmap4(xdr, bitmap, len); } static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) { - encode_getattr_three(xdr, bitmask[0] & nfs4_fattr_bitmap[0], - bitmask[1] & nfs4_fattr_bitmap[1], - bitmask[2] & nfs4_fattr_bitmap[2], - hdr); + encode_getattr(xdr, nfs4_fattr_bitmap, bitmask, + ARRAY_SIZE(nfs4_fattr_bitmap), hdr); } static void encode_getfattr_open(struct xdr_stream *xdr, const u32 *bitmask, const u32 *open_bitmap, struct compound_hdr *hdr) { - encode_getattr_three(xdr, - bitmask[0] & open_bitmap[0], - bitmask[1] & open_bitmap[1], - bitmask[2] & open_bitmap[2], - hdr); + encode_getattr(xdr, open_bitmap, bitmask, 3, hdr); } static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) { - encode_getattr_three(xdr, - bitmask[0] & nfs4_fsinfo_bitmap[0], - bitmask[1] & nfs4_fsinfo_bitmap[1], - bitmask[2] & nfs4_fsinfo_bitmap[2], - hdr); + encode_getattr(xdr, nfs4_fsinfo_bitmap, bitmask, + ARRAY_SIZE(nfs4_fsinfo_bitmap), hdr); } static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) { - encode_getattr_two(xdr, bitmask[0] & nfs4_fs_locations_bitmap[0], - bitmask[1] & nfs4_fs_locations_bitmap[1], hdr); + encode_getattr(xdr, nfs4_fs_locations_bitmap, bitmask, + ARRAY_SIZE(nfs4_fs_locations_bitmap), hdr); } static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr) @@ -2116,7 +2101,8 @@ static void nfs4_xdr_enc_access(struct rpc_rqst *req, struct xdr_stream *xdr, encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fh, &hdr); encode_access(xdr, args->access, &hdr); - encode_getfattr(xdr, args->bitmask, &hdr); + if (args->bitmask) + encode_getfattr(xdr, args->bitmask, &hdr); encode_nops(&hdr); } @@ -2558,13 +2544,17 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr, struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; + const __u32 nfs4_acl_bitmap[1] = { + [0] = FATTR4_WORD0_ACL, + }; uint32_t replen; encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fh, &hdr); replen = hdr.replen + op_decode_hdr_maxsz; - encode_getattr_two(xdr, FATTR4_WORD0_ACL, 0, &hdr); + encode_getattr(xdr, nfs4_acl_bitmap, NULL, + ARRAY_SIZE(nfs4_acl_bitmap), &hdr); xdr_inline_pages(&req->rq_rcv_buf, replen << 2, args->acl_pages, 0, args->acl_len); @@ -2643,8 +2633,8 @@ static void nfs4_xdr_enc_pathconf(struct rpc_rqst *req, struct xdr_stream *xdr, encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fh, &hdr); - encode_getattr_one(xdr, args->bitmask[0] & nfs4_pathconf_bitmap[0], - &hdr); + encode_getattr(xdr, nfs4_pathconf_bitmap, args->bitmask, + ARRAY_SIZE(nfs4_pathconf_bitmap), &hdr); encode_nops(&hdr); } @@ -2662,8 +2652,8 @@ static void nfs4_xdr_enc_statfs(struct rpc_rqst *req, struct xdr_stream *xdr, encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fh, &hdr); - encode_getattr_two(xdr, args->bitmask[0] & nfs4_statfs_bitmap[0], - args->bitmask[1] & nfs4_statfs_bitmap[1], &hdr); + encode_getattr(xdr, nfs4_statfs_bitmap, args->bitmask, + ARRAY_SIZE(nfs4_statfs_bitmap), &hdr); encode_nops(&hdr); } @@ -2683,7 +2673,7 @@ static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req, encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fhandle, &hdr); - encode_getattr_three(xdr, bitmask[0], bitmask[1], bitmask[2], &hdr); + encode_getattr(xdr, bitmask, NULL, 3, &hdr); encode_nops(&hdr); } @@ -3217,34 +3207,27 @@ static int decode_ace(struct xdr_stream *xdr, void *ace) return -EIO; } -static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) +static ssize_t +decode_bitmap4(struct xdr_stream *xdr, uint32_t *bitmap, size_t sz) { - uint32_t bmlen; - __be32 *p; - - p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) - goto out_overflow; - bmlen = be32_to_cpup(p); + ssize_t ret; - bitmap[0] = bitmap[1] = bitmap[2] = 0; - p = xdr_inline_decode(xdr, (bmlen << 2)); - if (unlikely(!p)) - goto out_overflow; - if (bmlen > 0) { - bitmap[0] = be32_to_cpup(p++); - if (bmlen > 1) { - bitmap[1] = be32_to_cpup(p++); - if (bmlen > 2) - bitmap[2] = be32_to_cpup(p); - } - } - return 0; -out_overflow: + ret = xdr_stream_decode_uint32_array(xdr, bitmap, sz); + if (likely(ret >= 0)) + return ret; + if (ret == -EMSGSIZE) + return sz; print_overflow_msg(__func__, xdr); return -EIO; } +static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) +{ + ssize_t ret; + ret = decode_bitmap4(xdr, bitmap, 3); + return ret < 0 ? ret : 0; +} + static int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, unsigned int *savep) { __be32 *p; @@ -3980,7 +3963,7 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, bitmap[1] &= ~FATTR4_WORD1_OWNER; if (owner_name != NULL) { - len = decode_nfs4_string(xdr, owner_name, GFP_NOWAIT); + len = decode_nfs4_string(xdr, owner_name, GFP_NOIO); if (len <= 0) goto out; dprintk("%s: name=%s\n", __func__, owner_name->data); @@ -4015,7 +3998,7 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP; if (group_name != NULL) { - len = decode_nfs4_string(xdr, group_name, GFP_NOWAIT); + len = decode_nfs4_string(xdr, group_name, GFP_NOIO); if (len <= 0) goto out; dprintk("%s: name=%s\n", __func__, group_name->data); @@ -4155,19 +4138,25 @@ out_overflow: return -EIO; } +static __be32 * +xdr_decode_nfstime4(__be32 *p, struct timespec *t) +{ + __u64 sec; + + p = xdr_decode_hyper(p, &sec); + t-> tv_sec = (time_t)sec; + t->tv_nsec = be32_to_cpup(p++); + return p; +} + static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time) { __be32 *p; - uint64_t sec; - uint32_t nsec; - p = xdr_inline_decode(xdr, 12); + p = xdr_inline_decode(xdr, nfstime4_maxsz << 2); if (unlikely(!p)) goto out_overflow; - p = xdr_decode_hyper(p, &sec); - nsec = be32_to_cpup(p); - time->tv_sec = (time_t)sec; - time->tv_nsec = (long)nsec; + xdr_decode_nfstime4(p, time); return 0; out_overflow: print_overflow_msg(__func__, xdr); @@ -5470,21 +5459,13 @@ decode_savefh(struct xdr_stream *xdr) static int decode_setattr(struct xdr_stream *xdr) { - __be32 *p; - uint32_t bmlen; int status; status = decode_op_hdr(xdr, OP_SETATTR); if (status) return status; - p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) - goto out_overflow; - bmlen = be32_to_cpup(p); - p = xdr_inline_decode(xdr, bmlen << 2); - if (likely(p)) + if (decode_bitmap4(xdr, NULL, 0) >= 0) return 0; -out_overflow: print_overflow_msg(__func__, xdr); return -EIO; } @@ -6255,7 +6236,8 @@ static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_access(xdr, &res->supported, &res->access); if (status != 0) goto out; - decode_getfattr(xdr, res->fattr, res->server); + if (res->fattr) + decode_getfattr(xdr, res->fattr, res->server); out: return status; } @@ -7535,6 +7517,7 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, unsigned int savep; uint32_t bitmap[3] = {0}; uint32_t len; + uint64_t new_cookie; __be32 *p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; @@ -7551,8 +7534,7 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, p = xdr_inline_decode(xdr, 12); if (unlikely(!p)) goto out_overflow; - entry->prev_cookie = entry->cookie; - p = xdr_decode_hyper(p, &entry->cookie); + p = xdr_decode_hyper(p, &new_cookie); entry->len = be32_to_cpup(p); p = xdr_inline_decode(xdr, entry->len); @@ -7586,6 +7568,9 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, if (entry->fattr->valid & NFS_ATTR_FATTR_TYPE) entry->d_type = nfs_umode_to_dtype(entry->fattr->mode); + entry->prev_cookie = entry->cookie; + entry->cookie = new_cookie; + return 0; out_overflow: diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index f7fd919..4e93d63 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -300,11 +300,11 @@ out: } static int -nfs_proc_remove(struct inode *dir, const struct qstr *name) +nfs_proc_remove(struct inode *dir, struct dentry *dentry) { struct nfs_removeargs arg = { .fh = NFS_FH(dir), - .name = *name, + .name = dentry->d_name, }; struct rpc_message msg = { .rpc_proc = &nfs_procedures[NFSPROC_REMOVE], @@ -312,7 +312,7 @@ nfs_proc_remove(struct inode *dir, const struct qstr *name) }; int status; - dprintk("NFS call remove %s\n", name->name); + dprintk("NFS call remove %pd2\n",dentry); status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); nfs_mark_for_revalidate(dir); @@ -321,7 +321,7 @@ nfs_proc_remove(struct inode *dir, const struct qstr *name) } static void -nfs_proc_unlink_setup(struct rpc_message *msg, struct inode *dir) +nfs_proc_unlink_setup(struct rpc_message *msg, struct dentry *dentry) { msg->rpc_proc = &nfs_procedures[NFSPROC_REMOVE]; } @@ -338,7 +338,9 @@ static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir) } static void -nfs_proc_rename_setup(struct rpc_message *msg, struct inode *dir) +nfs_proc_rename_setup(struct rpc_message *msg, + struct dentry *old_dentry, + struct dentry *new_dentry) { msg->rpc_proc = &nfs_procedures[NFSPROC_RENAME]; } @@ -671,12 +673,6 @@ static int nfs_have_delegation(struct inode *inode, fmode_t flags) return 0; } -static int nfs_return_delegation(struct inode *inode) -{ - nfs_wb_all(inode); - return 0; -} - static const struct inode_operations nfs_dir_inode_operations = { .create = nfs_create, .lookup = nfs_lookup, @@ -741,7 +737,6 @@ const struct nfs_rpc_ops nfs_v2_clientops = { .lock_check_bounds = nfs_lock_check_bounds, .close_context = nfs_close_context, .have_delegation = nfs_have_delegation, - .return_delegation = nfs_return_delegation, .alloc_client = nfs_alloc_client, .init_client = nfs_init_client, .free_client = nfs_free_client, diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 630b4a3..bf54fc9 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -105,7 +105,7 @@ static void nfs_do_call_unlink(struct nfs_unlinkdata *data) data->args.fh = NFS_FH(dir); nfs_fattr_init(data->res.dir_attr); - NFS_PROTO(dir)->unlink_setup(&msg, dir); + NFS_PROTO(dir)->unlink_setup(&msg, data->dentry); task_setup_data.rpc_client = NFS_CLIENT(dir); task = rpc_run_task(&task_setup_data); @@ -386,7 +386,7 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir, nfs_sb_active(old_dir->i_sb); - NFS_PROTO(data->old_dir)->rename_setup(&msg, old_dir); + NFS_PROTO(data->old_dir)->rename_setup(&msg, old_dentry, new_dentry); return rpc_run_task(&task_setup_data); } @@ -463,9 +463,6 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) fileid = NFS_FILEID(d_inode(dentry)); - /* Return delegation in anticipation of the rename */ - NFS_PROTO(d_inode(dentry))->return_delegation(d_inode(dentry)); - sdentry = NULL; do { int slen; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 6579f3b..0193053 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -231,6 +231,7 @@ static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int c if (i_size >= end) goto out; i_size_write(inode, end); + NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_SIZE; nfs_inc_stats(inode, NFSIOS_EXTENDWRITE); out: spin_unlock(&inode->i_lock); @@ -1562,8 +1563,11 @@ static int nfs_writeback_done(struct rpc_task *task, } /* Deal with the suid/sgid bit corner case */ - if (nfs_should_remove_suid(inode)) - nfs_mark_for_revalidate(inode); + if (nfs_should_remove_suid(inode)) { + spin_lock(&inode->i_lock); + NFS_I(inode)->cache_validity |= NFS_INO_INVALID_OTHER; + spin_unlock(&inode->i_lock); + } return 0; } diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index d51e1bb..d94e803 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -92,7 +92,7 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark, u32 event_mask, const void *data, int data_type) { - __u32 marks_mask, marks_ignored_mask; + __u32 marks_mask = 0, marks_ignored_mask = 0; const struct path *path = data; pr_debug("%s: inode_mark=%p vfsmnt_mark=%p mask=%x data=%p" @@ -108,24 +108,20 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark, !d_can_lookup(path->dentry)) return false; - if (inode_mark && vfsmnt_mark) { - marks_mask = (vfsmnt_mark->mask | inode_mark->mask); - marks_ignored_mask = (vfsmnt_mark->ignored_mask | inode_mark->ignored_mask); - } else if (inode_mark) { - /* - * if the event is for a child and this inode doesn't care about - * events on the child, don't send it! - */ - if ((event_mask & FS_EVENT_ON_CHILD) && - !(inode_mark->mask & FS_EVENT_ON_CHILD)) - return false; - marks_mask = inode_mark->mask; - marks_ignored_mask = inode_mark->ignored_mask; - } else if (vfsmnt_mark) { - marks_mask = vfsmnt_mark->mask; - marks_ignored_mask = vfsmnt_mark->ignored_mask; - } else { - BUG(); + /* + * if the event is for a child and this inode doesn't care about + * events on the child, don't send it! + */ + if (inode_mark && + (!(event_mask & FS_EVENT_ON_CHILD) || + (inode_mark->mask & FS_EVENT_ON_CHILD))) { + marks_mask |= inode_mark->mask; + marks_ignored_mask |= inode_mark->ignored_mask; + } + + if (vfsmnt_mark) { + marks_mask |= vfsmnt_mark->mask; + marks_ignored_mask |= vfsmnt_mark->ignored_mask; } if (d_is_dir(path->dentry) && diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 219b269..613ec7e 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -192,8 +192,9 @@ static int send_to_group(struct inode *to_tell, struct fsnotify_iter_info *iter_info) { struct fsnotify_group *group = NULL; - __u32 inode_test_mask = 0; - __u32 vfsmount_test_mask = 0; + __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD); + __u32 marks_mask = 0; + __u32 marks_ignored_mask = 0; if (unlikely(!inode_mark && !vfsmount_mark)) { BUG(); @@ -213,29 +214,25 @@ static int send_to_group(struct inode *to_tell, /* does the inode mark tell us to do something? */ if (inode_mark) { group = inode_mark->group; - inode_test_mask = (mask & ~FS_EVENT_ON_CHILD); - inode_test_mask &= inode_mark->mask; - inode_test_mask &= ~inode_mark->ignored_mask; + marks_mask |= inode_mark->mask; + marks_ignored_mask |= inode_mark->ignored_mask; } /* does the vfsmount_mark tell us to do something? */ if (vfsmount_mark) { - vfsmount_test_mask = (mask & ~FS_EVENT_ON_CHILD); group = vfsmount_mark->group; - vfsmount_test_mask &= vfsmount_mark->mask; - vfsmount_test_mask &= ~vfsmount_mark->ignored_mask; - if (inode_mark) - vfsmount_test_mask &= ~inode_mark->ignored_mask; + marks_mask |= vfsmount_mark->mask; + marks_ignored_mask |= vfsmount_mark->ignored_mask; } pr_debug("%s: group=%p to_tell=%p mask=%x inode_mark=%p" - " inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x" + " vfsmount_mark=%p marks_mask=%x marks_ignored_mask=%x" " data=%p data_is=%d cookie=%d\n", - __func__, group, to_tell, mask, inode_mark, - inode_test_mask, vfsmount_mark, vfsmount_test_mask, data, + __func__, group, to_tell, mask, inode_mark, vfsmount_mark, + marks_mask, marks_ignored_mask, data, data_is, cookie); - if (!inode_test_mask && !vfsmount_test_mask) + if (!(test_mask & marks_mask & ~marks_ignored_mask)) return 0; return group->ops->handle_event(group, to_tell, inode_mark, diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 01c6b38..7869622 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4250,10 +4250,11 @@ out: static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, bool preserve) { - int error; + int error, had_lock; struct inode *inode = d_inode(old_dentry); struct buffer_head *old_bh = NULL; struct inode *new_orphan_inode = NULL; + struct ocfs2_lock_holder oh; if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) return -EOPNOTSUPP; @@ -4295,6 +4296,14 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, goto out; } + had_lock = ocfs2_inode_lock_tracker(new_orphan_inode, NULL, 1, + &oh); + if (had_lock < 0) { + error = had_lock; + mlog_errno(error); + goto out; + } + /* If the security isn't preserved, we need to re-initialize them. */ if (!preserve) { error = ocfs2_init_security_and_acl(dir, new_orphan_inode, @@ -4302,14 +4311,15 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, if (error) mlog_errno(error); } -out: if (!error) { error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode, new_dentry); if (error) mlog_errno(error); } + ocfs2_inode_unlock_tracker(new_orphan_inode, 1, &oh, had_lock); +out: if (new_orphan_inode) { /* * We need to open_unlock the inode no matter whether we diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c index 3ae5fdb..10796d3 100644 --- a/fs/orangefs/super.c +++ b/fs/orangefs/super.c @@ -579,6 +579,11 @@ void orangefs_kill_sb(struct super_block *sb) /* provided sb cleanup */ kill_anon_super(sb); + if (!ORANGEFS_SB(sb)) { + mutex_lock(&orangefs_request_mutex); + mutex_unlock(&orangefs_request_mutex); + return; + } /* * issue the unmount to userspace to tell it to remove the * dynamic mount info it has for this superblock diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig index ce6ff5a..1703263 100644 --- a/fs/overlayfs/Kconfig +++ b/fs/overlayfs/Kconfig @@ -86,3 +86,20 @@ config OVERLAY_FS_NFS_EXPORT case basis with the "nfs_export=on" mount option. Say N unless you fully understand the consequences. + +config OVERLAY_FS_XINO_AUTO + bool "Overlayfs: auto enable inode number mapping" + default n + depends on OVERLAY_FS + help + If this config option is enabled then overlay filesystems will use + unused high bits in undelying filesystem inode numbers to map all + inodes to a unified address space. The mapped 64bit inode numbers + might not be compatible with applications that expect 32bit inodes. + + If compatibility with applications that expect 32bit inodes is not an + issue, then it is safe and recommended to say Y here. + + For more information, see Documentation/filesystems/overlayfs.txt + + If unsure, say N. diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index d855f50..8bede07 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -232,7 +232,7 @@ int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat) return err; } -struct ovl_fh *ovl_encode_fh(struct dentry *real, bool is_upper) +struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper) { struct ovl_fh *fh; int fh_type, fh_len, dwords; @@ -300,7 +300,7 @@ int ovl_set_origin(struct dentry *dentry, struct dentry *lower, * up and a pure upper inode. */ if (ovl_can_decode_fh(lower->d_sb)) { - fh = ovl_encode_fh(lower, false); + fh = ovl_encode_real_fh(lower, false); if (IS_ERR(fh)) return PTR_ERR(fh); } @@ -321,7 +321,7 @@ static int ovl_set_upper_fh(struct dentry *upper, struct dentry *index) const struct ovl_fh *fh; int err; - fh = ovl_encode_fh(upper, true); + fh = ovl_encode_real_fh(upper, true); if (IS_ERR(fh)) return PTR_ERR(fh); diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 87bd414..425a946 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -228,8 +228,8 @@ static int ovl_d_to_fh(struct dentry *dentry, char *buf, int buflen) goto fail; /* Encode an upper or lower file handle */ - fh = ovl_encode_fh(enc_lower ? ovl_dentry_lower(dentry) : - ovl_dentry_upper(dentry), !enc_lower); + fh = ovl_encode_real_fh(enc_lower ? ovl_dentry_lower(dentry) : + ovl_dentry_upper(dentry), !enc_lower); err = PTR_ERR(fh); if (IS_ERR(fh)) goto fail; @@ -267,8 +267,8 @@ static int ovl_dentry_to_fh(struct dentry *dentry, u32 *fid, int *max_len) return OVL_FILEID; } -static int ovl_encode_inode_fh(struct inode *inode, u32 *fid, int *max_len, - struct inode *parent) +static int ovl_encode_fh(struct inode *inode, u32 *fid, int *max_len, + struct inode *parent) { struct dentry *dentry; int type; @@ -305,15 +305,12 @@ static struct dentry *ovl_obtain_alias(struct super_block *sb, if (d_is_dir(upper ?: lower)) return ERR_PTR(-EIO); - inode = ovl_get_inode(sb, dget(upper), lower, index, !!lower); + inode = ovl_get_inode(sb, dget(upper), lowerpath, index, !!lower); if (IS_ERR(inode)) { dput(upper); return ERR_CAST(inode); } - if (index) - ovl_set_flag(OVL_INDEX, inode); - dentry = d_find_any_alias(inode); if (!dentry) { dentry = d_alloc_anon(inode->i_sb); @@ -685,7 +682,7 @@ static struct dentry *ovl_upper_fh_to_d(struct super_block *sb, if (!ofs->upper_mnt) return ERR_PTR(-EACCES); - upper = ovl_decode_fh(fh, ofs->upper_mnt); + upper = ovl_decode_real_fh(fh, ofs->upper_mnt, true); if (IS_ERR_OR_NULL(upper)) return upper; @@ -703,25 +700,39 @@ static struct dentry *ovl_lower_fh_to_d(struct super_block *sb, struct ovl_path *stack = &origin; struct dentry *dentry = NULL; struct dentry *index = NULL; - struct inode *inode = NULL; - bool is_deleted = false; + struct inode *inode; int err; - /* First lookup indexed upper by fh */ + /* First lookup overlay inode in inode cache by origin fh */ + err = ovl_check_origin_fh(ofs, fh, false, NULL, &stack); + if (err) + return ERR_PTR(err); + + if (!d_is_dir(origin.dentry) || + !(origin.dentry->d_flags & DCACHE_DISCONNECTED)) { + inode = ovl_lookup_inode(sb, origin.dentry, false); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_err; + if (inode) { + dentry = d_find_any_alias(inode); + iput(inode); + if (dentry) + goto out; + } + } + + /* Then lookup indexed upper/whiteout by origin fh */ if (ofs->indexdir) { index = ovl_get_index_fh(ofs, fh); err = PTR_ERR(index); if (IS_ERR(index)) { - if (err != -ESTALE) - return ERR_PTR(err); - - /* Found a whiteout index - treat as deleted inode */ - is_deleted = true; index = NULL; + goto out_err; } } - /* Then try to get upper dir by index */ + /* Then try to get a connected upper dir by index */ if (index && d_is_dir(index)) { struct dentry *upper = ovl_index_upper(ofs, index); @@ -734,24 +745,19 @@ static struct dentry *ovl_lower_fh_to_d(struct super_block *sb, goto out; } - /* Then lookup origin by fh */ - err = ovl_check_origin_fh(ofs, fh, NULL, &stack); - if (err) { - goto out_err; - } else if (index) { - err = ovl_verify_origin(index, origin.dentry, false); + /* Otherwise, get a connected non-upper dir or disconnected non-dir */ + if (d_is_dir(origin.dentry) && + (origin.dentry->d_flags & DCACHE_DISCONNECTED)) { + dput(origin.dentry); + origin.dentry = NULL; + err = ovl_check_origin_fh(ofs, fh, true, NULL, &stack); if (err) goto out_err; - } else if (is_deleted) { - /* Lookup deleted non-dir by origin inode */ - if (!d_is_dir(origin.dentry)) - inode = ovl_lookup_inode(sb, origin.dentry, false); - err = -ESTALE; - if (!inode || atomic_read(&inode->i_count) == 1) + } + if (index) { + err = ovl_verify_origin(index, origin.dentry, false); + if (err) goto out_err; - - /* Deleted but still open? */ - index = dget(ovl_i_dentry_upper(inode)); } dentry = ovl_get_dentry(sb, NULL, &origin, index); @@ -759,7 +765,6 @@ static struct dentry *ovl_lower_fh_to_d(struct super_block *sb, out: dput(origin.dentry); dput(index); - iput(inode); return dentry; out_err: @@ -829,7 +834,7 @@ static struct dentry *ovl_get_parent(struct dentry *dentry) } const struct export_operations ovl_export_operations = { - .encode_fh = ovl_encode_inode_fh, + .encode_fh = ovl_encode_fh, .fh_to_dentry = ovl_fh_to_dentry, .fh_to_parent = ovl_fh_to_parent, .get_name = ovl_get_name, diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 3b1bd46..6e3815f 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -16,13 +16,6 @@ #include "overlayfs.h" -static dev_t ovl_get_pseudo_dev(struct dentry *dentry) -{ - struct ovl_entry *oe = dentry->d_fsdata; - - return oe->lowerstack[0].layer->pseudo_dev; -} - int ovl_setattr(struct dentry *dentry, struct iattr *attr) { int err; @@ -66,6 +59,69 @@ out: return err; } +static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, + struct ovl_layer *lower_layer) +{ + bool samefs = ovl_same_sb(dentry->d_sb); + unsigned int xinobits = ovl_xino_bits(dentry->d_sb); + + if (samefs) { + /* + * When all layers are on the same fs, all real inode + * number are unique, so we use the overlay st_dev, + * which is friendly to du -x. + */ + stat->dev = dentry->d_sb->s_dev; + return 0; + } else if (xinobits) { + unsigned int shift = 64 - xinobits; + /* + * All inode numbers of underlying fs should not be using the + * high xinobits, so we use high xinobits to partition the + * overlay st_ino address space. The high bits holds the fsid + * (upper fsid is 0). This way overlay inode numbers are unique + * and all inodes use overlay st_dev. Inode numbers are also + * persistent for a given layer configuration. + */ + if (stat->ino >> shift) { + pr_warn_ratelimited("overlayfs: inode number too big (%pd2, ino=%llu, xinobits=%d)\n", + dentry, stat->ino, xinobits); + } else { + if (lower_layer) + stat->ino |= ((u64)lower_layer->fsid) << shift; + + stat->dev = dentry->d_sb->s_dev; + return 0; + } + } + + /* The inode could not be mapped to a unified st_ino address space */ + if (S_ISDIR(dentry->d_inode->i_mode)) { + /* + * Always use the overlay st_dev for directories, so 'find + * -xdev' will scan the entire overlay mount and won't cross the + * overlay mount boundaries. + * + * If not all layers are on the same fs the pair {real st_ino; + * overlay st_dev} is not unique, so use the non persistent + * overlay st_ino for directories. + */ + stat->dev = dentry->d_sb->s_dev; + stat->ino = dentry->d_inode->i_ino; + } else if (lower_layer && lower_layer->fsid) { + /* + * For non-samefs setup, if we cannot map all layers st_ino + * to a unified address space, we need to make sure that st_dev + * is unique per lower fs. Upper layer uses real st_dev and + * lower layers use the unique anonymous bdev assigned to the + * lower fs. + */ + stat->dev = lower_layer->fs->pseudo_dev; + } + + return 0; +} + int ovl_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { @@ -75,6 +131,7 @@ int ovl_getattr(const struct path *path, struct kstat *stat, const struct cred *old_cred; bool is_dir = S_ISDIR(dentry->d_inode->i_mode); bool samefs = ovl_same_sb(dentry->d_sb); + struct ovl_layer *lower_layer = NULL; int err; type = ovl_path_real(dentry, &realpath); @@ -84,14 +141,18 @@ int ovl_getattr(const struct path *path, struct kstat *stat, goto out; /* - * For non-dir or same fs, we use st_ino of the copy up origin, if we - * know it. This guaranties constant st_dev/st_ino across copy up. + * For non-dir or same fs, we use st_ino of the copy up origin. + * This guaranties constant st_dev/st_ino across copy up. + * With xino feature and non-samefs, we use st_ino of the copy up + * origin masked with high bits that represent the layer id. * - * If filesystem supports NFS export ops, this also guaranties + * If lower filesystem supports NFS file handles, this also guaranties * persistent st_ino across mount cycle. */ - if (!is_dir || samefs) { - if (OVL_TYPE_ORIGIN(type)) { + if (!is_dir || samefs || ovl_xino_bits(dentry->d_sb)) { + if (!OVL_TYPE_UPPER(type)) { + lower_layer = ovl_layer_lower(dentry); + } else if (OVL_TYPE_ORIGIN(type)) { struct kstat lowerstat; u32 lowermask = STATX_INO | (!is_dir ? STATX_NLINK : 0); @@ -118,43 +179,17 @@ int ovl_getattr(const struct path *path, struct kstat *stat, */ if (ovl_test_flag(OVL_INDEX, d_inode(dentry)) || (!ovl_verify_lower(dentry->d_sb) && - (is_dir || lowerstat.nlink == 1))) + (is_dir || lowerstat.nlink == 1))) { stat->ino = lowerstat.ino; - - if (samefs) - WARN_ON_ONCE(stat->dev != lowerstat.dev); - else - stat->dev = ovl_get_pseudo_dev(dentry); - } - if (samefs) { - /* - * When all layers are on the same fs, all real inode - * number are unique, so we use the overlay st_dev, - * which is friendly to du -x. - */ - stat->dev = dentry->d_sb->s_dev; - } else if (!OVL_TYPE_UPPER(type)) { - /* - * For non-samefs setup, to make sure that st_dev/st_ino - * pair is unique across the system, we use a unique - * anonymous st_dev for lower layer inode. - */ - stat->dev = ovl_get_pseudo_dev(dentry); + lower_layer = ovl_layer_lower(dentry); + } } - } else { - /* - * Always use the overlay st_dev for directories, so 'find - * -xdev' will scan the entire overlay mount and won't cross the - * overlay mount boundaries. - * - * If not all layers are on the same fs the pair {real st_ino; - * overlay st_dev} is not unique, so use the non persistent - * overlay st_ino for directories. - */ - stat->dev = dentry->d_sb->s_dev; - stat->ino = dentry->d_inode->i_ino; } + err = ovl_map_dev_ino(dentry, stat, lower_layer); + if (err) + goto out; + /* * It's probably not worth it to count subdirs to get the * correct link count. nlink=1 seems to pacify 'find' and @@ -383,24 +418,18 @@ int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags) int ovl_update_time(struct inode *inode, struct timespec *ts, int flags) { - struct dentry *alias; - struct path upperpath; - - if (!(flags & S_ATIME)) - return 0; - - alias = d_find_any_alias(inode); - if (!alias) - return 0; - - ovl_path_upper(alias, &upperpath); - if (upperpath.dentry) { - touch_atime(&upperpath); - inode->i_atime = d_inode(upperpath.dentry)->i_atime; + if (flags & S_ATIME) { + struct ovl_fs *ofs = inode->i_sb->s_fs_info; + struct path upperpath = { + .mnt = ofs->upper_mnt, + .dentry = ovl_upperdentry_dereference(OVL_I(inode)), + }; + + if (upperpath.dentry) { + touch_atime(&upperpath); + inode->i_atime = d_inode(upperpath.dentry)->i_atime; + } } - - dput(alias); - return 0; } @@ -459,9 +488,27 @@ static inline void ovl_lockdep_annotate_inode_mutex_key(struct inode *inode) #endif } -static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev) +static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev, + unsigned long ino, int fsid) { - inode->i_ino = get_next_ino(); + int xinobits = ovl_xino_bits(inode->i_sb); + + /* + * When NFS export is enabled and d_ino is consistent with st_ino + * (samefs or i_ino has enough bits to encode layer), set the same + * value used for d_ino to i_ino, because nfsd readdirplus compares + * d_ino values to i_ino values of child entries. When called from + * ovl_new_inode(), ino arg is 0, so i_ino will be updated to real + * upper inode i_ino on ovl_inode_init() or ovl_inode_update(). + */ + if (inode->i_sb->s_export_op && + (ovl_same_sb(inode->i_sb) || xinobits)) { + inode->i_ino = ino; + if (xinobits && fsid && !(ino >> (64 - xinobits))) + inode->i_ino |= (unsigned long)fsid << (64 - xinobits); + } else { + inode->i_ino = get_next_ino(); + } inode->i_mode = mode; inode->i_flags |= S_NOCMTIME; #ifdef CONFIG_FS_POSIX_ACL @@ -597,7 +644,7 @@ struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev) inode = new_inode(sb); if (inode) - ovl_fill_inode(inode, mode, rdev); + ovl_fill_inode(inode, mode, rdev, 0, 0); return inode; } @@ -703,13 +750,16 @@ static bool ovl_hash_bylower(struct super_block *sb, struct dentry *upper, } struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry, - struct dentry *lowerdentry, struct dentry *index, + struct ovl_path *lowerpath, struct dentry *index, unsigned int numlower) { struct inode *realinode = upperdentry ? d_inode(upperdentry) : NULL; struct inode *inode; + struct dentry *lowerdentry = lowerpath ? lowerpath->dentry : NULL; bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry, index); + int fsid = bylower ? lowerpath->layer->fsid : 0; bool is_dir; + unsigned long ino = 0; if (!realinode) realinode = d_inode(lowerdentry); @@ -748,18 +798,22 @@ struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry, if (!is_dir) nlink = ovl_get_nlink(lowerdentry, upperdentry, nlink); set_nlink(inode, nlink); + ino = key->i_ino; } else { /* Lower hardlink that will be broken on copy up */ inode = new_inode(sb); if (!inode) goto out_nomem; } - ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev); + ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev, ino, fsid); ovl_inode_init(inode, upperdentry, lowerdentry); if (upperdentry && ovl_is_impuredir(upperdentry)) ovl_set_flag(OVL_IMPURE, inode); + if (index) + ovl_set_flag(OVL_INDEX, inode); + /* Check for non-merge dir that may have whiteouts */ if (is_dir) { if (((upperdentry && lowerdentry) || numlower > 1) || diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 70fcfcc..2dba29e 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -56,6 +56,15 @@ static int ovl_check_redirect(struct dentry *dentry, struct ovl_lookup_data *d, if (s == next) goto invalid; } + /* + * One of the ancestor path elements in an absolute path + * lookup in ovl_lookup_layer() could have been opaque and + * that will stop further lookup in lower layers (d->stop=true) + * But we have found an absolute redirect in decendant path + * element and that should force continue lookup in lower + * layers (reset d->stop). + */ + d->stop = false; } else { if (strchr(buf, '/') != NULL) goto invalid; @@ -171,7 +180,8 @@ invalid: goto out; } -struct dentry *ovl_decode_fh(struct ovl_fh *fh, struct vfsmount *mnt) +struct dentry *ovl_decode_real_fh(struct ovl_fh *fh, struct vfsmount *mnt, + bool connected) { struct dentry *real; int bytes; @@ -186,7 +196,7 @@ struct dentry *ovl_decode_fh(struct ovl_fh *fh, struct vfsmount *mnt) bytes = (fh->len - offsetof(struct ovl_fh, fid)); real = exportfs_decode_fh(mnt, (struct fid *)fh->fid, bytes >> 2, (int)fh->type, - ovl_acceptable, mnt); + connected ? ovl_acceptable : NULL, mnt); if (IS_ERR(real)) { /* * Treat stale file handle to lower file as "origin unknown". @@ -220,6 +230,7 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, { struct dentry *this; int err; + bool last_element = !post[0]; this = lookup_one_len_unlocked(name, base, namelen); if (IS_ERR(this)) { @@ -245,11 +256,23 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, d->stop = true; if (d->is_dir) goto put_and_out; + + /* + * NB: handle failure to lookup non-last element when non-dir + * redirects become possible + */ + WARN_ON(!last_element); goto out; } - d->is_dir = true; - if (!d->last && ovl_is_opaquedir(this)) { - d->stop = d->opaque = true; + if (last_element) + d->is_dir = true; + if (d->last) + goto out; + + if (ovl_is_opaquedir(this)) { + d->stop = true; + if (last_element) + d->opaque = true; goto out; } err = ovl_check_redirect(this, d, prelen, post); @@ -310,14 +333,15 @@ static int ovl_lookup_layer(struct dentry *base, struct ovl_lookup_data *d, } -int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, +int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected, struct dentry *upperdentry, struct ovl_path **stackp) { struct dentry *origin = NULL; int i; for (i = 0; i < ofs->numlower; i++) { - origin = ovl_decode_fh(fh, ofs->lower_layers[i].mnt); + origin = ovl_decode_real_fh(fh, ofs->lower_layers[i].mnt, + connected); if (origin) break; } @@ -361,7 +385,7 @@ static int ovl_check_origin(struct ovl_fs *ofs, struct dentry *upperdentry, if (IS_ERR_OR_NULL(fh)) return PTR_ERR(fh); - err = ovl_check_origin_fh(ofs, fh, upperdentry, stackp); + err = ovl_check_origin_fh(ofs, fh, false, upperdentry, stackp); kfree(fh); if (err) { @@ -415,7 +439,7 @@ int ovl_verify_set_fh(struct dentry *dentry, const char *name, struct ovl_fh *fh; int err; - fh = ovl_encode_fh(real, is_upper); + fh = ovl_encode_real_fh(real, is_upper); err = PTR_ERR(fh); if (IS_ERR(fh)) goto fail; @@ -451,7 +475,7 @@ struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index) if (IS_ERR_OR_NULL(fh)) return ERR_CAST(fh); - upper = ovl_decode_fh(fh, ofs->upper_mnt); + upper = ovl_decode_real_fh(fh, ofs->upper_mnt, true); kfree(fh); if (IS_ERR_OR_NULL(upper)) @@ -558,7 +582,7 @@ int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index) /* Check if non-dir index is orphan and don't warn before cleaning it */ if (!d_is_dir(index) && d_inode(index)->i_nlink == 1) { - err = ovl_check_origin_fh(ofs, fh, index, &stack); + err = ovl_check_origin_fh(ofs, fh, false, index, &stack); if (err) goto fail; @@ -619,7 +643,7 @@ int ovl_get_index_name(struct dentry *origin, struct qstr *name) struct ovl_fh *fh; int err; - fh = ovl_encode_fh(origin, false); + fh = ovl_encode_real_fh(origin, false); if (IS_ERR(fh)) return PTR_ERR(fh); @@ -815,7 +839,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, .is_dir = false, .opaque = false, .stop = false, - .last = !poe->numlower, + .last = ofs->config.redirect_follow ? false : !poe->numlower, .redirect = NULL, }; @@ -873,7 +897,11 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, for (i = 0; !d.stop && i < poe->numlower; i++) { struct ovl_path lower = poe->lowerstack[i]; - d.last = i == poe->numlower - 1; + if (!ofs->config.redirect_follow) + d.last = i == poe->numlower - 1; + else + d.last = lower.layer->idx == roe->numlower; + err = ovl_lookup_layer(lower.dentry, &d, &this); if (err) goto out_put; @@ -976,17 +1004,18 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, upperdentry = dget(index); if (upperdentry || ctr) { - if (ctr) - origin = stack[0].dentry; - inode = ovl_get_inode(dentry->d_sb, upperdentry, origin, index, + inode = ovl_get_inode(dentry->d_sb, upperdentry, stack, index, ctr); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_free_oe; + /* + * NB: handle redirected hard links when non-dir redirects + * become possible + */ + WARN_ON(OVL_I(inode)->redirect); OVL_I(inode)->redirect = upperredirect; - if (index) - ovl_set_flag(OVL_INDEX, inode); } revert_creds(old_cred); diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 225ff11..e0b7de7 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -202,7 +202,7 @@ void ovl_drop_write(struct dentry *dentry); struct dentry *ovl_workdir(struct dentry *dentry); const struct cred *ovl_override_creds(struct super_block *sb); struct super_block *ovl_same_sb(struct super_block *sb); -bool ovl_can_decode_fh(struct super_block *sb); +int ovl_can_decode_fh(struct super_block *sb); struct dentry *ovl_indexdir(struct super_block *sb); bool ovl_index_all(struct super_block *sb); bool ovl_verify_lower(struct super_block *sb); @@ -215,6 +215,7 @@ void ovl_path_lower(struct dentry *dentry, struct path *path); enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path); struct dentry *ovl_dentry_upper(struct dentry *dentry); struct dentry *ovl_dentry_lower(struct dentry *dentry); +struct ovl_layer *ovl_layer_lower(struct dentry *dentry); struct dentry *ovl_dentry_real(struct dentry *dentry); struct dentry *ovl_i_dentry_upper(struct inode *inode); struct inode *ovl_inode_upper(struct inode *inode); @@ -263,11 +264,19 @@ static inline bool ovl_is_impuredir(struct dentry *dentry) return ovl_check_dir_xattr(dentry, OVL_XATTR_IMPURE); } +static inline unsigned int ovl_xino_bits(struct super_block *sb) +{ + struct ovl_fs *ofs = sb->s_fs_info; + + return ofs->xino_bits; +} + /* namei.c */ int ovl_check_fh_len(struct ovl_fh *fh, int fh_len); -struct dentry *ovl_decode_fh(struct ovl_fh *fh, struct vfsmount *mnt); -int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, +struct dentry *ovl_decode_real_fh(struct ovl_fh *fh, struct vfsmount *mnt, + bool connected); +int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected, struct dentry *upperdentry, struct ovl_path **stackp); int ovl_verify_set_fh(struct dentry *dentry, const char *name, struct dentry *real, bool is_upper, bool set); @@ -329,7 +338,7 @@ struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev); struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real, bool is_upper); struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry, - struct dentry *lowerdentry, struct dentry *index, + struct ovl_path *lowerpath, struct dentry *index, unsigned int numlower); static inline void ovl_copyattr(struct inode *from, struct inode *to) { @@ -361,7 +370,7 @@ int ovl_copy_up(struct dentry *dentry); int ovl_copy_up_flags(struct dentry *dentry, int flags); int ovl_copy_xattr(struct dentry *old, struct dentry *new); int ovl_set_attr(struct dentry *upper, struct kstat *stat); -struct ovl_fh *ovl_encode_fh(struct dentry *real, bool is_upper); +struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper); int ovl_set_origin(struct dentry *dentry, struct dentry *lower, struct dentry *upper); diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index bfef6ed..41655a7 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -18,13 +18,21 @@ struct ovl_config { const char *redirect_mode; bool index; bool nfs_export; + int xino; +}; + +struct ovl_sb { + struct super_block *sb; + dev_t pseudo_dev; }; struct ovl_layer { struct vfsmount *mnt; - dev_t pseudo_dev; - /* Index of this layer in fs root (upper == 0) */ + struct ovl_sb *fs; + /* Index of this layer in fs root (upper idx == 0) */ int idx; + /* One fsid per unique underlying sb (upper fsid == 0) */ + int fsid; }; struct ovl_path { @@ -35,8 +43,11 @@ struct ovl_path { /* private information held for overlayfs's superblock */ struct ovl_fs { struct vfsmount *upper_mnt; - unsigned numlower; + unsigned int numlower; + /* Number of unique lower sb that differ from upper sb */ + unsigned int numlowerfs; struct ovl_layer *lower_layers; + struct ovl_sb *lower_fs; /* workbasedir is the path at workdir= mount option */ struct dentry *workbasedir; /* workdir is the 'work' directory under workbasedir */ @@ -50,11 +61,11 @@ struct ovl_fs { const struct cred *creator_cred; bool tmpfile; bool noxattr; - /* sb common to all layers */ - struct super_block *same_sb; /* Did we take the inuse lock? */ bool upperdir_locked; bool workdir_locked; + /* Inode numbers in all layers do not use the high xino_bits */ + unsigned int xino_bits; }; /* private information held for every overlayfs dentry */ diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index c11f5c0..ef1fe42 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -120,6 +120,10 @@ static bool ovl_calc_d_ino(struct ovl_readdir_data *rdd, if (!rdd->dentry) return false; + /* Always recalc d_ino when remapping lower inode numbers */ + if (ovl_xino_bits(rdd->dentry->d_sb)) + return true; + /* Always recalc d_ino for parent */ if (strcmp(p->name, "..") == 0) return true; @@ -435,6 +439,19 @@ static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry) return cache; } +/* Map inode number to lower fs unique range */ +static u64 ovl_remap_lower_ino(u64 ino, int xinobits, int fsid, + const char *name, int namelen) +{ + if (ino >> (64 - xinobits)) { + pr_warn_ratelimited("overlayfs: d_ino too big (%.*s, ino=%llu, xinobits=%d)\n", + namelen, name, ino, xinobits); + return ino; + } + + return ino | ((u64)fsid) << (64 - xinobits); +} + /* * Set d_ino for upper entries. Non-upper entries should always report * the uppermost real inode ino and should not call this function. @@ -452,9 +469,10 @@ static int ovl_cache_update_ino(struct path *path, struct ovl_cache_entry *p) struct dentry *this = NULL; enum ovl_path_type type; u64 ino = p->real_ino; + int xinobits = ovl_xino_bits(dir->d_sb); int err = 0; - if (!ovl_same_sb(dir->d_sb)) + if (!ovl_same_sb(dir->d_sb) && !xinobits) goto out; if (p->name[0] == '.') { @@ -491,6 +509,10 @@ get: WARN_ON_ONCE(dir->d_sb->s_dev != stat.dev); ino = stat.ino; + } else if (xinobits && !OVL_TYPE_UPPER(type)) { + ino = ovl_remap_lower_ino(ino, xinobits, + ovl_layer_lower(this)->fsid, + p->name, p->len); } out: @@ -618,6 +640,8 @@ struct ovl_readdir_translate { struct ovl_dir_cache *cache; struct dir_context ctx; u64 parent_ino; + int fsid; + int xinobits; }; static int ovl_fill_real(struct dir_context *ctx, const char *name, @@ -628,14 +652,17 @@ static int ovl_fill_real(struct dir_context *ctx, const char *name, container_of(ctx, struct ovl_readdir_translate, ctx); struct dir_context *orig_ctx = rdt->orig_ctx; - if (rdt->parent_ino && strcmp(name, "..") == 0) + if (rdt->parent_ino && strcmp(name, "..") == 0) { ino = rdt->parent_ino; - else if (rdt->cache) { + } else if (rdt->cache) { struct ovl_cache_entry *p; p = ovl_cache_entry_find(&rdt->cache->root, name, namelen); if (p) ino = p->ino; + } else if (rdt->xinobits) { + ino = ovl_remap_lower_ino(ino, rdt->xinobits, rdt->fsid, + name, namelen); } return orig_ctx->actor(orig_ctx, name, namelen, offset, ino, d_type); @@ -646,11 +673,16 @@ static int ovl_iterate_real(struct file *file, struct dir_context *ctx) int err; struct ovl_dir_file *od = file->private_data; struct dentry *dir = file->f_path.dentry; + struct ovl_layer *lower_layer = ovl_layer_lower(dir); struct ovl_readdir_translate rdt = { .ctx.actor = ovl_fill_real, .orig_ctx = ctx, + .xinobits = ovl_xino_bits(dir->d_sb), }; + if (rdt.xinobits && lower_layer) + rdt.fsid = lower_layer->fsid; + if (OVL_TYPE_MERGE(ovl_path_type(dir->d_parent))) { struct kstat stat; struct path statpath = file->f_path; @@ -693,9 +725,10 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx) * dir is impure then need to adjust d_ino for copied up * entries. */ - if (ovl_same_sb(dentry->d_sb) && - (ovl_test_flag(OVL_IMPURE, d_inode(dentry)) || - OVL_TYPE_MERGE(ovl_path_type(dentry->d_parent)))) { + if (ovl_xino_bits(dentry->d_sb) || + (ovl_same_sb(dentry->d_sb) && + (ovl_test_flag(OVL_IMPURE, d_inode(dentry)) || + OVL_TYPE_MERGE(ovl_path_type(dentry->d_parent))))) { return ovl_iterate_real(file, ctx); } return iterate_dir(od->realfile, ctx); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 7c24619..e8551c9 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -17,6 +17,7 @@ #include <linux/statfs.h> #include <linux/seq_file.h> #include <linux/posix_acl_xattr.h> +#include <linux/exportfs.h> #include "overlayfs.h" MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); @@ -50,6 +51,11 @@ module_param_named(nfs_export, ovl_nfs_export_def, bool, 0644); MODULE_PARM_DESC(ovl_nfs_export_def, "Default to on or off for the NFS export feature"); +static bool ovl_xino_auto_def = IS_ENABLED(CONFIG_OVERLAY_FS_XINO_AUTO); +module_param_named(xino_auto, ovl_xino_auto_def, bool, 0644); +MODULE_PARM_DESC(ovl_xino_auto_def, + "Auto enable xino feature"); + static void ovl_entry_stack_free(struct ovl_entry *oe) { unsigned int i; @@ -236,11 +242,12 @@ static void ovl_free_fs(struct ovl_fs *ofs) if (ofs->upperdir_locked) ovl_inuse_unlock(ofs->upper_mnt->mnt_root); mntput(ofs->upper_mnt); - for (i = 0; i < ofs->numlower; i++) { + for (i = 0; i < ofs->numlower; i++) mntput(ofs->lower_layers[i].mnt); - free_anon_bdev(ofs->lower_layers[i].pseudo_dev); - } + for (i = 0; i < ofs->numlowerfs; i++) + free_anon_bdev(ofs->lower_fs[i].pseudo_dev); kfree(ofs->lower_layers); + kfree(ofs->lower_fs); kfree(ofs->config.lowerdir); kfree(ofs->config.upperdir); @@ -325,6 +332,23 @@ static const char *ovl_redirect_mode_def(void) return ovl_redirect_dir_def ? "on" : "off"; } +enum { + OVL_XINO_OFF, + OVL_XINO_AUTO, + OVL_XINO_ON, +}; + +static const char * const ovl_xino_str[] = { + "off", + "auto", + "on", +}; + +static inline int ovl_xino_def(void) +{ + return ovl_xino_auto_def ? OVL_XINO_AUTO : OVL_XINO_OFF; +} + /** * ovl_show_options * @@ -350,6 +374,8 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry) if (ofs->config.nfs_export != ovl_nfs_export_def) seq_printf(m, ",nfs_export=%s", ofs->config.nfs_export ? "on" : "off"); + if (ofs->config.xino != ovl_xino_def()) + seq_printf(m, ",xino=%s", ovl_xino_str[ofs->config.xino]); return 0; } @@ -384,6 +410,9 @@ enum { OPT_INDEX_OFF, OPT_NFS_EXPORT_ON, OPT_NFS_EXPORT_OFF, + OPT_XINO_ON, + OPT_XINO_OFF, + OPT_XINO_AUTO, OPT_ERR, }; @@ -397,6 +426,9 @@ static const match_table_t ovl_tokens = { {OPT_INDEX_OFF, "index=off"}, {OPT_NFS_EXPORT_ON, "nfs_export=on"}, {OPT_NFS_EXPORT_OFF, "nfs_export=off"}, + {OPT_XINO_ON, "xino=on"}, + {OPT_XINO_OFF, "xino=off"}, + {OPT_XINO_AUTO, "xino=auto"}, {OPT_ERR, NULL} }; @@ -511,6 +543,18 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) config->nfs_export = false; break; + case OPT_XINO_ON: + config->xino = OVL_XINO_ON; + break; + + case OPT_XINO_OFF: + config->xino = OVL_XINO_OFF; + break; + + case OPT_XINO_AUTO: + config->xino = OVL_XINO_AUTO; + break; + default: pr_err("overlayfs: unrecognized mount option \"%s\" or missing value\n", p); return -EINVAL; @@ -700,6 +744,7 @@ static int ovl_check_namelen(struct path *path, struct ovl_fs *ofs, static int ovl_lower_dir(const char *name, struct path *path, struct ovl_fs *ofs, int *stack_depth, bool *remote) { + int fh_type; int err; err = ovl_mount_dir_noesc(name, path); @@ -719,15 +764,19 @@ static int ovl_lower_dir(const char *name, struct path *path, * The inodes index feature and NFS export need to encode and decode * file handles, so they require that all layers support them. */ + fh_type = ovl_can_decode_fh(path->dentry->d_sb); if ((ofs->config.nfs_export || - (ofs->config.index && ofs->config.upperdir)) && - !ovl_can_decode_fh(path->dentry->d_sb)) { + (ofs->config.index && ofs->config.upperdir)) && !fh_type) { ofs->config.index = false; ofs->config.nfs_export = false; pr_warn("overlayfs: fs on '%s' does not support file handles, falling back to index=off,nfs_export=off.\n", name); } + /* Check if lower fs has 32bit inode numbers */ + if (fh_type != FILEID_INO32_GEN) + ofs->xino_bits = 0; + return 0; out_put: @@ -951,6 +1000,7 @@ static int ovl_make_workdir(struct ovl_fs *ofs, struct path *workpath) { struct vfsmount *mnt = ofs->upper_mnt; struct dentry *temp; + int fh_type; int err; err = mnt_want_write(mnt); @@ -1000,12 +1050,16 @@ static int ovl_make_workdir(struct ovl_fs *ofs, struct path *workpath) } /* Check if upper/work fs supports file handles */ - if (ofs->config.index && - !ovl_can_decode_fh(ofs->workdir->d_sb)) { + fh_type = ovl_can_decode_fh(ofs->workdir->d_sb); + if (ofs->config.index && !fh_type) { ofs->config.index = false; pr_warn("overlayfs: upper fs does not support file handles, falling back to index=off.\n"); } + /* Check if upper fs has 32bit inode numbers */ + if (fh_type != FILEID_INO32_GEN) + ofs->xino_bits = 0; + /* NFS export of r/w mount depends on index */ if (ofs->config.nfs_export && !ofs->config.index) { pr_warn("overlayfs: NFS export requires \"index=on\", falling back to nfs_export=off.\n"); @@ -1108,6 +1162,35 @@ out: return err; } +/* Get a unique fsid for the layer */ +static int ovl_get_fsid(struct ovl_fs *ofs, struct super_block *sb) +{ + unsigned int i; + dev_t dev; + int err; + + /* fsid 0 is reserved for upper fs even with non upper overlay */ + if (ofs->upper_mnt && ofs->upper_mnt->mnt_sb == sb) + return 0; + + for (i = 0; i < ofs->numlowerfs; i++) { + if (ofs->lower_fs[i].sb == sb) + return i + 1; + } + + err = get_anon_bdev(&dev); + if (err) { + pr_err("overlayfs: failed to get anonymous bdev for lowerpath\n"); + return err; + } + + ofs->lower_fs[ofs->numlowerfs].sb = sb; + ofs->lower_fs[ofs->numlowerfs].pseudo_dev = dev; + ofs->numlowerfs++; + + return ofs->numlowerfs; +} + static int ovl_get_lower_layers(struct ovl_fs *ofs, struct path *stack, unsigned int numlower) { @@ -1119,23 +1202,27 @@ static int ovl_get_lower_layers(struct ovl_fs *ofs, struct path *stack, GFP_KERNEL); if (ofs->lower_layers == NULL) goto out; + + ofs->lower_fs = kcalloc(numlower, sizeof(struct ovl_sb), + GFP_KERNEL); + if (ofs->lower_fs == NULL) + goto out; + for (i = 0; i < numlower; i++) { struct vfsmount *mnt; - dev_t dev; + int fsid; - err = get_anon_bdev(&dev); - if (err) { - pr_err("overlayfs: failed to get anonymous bdev for lowerpath\n"); + err = fsid = ovl_get_fsid(ofs, stack[i].mnt->mnt_sb); + if (err < 0) goto out; - } mnt = clone_private_mount(&stack[i]); err = PTR_ERR(mnt); if (IS_ERR(mnt)) { pr_err("overlayfs: failed to clone lowerpath\n"); - free_anon_bdev(dev); goto out; } + /* * Make lower layers R/O. That way fchmod/fchown on lower file * will fail instead of modifying lower fs. @@ -1143,16 +1230,41 @@ static int ovl_get_lower_layers(struct ovl_fs *ofs, struct path *stack, mnt->mnt_flags |= MNT_READONLY | MNT_NOATIME; ofs->lower_layers[ofs->numlower].mnt = mnt; - ofs->lower_layers[ofs->numlower].pseudo_dev = dev; ofs->lower_layers[ofs->numlower].idx = i + 1; + ofs->lower_layers[ofs->numlower].fsid = fsid; + if (fsid) { + ofs->lower_layers[ofs->numlower].fs = + &ofs->lower_fs[fsid - 1]; + } ofs->numlower++; + } + + /* + * When all layers on same fs, overlay can use real inode numbers. + * With mount option "xino=on", mounter declares that there are enough + * free high bits in underlying fs to hold the unique fsid. + * If overlayfs does encounter underlying inodes using the high xino + * bits reserved for fsid, it emits a warning and uses the original + * inode number. + */ + if (!ofs->numlowerfs || (ofs->numlowerfs == 1 && !ofs->upper_mnt)) { + ofs->xino_bits = 0; + ofs->config.xino = OVL_XINO_OFF; + } else if (ofs->config.xino == OVL_XINO_ON && !ofs->xino_bits) { + /* + * This is a roundup of number of bits needed for numlowerfs+1 + * (i.e. ilog2(numlowerfs+1 - 1) + 1). fsid 0 is reserved for + * upper fs even with non upper overlay. + */ + BUILD_BUG_ON(ilog2(OVL_MAX_STACK) > 31); + ofs->xino_bits = ilog2(ofs->numlowerfs) + 1; + } - /* Check if all lower layers are on same sb */ - if (i == 0) - ofs->same_sb = mnt->mnt_sb; - else if (ofs->same_sb != mnt->mnt_sb) - ofs->same_sb = NULL; + if (ofs->xino_bits) { + pr_info("overlayfs: \"xino\" feature enabled using %d upper inode bits.\n", + ofs->xino_bits); } + err = 0; out: return err; @@ -1263,6 +1375,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) ofs->config.index = ovl_index_def; ofs->config.nfs_export = ovl_nfs_export_def; + ofs->config.xino = ovl_xino_def(); err = ovl_parse_opt((char *) data, &ofs->config); if (err) goto out_err; @@ -1276,6 +1389,10 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) sb->s_stack_depth = 0; sb->s_maxbytes = MAX_LFS_FILESIZE; + /* Assume underlaying fs uses 32bit inodes unless proven otherwise */ + if (ofs->config.xino != OVL_XINO_OFF) + ofs->xino_bits = BITS_PER_LONG - 32; + if (ofs->config.upperdir) { if (!ofs->config.workdir) { pr_err("overlayfs: missing 'workdir'\n"); @@ -1305,8 +1422,6 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) /* If the upper fs is nonexistent, we mark overlayfs r/o too */ if (!ofs->upper_mnt) sb->s_flags |= SB_RDONLY; - else if (ofs->upper_mnt->mnt_sb != ofs->same_sb) - ofs->same_sb = NULL; if (!(ovl_force_readonly(ofs)) && ofs->config.index) { err = ovl_get_indexdir(ofs, oe, &upperpath); diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 930784a..6f10780 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -47,13 +47,29 @@ struct super_block *ovl_same_sb(struct super_block *sb) { struct ovl_fs *ofs = sb->s_fs_info; - return ofs->same_sb; + if (!ofs->numlowerfs) + return ofs->upper_mnt->mnt_sb; + else if (ofs->numlowerfs == 1 && !ofs->upper_mnt) + return ofs->lower_fs[0].sb; + else + return NULL; } -bool ovl_can_decode_fh(struct super_block *sb) +/* + * Check if underlying fs supports file handles and try to determine encoding + * type, in order to deduce maximum inode number used by fs. + * + * Return 0 if file handles are not supported. + * Return 1 (FILEID_INO32_GEN) if fs uses the default 32bit inode encoding. + * Return -1 if fs uses a non default encoding with unknown inode size. + */ +int ovl_can_decode_fh(struct super_block *sb) { - return (sb->s_export_op && sb->s_export_op->fh_to_dentry && - !uuid_is_null(&sb->s_uuid)); + if (!sb->s_export_op || !sb->s_export_op->fh_to_dentry || + uuid_is_null(&sb->s_uuid)) + return 0; + + return sb->s_export_op->encode_fh ? -1 : FILEID_INO32_GEN; } struct dentry *ovl_indexdir(struct super_block *sb) @@ -172,6 +188,13 @@ struct dentry *ovl_dentry_lower(struct dentry *dentry) return oe->numlower ? oe->lowerstack[0].dentry : NULL; } +struct ovl_layer *ovl_layer_lower(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + return oe->numlower ? oe->lowerstack[0].layer : NULL; +} + struct dentry *ovl_dentry_real(struct dentry *dentry) { return ovl_dentry_upper(dentry) ?: ovl_dentry_lower(dentry); @@ -279,12 +302,16 @@ void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect) void ovl_inode_init(struct inode *inode, struct dentry *upperdentry, struct dentry *lowerdentry) { + struct inode *realinode = d_inode(upperdentry ?: lowerdentry); + if (upperdentry) OVL_I(inode)->__upperdentry = upperdentry; if (lowerdentry) OVL_I(inode)->lower = igrab(d_inode(lowerdentry)); - ovl_copyattr(d_inode(upperdentry ?: lowerdentry), inode); + ovl_copyattr(realinode, inode); + if (!inode->i_ino) + inode->i_ino = realinode->i_ino; } void ovl_inode_update(struct inode *inode, struct dentry *upperdentry) @@ -299,6 +326,8 @@ void ovl_inode_update(struct inode *inode, struct dentry *upperdentry) smp_wmb(); OVL_I(inode)->__upperdentry = upperdentry; if (inode_unhashed(inode)) { + if (!inode->i_ino) + inode->i_ino = upperinode->i_ino; inode->i_private = upperinode; __insert_inode_hash(inode, (unsigned long) upperinode); } diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index 1ade120..0eaeb41 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -43,6 +43,21 @@ config PROC_VMCORE help Exports the dump image of crashed kernel in ELF format. +config PROC_VMCORE_DEVICE_DUMP + bool "Device Hardware/Firmware Log Collection" + depends on PROC_VMCORE + default n + help + After kernel panic, device drivers can collect the device + specific snapshot of their hardware or firmware before the + underlying devices are initialized in crash recovery kernel. + Note that the device driver must be present in the crash + recovery kernel's initramfs to collect its underlying device + snapshot. + + If you say Y here, the collected device dumps will be added + as ELF notes to /proc/vmcore. + config PROC_SYSCTL bool "Sysctl support (/proc/sys)" if EXPERT depends on PROC_FS diff --git a/fs/proc/base.c b/fs/proc/base.c index eafa39a..1a76d75 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -261,7 +261,7 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, * Inherently racy -- command line shares address space * with code and data. */ - rv = access_remote_vm(mm, arg_end - 1, &c, 1, 0); + rv = access_remote_vm(mm, arg_end - 1, &c, 1, FOLL_ANON); if (rv <= 0) goto out_free_page; @@ -279,7 +279,7 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, int nr_read; _count = min3(count, len, PAGE_SIZE); - nr_read = access_remote_vm(mm, p, page, _count, 0); + nr_read = access_remote_vm(mm, p, page, _count, FOLL_ANON); if (nr_read < 0) rv = nr_read; if (nr_read <= 0) @@ -325,7 +325,7 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, bool final; _count = min3(count, len, PAGE_SIZE); - nr_read = access_remote_vm(mm, p, page, _count, 0); + nr_read = access_remote_vm(mm, p, page, _count, FOLL_ANON); if (nr_read < 0) rv = nr_read; if (nr_read <= 0) @@ -946,7 +946,7 @@ static ssize_t environ_read(struct file *file, char __user *buf, max_len = min_t(size_t, PAGE_SIZE, count); this_len = min(max_len, this_len); - retval = access_remote_vm(mm, (env_start + src), page, this_len, 0); + retval = access_remote_vm(mm, (env_start + src), page, this_len, FOLL_ANON); if (retval <= 0) { ret = retval; @@ -1693,6 +1693,12 @@ void task_dump_owner(struct task_struct *task, umode_t mode, kuid_t uid; kgid_t gid; + if (unlikely(task->flags & PF_KTHREAD)) { + *ruid = GLOBAL_ROOT_UID; + *rgid = GLOBAL_ROOT_GID; + return; + } + /* Default to the tasks effective ownership */ rcu_read_lock(); cred = __task_cred(task); diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 04c4804..2078e70 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -15,6 +15,7 @@ #include <linux/stat.h> #include <linux/mm.h> #include <linux/module.h> +#include <linux/namei.h> #include <linux/slab.h> #include <linux/printk.h> #include <linux/mount.h> @@ -217,6 +218,26 @@ void proc_free_inum(unsigned int inum) ida_simple_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST); } +static int proc_misc_d_revalidate(struct dentry *dentry, unsigned int flags) +{ + if (flags & LOOKUP_RCU) + return -ECHILD; + + if (atomic_read(&PDE(d_inode(dentry))->in_use) < 0) + return 0; /* revalidate */ + return 1; +} + +static int proc_misc_d_delete(const struct dentry *dentry) +{ + return atomic_read(&PDE(d_inode(dentry))->in_use) < 0; +} + +static const struct dentry_operations proc_misc_dentry_ops = { + .d_revalidate = proc_misc_d_revalidate, + .d_delete = proc_misc_d_delete, +}; + /* * Don't create negative dentries here, return -ENOENT by hand * instead. @@ -234,7 +255,7 @@ struct dentry *proc_lookup_de(struct inode *dir, struct dentry *dentry, inode = proc_get_inode(dir->i_sb, de); if (!inode) return ERR_PTR(-ENOMEM); - d_set_d_op(dentry, &simple_dentry_operations); + d_set_d_op(dentry, &proc_misc_dentry_ops); d_add(dentry, inode); return NULL; } diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index d1e8276..e64ecb9 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -209,25 +209,34 @@ kclist_add_private(unsigned long pfn, unsigned long nr_pages, void *arg) { struct list_head *head = (struct list_head *)arg; struct kcore_list *ent; + struct page *p; + + if (!pfn_valid(pfn)) + return 1; + + p = pfn_to_page(pfn); + if (!memmap_valid_within(pfn, p, page_zone(p))) + return 1; ent = kmalloc(sizeof(*ent), GFP_KERNEL); if (!ent) return -ENOMEM; - ent->addr = (unsigned long)__va((pfn << PAGE_SHIFT)); + ent->addr = (unsigned long)page_to_virt(p); ent->size = nr_pages << PAGE_SHIFT; - /* Sanity check: Can happen in 32bit arch...maybe */ - if (ent->addr < (unsigned long) __va(0)) + if (!virt_addr_valid(ent->addr)) goto free_out; /* cut not-mapped area. ....from ppc-32 code. */ if (ULONG_MAX - ent->addr < ent->size) ent->size = ULONG_MAX - ent->addr; - /* cut when vmalloc() area is higher than direct-map area */ - if (VMALLOC_START > (unsigned long)__va(0)) { - if (ent->addr > VMALLOC_START) - goto free_out; + /* + * We've already checked virt_addr_valid so we know this address + * is a valid pointer, therefore we can check against it to determine + * if we need to trim + */ + if (VMALLOC_START > ent->addr) { if (VMALLOC_START - ent->addr < ent->size) ent->size = VMALLOC_START - ent->addr; } diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c index a000d75..b572cc8 100644 --- a/fs/proc/loadavg.c +++ b/fs/proc/loadavg.c @@ -24,7 +24,7 @@ static int loadavg_proc_show(struct seq_file *m, void *v) LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]), LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]), nr_running(), nr_threads, - idr_get_cursor(&task_active_pid_ns(current)->idr)); + idr_get_cursor(&task_active_pid_ns(current)->idr) - 1); return 0; } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 65ae546..c486ad4 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1310,9 +1310,11 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION else if (is_swap_pmd(pmd)) { swp_entry_t entry = pmd_to_swp_entry(pmd); + unsigned long offset = swp_offset(entry); + offset += (addr & ~PMD_MASK) >> PAGE_SHIFT; frame = swp_type(entry) | - (swp_offset(entry) << MAX_SWAPFILES_SHIFT); + (offset << MAX_SWAPFILES_SHIFT); flags |= PM_SWAP; if (pmd_swp_soft_dirty(pmd)) flags |= PM_SOFT_DIRTY; @@ -1332,6 +1334,8 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, break; if (pm->show_pfn && (flags & PM_PRESENT)) frame++; + else if (flags & PM_SWAP) + frame += (1 << MAX_SWAPFILES_SHIFT); } spin_unlock(ptl); return err; diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index a45f0af..cfb6674 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -20,6 +20,7 @@ #include <linux/init.h> #include <linux/crash_dump.h> #include <linux/list.h> +#include <linux/mutex.h> #include <linux/vmalloc.h> #include <linux/pagemap.h> #include <linux/uaccess.h> @@ -38,12 +39,23 @@ static size_t elfcorebuf_sz_orig; static char *elfnotes_buf; static size_t elfnotes_sz; +/* Size of all notes minus the device dump notes */ +static size_t elfnotes_orig_sz; /* Total size of vmcore file. */ static u64 vmcore_size; static struct proc_dir_entry *proc_vmcore; +#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP +/* Device Dump list and mutex to synchronize access to list */ +static LIST_HEAD(vmcoredd_list); +static DEFINE_MUTEX(vmcoredd_mutex); +#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ + +/* Device Dump Size */ +static size_t vmcoredd_orig_sz; + /* * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error * The called function has to take care of module refcounting. @@ -178,6 +190,77 @@ static int copy_to(void *target, void *src, size_t size, int userbuf) return 0; } +#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP +static int vmcoredd_copy_dumps(void *dst, u64 start, size_t size, int userbuf) +{ + struct vmcoredd_node *dump; + u64 offset = 0; + int ret = 0; + size_t tsz; + char *buf; + + mutex_lock(&vmcoredd_mutex); + list_for_each_entry(dump, &vmcoredd_list, list) { + if (start < offset + dump->size) { + tsz = min(offset + (u64)dump->size - start, (u64)size); + buf = dump->buf + start - offset; + if (copy_to(dst, buf, tsz, userbuf)) { + ret = -EFAULT; + goto out_unlock; + } + + size -= tsz; + start += tsz; + dst += tsz; + + /* Leave now if buffer filled already */ + if (!size) + goto out_unlock; + } + offset += dump->size; + } + +out_unlock: + mutex_unlock(&vmcoredd_mutex); + return ret; +} + +static int vmcoredd_mmap_dumps(struct vm_area_struct *vma, unsigned long dst, + u64 start, size_t size) +{ + struct vmcoredd_node *dump; + u64 offset = 0; + int ret = 0; + size_t tsz; + char *buf; + + mutex_lock(&vmcoredd_mutex); + list_for_each_entry(dump, &vmcoredd_list, list) { + if (start < offset + dump->size) { + tsz = min(offset + (u64)dump->size - start, (u64)size); + buf = dump->buf + start - offset; + if (remap_vmalloc_range_partial(vma, dst, buf, tsz)) { + ret = -EFAULT; + goto out_unlock; + } + + size -= tsz; + start += tsz; + dst += tsz; + + /* Leave now if buffer filled already */ + if (!size) + goto out_unlock; + } + offset += dump->size; + } + +out_unlock: + mutex_unlock(&vmcoredd_mutex); + return ret; +} +#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ + /* Read from the ELF header and then the crash dump. On error, negative value is * returned otherwise number of bytes read are returned. */ @@ -215,10 +298,41 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos, if (*fpos < elfcorebuf_sz + elfnotes_sz) { void *kaddr; + /* We add device dumps before other elf notes because the + * other elf notes may not fill the elf notes buffer + * completely and we will end up with zero-filled data + * between the elf notes and the device dumps. Tools will + * then try to decode this zero-filled data as valid notes + * and we don't want that. Hence, adding device dumps before + * the other elf notes ensure that zero-filled data can be + * avoided. + */ +#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP + /* Read device dumps */ + if (*fpos < elfcorebuf_sz + vmcoredd_orig_sz) { + tsz = min(elfcorebuf_sz + vmcoredd_orig_sz - + (size_t)*fpos, buflen); + start = *fpos - elfcorebuf_sz; + if (vmcoredd_copy_dumps(buffer, start, tsz, userbuf)) + return -EFAULT; + + buflen -= tsz; + *fpos += tsz; + buffer += tsz; + acc += tsz; + + /* leave now if filled buffer already */ + if (!buflen) + return acc; + } +#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ + + /* Read remaining elf notes */ tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, buflen); - kaddr = elfnotes_buf + *fpos - elfcorebuf_sz; + kaddr = elfnotes_buf + *fpos - elfcorebuf_sz - vmcoredd_orig_sz; if (copy_to(buffer, kaddr, tsz, userbuf)) return -EFAULT; + buflen -= tsz; *fpos += tsz; buffer += tsz; @@ -302,10 +416,8 @@ static const struct vm_operations_struct vmcore_mmap_ops = { }; /** - * alloc_elfnotes_buf - allocate buffer for ELF note segment in - * vmalloc memory - * - * @notes_sz: size of buffer + * vmcore_alloc_buf - allocate buffer in vmalloc memory + * @sizez: size of buffer * * If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap * the buffer to user-space by means of remap_vmalloc_range(). @@ -313,12 +425,12 @@ static const struct vm_operations_struct vmcore_mmap_ops = { * If CONFIG_MMU is not defined, use vzalloc() since mmap_vmcore() is * disabled and there's no need to allow users to mmap the buffer. */ -static inline char *alloc_elfnotes_buf(size_t notes_sz) +static inline char *vmcore_alloc_buf(size_t size) { #ifdef CONFIG_MMU - return vmalloc_user(notes_sz); + return vmalloc_user(size); #else - return vzalloc(notes_sz); + return vzalloc(size); #endif } @@ -446,11 +558,46 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) if (start < elfcorebuf_sz + elfnotes_sz) { void *kaddr; + /* We add device dumps before other elf notes because the + * other elf notes may not fill the elf notes buffer + * completely and we will end up with zero-filled data + * between the elf notes and the device dumps. Tools will + * then try to decode this zero-filled data as valid notes + * and we don't want that. Hence, adding device dumps before + * the other elf notes ensure that zero-filled data can be + * avoided. This also ensures that the device dumps and + * other elf notes can be properly mmaped at page aligned + * address. + */ +#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP + /* Read device dumps */ + if (start < elfcorebuf_sz + vmcoredd_orig_sz) { + u64 start_off; + + tsz = min(elfcorebuf_sz + vmcoredd_orig_sz - + (size_t)start, size); + start_off = start - elfcorebuf_sz; + if (vmcoredd_mmap_dumps(vma, vma->vm_start + len, + start_off, tsz)) + goto fail; + + size -= tsz; + start += tsz; + len += tsz; + + /* leave now if filled buffer already */ + if (!size) + return 0; + } +#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ + + /* Read remaining elf notes */ tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)start, size); - kaddr = elfnotes_buf + start - elfcorebuf_sz; + kaddr = elfnotes_buf + start - elfcorebuf_sz - vmcoredd_orig_sz; if (remap_vmalloc_range_partial(vma, vma->vm_start + len, kaddr, tsz)) goto fail; + size -= tsz; start += tsz; len += tsz; @@ -502,8 +649,8 @@ static struct vmcore* __init get_new_element(void) return kzalloc(sizeof(struct vmcore), GFP_KERNEL); } -static u64 __init get_vmcore_size(size_t elfsz, size_t elfnotesegsz, - struct list_head *vc_list) +static u64 get_vmcore_size(size_t elfsz, size_t elfnotesegsz, + struct list_head *vc_list) { u64 size; struct vmcore *m; @@ -665,7 +812,7 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, return rc; *notes_sz = roundup(phdr_sz, PAGE_SIZE); - *notes_buf = alloc_elfnotes_buf(*notes_sz); + *notes_buf = vmcore_alloc_buf(*notes_sz); if (!*notes_buf) return -ENOMEM; @@ -698,6 +845,11 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, /* Modify e_phnum to reflect merged headers. */ ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1; + /* Store the size of all notes. We need this to update the note + * header when the device dumps will be added. + */ + elfnotes_orig_sz = phdr.p_memsz; + return 0; } @@ -851,7 +1003,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, return rc; *notes_sz = roundup(phdr_sz, PAGE_SIZE); - *notes_buf = alloc_elfnotes_buf(*notes_sz); + *notes_buf = vmcore_alloc_buf(*notes_sz); if (!*notes_buf) return -ENOMEM; @@ -884,6 +1036,11 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, /* Modify e_phnum to reflect merged headers. */ ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1; + /* Store the size of all notes. We need this to update the note + * header when the device dumps will be added. + */ + elfnotes_orig_sz = phdr.p_memsz; + return 0; } @@ -976,8 +1133,8 @@ static int __init process_ptload_program_headers_elf32(char *elfptr, } /* Sets offset fields of vmcore elements. */ -static void __init set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz, - struct list_head *vc_list) +static void set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz, + struct list_head *vc_list) { loff_t vmcore_off; struct vmcore *m; @@ -1145,6 +1302,202 @@ static int __init parse_crash_elf_headers(void) return 0; } +#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP +/** + * vmcoredd_write_header - Write vmcore device dump header at the + * beginning of the dump's buffer. + * @buf: Output buffer where the note is written + * @data: Dump info + * @size: Size of the dump + * + * Fills beginning of the dump's buffer with vmcore device dump header. + */ +static void vmcoredd_write_header(void *buf, struct vmcoredd_data *data, + u32 size) +{ + struct vmcoredd_header *vdd_hdr = (struct vmcoredd_header *)buf; + + vdd_hdr->n_namesz = sizeof(vdd_hdr->name); + vdd_hdr->n_descsz = size + sizeof(vdd_hdr->dump_name); + vdd_hdr->n_type = NT_VMCOREDD; + + strncpy((char *)vdd_hdr->name, VMCOREDD_NOTE_NAME, + sizeof(vdd_hdr->name)); + memcpy(vdd_hdr->dump_name, data->dump_name, sizeof(vdd_hdr->dump_name)); +} + +/** + * vmcoredd_update_program_headers - Update all Elf program headers + * @elfptr: Pointer to elf header + * @elfnotesz: Size of elf notes aligned to page size + * @vmcoreddsz: Size of device dumps to be added to elf note header + * + * Determine type of Elf header (Elf64 or Elf32) and update the elf note size. + * Also update the offsets of all the program headers after the elf note header. + */ +static void vmcoredd_update_program_headers(char *elfptr, size_t elfnotesz, + size_t vmcoreddsz) +{ + unsigned char *e_ident = (unsigned char *)elfptr; + u64 start, end, size; + loff_t vmcore_off; + u32 i; + + vmcore_off = elfcorebuf_sz + elfnotesz; + + if (e_ident[EI_CLASS] == ELFCLASS64) { + Elf64_Ehdr *ehdr = (Elf64_Ehdr *)elfptr; + Elf64_Phdr *phdr = (Elf64_Phdr *)(elfptr + sizeof(Elf64_Ehdr)); + + /* Update all program headers */ + for (i = 0; i < ehdr->e_phnum; i++, phdr++) { + if (phdr->p_type == PT_NOTE) { + /* Update note size */ + phdr->p_memsz = elfnotes_orig_sz + vmcoreddsz; + phdr->p_filesz = phdr->p_memsz; + continue; + } + + start = rounddown(phdr->p_offset, PAGE_SIZE); + end = roundup(phdr->p_offset + phdr->p_memsz, + PAGE_SIZE); + size = end - start; + phdr->p_offset = vmcore_off + (phdr->p_offset - start); + vmcore_off += size; + } + } else { + Elf32_Ehdr *ehdr = (Elf32_Ehdr *)elfptr; + Elf32_Phdr *phdr = (Elf32_Phdr *)(elfptr + sizeof(Elf32_Ehdr)); + + /* Update all program headers */ + for (i = 0; i < ehdr->e_phnum; i++, phdr++) { + if (phdr->p_type == PT_NOTE) { + /* Update note size */ + phdr->p_memsz = elfnotes_orig_sz + vmcoreddsz; + phdr->p_filesz = phdr->p_memsz; + continue; + } + + start = rounddown(phdr->p_offset, PAGE_SIZE); + end = roundup(phdr->p_offset + phdr->p_memsz, + PAGE_SIZE); + size = end - start; + phdr->p_offset = vmcore_off + (phdr->p_offset - start); + vmcore_off += size; + } + } +} + +/** + * vmcoredd_update_size - Update the total size of the device dumps and update + * Elf header + * @dump_size: Size of the current device dump to be added to total size + * + * Update the total size of all the device dumps and update the Elf program + * headers. Calculate the new offsets for the vmcore list and update the + * total vmcore size. + */ +static void vmcoredd_update_size(size_t dump_size) +{ + vmcoredd_orig_sz += dump_size; + elfnotes_sz = roundup(elfnotes_orig_sz, PAGE_SIZE) + vmcoredd_orig_sz; + vmcoredd_update_program_headers(elfcorebuf, elfnotes_sz, + vmcoredd_orig_sz); + + /* Update vmcore list offsets */ + set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list); + + vmcore_size = get_vmcore_size(elfcorebuf_sz, elfnotes_sz, + &vmcore_list); + proc_vmcore->size = vmcore_size; +} + +/** + * vmcore_add_device_dump - Add a buffer containing device dump to vmcore + * @data: dump info. + * + * Allocate a buffer and invoke the calling driver's dump collect routine. + * Write Elf note at the beginning of the buffer to indicate vmcore device + * dump and add the dump to global list. + */ +int vmcore_add_device_dump(struct vmcoredd_data *data) +{ + struct vmcoredd_node *dump; + void *buf = NULL; + size_t data_size; + int ret; + + if (!data || !strlen(data->dump_name) || + !data->vmcoredd_callback || !data->size) + return -EINVAL; + + dump = vzalloc(sizeof(*dump)); + if (!dump) { + ret = -ENOMEM; + goto out_err; + } + + /* Keep size of the buffer page aligned so that it can be mmaped */ + data_size = roundup(sizeof(struct vmcoredd_header) + data->size, + PAGE_SIZE); + + /* Allocate buffer for driver's to write their dumps */ + buf = vmcore_alloc_buf(data_size); + if (!buf) { + ret = -ENOMEM; + goto out_err; + } + + vmcoredd_write_header(buf, data, data_size - + sizeof(struct vmcoredd_header)); + + /* Invoke the driver's dump collection routing */ + ret = data->vmcoredd_callback(data, buf + + sizeof(struct vmcoredd_header)); + if (ret) + goto out_err; + + dump->buf = buf; + dump->size = data_size; + + /* Add the dump to driver sysfs list */ + mutex_lock(&vmcoredd_mutex); + list_add_tail(&dump->list, &vmcoredd_list); + mutex_unlock(&vmcoredd_mutex); + + vmcoredd_update_size(data_size); + return 0; + +out_err: + if (buf) + vfree(buf); + + if (dump) + vfree(dump); + + return ret; +} +EXPORT_SYMBOL(vmcore_add_device_dump); +#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ + +/* Free all dumps in vmcore device dump list */ +static void vmcore_free_device_dumps(void) +{ +#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP + mutex_lock(&vmcoredd_mutex); + while (!list_empty(&vmcoredd_list)) { + struct vmcoredd_node *dump; + + dump = list_first_entry(&vmcoredd_list, struct vmcoredd_node, + list); + list_del(&dump->list); + vfree(dump->buf); + vfree(dump); + } + mutex_unlock(&vmcoredd_mutex); +#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ +} + /* Init function for vmcore module. */ static int __init vmcore_init(void) { @@ -1192,4 +1545,7 @@ void vmcore_cleanup(void) kfree(m); } free_elfcorebuf(); + + /* clear vmcore device dump list */ + vmcore_free_device_dumps(); } diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 020c597..d88231e 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2966,7 +2966,7 @@ static int __init dquot_init(void) NULL); order = 0; - dquot_hash = (struct hlist_head *)__get_free_pages(GFP_ATOMIC, order); + dquot_hash = (struct hlist_head *)__get_free_pages(GFP_KERNEL, order); if (!dquot_hash) panic("Cannot create dquot hash table"); @@ -37,6 +37,7 @@ #include <linux/user_namespace.h> #include "internal.h" +static int thaw_super_locked(struct super_block *sb); static LIST_HEAD(super_blocks); static DEFINE_SPINLOCK(sb_lock); @@ -166,6 +167,7 @@ static void destroy_unused_super(struct super_block *s) security_sb_free(s); put_user_ns(s->s_user_ns); kfree(s->s_subtype); + free_prealloced_shrinker(&s->s_shrink); /* no delays needed */ destroy_super_work(&s->destroy_work); } @@ -251,6 +253,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, s->s_shrink.count_objects = super_cache_count; s->s_shrink.batch = 1024; s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE; + if (prealloc_shrinker(&s->s_shrink)) + goto fail; return s; fail: @@ -517,11 +521,7 @@ retry: hlist_add_head(&s->s_instances, &type->fs_supers); spin_unlock(&sb_lock); get_filesystem(type); - err = register_shrinker(&s->s_shrink); - if (err) { - deactivate_locked_super(s); - s = ERR_PTR(err); - } + register_shrinker_prepared(&s->s_shrink); return s; } @@ -574,6 +574,28 @@ void drop_super_exclusive(struct super_block *sb) } EXPORT_SYMBOL(drop_super_exclusive); +static void __iterate_supers(void (*f)(struct super_block *)) +{ + struct super_block *sb, *p = NULL; + + spin_lock(&sb_lock); + list_for_each_entry(sb, &super_blocks, s_list) { + if (hlist_unhashed(&sb->s_instances)) + continue; + sb->s_count++; + spin_unlock(&sb_lock); + + f(sb); + + spin_lock(&sb_lock); + if (p) + __put_super(p); + p = sb; + } + if (p) + __put_super(p); + spin_unlock(&sb_lock); +} /** * iterate_supers - call function for all active superblocks * @f: function to call @@ -881,33 +903,22 @@ cancel_readonly: return retval; } -static void do_emergency_remount(struct work_struct *work) +static void do_emergency_remount_callback(struct super_block *sb) { - struct super_block *sb, *p = NULL; - - spin_lock(&sb_lock); - list_for_each_entry(sb, &super_blocks, s_list) { - if (hlist_unhashed(&sb->s_instances)) - continue; - sb->s_count++; - spin_unlock(&sb_lock); - down_write(&sb->s_umount); - if (sb->s_root && sb->s_bdev && (sb->s_flags & SB_BORN) && - !sb_rdonly(sb)) { - /* - * What lock protects sb->s_flags?? - */ - do_remount_sb(sb, SB_RDONLY, NULL, 1); - } - up_write(&sb->s_umount); - spin_lock(&sb_lock); - if (p) - __put_super(p); - p = sb; + down_write(&sb->s_umount); + if (sb->s_root && sb->s_bdev && (sb->s_flags & SB_BORN) && + !sb_rdonly(sb)) { + /* + * What lock protects sb->s_flags?? + */ + do_remount_sb(sb, SB_RDONLY, NULL, 1); } - if (p) - __put_super(p); - spin_unlock(&sb_lock); + up_write(&sb->s_umount); +} + +static void do_emergency_remount(struct work_struct *work) +{ + __iterate_supers(do_emergency_remount_callback); kfree(work); printk("Emergency Remount complete\n"); } @@ -923,6 +934,40 @@ void emergency_remount(void) } } +static void do_thaw_all_callback(struct super_block *sb) +{ + down_write(&sb->s_umount); + if (sb->s_root && sb->s_flags & MS_BORN) { + emergency_thaw_bdev(sb); + thaw_super_locked(sb); + } else { + up_write(&sb->s_umount); + } +} + +static void do_thaw_all(struct work_struct *work) +{ + __iterate_supers(do_thaw_all_callback); + kfree(work); + printk(KERN_WARNING "Emergency Thaw complete\n"); +} + +/** + * emergency_thaw_all -- forcibly thaw every frozen filesystem + * + * Used for emergency unfreeze of all filesystems via SysRq + */ +void emergency_thaw_all(void) +{ + struct work_struct *work; + + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (work) { + INIT_WORK(work, do_thaw_all); + schedule_work(work); + } +} + /* * Unnamed block devices are dummy devices used by virtual * filesystems which don't use real block-devices. -- jrs @@ -1492,11 +1537,10 @@ EXPORT_SYMBOL(freeze_super); * * Unlocks the filesystem and marks it writeable again after freeze_super(). */ -int thaw_super(struct super_block *sb) +static int thaw_super_locked(struct super_block *sb) { int error; - down_write(&sb->s_umount); if (sb->s_writers.frozen != SB_FREEZE_COMPLETE) { up_write(&sb->s_umount); return -EINVAL; @@ -1527,4 +1571,10 @@ out: deactivate_locked_super(sb); return 0; } + +int thaw_super(struct super_block *sb) +{ + down_write(&sb->s_umount); + return thaw_super_locked(sb); +} EXPORT_SYMBOL(thaw_super); diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c index f897e55..16a8ad2 100644 --- a/fs/udf/unicode.c +++ b/fs/udf/unicode.c @@ -28,6 +28,9 @@ #include "udf_sb.h" +#define SURROGATE_MASK 0xfffff800 +#define SURROGATE_PAIR 0x0000d800 + static int udf_uni2char_utf8(wchar_t uni, unsigned char *out, int boundlen) @@ -37,6 +40,9 @@ static int udf_uni2char_utf8(wchar_t uni, if (boundlen <= 0) return -ENAMETOOLONG; + if ((uni & SURROGATE_MASK) == SURROGATE_PAIR) + return -EINVAL; + if (uni < 0x80) { out[u_len++] = (unsigned char)uni; } else if (uni < 0x800) { diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 39387bd..4bcc095 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -1947,7 +1947,7 @@ void xfs_alloc_compute_maxlevels( xfs_mount_t *mp) /* file system mount structure */ { - mp->m_ag_maxlevels = xfs_btree_compute_maxlevels(mp, mp->m_alloc_mnr, + mp->m_ag_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr, (mp->m_sb.sb_agblocks + 1) / 2); } @@ -1959,7 +1959,6 @@ xfs_alloc_compute_maxlevels( */ xfs_extlen_t xfs_alloc_longest_free_extent( - struct xfs_mount *mp, struct xfs_perag *pag, xfs_extlen_t need, xfs_extlen_t reserved) @@ -2038,8 +2037,7 @@ xfs_alloc_space_available( /* do we have enough contiguous free space for the allocation? */ alloc_len = args->minlen + (args->alignment - 1) + args->minalignslop; - longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free, - reservation); + longest = xfs_alloc_longest_free_extent(pag, min_free, reservation); if (longest < alloc_len) return false; diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index a311a24..cbf789e 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -116,9 +116,8 @@ xfs_alloc_allow_busy_reuse(int datatype) unsigned int xfs_alloc_set_aside(struct xfs_mount *mp); unsigned int xfs_alloc_ag_max_usable(struct xfs_mount *mp); -xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp, - struct xfs_perag *pag, xfs_extlen_t need, - xfs_extlen_t reserved); +xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_perag *pag, + xfs_extlen_t need, xfs_extlen_t reserved); unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp, struct xfs_perag *pag); diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index ce4a34a..35a1244 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -511,7 +511,14 @@ xfs_attr_shortform_addname(xfs_da_args_t *args) if (args->flags & ATTR_CREATE) return retval; retval = xfs_attr_shortform_remove(args); - ASSERT(retval == 0); + if (retval) + return retval; + /* + * Since we have removed the old attr, clear ATTR_REPLACE so + * that the leaf format add routine won't trip over the attr + * not being around. + */ + args->flags &= ~ATTR_REPLACE; } if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX || diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 3b03d88..040eeda 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -725,12 +725,16 @@ xfs_bmap_extents_to_btree( *logflagsp = 0; if ((error = xfs_alloc_vextent(&args))) { xfs_iroot_realloc(ip, -1, whichfork); + ASSERT(ifp->if_broot == NULL); + XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); return error; } if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) { xfs_iroot_realloc(ip, -1, whichfork); + ASSERT(ifp->if_broot == NULL); + XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); return -ENOSPC; } @@ -3225,7 +3229,7 @@ xfs_bmap_longest_free_extent( } } - longest = xfs_alloc_longest_free_extent(mp, pag, + longest = xfs_alloc_longest_free_extent(pag, xfs_alloc_min_freelist(mp, pag), xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE)); if (*blen < longest) @@ -5667,7 +5671,6 @@ xfs_bmap_collapse_extents( xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, bool *done, - xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock, struct xfs_defer_ops *dfops) { diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index f3be641..2b766b3 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -228,7 +228,7 @@ void xfs_bmap_del_extent_cow(struct xfs_inode *ip, uint xfs_default_attroffset(struct xfs_inode *ip); int xfs_bmap_collapse_extents(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, - bool *done, xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock, + bool *done, xfs_fsblock_t *firstblock, struct xfs_defer_ops *dfops); int xfs_bmap_insert_extents(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index edc0193..ac7d664 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -4531,7 +4531,6 @@ xfs_btree_sblock_verify( */ uint xfs_btree_compute_maxlevels( - struct xfs_mount *mp, uint *limits, unsigned long len) { @@ -4839,7 +4838,6 @@ xfs_btree_query_all( */ xfs_extlen_t xfs_btree_calc_size( - struct xfs_mount *mp, uint *limits, unsigned long long len) { diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 58e30c0..9227159 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -481,10 +481,8 @@ xfs_failaddr_t xfs_btree_lblock_v5hdr_verify(struct xfs_buf *bp, xfs_failaddr_t xfs_btree_lblock_verify(struct xfs_buf *bp, unsigned int max_recs); -uint xfs_btree_compute_maxlevels(struct xfs_mount *mp, uint *limits, - unsigned long len); -xfs_extlen_t xfs_btree_calc_size(struct xfs_mount *mp, uint *limits, - unsigned long long len); +uint xfs_btree_compute_maxlevels(uint *limits, unsigned long len); +xfs_extlen_t xfs_btree_calc_size(uint *limits, unsigned long long len); /* return codes */ #define XFS_BTREE_QUERY_RANGE_CONTINUE 0 /* keep iterating */ diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 0e2cf5f..de627fa 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2406,7 +2406,7 @@ xfs_ialloc_compute_maxlevels( uint inodes; inodes = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG; - mp->m_in_maxlevels = xfs_btree_compute_maxlevels(mp, mp->m_inobt_mnr, + mp->m_in_maxlevels = xfs_btree_compute_maxlevels(mp->m_inobt_mnr, inodes); } diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index a2dd7f4..367e9a0 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -556,7 +556,7 @@ xfs_inobt_max_size( if (mp->m_inobt_mxr[0] == 0) return 0; - return xfs_btree_calc_size(mp, mp->m_inobt_mnr, + return xfs_btree_calc_size(mp->m_inobt_mnr, (uint64_t)mp->m_sb.sb_agblocks * mp->m_sb.sb_inopblock / XFS_INODES_PER_CHUNK); } diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index ef68b1d..1201107 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -466,6 +466,8 @@ xfs_dinode_verify( return __this_address; if (di_size > XFS_DFORK_DSIZE(dip, mp)) return __this_address; + if (dip->di_nextents) + return __this_address; /* fall through */ case XFS_DINODE_FMT_EXTENTS: case XFS_DINODE_FMT_BTREE: @@ -484,12 +486,31 @@ xfs_dinode_verify( if (XFS_DFORK_Q(dip)) { switch (dip->di_aformat) { case XFS_DINODE_FMT_LOCAL: + if (dip->di_anextents) + return __this_address; + /* fall through */ case XFS_DINODE_FMT_EXTENTS: case XFS_DINODE_FMT_BTREE: break; default: return __this_address; } + } else { + /* + * If there is no fork offset, this may be a freshly-made inode + * in a new disk cluster, in which case di_aformat is zeroed. + * Otherwise, such an inode must be in EXTENTS format; this goes + * for freed inodes as well. + */ + switch (dip->di_aformat) { + case 0: + case XFS_DINODE_FMT_EXTENTS: + break; + default: + return __this_address; + } + if (dip->di_anextents) + return __this_address; } /* only version 3 or greater inodes are extensively verified here */ diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index bee68c2..560e284 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -351,7 +351,6 @@ xfs_refcount_merge_center_extents( struct xfs_refcount_irec *center, struct xfs_refcount_irec *right, unsigned long long extlen, - xfs_agblock_t *agbno, xfs_extlen_t *aglen) { int error; @@ -471,7 +470,6 @@ xfs_refcount_merge_right_extent( struct xfs_btree_cur *cur, struct xfs_refcount_irec *right, struct xfs_refcount_irec *cright, - xfs_agblock_t *agbno, xfs_extlen_t *aglen) { int error; @@ -749,7 +747,7 @@ xfs_refcount_merge_extents( ulen < MAXREFCEXTLEN) { *shape_changed = true; return xfs_refcount_merge_center_extents(cur, &left, &cleft, - &right, ulen, agbno, aglen); + &right, ulen, aglen); } /* Try to merge left and cleft. */ @@ -778,7 +776,7 @@ xfs_refcount_merge_extents( ulen < MAXREFCEXTLEN) { *shape_changed = true; return xfs_refcount_merge_right_extent(cur, &right, &cright, - agbno, aglen); + aglen); } return error; @@ -1356,9 +1354,7 @@ xfs_refcount_adjust_cow_extents( struct xfs_btree_cur *cur, xfs_agblock_t agbno, xfs_extlen_t aglen, - enum xfs_refc_adjust_op adj, - struct xfs_defer_ops *dfops, - struct xfs_owner_info *oinfo) + enum xfs_refc_adjust_op adj) { struct xfs_refcount_irec ext, tmp; int error; @@ -1437,8 +1433,7 @@ xfs_refcount_adjust_cow( struct xfs_btree_cur *cur, xfs_agblock_t agbno, xfs_extlen_t aglen, - enum xfs_refc_adjust_op adj, - struct xfs_defer_ops *dfops) + enum xfs_refc_adjust_op adj) { bool shape_changed; int error; @@ -1465,8 +1460,7 @@ xfs_refcount_adjust_cow( goto out_error; /* Now that we've taken care of the ends, adjust the middle extents */ - error = xfs_refcount_adjust_cow_extents(cur, agbno, aglen, adj, - dfops, NULL); + error = xfs_refcount_adjust_cow_extents(cur, agbno, aglen, adj); if (error) goto out_error; @@ -1493,7 +1487,7 @@ __xfs_refcount_cow_alloc( /* Add refcount btree reservation */ return xfs_refcount_adjust_cow(rcur, agbno, aglen, - XFS_REFCOUNT_ADJUST_COW_ALLOC, dfops); + XFS_REFCOUNT_ADJUST_COW_ALLOC); } /* @@ -1511,7 +1505,7 @@ __xfs_refcount_cow_free( /* Remove refcount btree reservation */ return xfs_refcount_adjust_cow(rcur, agbno, aglen, - XFS_REFCOUNT_ADJUST_COW_FREE, dfops); + XFS_REFCOUNT_ADJUST_COW_FREE); } /* Record a CoW staging extent in the refcount btree. */ @@ -1568,7 +1562,7 @@ struct xfs_refcount_recovery { /* Stuff an extent on the recovery list. */ STATIC int xfs_refcount_recover_extent( - struct xfs_btree_cur *cur, + struct xfs_btree_cur *cur, union xfs_btree_rec *rec, void *priv) { diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 265fdce..375abfe 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -373,7 +373,6 @@ xfs_refcountbt_init_cursor( */ int xfs_refcountbt_maxrecs( - struct xfs_mount *mp, int blocklen, bool leaf) { @@ -390,7 +389,7 @@ void xfs_refcountbt_compute_maxlevels( struct xfs_mount *mp) { - mp->m_refc_maxlevels = xfs_btree_compute_maxlevels(mp, + mp->m_refc_maxlevels = xfs_btree_compute_maxlevels( mp->m_refc_mnr, mp->m_sb.sb_agblocks); } @@ -400,7 +399,7 @@ xfs_refcountbt_calc_size( struct xfs_mount *mp, unsigned long long len) { - return xfs_btree_calc_size(mp, mp->m_refc_mnr, len); + return xfs_btree_calc_size(mp->m_refc_mnr, len); } /* diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h index 9db008b..2bc4694 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.h +++ b/fs/xfs/libxfs/xfs_refcount_btree.h @@ -60,8 +60,7 @@ struct xfs_mount; extern struct xfs_btree_cur *xfs_refcountbt_init_cursor(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *agbp, xfs_agnumber_t agno, struct xfs_defer_ops *dfops); -extern int xfs_refcountbt_maxrecs(struct xfs_mount *mp, int blocklen, - bool leaf); +extern int xfs_refcountbt_maxrecs(int blocklen, bool leaf); extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp); extern xfs_extlen_t xfs_refcountbt_calc_size(struct xfs_mount *mp, diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 79822cf..fba8d27 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -376,7 +376,6 @@ xfs_rmap_free_check_owner( struct xfs_mount *mp, uint64_t ltoff, struct xfs_rmap_irec *rec, - xfs_fsblock_t bno, xfs_filblks_t len, uint64_t owner, uint64_t offset, @@ -519,7 +518,7 @@ xfs_rmap_unmap( bno + len, out_error); /* Check owner information. */ - error = xfs_rmap_free_check_owner(mp, ltoff, <rec, bno, len, owner, + error = xfs_rmap_free_check_owner(mp, ltoff, <rec, len, owner, offset, flags); if (error) goto out_error; diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 8b0d0de..d756e0b 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -499,7 +499,6 @@ xfs_rmapbt_init_cursor( */ int xfs_rmapbt_maxrecs( - struct xfs_mount *mp, int blocklen, int leaf) { @@ -534,7 +533,7 @@ xfs_rmapbt_compute_maxlevels( if (xfs_sb_version_hasreflink(&mp->m_sb)) mp->m_rmap_maxlevels = XFS_BTREE_MAXLEVELS; else - mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(mp, + mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels( mp->m_rmap_mnr, mp->m_sb.sb_agblocks); } @@ -544,7 +543,7 @@ xfs_rmapbt_calc_size( struct xfs_mount *mp, unsigned long long len) { - return xfs_btree_calc_size(mp, mp->m_rmap_mnr, len); + return xfs_btree_calc_size(mp->m_rmap_mnr, len); } /* diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h index 19c08e9..d68d96e 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.h +++ b/fs/xfs/libxfs/xfs_rmap_btree.h @@ -55,7 +55,7 @@ struct xfs_mount; struct xfs_btree_cur *xfs_rmapbt_init_cursor(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *bp, xfs_agnumber_t agno); -int xfs_rmapbt_maxrecs(struct xfs_mount *mp, int blocklen, int leaf); +int xfs_rmapbt_maxrecs(int blocklen, int leaf); extern void xfs_rmapbt_compute_maxlevels(struct xfs_mount *mp); extern xfs_extlen_t xfs_rmapbt_calc_size(struct xfs_mount *mp, diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 53433cc..d9b94bd 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -756,15 +756,13 @@ xfs_sb_mount_common( mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2; mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2; - mp->m_rmap_mxr[0] = xfs_rmapbt_maxrecs(mp, sbp->sb_blocksize, 1); - mp->m_rmap_mxr[1] = xfs_rmapbt_maxrecs(mp, sbp->sb_blocksize, 0); + mp->m_rmap_mxr[0] = xfs_rmapbt_maxrecs(sbp->sb_blocksize, 1); + mp->m_rmap_mxr[1] = xfs_rmapbt_maxrecs(sbp->sb_blocksize, 0); mp->m_rmap_mnr[0] = mp->m_rmap_mxr[0] / 2; mp->m_rmap_mnr[1] = mp->m_rmap_mxr[1] / 2; - mp->m_refc_mxr[0] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, - true); - mp->m_refc_mxr[1] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, - false); + mp->m_refc_mxr[0] = xfs_refcountbt_maxrecs(sbp->sb_blocksize, true); + mp->m_refc_mxr[1] = xfs_refcountbt_maxrecs(sbp->sb_blocksize, false); mp->m_refc_mnr[0] = mp->m_refc_mxr[0] / 2; mp->m_refc_mnr[1] = mp->m_refc_mxr[1] / 2; diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 5f17641..3bccdf7 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -734,8 +734,7 @@ xfs_calc_clear_agi_bucket_reservation( * the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot) */ STATIC uint -xfs_calc_qm_setqlim_reservation( - struct xfs_mount *mp) +xfs_calc_qm_setqlim_reservation(void) { return xfs_calc_buf_res(1, sizeof(struct xfs_disk_dquot)); } @@ -772,8 +771,7 @@ xfs_calc_qm_quotaoff_reservation( * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2 */ STATIC uint -xfs_calc_qm_quotaoff_end_reservation( - struct xfs_mount *mp) +xfs_calc_qm_quotaoff_end_reservation(void) { return sizeof(struct xfs_qoff_logitem) * 2; } @@ -877,14 +875,14 @@ xfs_trans_resv_calc( * The following transactions are logged in logical format with * a default log count. */ - resp->tr_qm_setqlim.tr_logres = xfs_calc_qm_setqlim_reservation(mp); + resp->tr_qm_setqlim.tr_logres = xfs_calc_qm_setqlim_reservation(); resp->tr_qm_setqlim.tr_logcount = XFS_DEFAULT_LOG_COUNT; resp->tr_qm_quotaoff.tr_logres = xfs_calc_qm_quotaoff_reservation(mp); resp->tr_qm_quotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT; resp->tr_qm_equotaoff.tr_logres = - xfs_calc_qm_quotaoff_end_reservation(mp); + xfs_calc_qm_quotaoff_end_reservation(); resp->tr_qm_equotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT; resp->tr_sb.tr_logres = xfs_calc_sb_reservation(mp); diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index e5fb008..2203465 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -53,6 +53,25 @@ xfs_bui_item_free( kmem_zone_free(xfs_bui_zone, buip); } +/* + * Freeing the BUI requires that we remove it from the AIL if it has already + * been placed there. However, the BUI may not yet have been placed in the AIL + * when called by xfs_bui_release() from BUD processing due to the ordering of + * committed vs unpin operations in bulk insert operations. Hence the reference + * count to ensure only the last caller frees the BUI. + */ +void +xfs_bui_release( + struct xfs_bui_log_item *buip) +{ + ASSERT(atomic_read(&buip->bui_refcount) > 0); + if (atomic_dec_and_test(&buip->bui_refcount)) { + xfs_trans_ail_remove(&buip->bui_item, SHUTDOWN_LOG_IO_ERROR); + xfs_bui_item_free(buip); + } +} + + STATIC void xfs_bui_item_size( struct xfs_log_item *lip, @@ -142,7 +161,7 @@ xfs_bui_item_unlock( struct xfs_log_item *lip) { if (lip->li_flags & XFS_LI_ABORTED) - xfs_bui_item_free(BUI_ITEM(lip)); + xfs_bui_release(BUI_ITEM(lip)); } /* @@ -206,24 +225,6 @@ xfs_bui_init( return buip; } -/* - * Freeing the BUI requires that we remove it from the AIL if it has already - * been placed there. However, the BUI may not yet have been placed in the AIL - * when called by xfs_bui_release() from BUD processing due to the ordering of - * committed vs unpin operations in bulk insert operations. Hence the reference - * count to ensure only the last caller frees the BUI. - */ -void -xfs_bui_release( - struct xfs_bui_log_item *buip) -{ - ASSERT(atomic_read(&buip->bui_refcount) > 0); - if (atomic_dec_and_test(&buip->bui_refcount)) { - xfs_trans_ail_remove(&buip->bui_item, SHUTDOWN_LOG_IO_ERROR); - xfs_bui_item_free(buip); - } -} - static inline struct xfs_bud_log_item *BUD_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_bud_log_item, bud_item); diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 05dee8f..8cd8c41 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1326,7 +1326,6 @@ xfs_collapse_file_space( int error; struct xfs_defer_ops dfops; xfs_fsblock_t first_block; - xfs_fileoff_t stop_fsb = XFS_B_TO_FSB(mp, VFS_I(ip)->i_size); xfs_fileoff_t next_fsb = XFS_B_TO_FSB(mp, offset + len); xfs_fileoff_t shift_fsb = XFS_B_TO_FSB(mp, len); uint resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); @@ -1361,7 +1360,7 @@ xfs_collapse_file_space( xfs_defer_init(&dfops, &first_block); error = xfs_bmap_collapse_extents(tp, ip, &next_fsb, shift_fsb, - &done, stop_fsb, &first_block, &dfops); + &done, &first_block, &dfops); if (error) goto out_bmap_cancel; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index ac669a1..55661cb 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1754,7 +1754,6 @@ xfs_buftarg_shrink_count( void xfs_free_buftarg( - struct xfs_mount *mp, struct xfs_buftarg *btp) { unregister_shrinker(&btp->bt_shrinker); diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 2f4c914..edced16 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -388,7 +388,7 @@ xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset) */ extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *, struct block_device *, struct dax_device *); -extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *); +extern void xfs_free_buftarg(struct xfs_buftarg *); extern void xfs_wait_buftarg(xfs_buftarg_t *); extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int); diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index b2cde54..7b68e6c 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -50,19 +50,19 @@ xfs_trim_extents( pag = xfs_perag_get(mp, agno); - error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); - if (error || !agbp) - goto out_put_perag; - - cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT); - /* * Force out the log. This means any transactions that might have freed - * space before we took the AGF buffer lock are now on disk, and the + * space before we take the AGF buffer lock are now on disk, and the * volatile disk cache is flushed. */ xfs_log_force(mp, XFS_LOG_SYNC); + error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); + if (error || !agbp) + goto out_put_perag; + + cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT); + /* * Look up the longest btree in the AGF and start with it. */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 64da906..b5b1e56 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -51,6 +51,24 @@ xfs_efi_item_free( } /* + * Freeing the efi requires that we remove it from the AIL if it has already + * been placed there. However, the EFI may not yet have been placed in the AIL + * when called by xfs_efi_release() from EFD processing due to the ordering of + * committed vs unpin operations in bulk insert operations. Hence the reference + * count to ensure only the last caller frees the EFI. + */ +void +xfs_efi_release( + struct xfs_efi_log_item *efip) +{ + ASSERT(atomic_read(&efip->efi_refcount) > 0); + if (atomic_dec_and_test(&efip->efi_refcount)) { + xfs_trans_ail_remove(&efip->efi_item, SHUTDOWN_LOG_IO_ERROR); + xfs_efi_item_free(efip); + } +} + +/* * This returns the number of iovecs needed to log the given efi item. * We only need 1 iovec for an efi item. It just logs the efi_log_format * structure. @@ -151,7 +169,7 @@ xfs_efi_item_unlock( struct xfs_log_item *lip) { if (lip->li_flags & XFS_LI_ABORTED) - xfs_efi_item_free(EFI_ITEM(lip)); + xfs_efi_release(EFI_ITEM(lip)); } /* @@ -279,24 +297,6 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) return -EFSCORRUPTED; } -/* - * Freeing the efi requires that we remove it from the AIL if it has already - * been placed there. However, the EFI may not yet have been placed in the AIL - * when called by xfs_efi_release() from EFD processing due to the ordering of - * committed vs unpin operations in bulk insert operations. Hence the reference - * count to ensure only the last caller frees the EFI. - */ -void -xfs_efi_release( - struct xfs_efi_log_item *efip) -{ - ASSERT(atomic_read(&efip->efi_refcount) > 0); - if (atomic_dec_and_test(&efip->efi_refcount)) { - xfs_trans_ail_remove(&efip->efi_item, SHUTDOWN_LOG_IO_ERROR); - xfs_efi_item_free(efip); - } -} - static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_efd_log_item, efd_item); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 299aee4..e70fb8c 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -778,22 +778,26 @@ xfs_file_fallocate( if (error) goto out_unlock; } else if (mode & FALLOC_FL_INSERT_RANGE) { - unsigned int blksize_mask = i_blocksize(inode) - 1; + unsigned int blksize_mask = i_blocksize(inode) - 1; + loff_t isize = i_size_read(inode); - new_size = i_size_read(inode) + len; if (offset & blksize_mask || len & blksize_mask) { error = -EINVAL; goto out_unlock; } - /* check the new inode size does not wrap through zero */ - if (new_size > inode->i_sb->s_maxbytes) { + /* + * New inode size must not exceed ->s_maxbytes, accounting for + * possible signed overflow. + */ + if (inode->i_sb->s_maxbytes - isize < len) { error = -EFBIG; goto out_unlock; } + new_size = isize + len; /* Offset should be less than i_size */ - if (offset >= i_size_read(inode)) { + if (offset >= isize) { error = -EINVAL; goto out_unlock; } @@ -876,8 +880,18 @@ xfs_file_dedupe_range( struct file *dst_file, u64 dst_loff) { + struct inode *srci = file_inode(src_file); + u64 max_dedupe; int error; + /* + * Since we have to read all these pages in to compare them, cut + * it off at MAX_RW_COUNT/2 rounded down to the nearest block. + * That means we won't do more than MAX_RW_COUNT IO per request. + */ + max_dedupe = (MAX_RW_COUNT >> 1) & ~(i_blocksize(srci) - 1); + if (len > max_dedupe) + len = max_dedupe; error = xfs_reflink_remap_range(src_file, loff, dst_file, dst_loff, len, true); if (error) diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 043ca380..3f8722e 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -34,7 +34,6 @@ struct xfs_fstrm_item { struct xfs_mru_cache_elem mru; - struct xfs_inode *ip; xfs_agnumber_t ag; /* AG in use for this directory */ }; @@ -122,14 +121,15 @@ xfs_filestream_put_ag( static void xfs_fstrm_free_func( + void *data, struct xfs_mru_cache_elem *mru) { + struct xfs_mount *mp = data; struct xfs_fstrm_item *item = container_of(mru, struct xfs_fstrm_item, mru); - xfs_filestream_put_ag(item->ip->i_mount, item->ag); - - trace_xfs_filestream_free(item->ip, item->ag); + xfs_filestream_put_ag(mp, item->ag); + trace_xfs_filestream_free(mp, mru->key, item->ag); kmem_free(item); } @@ -165,7 +165,7 @@ xfs_filestream_pick_ag( trylock = XFS_ALLOC_FLAG_TRYLOCK; for (nscan = 0; 1; nscan++) { - trace_xfs_filestream_scan(ip, ag); + trace_xfs_filestream_scan(mp, ip->i_ino, ag); pag = xfs_perag_get(mp, ag); @@ -198,7 +198,7 @@ xfs_filestream_pick_ag( goto next_ag; } - longest = xfs_alloc_longest_free_extent(mp, pag, + longest = xfs_alloc_longest_free_extent(pag, xfs_alloc_min_freelist(mp, pag), xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE)); if (((minlen && longest >= minlen) || @@ -265,7 +265,6 @@ next_ag: goto out_put_ag; item->ag = *agp; - item->ip = ip; err = xfs_mru_cache_insert(mp->m_filestream, ip->i_ino, &item->mru); if (err) { @@ -333,7 +332,7 @@ xfs_filestream_lookup_ag( ag = container_of(mru, struct xfs_fstrm_item, mru)->ag; xfs_mru_cache_done(mp->m_filestream); - trace_xfs_filestream_lookup(ip, ag); + trace_xfs_filestream_lookup(mp, ip->i_ino, ag); goto out; } @@ -399,7 +398,7 @@ xfs_filestream_new_ag( * Only free the item here so we skip over the old AG earlier. */ if (mru) - xfs_fstrm_free_func(mru); + xfs_fstrm_free_func(mp, mru); IRELE(pip); exit: @@ -426,8 +425,8 @@ xfs_filestream_mount( * timer tunable to within about 10 percent. This requires at least 10 * groups. */ - return xfs_mru_cache_create(&mp->m_filestream, xfs_fstrm_centisecs * 10, - 10, xfs_fstrm_free_func); + return xfs_mru_cache_create(&mp->m_filestream, mp, + xfs_fstrm_centisecs * 10, 10, xfs_fstrm_free_func); } void diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 3e3aab3..2b70c8b 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -972,10 +972,8 @@ xfs_dir_ialloc( xfs_nlink_t nlink, dev_t rdev, prid_t prid, /* project id */ - xfs_inode_t **ipp, /* pointer to inode; it will be + xfs_inode_t **ipp) /* pointer to inode; it will be locked. */ - int *committed) - { xfs_trans_t *tp; xfs_inode_t *ip; @@ -1050,8 +1048,6 @@ xfs_dir_ialloc( } code = xfs_trans_roll(&tp); - if (committed != NULL) - *committed = 1; /* * Re-attach the quota info that we detached from prev trx. @@ -1088,9 +1084,6 @@ xfs_dir_ialloc( } ASSERT(!ialloc_context && ip); - } else { - if (committed != NULL) - *committed = 0; } *ipp = ip; @@ -1217,8 +1210,7 @@ xfs_create( * entry pointing to them, but a directory also the "." entry * pointing to itself. */ - error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, prid, &ip, - NULL); + error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, prid, &ip); if (error) goto out_trans_cancel; @@ -1309,7 +1301,6 @@ xfs_create( int xfs_create_tmpfile( struct xfs_inode *dp, - struct dentry *dentry, umode_t mode, struct xfs_inode **ipp) { @@ -1351,7 +1342,7 @@ xfs_create_tmpfile( if (error) goto out_trans_cancel; - error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, prid, &ip, NULL); + error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, prid, &ip); if (error) goto out_trans_cancel; @@ -1611,13 +1602,15 @@ xfs_itruncate_extents( goto out; } - /* Remove all pending CoW reservations. */ - error = xfs_reflink_cancel_cow_blocks(ip, &tp, first_unmap_block, - last_block, true); - if (error) - goto out; + if (whichfork == XFS_DATA_FORK) { + /* Remove all pending CoW reservations. */ + error = xfs_reflink_cancel_cow_blocks(ip, &tp, + first_unmap_block, last_block, true); + if (error) + goto out; - xfs_itruncate_clear_reflink_flags(ip); + xfs_itruncate_clear_reflink_flags(ip); + } /* * Always re-log the inode so that our permanent transaction can keep @@ -2903,7 +2896,7 @@ xfs_rename_alloc_whiteout( struct xfs_inode *tmpfile; int error; - error = xfs_create_tmpfile(dp, NULL, S_IFCHR | WHITEOUT_MODE, &tmpfile); + error = xfs_create_tmpfile(dp, S_IFCHR | WHITEOUT_MODE, &tmpfile); if (error) return error; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 132d8aa..1eebc53 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -393,8 +393,8 @@ int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode **ipp, struct xfs_name *ci_name); int xfs_create(struct xfs_inode *dp, struct xfs_name *name, umode_t mode, dev_t rdev, struct xfs_inode **ipp); -int xfs_create_tmpfile(struct xfs_inode *dp, struct dentry *dentry, - umode_t mode, struct xfs_inode **ipp); +int xfs_create_tmpfile(struct xfs_inode *dp, umode_t mode, + struct xfs_inode **ipp); int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode *ip); int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, @@ -431,7 +431,7 @@ xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip); int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t, xfs_nlink_t, dev_t, prid_t, - struct xfs_inode **, int *); + struct xfs_inode **); /* from xfs_file.c */ enum xfs_prealloc_flags { diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 154725b..a3ed3c8 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -177,7 +177,7 @@ xfs_generic_create( if (!tmpfile) { error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip); } else { - error = xfs_create_tmpfile(XFS_I(dir), dentry, mode, &ip); + error = xfs_create_tmpfile(XFS_I(dir), mode, &ip); } if (unlikely(error)) goto out_free_acl; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index b9c9c84..2fcd9ed 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -560,7 +560,6 @@ xfs_log_done( */ int xfs_log_notify( - struct xfs_mount *mp, struct xlog_in_core *iclog, xfs_log_callback_t *cb) { diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 7e2d629..fa8ad31 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -141,8 +141,7 @@ int xfs_log_mount_cancel(struct xfs_mount *); xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp); void xfs_log_space_wake(struct xfs_mount *mp); -int xfs_log_notify(struct xfs_mount *mp, - struct xlog_in_core *iclog, +int xfs_log_notify(struct xlog_in_core *iclog, struct xfs_log_callback *callback_entry); int xfs_log_release_iclog(struct xfs_mount *mp, struct xlog_in_core *iclog); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index cb376ac..4668403 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -848,7 +848,7 @@ restart: /* attach all the transactions w/ busy extents to iclog */ ctx->log_cb.cb_func = xlog_cil_committed; ctx->log_cb.cb_arg = ctx; - error = xfs_log_notify(log->l_mp, commit_iclog, &ctx->log_cb); + error = xfs_log_notify(commit_iclog, &ctx->log_cb); if (error) goto out_abort; diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index f8a674d..70eea7a 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -112,6 +112,7 @@ struct xfs_mru_cache { xfs_mru_cache_free_func_t free_func; /* Function pointer for freeing. */ struct delayed_work work; /* Workqueue data for reaping. */ unsigned int queued; /* work has been queued */ + void *data; }; static struct workqueue_struct *xfs_mru_reap_wq; @@ -259,7 +260,7 @@ _xfs_mru_cache_clear_reap_list( list_for_each_entry_safe(elem, next, &tmp, list_node) { list_del_init(&elem->list_node); - mru->free_func(elem); + mru->free_func(mru->data, elem); } spin_lock(&mru->lock); @@ -326,6 +327,7 @@ xfs_mru_cache_uninit(void) int xfs_mru_cache_create( struct xfs_mru_cache **mrup, + void *data, unsigned int lifetime_ms, unsigned int grp_count, xfs_mru_cache_free_func_t free_func) @@ -369,7 +371,7 @@ xfs_mru_cache_create( mru->grp_time = grp_time; mru->free_func = free_func; - + mru->data = data; *mrup = mru; exit: @@ -492,7 +494,7 @@ xfs_mru_cache_delete( elem = xfs_mru_cache_remove(mru, key); if (elem) - mru->free_func(elem); + mru->free_func(mru->data, elem); } /* diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h index fb5245b..b3f3fbd 100644 --- a/fs/xfs/xfs_mru_cache.h +++ b/fs/xfs/xfs_mru_cache.h @@ -26,13 +26,13 @@ struct xfs_mru_cache_elem { }; /* Function pointer type for callback to free a client's data pointer. */ -typedef void (*xfs_mru_cache_free_func_t)(struct xfs_mru_cache_elem *elem); +typedef void (*xfs_mru_cache_free_func_t)(void *, struct xfs_mru_cache_elem *); int xfs_mru_cache_init(void); void xfs_mru_cache_uninit(void); -int xfs_mru_cache_create(struct xfs_mru_cache **mrup, unsigned int lifetime_ms, - unsigned int grp_count, - xfs_mru_cache_free_func_t free_func); +int xfs_mru_cache_create(struct xfs_mru_cache **mrup, void *data, + unsigned int lifetime_ms, unsigned int grp_count, + xfs_mru_cache_free_func_t free_func); void xfs_mru_cache_destroy(struct xfs_mru_cache *mru); int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key, struct xfs_mru_cache_elem *elem); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 5b848f4..ec39ae2 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -748,7 +748,6 @@ xfs_qm_qino_alloc( { xfs_trans_t *tp; int error; - int committed; bool need_alloc = true; *ip = NULL; @@ -788,8 +787,7 @@ xfs_qm_qino_alloc( return error; if (need_alloc) { - error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, ip, - &committed); + error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, ip); if (error) { xfs_trans_cancel(tp); return error; diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 7a39f40..15c9393 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -52,6 +52,25 @@ xfs_cui_item_free( kmem_zone_free(xfs_cui_zone, cuip); } +/* + * Freeing the CUI requires that we remove it from the AIL if it has already + * been placed there. However, the CUI may not yet have been placed in the AIL + * when called by xfs_cui_release() from CUD processing due to the ordering of + * committed vs unpin operations in bulk insert operations. Hence the reference + * count to ensure only the last caller frees the CUI. + */ +void +xfs_cui_release( + struct xfs_cui_log_item *cuip) +{ + ASSERT(atomic_read(&cuip->cui_refcount) > 0); + if (atomic_dec_and_test(&cuip->cui_refcount)) { + xfs_trans_ail_remove(&cuip->cui_item, SHUTDOWN_LOG_IO_ERROR); + xfs_cui_item_free(cuip); + } +} + + STATIC void xfs_cui_item_size( struct xfs_log_item *lip, @@ -141,7 +160,7 @@ xfs_cui_item_unlock( struct xfs_log_item *lip) { if (lip->li_flags & XFS_LI_ABORTED) - xfs_cui_item_free(CUI_ITEM(lip)); + xfs_cui_release(CUI_ITEM(lip)); } /* @@ -211,24 +230,6 @@ xfs_cui_init( return cuip; } -/* - * Freeing the CUI requires that we remove it from the AIL if it has already - * been placed there. However, the CUI may not yet have been placed in the AIL - * when called by xfs_cui_release() from CUD processing due to the ordering of - * committed vs unpin operations in bulk insert operations. Hence the reference - * count to ensure only the last caller frees the CUI. - */ -void -xfs_cui_release( - struct xfs_cui_log_item *cuip) -{ - ASSERT(atomic_read(&cuip->cui_refcount) > 0); - if (atomic_dec_and_test(&cuip->cui_refcount)) { - xfs_trans_ail_remove(&cuip->cui_item, SHUTDOWN_LOG_IO_ERROR); - xfs_cui_item_free(cuip); - } -} - static inline struct xfs_cud_log_item *CUD_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_cud_log_item, cud_item); diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 49d3124..06a0784 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -52,6 +52,24 @@ xfs_rui_item_free( kmem_zone_free(xfs_rui_zone, ruip); } +/* + * Freeing the RUI requires that we remove it from the AIL if it has already + * been placed there. However, the RUI may not yet have been placed in the AIL + * when called by xfs_rui_release() from RUD processing due to the ordering of + * committed vs unpin operations in bulk insert operations. Hence the reference + * count to ensure only the last caller frees the RUI. + */ +void +xfs_rui_release( + struct xfs_rui_log_item *ruip) +{ + ASSERT(atomic_read(&ruip->rui_refcount) > 0); + if (atomic_dec_and_test(&ruip->rui_refcount)) { + xfs_trans_ail_remove(&ruip->rui_item, SHUTDOWN_LOG_IO_ERROR); + xfs_rui_item_free(ruip); + } +} + STATIC void xfs_rui_item_size( struct xfs_log_item *lip, @@ -141,7 +159,7 @@ xfs_rui_item_unlock( struct xfs_log_item *lip) { if (lip->li_flags & XFS_LI_ABORTED) - xfs_rui_item_free(RUI_ITEM(lip)); + xfs_rui_release(RUI_ITEM(lip)); } /* @@ -233,24 +251,6 @@ xfs_rui_copy_format( return 0; } -/* - * Freeing the RUI requires that we remove it from the AIL if it has already - * been placed there. However, the RUI may not yet have been placed in the AIL - * when called by xfs_rui_release() from RUD processing due to the ordering of - * committed vs unpin operations in bulk insert operations. Hence the reference - * count to ensure only the last caller frees the RUI. - */ -void -xfs_rui_release( - struct xfs_rui_log_item *ruip) -{ - ASSERT(atomic_read(&ruip->rui_refcount) > 0); - if (atomic_dec_and_test(&ruip->rui_refcount)) { - xfs_trans_ail_remove(&ruip->rui_item, SHUTDOWN_LOG_IO_ERROR); - xfs_rui_item_free(ruip); - } -} - static inline struct xfs_rud_log_item *RUD_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_rud_log_item, rud_item); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 612c1d5..d714240 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -722,7 +722,7 @@ xfs_close_devices( struct block_device *logdev = mp->m_logdev_targp->bt_bdev; struct dax_device *dax_logdev = mp->m_logdev_targp->bt_daxdev; - xfs_free_buftarg(mp, mp->m_logdev_targp); + xfs_free_buftarg(mp->m_logdev_targp); xfs_blkdev_put(logdev); fs_put_dax(dax_logdev); } @@ -730,11 +730,11 @@ xfs_close_devices( struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev; struct dax_device *dax_rtdev = mp->m_rtdev_targp->bt_daxdev; - xfs_free_buftarg(mp, mp->m_rtdev_targp); + xfs_free_buftarg(mp->m_rtdev_targp); xfs_blkdev_put(rtdev); fs_put_dax(dax_rtdev); } - xfs_free_buftarg(mp, mp->m_ddev_targp); + xfs_free_buftarg(mp->m_ddev_targp); fs_put_dax(dax_ddev); } @@ -808,9 +808,9 @@ xfs_open_devices( out_free_rtdev_targ: if (mp->m_rtdev_targp) - xfs_free_buftarg(mp, mp->m_rtdev_targp); + xfs_free_buftarg(mp->m_rtdev_targp); out_free_ddev_targ: - xfs_free_buftarg(mp, mp->m_ddev_targp); + xfs_free_buftarg(mp->m_ddev_targp); out_close_rtdev: xfs_blkdev_put(rtdev); fs_put_dax(dax_rtdev); @@ -1247,7 +1247,6 @@ xfs_quiesce_attr( STATIC int xfs_test_remount_options( struct super_block *sb, - struct xfs_mount *mp, char *options) { int error = 0; @@ -1278,7 +1277,7 @@ xfs_fs_remount( int error; /* First, check for complete junk; i.e. invalid options */ - error = xfs_test_remount_options(sb, mp, options); + error = xfs_test_remount_options(sb, options); if (error) return error; diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 2e9e793..5b66ac1 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -264,7 +264,7 @@ xfs_symlink( * Allocate an inode for the symlink. */ error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0, - prid, &ip, NULL); + prid, &ip); if (error) goto out_trans_cancel; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index a982c0b..8955254 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -506,8 +506,8 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release); DEFINE_BUF_ITEM_EVENT(xfs_trans_binval); DECLARE_EVENT_CLASS(xfs_filestream_class, - TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno), - TP_ARGS(ip, agno), + TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino, xfs_agnumber_t agno), + TP_ARGS(mp, ino, agno), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) @@ -515,10 +515,10 @@ DECLARE_EVENT_CLASS(xfs_filestream_class, __field(int, streams) ), TP_fast_assign( - __entry->dev = VFS_I(ip)->i_sb->s_dev; - __entry->ino = ip->i_ino; + __entry->dev = mp->m_super->s_dev; + __entry->ino = ino; __entry->agno = agno; - __entry->streams = xfs_filestream_peek_ag(ip->i_mount, agno); + __entry->streams = xfs_filestream_peek_ag(mp, agno); ), TP_printk("dev %d:%d ino 0x%llx agno %u streams %d", MAJOR(__entry->dev), MINOR(__entry->dev), @@ -528,8 +528,8 @@ DECLARE_EVENT_CLASS(xfs_filestream_class, ) #define DEFINE_FILESTREAM_EVENT(name) \ DEFINE_EVENT(xfs_filestream_class, name, \ - TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno), \ - TP_ARGS(ip, agno)) + TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino, xfs_agnumber_t agno), \ + TP_ARGS(mp, ino, agno)) DEFINE_FILESTREAM_EVENT(xfs_filestream_free); DEFINE_FILESTREAM_EVENT(xfs_filestream_lookup); DEFINE_FILESTREAM_EVENT(xfs_filestream_scan); |