From ef95d31e6de6be9602ce950b85fb7ab8af46ae42 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 10 Mar 2009 20:33:17 -0400 Subject: NFS: Fix misparsing of nfsv4 fs_locations attribute (take 2) The changeset ea31a4437c59219bf3ea946d58984b01a45a289c (nfs: Fix misparsing of nfsv4 fs_locations attribute) causes the mountpath that is calculated at the beginning of try_location() to be clobbered when we later strncpy a non-nul terminated hostname using an incorrect buffer length. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4namespace.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 30befc3..2a2a0a7 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -21,7 +21,9 @@ #define NFSDBG_FACILITY NFSDBG_VFS /* - * Check if fs_root is valid + * Convert the NFSv4 pathname components into a standard posix path. + * + * Note that the resulting string will be placed at the end of the buffer */ static inline char *nfs4_pathname_string(const struct nfs4_pathname *pathname, char *buffer, ssize_t buflen) @@ -99,21 +101,20 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, { struct vfsmount *mnt = ERR_PTR(-ENOENT); char *mnt_path; - int page2len; + unsigned int maxbuflen; unsigned int s; mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE); if (IS_ERR(mnt_path)) return mnt; mountdata->mnt_path = mnt_path; - page2 += strlen(mnt_path) + 1; - page2len = PAGE_SIZE - strlen(mnt_path) - 1; + maxbuflen = mnt_path - 1 - page2; for (s = 0; s < location->nservers; s++) { const struct nfs4_string *buf = &location->servers[s]; struct sockaddr_storage addr; - if (buf->len <= 0 || buf->len >= PAGE_SIZE) + if (buf->len <= 0 || buf->len >= maxbuflen) continue; mountdata->addr = (struct sockaddr *)&addr; @@ -126,8 +127,8 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, continue; nfs_set_port(mountdata->addr, NFS_PORT); - strncpy(page2, buf->data, page2len); - page2[page2len] = '\0'; + memcpy(page2, buf->data, buf->len); + page2[buf->len] = '\0'; mountdata->hostname = page2; snprintf(page, PAGE_SIZE, "%s:%s", -- cgit v1.1 From ae46141ff08f1965b17c531b571953c39ce8b9e2 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 10 Mar 2009 20:33:18 -0400 Subject: NFSv3: Fix posix ACL code Fix a memory leak due to allocation in the XDR layer. In cases where the RPC call needs to be retransmitted, we end up allocating new pages without clearing the old ones. Fix this by moving the allocation into nfs3_proc_setacls(). Also fix an issue discovered by Kevin Rudd, whereby the amount of memory reserved for the acls in the xdr_buf->head was miscalculated, and causing corruption. Signed-off-by: Trond Myklebust --- fs/nfs/nfs3acl.c | 27 +++++++++++++++++++++------ fs/nfs/nfs3xdr.c | 34 +++++++++++++--------------------- 2 files changed, 34 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index cef6255..6bbf0e6 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -292,7 +292,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, { struct nfs_server *server = NFS_SERVER(inode); struct nfs_fattr fattr; - struct page *pages[NFSACL_MAXPAGES] = { }; + struct page *pages[NFSACL_MAXPAGES]; struct nfs3_setaclargs args = { .inode = inode, .mask = NFS_ACL, @@ -303,7 +303,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, .rpc_argp = &args, .rpc_resp = &fattr, }; - int status, count; + int status; status = -EOPNOTSUPP; if (!nfs_server_capable(inode, NFS_CAP_ACLS)) @@ -319,6 +319,20 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, if (S_ISDIR(inode->i_mode)) { args.mask |= NFS_DFACL; args.acl_default = dfacl; + args.len = nfsacl_size(acl, dfacl); + } else + args.len = nfsacl_size(acl, NULL); + + if (args.len > NFS_ACL_INLINE_BUFSIZE) { + unsigned int npages = 1 + ((args.len - 1) >> PAGE_SHIFT); + + status = -ENOMEM; + do { + args.pages[args.npages] = alloc_page(GFP_KERNEL); + if (args.pages[args.npages] == NULL) + goto out_freepages; + args.npages++; + } while (args.npages < npages); } dprintk("NFS call setacl\n"); @@ -329,10 +343,6 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, nfs_zap_acl_cache(inode); dprintk("NFS reply setacl: %d\n", status); - /* pages may have been allocated at the xdr layer. */ - for (count = 0; count < NFSACL_MAXPAGES && args.pages[count]; count++) - __free_page(args.pages[count]); - switch (status) { case 0: status = nfs_refresh_inode(inode, &fattr); @@ -346,6 +356,11 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, case -ENOTSUPP: status = -EOPNOTSUPP; } +out_freepages: + while (args.npages != 0) { + args.npages--; + __free_page(args.pages[args.npages]); + } out: return status; } diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 11cddde..6cdeacf 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -82,8 +82,10 @@ #define NFS3_commitres_sz (1+NFS3_wcc_data_sz+2) #define ACL3_getaclargs_sz (NFS3_fh_sz+1) -#define ACL3_setaclargs_sz (NFS3_fh_sz+1+2*(2+5*3)) -#define ACL3_getaclres_sz (1+NFS3_post_op_attr_sz+1+2*(2+5*3)) +#define ACL3_setaclargs_sz (NFS3_fh_sz+1+ \ + XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE)) +#define ACL3_getaclres_sz (1+NFS3_post_op_attr_sz+1+ \ + XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE)) #define ACL3_setaclres_sz (1+NFS3_post_op_attr_sz) /* @@ -703,28 +705,18 @@ nfs3_xdr_setaclargs(struct rpc_rqst *req, __be32 *p, struct nfs3_setaclargs *args) { struct xdr_buf *buf = &req->rq_snd_buf; - unsigned int base, len_in_head, len = nfsacl_size( - (args->mask & NFS_ACL) ? args->acl_access : NULL, - (args->mask & NFS_DFACL) ? args->acl_default : NULL); - int count, err; + unsigned int base; + int err; p = xdr_encode_fhandle(p, NFS_FH(args->inode)); *p++ = htonl(args->mask); - base = (char *)p - (char *)buf->head->iov_base; - /* put as much of the acls into head as possible. */ - len_in_head = min_t(unsigned int, buf->head->iov_len - base, len); - len -= len_in_head; - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p + (len_in_head >> 2)); - - for (count = 0; (count << PAGE_SHIFT) < len; count++) { - args->pages[count] = alloc_page(GFP_KERNEL); - if (!args->pages[count]) { - while (count) - __free_page(args->pages[--count]); - return -ENOMEM; - } - } - xdr_encode_pages(buf, args->pages, 0, len); + req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); + base = req->rq_slen; + + if (args->npages != 0) + xdr_encode_pages(buf, args->pages, 0, args->len); + else + req->rq_slen += args->len; err = nfsacl_encode(buf, base, args->inode, (args->mask & NFS_ACL) ? -- cgit v1.1 From 57df675c60c5cf0748ddba9c7f85afde1530d74d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 10 Mar 2009 20:33:20 -0400 Subject: NLM: Fix GRANT callback address comparison when IPv6 is enabled The NFS mount command may pass an AF_INET server address to lockd. If lockd happens to be using a PF_INET6 listener, the nlm_cmp_addr() in nlmclnt_grant() will fail to match requests from that host because they will all have a mapped IPv4 AF_INET6 address. Adopt the same solution used in nfs_sockaddr_match_ipaddr() for NFSv4 callbacks: if either address is AF_INET, map it to an AF_INET6 address before doing the comparison. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/lockd/clntlock.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 1f3b0fc..aedc47a 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -139,6 +139,55 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout) return 0; } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static const struct in6_addr *nlmclnt_map_v4addr(const struct sockaddr *sap, + struct in6_addr *addr_mapped) +{ + const struct sockaddr_in *sin = (const struct sockaddr_in *)sap; + + switch (sap->sa_family) { + case AF_INET6: + return &((const struct sockaddr_in6 *)sap)->sin6_addr; + case AF_INET: + ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, addr_mapped); + return addr_mapped; + } + + return NULL; +} + +/* + * If lockd is using a PF_INET6 listener, all incoming requests appear + * to come from AF_INET6 remotes. The address of AF_INET remotes are + * mapped to AF_INET6 automatically by the network layer. In case the + * user passed an AF_INET server address at mount time, ensure both + * addresses are AF_INET6 before comparing them. + */ +static int nlmclnt_cmp_addr(const struct nlm_host *host, + const struct sockaddr *sap) +{ + const struct in6_addr *addr1; + const struct in6_addr *addr2; + struct in6_addr addr1_mapped; + struct in6_addr addr2_mapped; + + addr1 = nlmclnt_map_v4addr(nlm_addr(host), &addr1_mapped); + if (likely(addr1 != NULL)) { + addr2 = nlmclnt_map_v4addr(sap, &addr2_mapped); + if (likely(addr2 != NULL)) + return ipv6_addr_equal(addr1, addr2); + } + + return 0; +} +#else /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */ +static int nlmclnt_cmp_addr(const struct nlm_host *host, + const struct sockaddr *sap) +{ + return nlm_cmp_addr(nlm_addr(host), sap); +} +#endif /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */ + /* * The server lockd has called us back to tell us the lock was granted */ @@ -166,7 +215,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock) */ if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid) continue; - if (!nlm_cmp_addr(nlm_addr(block->b_host), addr)) + if (!nlmclnt_cmp_addr(block->b_host, addr)) continue; if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0) continue; -- cgit v1.1 From a71ee337b31271e701f689d544b6153b75609bc5 Mon Sep 17 00:00:00 2001 From: Suresh Jayaraman Date: Tue, 10 Mar 2009 20:33:21 -0400 Subject: NFS: Handle -ESTALE error in access() Hi Trond, I have been looking at a bugreport where trying to open applications on KDE on a NFS mounted home fails temporarily. There have been multiple reports on different kernel versions pointing to this common issue: http://bugzilla.kernel.org/show_bug.cgi?id=12557 https://bugs.launchpad.net/ubuntu/+source/linux/+bug/269954 http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=508866.html This issue can be reproducible consistently by doing this on a NFS mounted home (KDE): 1. Open 2 xterm sessions 2. From one of the xterm session, do "ssh -X " 3. "stat ~/.Xauthority" on the remote SSH session 4. Close the two xterm sessions 5. On the server do a "stat ~/.Xauthority" 6. Now on the client, try to open xterm This will fail. Even if the filehandle had become stale, the NFS client should invalidate the cache/inode and should repeat LOOKUP. Looking at the packet capture when the failure occurs shows that there were two subsequent ACCESS() calls with the same filehandle and both fails with -ESTALE error. I have tested the fix below. Now the client issue a LOOKUP after the ACCESS() call fails with -ESTALE. If all this makes sense to you, can you consider this for inclusion? Thanks, If the server returns an -ESTALE error due to stale filehandle in response to an ACCESS() call, we need to invalidate the cache and inode so that LOOKUP() can be retried. Without this change, the nfs client retries ACCESS() with the same filehandle, fails again and could lead to temporary failure of applications running on nfs mounted home. Signed-off-by: Suresh Jayaraman Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index e35c819..672368f 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1892,8 +1892,14 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask) cache.cred = cred; cache.jiffies = jiffies; status = NFS_PROTO(inode)->access(inode, &cache); - if (status != 0) + if (status != 0) { + if (status == -ESTALE) { + nfs_zap_caches(inode); + if (!S_ISDIR(inode->i_mode)) + set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); + } return status; + } nfs_access_add_cache(inode, &cache); out: if ((mask & ~cache.mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) -- cgit v1.1 From d7371c41b0cda782256b1df759df4e8d4724584c Mon Sep 17 00:00:00 2001 From: Ian Dall Date: Tue, 10 Mar 2009 20:33:22 -0400 Subject: Bug 11061, NFS mounts dropped Addresses: http://bugzilla.kernel.org/show_bug.cgi?id=11061 sockaddr structures can't be reliably compared using memcmp() because there are padding bytes in the structure which can't be guaranteed to be the same even when the sockaddr structures refer to the same socket. Instead compare all the relevant fields. In the case of IPv6 sin6_flowinfo is not compared because it only affects QoS and sin6_scope_id is only compared if the address is "link local" because "link local" addresses need only be unique to a specific link. Signed-off-by: Ian Dall Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 62 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 9b728f3..06654b8 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -273,6 +273,65 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, #endif /* + * Test if two ip4 socket addresses refer to the same socket, by + * comparing relevant fields. The padding bytes specifically, are + * not compared. + * + * The caller should ensure both socket addresses are AF_INET. + */ +static int nfs_sockaddr_cmp_ip4(const struct sockaddr_in * saddr1, + const struct sockaddr_in * saddr2) +{ + if (saddr1->sin_addr.s_addr != saddr2->sin_addr.s_addr) + return 0; + return saddr1->sin_port == saddr2->sin_port; +} + +/* + * Test if two ip6 socket addresses refer to the same socket by + * comparing relevant fields. The padding bytes specifically, are not + * compared. sin6_flowinfo is not compared because it only affects QoS + * and sin6_scope_id is only compared if the address is "link local" + * because "link local" addresses need only be unique to a specific + * link. Conversely, ordinary unicast addresses might have different + * sin6_scope_id. + * + * The caller should ensure both socket addresses are AF_INET6. + */ +static int nfs_sockaddr_cmp_ip6 (const struct sockaddr_in6 * saddr1, + const struct sockaddr_in6 * saddr2) +{ + if (!ipv6_addr_equal(&saddr1->sin6_addr, + &saddr1->sin6_addr)) + return 0; + if (ipv6_addr_scope(&saddr1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL && + saddr1->sin6_scope_id != saddr2->sin6_scope_id) + return 0; + return saddr1->sin6_port == saddr2->sin6_port; +} + +/* + * Test if two socket addresses represent the same actual socket, + * by comparing (only) relevant fields. + */ +static int nfs_sockaddr_cmp(const struct sockaddr *sa1, + const struct sockaddr *sa2) +{ + if (sa1->sa_family != sa2->sa_family) + return 0; + + switch (sa1->sa_family) { + case AF_INET: + return nfs_sockaddr_cmp_ip4((const struct sockaddr_in *) sa1, + (const struct sockaddr_in *) sa2); + case AF_INET6: + return nfs_sockaddr_cmp_ip6((const struct sockaddr_in6 *) sa1, + (const struct sockaddr_in6 *) sa2); + } + return 0; +} + +/* * Find a client by IP address and protocol version * - returns NULL if no such client */ @@ -344,8 +403,10 @@ struct nfs_client *nfs_find_client_next(struct nfs_client *clp) static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *data) { struct nfs_client *clp; + const struct sockaddr *sap = data->addr; list_for_each_entry(clp, &nfs_client_list, cl_share_link) { + const struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr; /* Don't match clients that failed to initialise properly */ if (clp->cl_cons_state < 0) continue; @@ -358,7 +419,7 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat continue; /* Match the full socket address */ - if (memcmp(&clp->cl_addr, data->addr, sizeof(clp->cl_addr)) != 0) + if (!nfs_sockaddr_cmp(sap, clap)) continue; atomic_inc(&clp->cl_count); -- cgit v1.1 From 9f4c899c0d90e1b51b6864834f3877b47c161a0e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 12 Mar 2009 14:51:32 -0400 Subject: NFS: Fix the fix to Bugzilla #11061, when IPv6 isn't defined... Stephen Rothwell reports: Today's linux-next build (powerpc ppc64_defconfig) failed like this: fs/built-in.o: In function `.nfs_get_client': client.c:(.text+0x115010): undefined reference to `.__ipv6_addr_type' Fix by moving the IPV6 specific parts of commit d7371c41b0cda782256b1df759df4e8d4724584c ("Bug 11061, NFS mounts dropped") into the '#ifdef IPV6..." section. Also fix up a couple of formatting issues. Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 68 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 39 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 06654b8..574158a 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -255,6 +255,32 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, } return 0; } + +/* + * Test if two ip6 socket addresses refer to the same socket by + * comparing relevant fields. The padding bytes specifically, are not + * compared. sin6_flowinfo is not compared because it only affects QoS + * and sin6_scope_id is only compared if the address is "link local" + * because "link local" addresses need only be unique to a specific + * link. Conversely, ordinary unicast addresses might have different + * sin6_scope_id. + * + * The caller should ensure both socket addresses are AF_INET6. + */ +static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1, + const struct sockaddr *sa2) +{ + const struct sockaddr_in6 *saddr1 = (const struct sockaddr_in6 *)sa1; + const struct sockaddr_in6 *saddr2 = (const struct sockaddr_in6 *)sa2; + + if (!ipv6_addr_equal(&saddr1->sin6_addr, + &saddr1->sin6_addr)) + return 0; + if (ipv6_addr_scope(&saddr1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL && + saddr1->sin6_scope_id != saddr2->sin6_scope_id) + return 0; + return saddr1->sin6_port == saddr2->sin6_port; +} #else static int nfs_sockaddr_match_ipaddr4(const struct sockaddr_in *sa1, const struct sockaddr_in *sa2) @@ -270,6 +296,12 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, return nfs_sockaddr_match_ipaddr4((const struct sockaddr_in *)sa1, (const struct sockaddr_in *)sa2); } + +static int nfs_sockaddr_cmp_ip6(const struct sockaddr * sa1, + const struct sockaddr * sa2) +{ + return 0; +} #endif /* @@ -279,38 +311,18 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, * * The caller should ensure both socket addresses are AF_INET. */ -static int nfs_sockaddr_cmp_ip4(const struct sockaddr_in * saddr1, - const struct sockaddr_in * saddr2) +static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1, + const struct sockaddr *sa2) { + const struct sockaddr_in *saddr1 = (const struct sockaddr_in *)sa1; + const struct sockaddr_in *saddr2 = (const struct sockaddr_in *)sa2; + if (saddr1->sin_addr.s_addr != saddr2->sin_addr.s_addr) return 0; return saddr1->sin_port == saddr2->sin_port; } /* - * Test if two ip6 socket addresses refer to the same socket by - * comparing relevant fields. The padding bytes specifically, are not - * compared. sin6_flowinfo is not compared because it only affects QoS - * and sin6_scope_id is only compared if the address is "link local" - * because "link local" addresses need only be unique to a specific - * link. Conversely, ordinary unicast addresses might have different - * sin6_scope_id. - * - * The caller should ensure both socket addresses are AF_INET6. - */ -static int nfs_sockaddr_cmp_ip6 (const struct sockaddr_in6 * saddr1, - const struct sockaddr_in6 * saddr2) -{ - if (!ipv6_addr_equal(&saddr1->sin6_addr, - &saddr1->sin6_addr)) - return 0; - if (ipv6_addr_scope(&saddr1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL && - saddr1->sin6_scope_id != saddr2->sin6_scope_id) - return 0; - return saddr1->sin6_port == saddr2->sin6_port; -} - -/* * Test if two socket addresses represent the same actual socket, * by comparing (only) relevant fields. */ @@ -322,11 +334,9 @@ static int nfs_sockaddr_cmp(const struct sockaddr *sa1, switch (sa1->sa_family) { case AF_INET: - return nfs_sockaddr_cmp_ip4((const struct sockaddr_in *) sa1, - (const struct sockaddr_in *) sa2); + return nfs_sockaddr_cmp_ip4(sa1, sa2); case AF_INET6: - return nfs_sockaddr_cmp_ip6((const struct sockaddr_in6 *) sa1, - (const struct sockaddr_in6 *) sa2); + return nfs_sockaddr_cmp_ip6(sa1, sa2); } return 0; } -- cgit v1.1 From 6c9fd1dc0a597e575617a7de7086c8a3efa8f524 Mon Sep 17 00:00:00 2001 From: Tiger Yang Date: Fri, 6 Mar 2009 10:19:30 +0800 Subject: ocfs2: reserve xattr block for new directory with inline data If this is a new directory with inline data, we choose to reserve the entire inline area for directory contents and force an external xattr block. Signed-off-by: Tiger Yang Acked-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/xattr.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 4ddd788..c63efb5 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -547,8 +547,12 @@ int ocfs2_calc_xattr_init(struct inode *dir, * when blocksize = 512, may reserve one more cluser for * xattr bucket, otherwise reserve one metadata block * for them is ok. + * If this is a new directory with inline data, + * we choose to reserve the entire inline area for + * directory contents and force an external xattr block. */ if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE || + (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) || (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) { ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac); if (ret) { -- cgit v1.1 From d9ae49d6e2b1ac9166e58ae3c9345135604beaa6 Mon Sep 17 00:00:00 2001 From: Tiger Yang Date: Thu, 5 Mar 2009 11:06:15 +0800 Subject: ocfs2: tweak to get the maximum inline data size with xattr Replace max_inline_data with max_inline_data_with_xattr to ensure it correct when xattr inlined. Signed-off-by: Tiger Yang Acked-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/aops.c | 7 +++++-- fs/ocfs2/namei.c | 3 ++- fs/ocfs2/ocfs2_fs.h | 6 ------ 3 files changed, 7 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index a067a6c..8e1709a 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -227,7 +227,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page, size = i_size_read(inode); if (size > PAGE_CACHE_SIZE || - size > ocfs2_max_inline_data(inode->i_sb)) { + size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) { ocfs2_error(inode->i_sb, "Inode %llu has with inline data has bad size: %Lu", (unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -1555,6 +1555,7 @@ static int ocfs2_try_to_write_inline_data(struct address_space *mapping, int ret, written = 0; loff_t end = pos + len; struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = NULL; mlog(0, "Inode %llu, write of %u bytes at off %llu. features: 0x%x\n", (unsigned long long)oi->ip_blkno, len, (unsigned long long)pos, @@ -1587,7 +1588,9 @@ static int ocfs2_try_to_write_inline_data(struct address_space *mapping, /* * Check whether the write can fit. */ - if (mmap_page || end > ocfs2_max_inline_data(inode->i_sb)) + di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; + if (mmap_page || + end > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) return 0; do_inline_write: diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 084aba8..4b11762 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -532,7 +532,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL); - fe->id2.i_data.id_count = cpu_to_le16(ocfs2_max_inline_data(osb->sb)); + fe->id2.i_data.id_count = cpu_to_le16( + ocfs2_max_inline_data_with_xattr(osb->sb, fe)); } else { fel = &fe->id2.i_list; fel->l_tree_depth = 0; diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index c7ae45a..2332ef7 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -1070,12 +1070,6 @@ static inline int ocfs2_fast_symlink_chars(struct super_block *sb) offsetof(struct ocfs2_dinode, id2.i_symlink); } -static inline int ocfs2_max_inline_data(struct super_block *sb) -{ - return sb->s_blocksize - - offsetof(struct ocfs2_dinode, id2.i_data.id_data); -} - static inline int ocfs2_max_inline_data_with_xattr(struct super_block *sb, struct ocfs2_dinode *di) { -- cgit v1.1 From 74e77eb30d0ecbb12964d005b439c8b84a505b84 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Thu, 12 Mar 2009 06:24:23 +0800 Subject: ocfs2: Fix a bug found by sparse check. We need to use le32_to_cpu to test rec->e_cpos in ocfs2_dinode_insert_check. Signed-off-by: Tao Ma Acked-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 3a9e5de..19e3a96 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -176,7 +176,8 @@ static int ocfs2_dinode_insert_check(struct inode *inode, BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL); mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) && - (OCFS2_I(inode)->ip_clusters != rec->e_cpos), + (OCFS2_I(inode)->ip_clusters != + le32_to_cpu(rec->e_cpos)), "Device %s, asking for sparse allocation: inode %llu, " "cpos %u, clusters %u\n", osb->dev_str, -- cgit v1.1 From 712e53e46a1da35fcd88c05aa0c675b10f7c0e9d Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Thu, 12 Mar 2009 08:37:34 +0800 Subject: ocfs2: Use xs->bucket to set xattr value outside A long time ago, xs->base is allocated a 4K size and all the contents in the bucket are copied to the it. Now we use ocfs2_xattr_bucket to abstract xattr bucket and xs->base is initialized to the start of the bu_bhs[0]. So xs->base + offset will overflow when the value root is stored outside the first block. Then why we can survive the xattr test by now? It is because we always read the bucket contiguously now and kernel mm allocate continguous memory for us. We are lucky, but we should fix it. So just get the right value root as other callers do. Signed-off-by: Tao Ma Acked-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/xattr.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index c63efb5..2563df8 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -4795,19 +4795,33 @@ static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode, char *val, int value_len) { - int offset; + int ret, offset, block_off; struct ocfs2_xattr_value_root *xv; struct ocfs2_xattr_entry *xe = xs->here; + struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket); + void *base; BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe)); - offset = le16_to_cpu(xe->xe_name_offset) + - OCFS2_XATTR_SIZE(xe->xe_name_len); + ret = ocfs2_xattr_bucket_get_name_value(inode, xh, + xe - xh->xh_entries, + &block_off, + &offset); + if (ret) { + mlog_errno(ret); + goto out; + } - xv = (struct ocfs2_xattr_value_root *)(xs->base + offset); + base = bucket_block(xs->bucket, block_off); + xv = (struct ocfs2_xattr_value_root *)(base + offset + + OCFS2_XATTR_SIZE(xe->xe_name_len)); - return __ocfs2_xattr_set_value_outside(inode, handle, - xv, val, value_len); + ret = __ocfs2_xattr_set_value_outside(inode, handle, + xv, val, value_len); + if (ret) + mlog_errno(ret); +out: + return ret; } static int ocfs2_rm_xattr_cluster(struct inode *inode, -- cgit v1.1 From 020fe22ff14320927f394de222cbb11708bcc7a8 Mon Sep 17 00:00:00 2001 From: Enrik Berkhan Date: Fri, 13 Mar 2009 13:51:56 -0700 Subject: nommu: ramfs: pages allocated to an inode's pagecache may get wrongly discarded The pages attached to a ramfs inode's pagecache by truncation from nothing - as done by SYSV SHM for example - may get discarded under memory pressure. The problem is that the pages are not marked dirty. Anything that creates data in an MMU-based ramfs will cause the pages holding that data will cause the set_page_dirty() aop to be called. For the NOMMU-based mmap, set_page_dirty() may be called by write(), but it won't be called by page-writing faults on writable mmaps, and it isn't called by ramfs_nommu_expand_for_mapping() when a file is being truncated from nothing to allocate a contiguous run. The solution is to mark the pages dirty at the point of allocation by the truncation code. Signed-off-by: Enrik Berkhan Signed-off-by: David Howells Cc: Peter Zijlstra Cc: Nick Piggin Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ramfs/file-nommu.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index b9b567a..90d72be 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -114,6 +114,9 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) if (!pagevec_add(&lru_pvec, page)) __pagevec_lru_add_file(&lru_pvec); + /* prevent the page from being discarded on memory pressure */ + SetPageDirty(page); + unlock_page(page); } -- cgit v1.1 From 15e7b8767605dc0cb9bd4594caabfec392385210 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 13 Mar 2009 13:51:58 -0700 Subject: nommu: ramfs: don't leak pages when adding to page cache fails When a ramfs nommu mapping is expanded, contiguous pages are allocated and added to the pagecache. The caller's reference is then passed on by moving whole pagevecs to the file lru list. If the page cache adding fails, make sure that the error path also moves the pagevec contents which might still contain up to PAGEVEC_SIZE successfully added pages, of which we would leak references otherwise. Signed-off-by: Johannes Weiner Cc: David Howells Cc: Enrik Berkhan Cc: Nick Piggin Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ramfs/file-nommu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 90d72be..5d7c7ec 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -129,6 +129,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) return -EFBIG; add_error: + pagevec_lru_add_file(&lru_pvec); page_cache_release(pages + loop); for (loop++; loop < npages; loop++) __free_page(pages + loop); -- cgit v1.1 From 84814d642a4f1f294bd675ab11aae1ca54c6cedb Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Fri, 13 Mar 2009 13:51:59 -0700 Subject: eCryptfs: don't encrypt file key with filename key eCryptfs has file encryption keys (FEK), file encryption key encryption keys (FEKEK), and filename encryption keys (FNEK). The per-file FEK is encrypted with one or more FEKEKs and stored in the header of the encrypted file. I noticed that the FEK is also being encrypted by the FNEK. This is a problem if a user wants to use a different FNEK than their FEKEK, as their file contents will still be accessible with the FNEK. This is a minimalistic patch which prevents the FNEKs signatures from being copied to the inode signatures list. Ultimately, it keeps the FEK from being encrypted with a FNEK. Signed-off-by: Tyler Hicks Cc: Serge Hallyn Acked-by: Dustin Kirkland Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ecryptfs/crypto.c | 2 ++ fs/ecryptfs/ecryptfs_kernel.h | 3 ++- fs/ecryptfs/keystore.c | 3 ++- fs/ecryptfs/main.c | 5 +++-- 4 files changed, 9 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index f6caeb1..bdca1f4 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -946,6 +946,8 @@ static int ecryptfs_copy_mount_wide_sigs_to_inode_sigs( list_for_each_entry(global_auth_tok, &mount_crypt_stat->global_auth_tok_list, mount_crypt_stat_list) { + if (global_auth_tok->flags & ECRYPTFS_AUTH_TOK_FNEK) + continue; rc = ecryptfs_add_keysig(crypt_stat, global_auth_tok->sig); if (rc) { printk(KERN_ERR "Error adding keysig; rc = [%d]\n", rc); diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index c11fc95..eb2267e 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -328,6 +328,7 @@ struct ecryptfs_dentry_info { */ struct ecryptfs_global_auth_tok { #define ECRYPTFS_AUTH_TOK_INVALID 0x00000001 +#define ECRYPTFS_AUTH_TOK_FNEK 0x00000002 u32 flags; struct list_head mount_crypt_stat_list; struct key *global_auth_tok_key; @@ -696,7 +697,7 @@ ecryptfs_write_header_metadata(char *virt, int ecryptfs_add_keysig(struct ecryptfs_crypt_stat *crypt_stat, char *sig); int ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat, - char *sig); + char *sig, u32 global_auth_tok_flags); int ecryptfs_get_global_auth_tok_for_sig( struct ecryptfs_global_auth_tok **global_auth_tok, struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig); diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index ff53942..e4a6223 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -2375,7 +2375,7 @@ struct kmem_cache *ecryptfs_global_auth_tok_cache; int ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat, - char *sig) + char *sig, u32 global_auth_tok_flags) { struct ecryptfs_global_auth_tok *new_auth_tok; int rc = 0; @@ -2389,6 +2389,7 @@ ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat, goto out; } memcpy(new_auth_tok->sig, sig, ECRYPTFS_SIG_SIZE_HEX); + new_auth_tok->flags = global_auth_tok_flags; new_auth_tok->sig[ECRYPTFS_SIG_SIZE_HEX] = '\0'; mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex); list_add(&new_auth_tok->mount_crypt_stat_list, diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 789cf2e..aed56c2 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -319,7 +319,7 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options) case ecryptfs_opt_ecryptfs_sig: sig_src = args[0].from; rc = ecryptfs_add_global_auth_tok(mount_crypt_stat, - sig_src); + sig_src, 0); if (rc) { printk(KERN_ERR "Error attempting to register " "global sig; rc = [%d]\n", rc); @@ -370,7 +370,8 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options) ECRYPTFS_SIG_SIZE_HEX] = '\0'; rc = ecryptfs_add_global_auth_tok( mount_crypt_stat, - mount_crypt_stat->global_default_fnek_sig); + mount_crypt_stat->global_default_fnek_sig, + ECRYPTFS_AUTH_TOK_FNEK); if (rc) { printk(KERN_ERR "Error attempting to register " "global fnek sig [%s]; rc = [%d]\n", -- cgit v1.1 From 87092698c665e0a358caf9825ae13114343027e8 Mon Sep 17 00:00:00 2001 From: un'ichi Nomura Date: Mon, 9 Mar 2009 10:40:52 +0100 Subject: block: Add gfp_mask parameter to bio_integrity_clone() Stricter gfp_mask might be required for clone allocation. For example, request-based dm may clone bio in interrupt context so it has to use GFP_ATOMIC. Signed-off-by: Kiyoshi Ueda Signed-off-by: Jun'ichi Nomura Acked-by: Martin K. Petersen Cc: Alasdair G Kergon Signed-off-by: Jens Axboe --- fs/bio-integrity.c | 5 +++-- fs/bio.c | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 549b014..fe2b1aa 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -685,19 +685,20 @@ EXPORT_SYMBOL(bio_integrity_split); * bio_integrity_clone - Callback for cloning bios with integrity metadata * @bio: New bio * @bio_src: Original bio + * @gfp_mask: Memory allocation mask * @bs: bio_set to allocate bip from * * Description: Called to allocate a bip when cloning a bio */ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, - struct bio_set *bs) + gfp_t gfp_mask, struct bio_set *bs) { struct bio_integrity_payload *bip_src = bio_src->bi_integrity; struct bio_integrity_payload *bip; BUG_ON(bip_src == NULL); - bip = bio_integrity_alloc_bioset(bio, GFP_NOIO, bip_src->bip_vcnt, bs); + bip = bio_integrity_alloc_bioset(bio, gfp_mask, bip_src->bip_vcnt, bs); if (bip == NULL) return -EIO; diff --git a/fs/bio.c b/fs/bio.c index 124b95c..cf747378 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -463,7 +463,7 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask) if (bio_integrity(bio)) { int ret; - ret = bio_integrity_clone(b, bio, fs_bio_set); + ret = bio_integrity_clone(b, bio, gfp_mask, fs_bio_set); if (ret < 0) return NULL; -- cgit v1.1 From 059ea3318c8ede71851a52b4359fbf1ab0cec301 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 9 Mar 2009 10:42:45 +0100 Subject: block: fix memory leak in bio_clone() If bio_integrity_clone() fails, bio_clone() returns NULL without freeing the newly allocated bio. Signed-off-by: Li Zefan Signed-off-by: Jens Axboe --- fs/bio.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index cf747378..d4f0632 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -465,8 +465,10 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask) ret = bio_integrity_clone(b, bio, gfp_mask, fs_bio_set); - if (ret < 0) + if (ret < 0) { + bio_put(b); return NULL; + } } return b; -- cgit v1.1