diff options
Diffstat (limited to 'fs')
104 files changed, 1575 insertions, 1194 deletions
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 88e3787..e298fe1 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -119,6 +119,7 @@ int v9fs_dir_release(struct inode *inode, struct file *filp) const struct file_operations v9fs_dir_operations = { .read = generic_read_dir, + .llseek = generic_file_llseek, .readdir = v9fs_dir_readdir, .open = v9fs_file_open, .release = v9fs_dir_release, diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index c95295c..e83aa5e 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -626,8 +626,7 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, return NULL; error: - if (fid) - p9_client_clunk(fid); + p9_client_clunk(fid); return ERR_PTR(result); } @@ -1930,6 +1930,16 @@ config CIFS_WEAK_PW_HASH If unsure, say N. +config CIFS_UPCALL + bool "Kerberos/SPNEGO advanced session setup" + depends on CIFS && KEYS + help + Enables an upcall mechanism for CIFS which accesses + userspace helper utilities to provide SPNEGO packaged (RFC 4178) + Kerberos tickets which are needed to mount to certain secure servers + (for which more secure Kerberos authentication is required). If + unsure, say N. + config CIFS_XATTR bool "CIFS extended attributes" depends on CIFS @@ -1982,17 +1992,6 @@ config CIFS_EXPERIMENTAL (which is disabled by default). See the file fs/cifs/README for more details. If unsure, say N. -config CIFS_UPCALL - bool "Kerberos/SPNEGO advanced session setup (EXPERIMENTAL)" - depends on CIFS_EXPERIMENTAL - depends on KEYS - help - Enables an upcall mechanism for CIFS which accesses - userspace helper utilities to provide SPNEGO packaged (RFC 4178) - Kerberos tickets which are needed to mount to certain secure servers - (for which more secure Kerberos authentication is required). If - unsure, say N. - config CIFS_DFS_UPCALL bool "DFS feature support (EXPERIMENTAL)" depends on CIFS_EXPERIMENTAL diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c index fc1a8dc..85a30e9 100644 --- a/fs/adfs/dir.c +++ b/fs/adfs/dir.c @@ -197,6 +197,7 @@ out: const struct file_operations adfs_dir_operations = { .read = generic_read_dir, + .llseek = generic_file_llseek, .readdir = adfs_readdir, .fsync = file_fsync, }; diff --git a/fs/affs/dir.c b/fs/affs/dir.c index 6e3f282..7b36904 100644 --- a/fs/affs/dir.c +++ b/fs/affs/dir.c @@ -19,6 +19,7 @@ static int affs_readdir(struct file *, void *, filldir_t); const struct file_operations affs_dir_operations = { .read = generic_read_dir, + .llseek = generic_file_llseek, .readdir = affs_readdir, .fsync = file_fsync, }; diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index bcfb2dc..2a41c2a 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -36,6 +36,7 @@ const struct file_operations autofs4_root_operations = { .release = dcache_dir_close, .read = generic_read_dir, .readdir = dcache_readdir, + .llseek = dcache_dir_lseek, .ioctl = autofs4_root_ioctl, }; @@ -44,6 +45,7 @@ const struct file_operations autofs4_dir_operations = { .release = dcache_dir_close, .read = generic_read_dir, .readdir = dcache_readdir, + .llseek = dcache_dir_lseek, }; const struct inode_operations autofs4_indirect_root_inode_operations = { diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 02c6e62b..740f536 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -66,6 +66,7 @@ static struct kmem_cache *befs_inode_cachep; static const struct file_operations befs_dir_operations = { .read = generic_read_dir, .readdir = befs_readdir, + .llseek = generic_file_llseek, }; static const struct inode_operations befs_dir_inode_operations = { diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index 87ee5cc..ed8feb0 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -125,8 +125,8 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, int mode, inode->i_ino); if (err) { inode_dec_link_count(inode); - iput(inode); mutex_unlock(&info->bfs_lock); + iput(inode); return err; } mutex_unlock(&info->bfs_lock); diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 56372ec..dfc0197 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -914,7 +914,9 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs) /* Stash our initial stack pointer into the mm structure */ current->mm->start_stack = (unsigned long )sp; - +#ifdef FLAT_PLAT_INIT + FLAT_PLAT_INIT(regs); +#endif DBG_FLT("start_thread(regs=0x%x, entry=0x%x, start_stack=0x%x)\n", (int)regs, (int)start_addr, (int)current->mm->start_stack); diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 7562053..8d7e88e 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -120,8 +120,6 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs) if (bprm->misc_bang) goto _ret; - bprm->misc_bang = 1; - /* to keep locking time low, we copy the interpreter string */ read_lock(&entries_lock); fmt = check_file(bprm); @@ -199,6 +197,8 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs) if (retval < 0) goto _error; + bprm->misc_bang = 1; + retval = search_binary_handler (bprm, regs); if (retval < 0) goto _error; @@ -469,20 +469,21 @@ static void bio_free_map_data(struct bio_map_data *bmd) kfree(bmd); } -static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count) +static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count, + gfp_t gfp_mask) { - struct bio_map_data *bmd = kmalloc(sizeof(*bmd), GFP_KERNEL); + struct bio_map_data *bmd = kmalloc(sizeof(*bmd), gfp_mask); if (!bmd) return NULL; - bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, GFP_KERNEL); + bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, gfp_mask); if (!bmd->iovecs) { kfree(bmd); return NULL; } - bmd->sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, GFP_KERNEL); + bmd->sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, gfp_mask); if (bmd->sgvecs) return bmd; @@ -491,8 +492,8 @@ static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count) return NULL; } -static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count, - int uncopy) +static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs, + struct sg_iovec *iov, int iov_count, int uncopy) { int ret = 0, i; struct bio_vec *bvec; @@ -502,7 +503,7 @@ static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count, __bio_for_each_segment(bvec, bio, i, 0) { char *bv_addr = page_address(bvec->bv_page); - unsigned int bv_len = bvec->bv_len; + unsigned int bv_len = iovecs[i].bv_len; while (bv_len && iov_idx < iov_count) { unsigned int bytes; @@ -554,7 +555,7 @@ int bio_uncopy_user(struct bio *bio) struct bio_map_data *bmd = bio->bi_private; int ret; - ret = __bio_copy_iov(bio, bmd->sgvecs, bmd->nr_sgvecs, 1); + ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs, bmd->nr_sgvecs, 1); bio_free_map_data(bmd); bio_put(bio); @@ -596,7 +597,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov, len += iov[i].iov_len; } - bmd = bio_alloc_map_data(nr_pages, iov_count); + bmd = bio_alloc_map_data(nr_pages, iov_count, GFP_KERNEL); if (!bmd) return ERR_PTR(-ENOMEM); @@ -633,7 +634,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov, * success */ if (!write_to_vm) { - ret = __bio_copy_iov(bio, iov, iov_count, 0); + ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0); if (ret) goto cleanup; } @@ -942,19 +943,22 @@ static void bio_copy_kern_endio(struct bio *bio, int err) { struct bio_vec *bvec; const int read = bio_data_dir(bio) == READ; - char *p = bio->bi_private; + struct bio_map_data *bmd = bio->bi_private; int i; + char *p = bmd->sgvecs[0].iov_base; __bio_for_each_segment(bvec, bio, i, 0) { char *addr = page_address(bvec->bv_page); + int len = bmd->iovecs[i].bv_len; if (read && !err) - memcpy(p, addr, bvec->bv_len); + memcpy(p, addr, len); __free_page(bvec->bv_page); - p += bvec->bv_len; + p += len; } + bio_free_map_data(bmd); bio_put(bio); } @@ -978,11 +982,21 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, const int nr_pages = end - start; struct bio *bio; struct bio_vec *bvec; + struct bio_map_data *bmd; int i, ret; + struct sg_iovec iov; + + iov.iov_base = data; + iov.iov_len = len; + + bmd = bio_alloc_map_data(nr_pages, 1, gfp_mask); + if (!bmd) + return ERR_PTR(-ENOMEM); + ret = -ENOMEM; bio = bio_alloc(gfp_mask, nr_pages); if (!bio) - return ERR_PTR(-ENOMEM); + goto out_bmd; while (len) { struct page *page; @@ -1016,14 +1030,18 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, } } - bio->bi_private = data; + bio->bi_private = bmd; bio->bi_end_io = bio_copy_kern_endio; + + bio_set_map_data(bmd, bio, &iov, 1); return bio; cleanup: bio_for_each_segment(bvec, bio, i) __free_page(bvec->bv_page); bio_put(bio); +out_bmd: + bio_free_map_data(bmd); return ERR_PTR(ret); } diff --git a/fs/buffer.c b/fs/buffer.c index 38653e3..ac78d4c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2926,14 +2926,17 @@ int submit_bh(int rw, struct buffer_head * bh) BUG_ON(!buffer_mapped(bh)); BUG_ON(!bh->b_end_io); - if (buffer_ordered(bh) && (rw == WRITE)) - rw = WRITE_BARRIER; + /* + * Mask in barrier bit for a write (could be either a WRITE or a + * WRITE_SYNC + */ + if (buffer_ordered(bh) && (rw & WRITE)) + rw |= WRITE_BARRIER; /* - * Only clear out a write error when rewriting, should this - * include WRITE_SYNC as well? + * Only clear out a write error when rewriting */ - if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER)) + if (test_set_buffer_req(bh) && (rw & WRITE)) clear_buffer_write_io_error(bh); /* diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES index f5d0083..06e521a 100644 --- a/fs/cifs/CHANGES +++ b/fs/cifs/CHANGES @@ -4,7 +4,15 @@ Fix premature write failure on congested networks (we would give up on EAGAIN from the socket too quickly on large writes). Cifs_mkdir and cifs_create now respect the setgid bit on parent dir. Fix endian problems in acl (mode from/to cifs acl) on bigendian -architectures. +architectures. Fix problems with preserving timestamps on copying open +files (e.g. "cp -a") to Windows servers. For mkdir and create honor setgid bit +on parent directory when server supports Unix Extensions but not POSIX +create. Update cifs.upcall version to handle new Kerberos sec flags +(this requires update of cifs.upcall program from Samba). Fix memory leak +on dns_upcall (resolving DFS referralls). Fix plain text password +authentication (requires setting SecurityFlags to 0x30030 to enable +lanman and plain text though). Fix writes to be at correct offset when +file is open with O_APPEND and file is on a directio (forcediretio) mount. Version 1.53 ------------ diff --git a/fs/cifs/README b/fs/cifs/README index 2bd6fe5..bd2343d 100644 --- a/fs/cifs/README +++ b/fs/cifs/README @@ -542,10 +542,20 @@ SecurityFlags Flags which control security negotiation and hashing mechanisms (as "must use") on the other hand does not make much sense. Default flags are 0x07007 - (NTLM, NTLMv2 and packet signing allowed). Maximum + (NTLM, NTLMv2 and packet signing allowed). The maximum allowable flags if you want to allow mounts to servers using weaker password hashes is 0x37037 (lanman, - plaintext, ntlm, ntlmv2, signing allowed): + plaintext, ntlm, ntlmv2, signing allowed). Some + SecurityFlags require the corresponding menuconfig + options to be enabled (lanman and plaintext require + CONFIG_CIFS_WEAK_PW_HASH for example). Enabling + plaintext authentication currently requires also + enabling lanman authentication in the security flags + because the cifs module only supports sending + laintext passwords using the older lanman dialect + form of the session setup SMB. (e.g. for authentication + using plain text passwords, set the SecurityFlags + to 0x30030): may use packet signing 0x00001 must use packet signing 0x01001 @@ -642,8 +652,30 @@ The statistics for the number of total SMBs and oplock breaks are different in that they represent all for that share, not just those for which the server returned success. -Also note that "cat /proc/fs/cifs/DebugData" will display information about +Also note that "cat /proc/fs/cifs/DebugData" will display information about the active sessions and the shares that are mounted. -Enabling Kerberos (extended security) works when CONFIG_CIFS_EXPERIMENTAL is -on but requires a user space helper (from the Samba project). NTLM and NTLMv2 and -LANMAN support do not require this helper. + +Enabling Kerberos (extended security) works but requires version 1.2 or later +of the helper program cifs.upcall to be present and to be configured in the +/etc/request-key.conf file. The cifs.upcall helper program is from the Samba +project(http://www.samba.org). NTLM and NTLMv2 and LANMAN support do not +require this helper. Note that NTLMv2 security (which does not require the +cifs.upcall helper program), instead of using Kerberos, is sufficient for +some use cases. + +Enabling DFS support (used to access shares transparently in an MS-DFS +global name space) requires that CONFIG_CIFS_EXPERIMENTAL be enabled. In +addition, DFS support for target shares which are specified as UNC +names which begin with host names (rather than IP addresses) requires +a user space helper (such as cifs.upcall) to be present in order to +translate host names to ip address, and the user space helper must also +be configured in the file /etc/request-key.conf + +To use cifs Kerberos and DFS support, the Linux keyutils package should be +installed and something like the following lines should be added to the +/etc/request-key.conf file: + +create cifs.spnego * * /usr/local/sbin/cifs.upcall %k +create dns_resolver * * /usr/local/sbin/cifs.upcall %k + + diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c index 5fabd2c..1b09f16 100644 --- a/fs/cifs/asn1.c +++ b/fs/cifs/asn1.c @@ -476,6 +476,7 @@ decode_negTokenInit(unsigned char *security_blob, int length, unsigned int cls, con, tag, oidlen, rc; bool use_ntlmssp = false; bool use_kerberos = false; + bool use_mskerberos = false; *secType = NTLM; /* BB eventually make Kerberos or NLTMSSP the default*/ @@ -574,10 +575,12 @@ decode_negTokenInit(unsigned char *security_blob, int length, *(oid + 1), *(oid + 2), *(oid + 3))); if (compare_oid(oid, oidlen, MSKRB5_OID, - MSKRB5_OID_LEN)) - use_kerberos = true; + MSKRB5_OID_LEN) && + !use_kerberos) + use_mskerberos = true; else if (compare_oid(oid, oidlen, KRB5_OID, - KRB5_OID_LEN)) + KRB5_OID_LEN) && + !use_mskerberos) use_kerberos = true; else if (compare_oid(oid, oidlen, NTLMSSP_OID, NTLMSSP_OID_LEN)) @@ -630,6 +633,8 @@ decode_negTokenInit(unsigned char *security_blob, int length, if (use_kerberos) *secType = Kerberos; + else if (use_mskerberos) + *secType = MSKerberos; else if (use_ntlmssp) *secType = NTLMSSP; diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c index 2434ab0..117ef4b 100644 --- a/fs/cifs/cifs_spnego.c +++ b/fs/cifs/cifs_spnego.c @@ -114,9 +114,11 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo) dp = description + strlen(description); - /* for now, only sec=krb5 is valid */ + /* for now, only sec=krb5 and sec=mskrb5 are valid */ if (server->secType == Kerberos) sprintf(dp, ";sec=krb5"); + else if (server->secType == MSKerberos) + sprintf(dp, ";sec=mskrb5"); else goto out; diff --git a/fs/cifs/cifs_spnego.h b/fs/cifs/cifs_spnego.h index 05a34b1..e4041ec 100644 --- a/fs/cifs/cifs_spnego.h +++ b/fs/cifs/cifs_spnego.h @@ -23,7 +23,7 @@ #ifndef _CIFS_SPNEGO_H #define _CIFS_SPNEGO_H -#define CIFS_SPNEGO_UPCALL_VERSION 1 +#define CIFS_SPNEGO_UPCALL_VERSION 2 /* * The version field should always be set to CIFS_SPNEGO_UPCALL_VERSION. diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 83fd40d..bd5f13d 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -294,6 +294,7 @@ void calc_lanman_hash(struct cifsSesInfo *ses, char *lnm_session_key) if ((ses->server->secMode & SECMODE_PW_ENCRYPT) == 0) if (extended_security & CIFSSEC_MAY_PLNTXT) { + memset(lnm_session_key, 0, CIFS_SESS_KEY_SIZE); memcpy(lnm_session_key, password_with_pad, CIFS_ENCPWD_SIZE); return; diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index e8da4ee..25ecbd5 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -175,6 +175,8 @@ out_no_root: if (inode) iput(inode); + cifs_umount(sb, cifs_sb); + out_mount_failed: if (cifs_sb) { #ifdef CONFIG_CIFS_DFS_UPCALL diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 7e1cf26..8dfd6f2 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -80,7 +80,8 @@ enum securityEnum { NTLMv2, /* Legacy NTLM auth with NTLMv2 hash */ RawNTLMSSP, /* NTLMSSP without SPNEGO */ NTLMSSP, /* NTLMSSP via SPNEGO */ - Kerberos /* Kerberos via SPNEGO */ + Kerberos, /* Kerberos via SPNEGO */ + MSKerberos, /* MS Kerberos via SPNEGO */ }; enum protocolEnum { diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 0711db6..4c13bcd 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3598,19 +3598,21 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo, char ntlm_session_key[CIFS_SESS_KEY_SIZE]; bool ntlmv2_flag = false; int first_time = 0; + struct TCP_Server_Info *server = pSesInfo->server; /* what if server changes its buffer size after dropping the session? */ - if (pSesInfo->server->maxBuf == 0) /* no need to send on reconnect */ { + if (server->maxBuf == 0) /* no need to send on reconnect */ { rc = CIFSSMBNegotiate(xid, pSesInfo); - if (rc == -EAGAIN) /* retry only once on 1st time connection */ { + if (rc == -EAGAIN) { + /* retry only once on 1st time connection */ rc = CIFSSMBNegotiate(xid, pSesInfo); if (rc == -EAGAIN) rc = -EHOSTDOWN; } if (rc == 0) { spin_lock(&GlobalMid_Lock); - if (pSesInfo->server->tcpStatus != CifsExiting) - pSesInfo->server->tcpStatus = CifsGood; + if (server->tcpStatus != CifsExiting) + server->tcpStatus = CifsGood; else rc = -EHOSTDOWN; spin_unlock(&GlobalMid_Lock); @@ -3623,23 +3625,22 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo, goto ss_err_exit; pSesInfo->flags = 0; - pSesInfo->capabilities = pSesInfo->server->capabilities; + pSesInfo->capabilities = server->capabilities; if (linuxExtEnabled == 0) pSesInfo->capabilities &= (~CAP_UNIX); /* pSesInfo->sequence_number = 0;*/ cFYI(1, ("Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d", - pSesInfo->server->secMode, - pSesInfo->server->capabilities, - pSesInfo->server->timeAdj)); + server->secMode, server->capabilities, server->timeAdj)); + if (experimEnabled < 2) rc = CIFS_SessSetup(xid, pSesInfo, first_time, nls_info); else if (extended_security && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY) - && (pSesInfo->server->secType == NTLMSSP)) { + && (server->secType == NTLMSSP)) { rc = -EOPNOTSUPP; } else if (extended_security && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY) - && (pSesInfo->server->secType == RawNTLMSSP)) { + && (server->secType == RawNTLMSSP)) { cFYI(1, ("NTLMSSP sesssetup")); rc = CIFSNTLMSSPNegotiateSessSetup(xid, pSesInfo, &ntlmv2_flag, nls_info); @@ -3668,12 +3669,12 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo, } else { SMBNTencrypt(pSesInfo->password, - pSesInfo->server->cryptKey, + server->cryptKey, ntlm_session_key); if (first_time) cifs_calculate_mac_key( - &pSesInfo->server->mac_signing_key, + &server->mac_signing_key, ntlm_session_key, pSesInfo->password); } @@ -3686,13 +3687,13 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo, nls_info); } } else { /* old style NTLM 0.12 session setup */ - SMBNTencrypt(pSesInfo->password, pSesInfo->server->cryptKey, + SMBNTencrypt(pSesInfo->password, server->cryptKey, ntlm_session_key); if (first_time) - cifs_calculate_mac_key( - &pSesInfo->server->mac_signing_key, - ntlm_session_key, pSesInfo->password); + cifs_calculate_mac_key(&server->mac_signing_key, + ntlm_session_key, + pSesInfo->password); rc = CIFSSessSetup(xid, pSesInfo, ntlm_session_key, nls_info); } diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c index f730ef3..a2e0673 100644 --- a/fs/cifs/dns_resolve.c +++ b/fs/cifs/dns_resolve.c @@ -47,11 +47,18 @@ static int dns_resolver_instantiate(struct key *key, const void *data, return rc; } +static void +dns_resolver_destroy(struct key *key) +{ + kfree(key->payload.data); +} + struct key_type key_type_dns_resolver = { .name = "dns_resolver", .def_datalen = sizeof(struct in_addr), .describe = user_describe, .instantiate = dns_resolver_instantiate, + .destroy = dns_resolver_destroy, .match = user_match, }; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index ff14d14..cbefe1f 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -833,6 +833,10 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data, return -EBADF; open_file = (struct cifsFileInfo *) file->private_data; + rc = generic_write_checks(file, poffset, &write_size, 0); + if (rc) + return rc; + xid = GetXid(); if (*poffset > file->f_path.dentry->d_inode->i_size) diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 28a2209..9c548f1 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -546,7 +546,8 @@ int cifs_get_inode_info(struct inode **pinode, if ((inode->i_mode & S_IWUGO) == 0 && (attr & ATTR_READONLY) == 0) inode->i_mode |= (S_IWUGO & default_mode); - inode->i_mode &= ~S_IFMT; + + inode->i_mode &= ~S_IFMT; } /* clear write bits if ATTR_READONLY is set */ if (attr & ATTR_READONLY) @@ -649,6 +650,7 @@ struct inode *cifs_iget(struct super_block *sb, unsigned long ino) inode->i_fop = &simple_dir_operations; inode->i_uid = cifs_sb->mnt_uid; inode->i_gid = cifs_sb->mnt_gid; + } else if (rc) { _FreeXid(xid); iget_failed(inode); return ERR_PTR(rc); diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index ed150ef..252fdc0 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -409,6 +409,8 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time, #ifdef CONFIG_CIFS_WEAK_PW_HASH char lnm_session_key[CIFS_SESS_KEY_SIZE]; + pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE; + /* no capabilities flags in old lanman negotiation */ pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE); @@ -505,7 +507,7 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time, unicode_ssetup_strings(&bcc_ptr, ses, nls_cp); } else ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); - } else if (type == Kerberos) { + } else if (type == Kerberos || type == MSKerberos) { #ifdef CONFIG_CIFS_UPCALL struct cifs_spnego_msg *msg; spnego_key = cifs_get_spnego_key(ses); @@ -516,6 +518,15 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time, } msg = spnego_key->payload.data; + /* check version field to make sure that cifs.upcall is + sending us a response in an expected form */ + if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) { + cERROR(1, ("incorrect version of cifs.upcall (expected" + " %d but got %d)", + CIFS_SPNEGO_UPCALL_VERSION, msg->version)); + rc = -EKEYREJECTED; + goto ssetup_exit; + } /* bail out if key is too long */ if (msg->sesskey_len > sizeof(ses->server->mac_signing_key.data.krb5)) { diff --git a/fs/compat.c b/fs/compat.c index c9d1472..075d050 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -792,8 +792,10 @@ static int compat_fillonedir(void *__buf, const char *name, int namlen, if (buf->result) return -EINVAL; d_ino = ino; - if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) + if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { + buf->result = -EOVERFLOW; return -EOVERFLOW; + } buf->result++; dirent = buf->dirent; if (!access_ok(VERIFY_WRITE, dirent, @@ -862,8 +864,10 @@ static int compat_filldir(void *__buf, const char *name, int namlen, if (reclen > buf->count) return -EINVAL; d_ino = ino; - if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) + if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { + buf->error = -EOVERFLOW; return -EOVERFLOW; + } dirent = buf->previous; if (dirent) { if (__put_user(offset, &dirent->d_off)) diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 7a8db78..8e93341 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1311,16 +1311,18 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry) * Ensure that no racing symlink() will make detach_prep() fail while * the new link is temporarily attached */ - mutex_lock(&configfs_symlink_mutex); - spin_lock(&configfs_dirent_lock); do { struct mutex *wait_mutex; + mutex_lock(&configfs_symlink_mutex); + spin_lock(&configfs_dirent_lock); ret = configfs_detach_prep(dentry, &wait_mutex); - if (ret) { + if (ret) configfs_detach_rollback(dentry); - spin_unlock(&configfs_dirent_lock); - mutex_unlock(&configfs_symlink_mutex); + spin_unlock(&configfs_dirent_lock); + mutex_unlock(&configfs_symlink_mutex); + + if (ret) { if (ret != -EAGAIN) { config_item_put(parent_item); return ret; @@ -1329,13 +1331,8 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry) /* Wait until the racing operation terminates */ mutex_lock(wait_mutex); mutex_unlock(wait_mutex); - - mutex_lock(&configfs_symlink_mutex); - spin_lock(&configfs_dirent_lock); } } while (ret == -EAGAIN); - spin_unlock(&configfs_dirent_lock); - mutex_unlock(&configfs_symlink_mutex); /* Get a working ref for the duration of this function */ item = configfs_get_config_item(dentry); diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 0c3b618..f40423e 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -43,58 +43,13 @@ static DEFINE_MUTEX(read_mutex); static int cramfs_iget5_test(struct inode *inode, void *opaque) { struct cramfs_inode *cramfs_inode = opaque; - - if (inode->i_ino != CRAMINO(cramfs_inode)) - return 0; /* does not match */ - - if (inode->i_ino != 1) - return 1; - - /* all empty directories, char, block, pipe, and sock, share inode #1 */ - - if ((inode->i_mode != cramfs_inode->mode) || - (inode->i_gid != cramfs_inode->gid) || - (inode->i_uid != cramfs_inode->uid)) - return 0; /* does not match */ - - if ((S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) && - (inode->i_rdev != old_decode_dev(cramfs_inode->size))) - return 0; /* does not match */ - - return 1; /* matches */ + return inode->i_ino == CRAMINO(cramfs_inode) && inode->i_ino != 1; } static int cramfs_iget5_set(struct inode *inode, void *opaque) { - static struct timespec zerotime; struct cramfs_inode *cramfs_inode = opaque; - inode->i_mode = cramfs_inode->mode; - inode->i_uid = cramfs_inode->uid; - inode->i_size = cramfs_inode->size; - inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1; - inode->i_gid = cramfs_inode->gid; - /* Struct copy intentional */ - inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime; inode->i_ino = CRAMINO(cramfs_inode); - /* inode->i_nlink is left 1 - arguably wrong for directories, - but it's the best we can do without reading the directory - contents. 1 yields the right result in GNU find, even - without -noleaf option. */ - if (S_ISREG(inode->i_mode)) { - inode->i_fop = &generic_ro_fops; - inode->i_data.a_ops = &cramfs_aops; - } else if (S_ISDIR(inode->i_mode)) { - inode->i_op = &cramfs_dir_inode_operations; - inode->i_fop = &cramfs_directory_operations; - } else if (S_ISLNK(inode->i_mode)) { - inode->i_op = &page_symlink_inode_operations; - inode->i_data.a_ops = &cramfs_aops; - } else { - inode->i_size = 0; - inode->i_blocks = 0; - init_special_inode(inode, inode->i_mode, - old_decode_dev(cramfs_inode->size)); - } return 0; } @@ -104,12 +59,48 @@ static struct inode *get_cramfs_inode(struct super_block *sb, struct inode *inode = iget5_locked(sb, CRAMINO(cramfs_inode), cramfs_iget5_test, cramfs_iget5_set, cramfs_inode); + static struct timespec zerotime; + if (inode && (inode->i_state & I_NEW)) { + inode->i_mode = cramfs_inode->mode; + inode->i_uid = cramfs_inode->uid; + inode->i_size = cramfs_inode->size; + inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1; + inode->i_gid = cramfs_inode->gid; + /* Struct copy intentional */ + inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime; + /* inode->i_nlink is left 1 - arguably wrong for directories, + but it's the best we can do without reading the directory + contents. 1 yields the right result in GNU find, even + without -noleaf option. */ + if (S_ISREG(inode->i_mode)) { + inode->i_fop = &generic_ro_fops; + inode->i_data.a_ops = &cramfs_aops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &cramfs_dir_inode_operations; + inode->i_fop = &cramfs_directory_operations; + } else if (S_ISLNK(inode->i_mode)) { + inode->i_op = &page_symlink_inode_operations; + inode->i_data.a_ops = &cramfs_aops; + } else { + inode->i_size = 0; + inode->i_blocks = 0; + init_special_inode(inode, inode->i_mode, + old_decode_dev(cramfs_inode->size)); + } unlock_new_inode(inode); } return inode; } +static void cramfs_drop_inode(struct inode *inode) +{ + if (inode->i_ino == 1) + generic_delete_inode(inode); + else + generic_drop_inode(inode); +} + /* * We have our own block cache: don't fill up the buffer cache * with the rom-image, because the way the filesystem is set @@ -534,6 +525,7 @@ static const struct super_operations cramfs_ops = { .put_super = cramfs_put_super, .remount_fs = cramfs_remount, .statfs = cramfs_statfs, + .drop_inode = cramfs_drop_inode, }; static int cramfs_get_sb(struct file_system_type *fs_type, diff --git a/fs/dcache.c b/fs/dcache.c index 101663d..e7a1a99 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1236,7 +1236,7 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) * If no entry exists with the exact case name, allocate new dentry with * the exact case, and return the spliced entry. */ -struct dentry *d_add_ci(struct inode *inode, struct dentry *dentry, +struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode, struct qstr *name) { int error; @@ -1395,6 +1395,10 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) if (dentry->d_parent != parent) goto next; + /* non-existing due to RCU? */ + if (d_unhashed(dentry)) + goto next; + /* * It is safe to compare names since d_move() cannot * change the qstr (protected by d_lock). @@ -1410,10 +1414,8 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) goto next; } - if (!d_unhashed(dentry)) { - atomic_inc(&dentry->d_count); - found = dentry; - } + atomic_inc(&dentry->d_count); + found = dentry; spin_unlock(&dentry->d_lock); break; next: diff --git a/fs/efs/namei.c b/fs/efs/namei.c index 3a404e7..291abb1 100644 --- a/fs/efs/namei.c +++ b/fs/efs/namei.c @@ -74,8 +74,7 @@ struct dentry *efs_lookup(struct inode *dir, struct dentry *dentry, struct namei } unlock_kernel(); - d_add(dentry, inode); - return NULL; + return d_splice_alias(inode, dentry); } static struct inode *efs_nfs_get_inode(struct super_block *sb, u64 ino, @@ -752,11 +752,11 @@ static int exec_mmap(struct mm_struct *mm) tsk->active_mm = mm; activate_mm(active_mm, mm); task_unlock(tsk); - mm_update_next_owner(old_mm); arch_pick_mmap_layout(mm); if (old_mm) { up_read(&old_mm->mmap_sem); BUG_ON(active_mm != old_mm); + mm_update_next_owner(old_mm); mmput(old_mm); return 0; } diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 1ae5004..e9fa960 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -1626,6 +1626,9 @@ ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi, free_blocks = percpu_counter_sum_and_set(&sbi->s_freeblocks_counter); #endif + if (free_blocks <= root_blocks) + /* we don't have free space */ + return 0; if (free_blocks - root_blocks < nblocks) return free_blocks - root_blocks; return nblocks; diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index d3d23d7..ec8e33b 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -411,7 +411,7 @@ static int call_filldir(struct file * filp, void * dirent, get_dtype(sb, fname->file_type)); if (error) { filp->f_pos = curr_pos; - info->extra_fname = fname->next; + info->extra_fname = fname; return error; } fname = fname->next; @@ -450,11 +450,21 @@ static int ext4_dx_readdir(struct file * filp, * If there are any leftover names on the hash collision * chain, return them first. */ - if (info->extra_fname && - call_filldir(filp, dirent, filldir, info->extra_fname)) - goto finished; + if (info->extra_fname) { + if (call_filldir(filp, dirent, filldir, info->extra_fname)) + goto finished; - if (!info->curr_node) + info->extra_fname = NULL; + info->curr_node = rb_next(info->curr_node); + if (!info->curr_node) { + if (info->next_hash == ~0) { + filp->f_pos = EXT4_HTREE_EOF; + goto finished; + } + info->curr_hash = info->next_hash; + info->curr_minor_hash = 0; + } + } else if (!info->curr_node) info->curr_node = rb_first(&info->root); while (1) { diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6c7924d..2950032 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1072,6 +1072,8 @@ extern void ext4_set_inode_flags(struct inode *); extern void ext4_get_inode_flags(struct ext4_inode_info *); extern void ext4_set_aops(struct inode *inode); extern int ext4_writepage_trans_blocks(struct inode *); +extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks); +extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page); @@ -1227,6 +1229,8 @@ extern const struct inode_operations ext4_fast_symlink_inode_operations; /* extents.c */ extern int ext4_ext_tree_init(handle_t *handle, struct inode *); extern int ext4_ext_writepage_trans_blocks(struct inode *, int); +extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, + int chunk); extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, ext4_lblk_t iblock, unsigned long max_blocks, struct buffer_head *bh_result, diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 6c166c0..d33dc56 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -216,7 +216,9 @@ extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks); extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t); extern int ext4_extent_tree_init(handle_t *, struct inode *); -extern int ext4_ext_calc_credits_for_insert(struct inode *, struct ext4_ext_path *); +extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, + int num, + struct ext4_ext_path *path); extern int ext4_ext_try_to_merge(struct inode *inode, struct ext4_ext_path *path, struct ext4_extent *); diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index eb8bc3a..b455c68 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -51,6 +51,14 @@ EXT4_XATTR_TRANS_BLOCKS - 2 + \ 2*EXT4_QUOTA_TRANS_BLOCKS(sb)) +/* + * Define the number of metadata blocks we need to account to modify data. + * + * This include super block, inode block, quota blocks and xattr blocks + */ +#define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \ + 2*EXT4_QUOTA_TRANS_BLOCKS(sb)) + /* Delete operations potentially hit one directory's namespace plus an * entire inode, plus arbitrary amounts of bitmap/indirection data. Be * generous. We can grow the delete transaction later if necessary. */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 612c3d2..b24d3c5 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1747,54 +1747,61 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, } /* - * ext4_ext_calc_credits_for_insert: - * This routine returns max. credits that the extent tree can consume. - * It should be OK for low-performance paths like ->writepage() - * To allow many writing processes to fit into a single transaction, - * the caller should calculate credits under i_data_sem and - * pass the actual path. + * ext4_ext_calc_credits_for_single_extent: + * This routine returns max. credits that needed to insert an extent + * to the extent tree. + * When pass the actual path, the caller should calculate credits + * under i_data_sem. */ -int ext4_ext_calc_credits_for_insert(struct inode *inode, +int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, struct ext4_ext_path *path) { - int depth, needed; - if (path) { + int depth = ext_depth(inode); + int ret = 0; + /* probably there is space in leaf? */ - depth = ext_depth(inode); if (le16_to_cpu(path[depth].p_hdr->eh_entries) - < le16_to_cpu(path[depth].p_hdr->eh_max)) - return 1; - } + < le16_to_cpu(path[depth].p_hdr->eh_max)) { - /* - * given 32-bit logical block (4294967296 blocks), max. tree - * can be 4 levels in depth -- 4 * 340^4 == 53453440000. - * Let's also add one more level for imbalance. - */ - depth = 5; - - /* allocation of new data block(s) */ - needed = 2; + /* + * There are some space in the leaf tree, no + * need to account for leaf block credit + * + * bitmaps and block group descriptor blocks + * and other metadat blocks still need to be + * accounted. + */ + /* 1 bitmap, 1 block group descriptor */ + ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb); + } + } - /* - * tree can be full, so it would need to grow in depth: - * we need one credit to modify old root, credits for - * new root will be added in split accounting - */ - needed += 1; + return ext4_chunk_trans_blocks(inode, nrblocks); +} - /* - * Index split can happen, we would need: - * allocate intermediate indexes (bitmap + group) - * + change two blocks at each level, but root (already included) - */ - needed += (depth * 2) + (depth * 2); +/* + * How many index/leaf blocks need to change/allocate to modify nrblocks? + * + * if nrblocks are fit in a single extent (chunk flag is 1), then + * in the worse case, each tree level index/leaf need to be changed + * if the tree split due to insert a new extent, then the old tree + * index/leaf need to be updated too + * + * If the nrblocks are discontiguous, they could cause + * the whole tree split more than once, but this is really rare. + */ +int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) +{ + int index; + int depth = ext_depth(inode); - /* any allocation modifies superblock */ - needed += 1; + if (chunk) + index = depth * 2; + else + index = depth * 3; - return needed; + return index; } static int ext4_remove_blocks(handle_t *handle, struct inode *inode, @@ -1921,9 +1928,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, correct_index = 1; credits += (ext_depth(inode)) + 1; } -#ifdef CONFIG_QUOTA credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); -#endif err = ext4_ext_journal_restart(handle, credits); if (err) @@ -2805,7 +2810,7 @@ void ext4_ext_truncate(struct inode *inode) /* * probably first extent we're gonna free will be last in block */ - err = ext4_writepage_trans_blocks(inode) + 3; + err = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, err); if (IS_ERR(handle)) return; @@ -2819,7 +2824,7 @@ void ext4_ext_truncate(struct inode *inode) down_write(&EXT4_I(inode)->i_data_sem); ext4_ext_invalidate_cache(inode); - ext4_mb_discard_inode_preallocations(inode); + ext4_discard_reservation(inode); /* * TODO: optimization is possible here. @@ -2858,27 +2863,6 @@ out_stop: ext4_journal_stop(handle); } -/* - * ext4_ext_writepage_trans_blocks: - * calculate max number of blocks we could modify - * in order to allocate new block for an inode - */ -int ext4_ext_writepage_trans_blocks(struct inode *inode, int num) -{ - int needed; - - needed = ext4_ext_calc_credits_for_insert(inode, NULL); - - /* caller wants to allocate num blocks, but note it includes sb */ - needed = needed * num - (num - 1); - -#ifdef CONFIG_QUOTA - needed += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); -#endif - - return needed; -} - static void ext4_falloc_update_inode(struct inode *inode, int mode, loff_t new_size, int update_ctime) { @@ -2939,10 +2923,9 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - block; /* - * credits to insert 1 extent into extent tree + buffers to be able to - * modify 1 super block, 1 block bitmap and 1 group descriptor. + * credits to insert 1 extent into extent tree */ - credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 3; + credits = ext4_chunk_trans_blocks(inode, max_blocks); mutex_lock(&inode->i_mutex); retry: while (ret >= 0 && ret < max_blocks) { diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 655e760..f344834 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -351,7 +351,7 @@ find_close_to_parent: goto found_flexbg; } - if (best_flex < 0 || + if (flex_group[best_flex].free_inodes == 0 || (flex_group[i].free_blocks > flex_group[best_flex].free_blocks && flex_group[i].free_inodes)) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 59fbbe8..7e91913e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -41,6 +41,8 @@ #include "acl.h" #include "ext4_extents.h" +#define MPAGE_DA_EXTENT_TAIL 0x01 + static inline int ext4_begin_ordered_truncate(struct inode *inode, loff_t new_size) { @@ -1005,6 +1007,9 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks) */ static int ext4_calc_metadata_amount(struct inode *inode, int blocks) { + if (!blocks) + return 0; + if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) return ext4_ext_calc_metadata_amount(inode, blocks); @@ -1041,18 +1046,6 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used) spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); } -/* Maximum number of blocks we map for direct IO at once. */ -#define DIO_MAX_BLOCKS 4096 -/* - * Number of credits we need for writing DIO_MAX_BLOCKS: - * We need sb + group descriptor + bitmap + inode -> 4 - * For B blocks with A block pointers per block we need: - * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect). - * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25. - */ -#define DIO_CREDITS 25 - - /* * The ext4_get_blocks_wrap() function try to look up the requested blocks, * and returns if the blocks are already mapped. @@ -1164,19 +1157,23 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, return retval; } +/* Maximum number of blocks we map for direct IO at once. */ +#define DIO_MAX_BLOCKS 4096 + static int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { handle_t *handle = ext4_journal_current_handle(); int ret = 0, started = 0; unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; + int dio_credits; if (create && !handle) { /* Direct IO write... */ if (max_blocks > DIO_MAX_BLOCKS) max_blocks = DIO_MAX_BLOCKS; - handle = ext4_journal_start(inode, DIO_CREDITS + - 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb)); + dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); + handle = ext4_journal_start(inode, dio_credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; @@ -1559,7 +1556,25 @@ static void ext4_da_release_space(struct inode *inode, int to_free) struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int total, mdb, mdb_free, release; + if (!to_free) + return; /* Nothing to release, exit */ + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + + if (!EXT4_I(inode)->i_reserved_data_blocks) { + /* + * if there is no reserved blocks, but we try to free some + * then the counter is messed up somewhere. + * but since this function is called from invalidate + * page, it's harmless to return without any action + */ + printk(KERN_INFO "ext4 delalloc try to release %d reserved " + "blocks for inode %lu, but there is no reserved " + "data blocks\n", to_free, inode->i_ino); + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + return; + } + /* recalculate the number of metablocks still need to be reserved */ total = EXT4_I(inode)->i_reserved_data_blocks - to_free; mdb = ext4_calc_metadata_amount(inode, total); @@ -1613,11 +1628,13 @@ struct mpage_da_data { unsigned long first_page, next_page; /* extent of pages */ get_block_t *get_block; struct writeback_control *wbc; + int io_done; + long pages_written; }; /* * mpage_da_submit_io - walks through extent of pages and try to write - * them with __mpage_writepage() + * them with writepage() call back * * @mpd->inode: inode * @mpd->first_page: first page of the extent @@ -1632,18 +1649,11 @@ struct mpage_da_data { static int mpage_da_submit_io(struct mpage_da_data *mpd) { struct address_space *mapping = mpd->inode->i_mapping; - struct mpage_data mpd_pp = { - .bio = NULL, - .last_block_in_bio = 0, - .get_block = mpd->get_block, - .use_writepage = 1, - }; int ret = 0, err, nr_pages, i; unsigned long index, end; struct pagevec pvec; BUG_ON(mpd->next_page <= mpd->first_page); - pagevec_init(&pvec, 0); index = mpd->first_page; end = mpd->next_page - 1; @@ -1661,8 +1671,9 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) break; index++; - err = __mpage_writepage(page, mpd->wbc, &mpd_pp); - + err = mapping->a_ops->writepage(page, mpd->wbc); + if (!err) + mpd->pages_written++; /* * In error case, we have to continue because * remaining pages are still locked @@ -1673,9 +1684,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) } pagevec_release(&pvec); } - if (mpd_pp.bio) - mpage_bio_submit(WRITE, mpd_pp.bio); - return ret; } @@ -1698,7 +1706,7 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, int blocks = exbh->b_size >> inode->i_blkbits; sector_t pblock = exbh->b_blocknr, cur_logical; struct buffer_head *head, *bh; - unsigned long index, end; + pgoff_t index, end; struct pagevec pvec; int nr_pages, i; @@ -1741,6 +1749,13 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, if (buffer_delay(bh)) { bh->b_blocknr = pblock; clear_buffer_delay(bh); + bh->b_bdev = inode->i_sb->s_bdev; + } else if (buffer_unwritten(bh)) { + bh->b_blocknr = pblock; + clear_buffer_unwritten(bh); + set_buffer_mapped(bh); + set_buffer_new(bh); + bh->b_bdev = inode->i_sb->s_bdev; } else if (buffer_mapped(bh)) BUG_ON(bh->b_blocknr != pblock); @@ -1776,13 +1791,11 @@ static inline void __unmap_underlying_blocks(struct inode *inode, * * The function skips space we know is already mapped to disk blocks. * - * The function ignores errors ->get_block() returns, thus real - * error handling is postponed to __mpage_writepage() */ static void mpage_da_map_blocks(struct mpage_da_data *mpd) { + int err = 0; struct buffer_head *lbh = &mpd->lbh; - int err = 0, remain = lbh->b_size; sector_t next = lbh->b_blocknr; struct buffer_head new; @@ -1792,38 +1805,36 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd) if (buffer_mapped(lbh) && !buffer_delay(lbh)) return; - while (remain) { - new.b_state = lbh->b_state; - new.b_blocknr = 0; - new.b_size = remain; - err = mpd->get_block(mpd->inode, next, &new, 1); - if (err) { - /* - * Rather than implement own error handling - * here, we just leave remaining blocks - * unallocated and try again with ->writepage() - */ - break; - } - BUG_ON(new.b_size == 0); + new.b_state = lbh->b_state; + new.b_blocknr = 0; + new.b_size = lbh->b_size; - if (buffer_new(&new)) - __unmap_underlying_blocks(mpd->inode, &new); + /* + * If we didn't accumulate anything + * to write simply return + */ + if (!new.b_size) + return; + err = mpd->get_block(mpd->inode, next, &new, 1); + if (err) + return; + BUG_ON(new.b_size == 0); - /* - * If blocks are delayed marked, we need to - * put actual blocknr and drop delayed bit - */ - if (buffer_delay(lbh)) - mpage_put_bnr_to_bhs(mpd, next, &new); + if (buffer_new(&new)) + __unmap_underlying_blocks(mpd->inode, &new); - /* go for the remaining blocks */ - next += new.b_size >> mpd->inode->i_blkbits; - remain -= new.b_size; - } + /* + * If blocks are delayed marked, we need to + * put actual blocknr and drop delayed bit + */ + if (buffer_delay(lbh) || buffer_unwritten(lbh)) + mpage_put_bnr_to_bhs(mpd, next, &new); + + return; } -#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay)) +#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ + (1 << BH_Delay) | (1 << BH_Unwritten)) /* * mpage_add_bh_to_extent - try to add one more block to extent of blocks @@ -1837,41 +1848,61 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd) static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, sector_t logical, struct buffer_head *bh) { - struct buffer_head *lbh = &mpd->lbh; sector_t next; + size_t b_size = bh->b_size; + struct buffer_head *lbh = &mpd->lbh; + int nrblocks = lbh->b_size >> mpd->inode->i_blkbits; - next = lbh->b_blocknr + (lbh->b_size >> mpd->inode->i_blkbits); - + /* check if thereserved journal credits might overflow */ + if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) { + if (nrblocks >= EXT4_MAX_TRANS_DATA) { + /* + * With non-extent format we are limited by the journal + * credit available. Total credit needed to insert + * nrblocks contiguous blocks is dependent on the + * nrblocks. So limit nrblocks. + */ + goto flush_it; + } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) > + EXT4_MAX_TRANS_DATA) { + /* + * Adding the new buffer_head would make it cross the + * allowed limit for which we have journal credit + * reserved. So limit the new bh->b_size + */ + b_size = (EXT4_MAX_TRANS_DATA - nrblocks) << + mpd->inode->i_blkbits; + /* we will do mpage_da_submit_io in the next loop */ + } + } /* * First block in the extent */ if (lbh->b_size == 0) { lbh->b_blocknr = logical; - lbh->b_size = bh->b_size; + lbh->b_size = b_size; lbh->b_state = bh->b_state & BH_FLAGS; return; } + next = lbh->b_blocknr + nrblocks; /* * Can we merge the block to our big extent? */ if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) { - lbh->b_size += bh->b_size; + lbh->b_size += b_size; return; } +flush_it: /* * We couldn't merge the block to our extent, so we * need to flush current extent and start new one */ mpage_da_map_blocks(mpd); - - /* - * Now start a new extent - */ - lbh->b_size = bh->b_size; - lbh->b_state = bh->b_state & BH_FLAGS; - lbh->b_blocknr = logical; + mpage_da_submit_io(mpd); + mpd->io_done = 1; + return; } /* @@ -1891,17 +1922,35 @@ static int __mpage_da_writepage(struct page *page, struct buffer_head *bh, *head, fake; sector_t logical; + if (mpd->io_done) { + /* + * Rest of the page in the page_vec + * redirty then and skip then. We will + * try to to write them again after + * starting a new transaction + */ + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return MPAGE_DA_EXTENT_TAIL; + } /* * Can we merge this page to current extent? */ if (mpd->next_page != page->index) { /* * Nope, we can't. So, we map non-allocated blocks - * and start IO on them using __mpage_writepage() + * and start IO on them using writepage() */ if (mpd->next_page != mpd->first_page) { mpage_da_map_blocks(mpd); mpage_da_submit_io(mpd); + /* + * skip rest of the page in the page_vec + */ + mpd->io_done = 1; + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return MPAGE_DA_EXTENT_TAIL; } /* @@ -1932,6 +1981,8 @@ static int __mpage_da_writepage(struct page *page, set_buffer_dirty(bh); set_buffer_uptodate(bh); mpage_add_bh_to_extent(mpd, logical, bh); + if (mpd->io_done) + return MPAGE_DA_EXTENT_TAIL; } else { /* * Page with regular buffer heads, just add all dirty ones @@ -1940,8 +1991,12 @@ static int __mpage_da_writepage(struct page *page, bh = head; do { BUG_ON(buffer_locked(bh)); - if (buffer_dirty(bh)) + if (buffer_dirty(bh) && + (!buffer_mapped(bh) || buffer_delay(bh))) { mpage_add_bh_to_extent(mpd, logical, bh); + if (mpd->io_done) + return MPAGE_DA_EXTENT_TAIL; + } logical++; } while ((bh = bh->b_this_page) != head); } @@ -1960,22 +2015,13 @@ static int __mpage_da_writepage(struct page *page, * * This is a library function, which implements the writepages() * address_space_operation. - * - * In order to avoid duplication of logic that deals with partial pages, - * multiple bio per page, etc, we find non-allocated blocks, allocate - * them with minimal calls to ->get_block() and re-use __mpage_writepage() - * - * It's important that we call __mpage_writepage() only once for each - * involved page, otherwise we'd have to implement more complicated logic - * to deal with pages w/o PG_lock or w/ PG_writeback and so on. - * - * See comments to mpage_writepages() */ static int mpage_da_writepages(struct address_space *mapping, struct writeback_control *wbc, get_block_t get_block) { struct mpage_da_data mpd; + long to_write; int ret; if (!get_block) @@ -1989,17 +2035,22 @@ static int mpage_da_writepages(struct address_space *mapping, mpd.first_page = 0; mpd.next_page = 0; mpd.get_block = get_block; + mpd.io_done = 0; + mpd.pages_written = 0; + + to_write = wbc->nr_to_write; ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd); /* * Handle last extent of pages */ - if (mpd.next_page != mpd.first_page) { + if (!mpd.io_done && mpd.next_page != mpd.first_page) { mpage_da_map_blocks(&mpd); mpage_da_submit_io(&mpd); } + wbc->nr_to_write = to_write - mpd.pages_written; return ret; } @@ -2204,63 +2255,95 @@ static int ext4_da_writepage(struct page *page, } /* - * For now just follow the DIO way to estimate the max credits - * needed to write out EXT4_MAX_WRITEBACK_PAGES. - * todo: need to calculate the max credits need for - * extent based files, currently the DIO credits is based on - * indirect-blocks mapping way. - * - * Probably should have a generic way to calculate credits - * for DIO, writepages, and truncate + * This is called via ext4_da_writepages() to + * calulate the total number of credits to reserve to fit + * a single extent allocation into a single transaction, + * ext4_da_writpeages() will loop calling this before + * the block allocation. */ -#define EXT4_MAX_WRITEBACK_PAGES DIO_MAX_BLOCKS -#define EXT4_MAX_WRITEBACK_CREDITS DIO_CREDITS + +static int ext4_da_writepages_trans_blocks(struct inode *inode) +{ + int max_blocks = EXT4_I(inode)->i_reserved_data_blocks; + + /* + * With non-extent format the journal credit needed to + * insert nrblocks contiguous block is dependent on + * number of contiguous block. So we will limit + * number of contiguous block to a sane value + */ + if (!(inode->i_flags & EXT4_EXTENTS_FL) && + (max_blocks > EXT4_MAX_TRANS_DATA)) + max_blocks = EXT4_MAX_TRANS_DATA; + + return ext4_chunk_trans_blocks(inode, max_blocks); +} static int ext4_da_writepages(struct address_space *mapping, - struct writeback_control *wbc) + struct writeback_control *wbc) { - struct inode *inode = mapping->host; handle_t *handle = NULL; - int needed_blocks; - int ret = 0; - long to_write; loff_t range_start = 0; + struct inode *inode = mapping->host; + int needed_blocks, ret = 0, nr_to_writebump = 0; + long to_write, pages_skipped = 0; + struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); /* * No pages to write? This is mainly a kludge to avoid starting * a transaction for special inodes like journal inode on last iput() * because that could violate lock ordering on umount */ - if (!mapping->nrpages) + if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) return 0; - /* - * Estimate the worse case needed credits to write out - * EXT4_MAX_BUF_BLOCKS pages + * Make sure nr_to_write is >= sbi->s_mb_stream_request + * This make sure small files blocks are allocated in + * single attempt. This ensure that small files + * get less fragmented. */ - needed_blocks = EXT4_MAX_WRITEBACK_CREDITS; + if (wbc->nr_to_write < sbi->s_mb_stream_request) { + nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write; + wbc->nr_to_write = sbi->s_mb_stream_request; + } - to_write = wbc->nr_to_write; - if (!wbc->range_cyclic) { + if (!wbc->range_cyclic) /* * If range_cyclic is not set force range_cont * and save the old writeback_index */ wbc->range_cont = 1; - range_start = wbc->range_start; - } - while (!ret && to_write) { + range_start = wbc->range_start; + pages_skipped = wbc->pages_skipped; + +restart_loop: + to_write = wbc->nr_to_write; + while (!ret && to_write > 0) { + + /* + * we insert one extent at a time. So we need + * credit needed for single extent allocation. + * journalled mode is currently not supported + * by delalloc + */ + BUG_ON(ext4_should_journal_data(inode)); + needed_blocks = ext4_da_writepages_trans_blocks(inode); + /* start a new transaction*/ handle = ext4_journal_start(inode, needed_blocks); if (IS_ERR(handle)) { ret = PTR_ERR(handle); + printk(KERN_EMERG "%s: jbd2_start: " + "%ld pages, ino %lu; err %d\n", __func__, + wbc->nr_to_write, inode->i_ino, ret); + dump_stack(); goto out_writepages; } if (ext4_should_order_data(inode)) { /* * With ordered mode we need to add - * the inode to the journal handle + * the inode to the journal handl * when we do block allocation. */ ret = ext4_jbd2_file_inode(handle, inode); @@ -2268,20 +2351,20 @@ static int ext4_da_writepages(struct address_space *mapping, ext4_journal_stop(handle); goto out_writepages; } - } - /* - * set the max dirty pages could be write at a time - * to fit into the reserved transaction credits - */ - if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES) - wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES; to_write -= wbc->nr_to_write; ret = mpage_da_writepages(mapping, wbc, - ext4_da_get_block_write); + ext4_da_get_block_write); ext4_journal_stop(handle); - if (wbc->nr_to_write) { + if (ret == MPAGE_DA_EXTENT_TAIL) { + /* + * got one extent now try with + * rest of the pages + */ + to_write += wbc->nr_to_write; + ret = 0; + } else if (wbc->nr_to_write) { /* * There is no more writeout needed * or we requested for a noblocking writeout @@ -2293,10 +2376,18 @@ static int ext4_da_writepages(struct address_space *mapping, wbc->nr_to_write = to_write; } -out_writepages: - wbc->nr_to_write = to_write; - if (range_start) + if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) { + /* We skipped pages in this loop */ wbc->range_start = range_start; + wbc->nr_to_write = to_write + + wbc->pages_skipped - pages_skipped; + wbc->pages_skipped = pages_skipped; + goto restart_loop; + } + +out_writepages: + wbc->nr_to_write = to_write - nr_to_writebump; + wbc->range_start = range_start; return ret; } @@ -3486,6 +3577,9 @@ void ext4_truncate(struct inode *inode) * modify the block allocation tree. */ down_write(&ei->i_data_sem); + + ext4_discard_reservation(inode); + /* * The orphan list entry will now protect us from any crash which * occurs before the truncate completes, so it is now safe to propagate @@ -3555,8 +3649,6 @@ do_indirects: ; } - ext4_discard_reservation(inode); - up_write(&ei->i_data_sem); inode->i_mtime = inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); @@ -4324,57 +4416,129 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, return 0; } +static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks, + int chunk) +{ + int indirects; + + /* if nrblocks are contiguous */ + if (chunk) { + /* + * With N contiguous data blocks, it need at most + * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks + * 2 dindirect blocks + * 1 tindirect block + */ + indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb); + return indirects + 3; + } + /* + * if nrblocks are not contiguous, worse case, each block touch + * a indirect block, and each indirect block touch a double indirect + * block, plus a triple indirect block + */ + indirects = nrblocks * 2 + 1; + return indirects; +} + +static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) +{ + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + return ext4_indirect_trans_blocks(inode, nrblocks, 0); + return ext4_ext_index_trans_blocks(inode, nrblocks, 0); +} /* - * How many blocks doth make a writepage()? - * - * With N blocks per page, it may be: - * N data blocks - * 2 indirect block - * 2 dindirect - * 1 tindirect - * N+5 bitmap blocks (from the above) - * N+5 group descriptor summary blocks - * 1 inode block - * 1 superblock. - * 2 * EXT4_SINGLEDATA_TRANS_BLOCKS for the quote files + * Account for index blocks, block groups bitmaps and block group + * descriptor blocks if modify datablocks and index blocks + * worse case, the indexs blocks spread over different block groups * - * 3 * (N + 5) + 2 + 2 * EXT4_SINGLEDATA_TRANS_BLOCKS + * If datablocks are discontiguous, they are possible to spread over + * different block groups too. If they are contiugous, with flexbg, + * they could still across block group boundary. * - * With ordered or writeback data it's the same, less the N data blocks. + * Also account for superblock, inode, quota and xattr blocks + */ +int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) +{ + int groups, gdpblocks; + int idxblocks; + int ret = 0; + + /* + * How many index blocks need to touch to modify nrblocks? + * The "Chunk" flag indicating whether the nrblocks is + * physically contiguous on disk + * + * For Direct IO and fallocate, they calls get_block to allocate + * one single extent at a time, so they could set the "Chunk" flag + */ + idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk); + + ret = idxblocks; + + /* + * Now let's see how many group bitmaps and group descriptors need + * to account + */ + groups = idxblocks; + if (chunk) + groups += 1; + else + groups += nrblocks; + + gdpblocks = groups; + if (groups > EXT4_SB(inode->i_sb)->s_groups_count) + groups = EXT4_SB(inode->i_sb)->s_groups_count; + if (groups > EXT4_SB(inode->i_sb)->s_gdb_count) + gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; + + /* bitmaps and block group descriptor blocks */ + ret += groups + gdpblocks; + + /* Blocks for super block, inode, quota and xattr blocks */ + ret += EXT4_META_TRANS_BLOCKS(inode->i_sb); + + return ret; +} + +/* + * Calulate the total number of credits to reserve to fit + * the modification of a single pages into a single transaction, + * which may include multiple chunks of block allocations. * - * If the inode's direct blocks can hold an integral number of pages then a - * page cannot straddle two indirect blocks, and we can only touch one indirect - * and dindirect block, and the "5" above becomes "3". + * This could be called via ext4_write_begin() * - * This still overestimates under most circumstances. If we were to pass the - * start and end offsets in here as well we could do block_to_path() on each - * block and work out the exact number of indirects which are touched. Pah. + * We need to consider the worse case, when + * one new block per extent. */ - int ext4_writepage_trans_blocks(struct inode *inode) { int bpp = ext4_journal_blocks_per_page(inode); - int indirects = (EXT4_NDIR_BLOCKS % bpp) ? 5 : 3; int ret; - if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) - return ext4_ext_writepage_trans_blocks(inode, bpp); + ret = ext4_meta_trans_blocks(inode, bpp, 0); + /* Account for data blocks for journalled mode */ if (ext4_should_journal_data(inode)) - ret = 3 * (bpp + indirects) + 2; - else - ret = 2 * (bpp + indirects) + 2; - -#ifdef CONFIG_QUOTA - /* We know that structure was already allocated during DQUOT_INIT so - * we will be updating only the data blocks + inodes */ - ret += 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); -#endif - + ret += bpp; return ret; } /* + * Calculate the journal credits for a chunk of data modification. + * + * This is called from DIO, fallocate or whoever calling + * ext4_get_blocks_wrap() to map/allocate a chunk of contigous disk blocks. + * + * journal buffers for data blocks are not included here, as DIO + * and fallocate do no need to journal data buffers. + */ +int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks) +{ + return ext4_meta_trans_blocks(inode, nrblocks, 1); +} + +/* * The caller must have previously called ext4_reserve_inode_write(). * Give this, we know that the caller already has write access to iloc->bh. */ diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 865e9dd..e0e3a5e 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3282,6 +3282,35 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, } /* + * Return the prealloc space that have minimal distance + * from the goal block. @cpa is the prealloc + * space that is having currently known minimal distance + * from the goal block. + */ +static struct ext4_prealloc_space * +ext4_mb_check_group_pa(ext4_fsblk_t goal_block, + struct ext4_prealloc_space *pa, + struct ext4_prealloc_space *cpa) +{ + ext4_fsblk_t cur_distance, new_distance; + + if (cpa == NULL) { + atomic_inc(&pa->pa_count); + return pa; + } + cur_distance = abs(goal_block - cpa->pa_pstart); + new_distance = abs(goal_block - pa->pa_pstart); + + if (cur_distance < new_distance) + return cpa; + + /* drop the previous reference */ + atomic_dec(&cpa->pa_count); + atomic_inc(&pa->pa_count); + return pa; +} + +/* * search goal blocks in preallocated space */ static noinline_for_stack int @@ -3290,7 +3319,8 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) int order, i; struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_locality_group *lg; - struct ext4_prealloc_space *pa; + struct ext4_prealloc_space *pa, *cpa = NULL; + ext4_fsblk_t goal_block; /* only data can be preallocated */ if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) @@ -3333,6 +3363,13 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) /* The max size of hash table is PREALLOC_TB_SIZE */ order = PREALLOC_TB_SIZE - 1; + goal_block = ac->ac_g_ex.fe_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb) + + ac->ac_g_ex.fe_start + + le32_to_cpu(EXT4_SB(ac->ac_sb)->s_es->s_first_data_block); + /* + * search for the prealloc space that is having + * minimal distance from the goal block. + */ for (i = order; i < PREALLOC_TB_SIZE; i++) { rcu_read_lock(); list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i], @@ -3340,17 +3377,19 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) spin_lock(&pa->pa_lock); if (pa->pa_deleted == 0 && pa->pa_free >= ac->ac_o_ex.fe_len) { - atomic_inc(&pa->pa_count); - ext4_mb_use_group_pa(ac, pa); - spin_unlock(&pa->pa_lock); - ac->ac_criteria = 20; - rcu_read_unlock(); - return 1; + + cpa = ext4_mb_check_group_pa(goal_block, + pa, cpa); } spin_unlock(&pa->pa_lock); } rcu_read_unlock(); } + if (cpa) { + ext4_mb_use_group_pa(ac, cpa); + ac->ac_criteria = 20; + return 1; + } return 0; } diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index b9e077b..46fc0b5 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -53,7 +53,8 @@ static int finish_range(handle_t *handle, struct inode *inode, * credit. But below we try to not accumalate too much * of them by restarting the journal. */ - needed = ext4_ext_calc_credits_for_insert(inode, path); + needed = ext4_ext_calc_credits_for_single_extent(inode, + lb->last_block - lb->first_block + 1, path); /* * Make sure the credit we accumalated is not really high diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 0a92651..b3d3560 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -773,7 +773,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) if (reserved_gdb || gdb_off == 0) { if (!EXT4_HAS_COMPAT_FEATURE(sb, - EXT4_FEATURE_COMPAT_RESIZE_INODE)){ + EXT4_FEATURE_COMPAT_RESIZE_INODE) + || !le16_to_cpu(es->s_reserved_gdt_blocks)) { ext4_warning(sb, __func__, "No reserved GDT blocks, can't resize"); return -EPERM; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index d5d7795..566344b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -568,6 +568,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) #endif ei->i_block_alloc_info = NULL; ei->vfs_inode.i_version = 1; + ei->vfs_inode.i_data.writeback_index = 0; memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); INIT_LIST_HEAD(&ei->i_prealloc_list); spin_lock_init(&ei->i_prealloc_lock); diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 6d266d7..80ff338 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -562,26 +562,23 @@ static int fat_write_inode(struct inode *inode, int wait) struct buffer_head *bh; struct msdos_dir_entry *raw_entry; loff_t i_pos; - int err = 0; + int err; retry: i_pos = MSDOS_I(inode)->i_pos; if (inode->i_ino == MSDOS_ROOT_INO || !i_pos) return 0; - lock_super(sb); bh = sb_bread(sb, i_pos >> sbi->dir_per_block_bits); if (!bh) { printk(KERN_ERR "FAT: unable to read inode block " "for updating (i_pos %lld)\n", i_pos); - err = -EIO; - goto out; + return -EIO; } spin_lock(&sbi->inode_hash_lock); if (i_pos != MSDOS_I(inode)->i_pos) { spin_unlock(&sbi->inode_hash_lock); brelse(bh); - unlock_super(sb); goto retry; } @@ -607,11 +604,10 @@ retry: } spin_unlock(&sbi->inode_hash_lock); mark_buffer_dirty(bh); + err = 0; if (wait) err = sync_dirty_buffer(bh); brelse(bh); -out: - unlock_super(sb); return err; } @@ -166,6 +166,7 @@ static struct inode *alloc_inode(struct super_block *sb) mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE); mapping->assoc_mapping = NULL; mapping->backing_dev_info = &default_backing_dev_info; + mapping->writeback_index = 0; /* * If the block_device provides a backing_dev_info for client diff --git a/fs/inotify_user.c b/fs/inotify_user.c index 6024942..d85c7d9 100644 --- a/fs/inotify_user.c +++ b/fs/inotify_user.c @@ -323,7 +323,7 @@ out: } /* - * remove_kevent - cleans up and ultimately frees the given kevent + * remove_kevent - cleans up the given kevent * * Caller must hold dev->ev_mutex. */ @@ -334,7 +334,13 @@ static void remove_kevent(struct inotify_device *dev, dev->event_count--; dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len; +} +/* + * free_kevent - frees the given kevent. + */ +static void free_kevent(struct inotify_kernel_event *kevent) +{ kfree(kevent->name); kmem_cache_free(event_cachep, kevent); } @@ -350,6 +356,7 @@ static void inotify_dev_event_dequeue(struct inotify_device *dev) struct inotify_kernel_event *kevent; kevent = inotify_dev_get_event(dev); remove_kevent(dev, kevent); + free_kevent(kevent); } } @@ -433,17 +440,15 @@ static ssize_t inotify_read(struct file *file, char __user *buf, dev = file->private_data; while (1) { - int events; prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE); mutex_lock(&dev->ev_mutex); - events = !list_empty(&dev->events); - mutex_unlock(&dev->ev_mutex); - if (events) { + if (!list_empty(&dev->events)) { ret = 0; break; } + mutex_unlock(&dev->ev_mutex); if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; @@ -462,7 +467,6 @@ static ssize_t inotify_read(struct file *file, char __user *buf, if (ret) return ret; - mutex_lock(&dev->ev_mutex); while (1) { struct inotify_kernel_event *kevent; @@ -481,6 +485,13 @@ static ssize_t inotify_read(struct file *file, char __user *buf, } break; } + remove_kevent(dev, kevent); + + /* + * Must perform the copy_to_user outside the mutex in order + * to avoid a lock order reversal with mmap_sem. + */ + mutex_unlock(&dev->ev_mutex); if (copy_to_user(buf, &kevent->event, event_size)) { ret = -EFAULT; @@ -498,7 +509,9 @@ static ssize_t inotify_read(struct file *file, char __user *buf, count -= kevent->event.len; } - remove_kevent(dev, kevent); + free_kevent(kevent); + + mutex_lock(&dev->ev_mutex); } mutex_unlock(&dev->ev_mutex); diff --git a/fs/ioprio.c b/fs/ioprio.c index c4a1c3c..da3cc46 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -115,11 +115,11 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio) pgrp = task_pgrp(current); else pgrp = find_vpid(who); - do_each_pid_task(pgrp, PIDTYPE_PGID, p) { + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { ret = set_task_ioprio(p, ioprio); if (ret) break; - } while_each_pid_task(pgrp, PIDTYPE_PGID, p); + } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case IOPRIO_WHO_USER: if (!who) @@ -204,7 +204,7 @@ asmlinkage long sys_ioprio_get(int which, int who) pgrp = task_pgrp(current); else pgrp = find_vpid(who); - do_each_pid_task(pgrp, PIDTYPE_PGID, p) { + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { tmpio = get_task_ioprio(p); if (tmpio < 0) continue; @@ -212,7 +212,7 @@ asmlinkage long sys_ioprio_get(int which, int who) ret = tmpio; else ret = ioprio_best(ret, tmpio); - } while_each_pid_task(pgrp, PIDTYPE_PGID, p); + } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case IOPRIO_WHO_USER: if (!who) diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h index 31559f4..4c41db9 100644 --- a/fs/jffs2/jffs2_fs_i.h +++ b/fs/jffs2/jffs2_fs_i.h @@ -12,7 +12,6 @@ #ifndef _JFFS2_FS_I #define _JFFS2_FS_I -#include <linux/version.h> #include <linux/rbtree.h> #include <linux/posix_acl.h> #include <linux/mutex.h> diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 9abcd2b..e9b2017 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1279,6 +1279,12 @@ static int nfs_parse_mount_options(char *raw, } } + if (errors > 0) { + dfprintk(MOUNT, "NFS: parsing encountered %d error%s\n", + errors, (errors == 1 ? "" : "s")); + if (!sloppy) + return 0; + } return 1; out_nomem: diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index b6ed383..54b8b41 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -443,7 +443,7 @@ init_state(struct posix_acl_state *state, int cnt) * enough space for either: */ alloc = sizeof(struct posix_ace_state_array) - + cnt*sizeof(struct posix_ace_state); + + cnt*sizeof(struct posix_user_ace_state); state->users = kzalloc(alloc, GFP_KERNEL); if (!state->users) return -ENOMEM; diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 2e51ada..e5b51ff 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -867,11 +867,6 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, int slack_bytes; __be32 status; - status = nfserr_resource; - cstate = cstate_alloc(); - if (cstate == NULL) - goto out; - resp->xbuf = &rqstp->rq_res; resp->p = rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len; resp->tagp = resp->p; @@ -890,6 +885,11 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, if (args->minorversion > NFSD_SUPPORTED_MINOR_VERSION) goto out; + status = nfserr_resource; + cstate = cstate_alloc(); + if (cstate == NULL) + goto out; + status = nfs_ok; while (!status && resp->opcnt < args->opcnt) { op = &args->ops[resp->opcnt++]; @@ -957,9 +957,9 @@ encode_op: nfsd4_increment_op_stats(op->opnum); } + cstate_free(cstate); out: nfsd4_release_compoundargs(args); - cstate_free(cstate); dprintk("nfsv4 compound returned %d\n", ntohl(status)); return status; } diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c index e1781c8..9e8a95b 100644 --- a/fs/ntfs/namei.c +++ b/fs/ntfs/namei.c @@ -174,7 +174,6 @@ static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent, // TODO: Consider moving this lot to a separate function! (AIA) handle_name: { - struct dentry *real_dent, *new_dent; MFT_RECORD *m; ntfs_attr_search_ctx *ctx; ntfs_inode *ni = NTFS_I(dent_inode); @@ -255,93 +254,9 @@ handle_name: } nls_name.hash = full_name_hash(nls_name.name, nls_name.len); - /* - * Note: No need for dent->d_lock lock as i_mutex is held on the - * parent inode. - */ - - /* Does a dentry matching the nls_name exist already? */ - real_dent = d_lookup(dent->d_parent, &nls_name); - /* If not, create it now. */ - if (!real_dent) { - real_dent = d_alloc(dent->d_parent, &nls_name); - kfree(nls_name.name); - if (!real_dent) { - err = -ENOMEM; - goto err_out; - } - new_dent = d_splice_alias(dent_inode, real_dent); - if (new_dent) - dput(real_dent); - else - new_dent = real_dent; - ntfs_debug("Done. (Created new dentry.)"); - return new_dent; - } + dent = d_add_ci(dent, dent_inode, &nls_name); kfree(nls_name.name); - /* Matching dentry exists, check if it is negative. */ - if (real_dent->d_inode) { - if (unlikely(real_dent->d_inode != dent_inode)) { - /* This can happen because bad inodes are unhashed. */ - BUG_ON(!is_bad_inode(dent_inode)); - BUG_ON(!is_bad_inode(real_dent->d_inode)); - } - /* - * Already have the inode and the dentry attached, decrement - * the reference count to balance the ntfs_iget() we did - * earlier on. We found the dentry using d_lookup() so it - * cannot be disconnected and thus we do not need to worry - * about any NFS/disconnectedness issues here. - */ - iput(dent_inode); - ntfs_debug("Done. (Already had inode and dentry.)"); - return real_dent; - } - /* - * Negative dentry: instantiate it unless the inode is a directory and - * has a 'disconnected' dentry (i.e. IS_ROOT and DCACHE_DISCONNECTED), - * in which case d_move() that in place of the found dentry. - */ - if (!S_ISDIR(dent_inode->i_mode)) { - /* Not a directory; everything is easy. */ - d_instantiate(real_dent, dent_inode); - ntfs_debug("Done. (Already had negative file dentry.)"); - return real_dent; - } - spin_lock(&dcache_lock); - if (list_empty(&dent_inode->i_dentry)) { - /* - * Directory without a 'disconnected' dentry; we need to do - * d_instantiate() by hand because it takes dcache_lock which - * we already hold. - */ - list_add(&real_dent->d_alias, &dent_inode->i_dentry); - real_dent->d_inode = dent_inode; - spin_unlock(&dcache_lock); - security_d_instantiate(real_dent, dent_inode); - ntfs_debug("Done. (Already had negative directory dentry.)"); - return real_dent; - } - /* - * Directory with a 'disconnected' dentry; get a reference to the - * 'disconnected' dentry. - */ - new_dent = list_entry(dent_inode->i_dentry.next, struct dentry, - d_alias); - dget_locked(new_dent); - spin_unlock(&dcache_lock); - /* Do security vodoo. */ - security_d_instantiate(real_dent, dent_inode); - /* Move new_dent in place of real_dent. */ - d_move(new_dent, real_dent); - /* Balance the ntfs_iget() we did above. */ - iput(dent_inode); - /* Throw away real_dent. */ - dput(real_dent); - /* Use new_dent as the actual dentry. */ - ntfs_debug("Done. (Already had negative, disconnected directory " - "dentry.)"); - return new_dent; + return dent; eio_err_out: ntfs_error(vol->sb, "Illegal file name attribute. Run chkdsk."); diff --git a/fs/ntfs/usnjrnl.h b/fs/ntfs/usnjrnl.h index 3a8af75..4087fbd 100644 --- a/fs/ntfs/usnjrnl.h +++ b/fs/ntfs/usnjrnl.h @@ -113,7 +113,7 @@ typedef struct { * Reason flags (32-bit). Cumulative flags describing the change(s) to the * file since it was last opened. I think the names speak for themselves but * if you disagree check out the descriptions in the Linux NTFS project NTFS - * documentation: http://linux-ntfs.sourceforge.net/ntfs/files/usnjrnl.html + * documentation: http://www.linux-ntfs.org/ */ enum { USN_REASON_DATA_OVERWRITE = const_cpu_to_le32(0x00000001), @@ -145,7 +145,7 @@ typedef le32 USN_REASON_FLAGS; * Source info flags (32-bit). Information about the source of the change(s) * to the file. For detailed descriptions of what these mean, see the Linux * NTFS project NTFS documentation: - * http://linux-ntfs.sourceforge.net/ntfs/files/usnjrnl.html + * http://www.linux-ntfs.org/ */ enum { USN_SOURCE_DATA_MANAGEMENT = const_cpu_to_le32(0x00000001), diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 506c24f..a53da14 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -594,7 +594,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, goto bail; } - if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno) { + if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno && create) { ocfs2_error(inode->i_sb, "Inode %llu has a hole at block %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c index d8bfa0e..52276c0 100644 --- a/fs/ocfs2/cluster/netdebug.c +++ b/fs/ocfs2/cluster/netdebug.c @@ -138,20 +138,20 @@ static int nst_seq_show(struct seq_file *seq, void *v) " message id: %d\n" " message type: %u\n" " message key: 0x%08x\n" - " sock acquiry: %lu.%lu\n" - " send start: %lu.%lu\n" - " wait start: %lu.%lu\n", + " sock acquiry: %lu.%ld\n" + " send start: %lu.%ld\n" + " wait start: %lu.%ld\n", nst, (unsigned long)nst->st_task->pid, (unsigned long)nst->st_task->tgid, nst->st_task->comm, nst->st_node, nst->st_sc, nst->st_id, nst->st_msg_type, nst->st_msg_key, nst->st_sock_time.tv_sec, - (unsigned long)nst->st_sock_time.tv_usec, + (long)nst->st_sock_time.tv_usec, nst->st_send_time.tv_sec, - (unsigned long)nst->st_send_time.tv_usec, + (long)nst->st_send_time.tv_usec, nst->st_status_time.tv_sec, - nst->st_status_time.tv_usec); + (long)nst->st_status_time.tv_usec); } spin_unlock(&o2net_debug_lock); @@ -276,7 +276,7 @@ static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos) return sc; /* unused, just needs to be null when done */ } -#define TV_SEC_USEC(TV) TV.tv_sec, (unsigned long)TV.tv_usec +#define TV_SEC_USEC(TV) TV.tv_sec, (long)TV.tv_usec static int sc_seq_show(struct seq_file *seq, void *v) { @@ -309,12 +309,12 @@ static int sc_seq_show(struct seq_file *seq, void *v) " remote node: %s\n" " page off: %zu\n" " handshake ok: %u\n" - " timer: %lu.%lu\n" - " data ready: %lu.%lu\n" - " advance start: %lu.%lu\n" - " advance stop: %lu.%lu\n" - " func start: %lu.%lu\n" - " func stop: %lu.%lu\n" + " timer: %lu.%ld\n" + " data ready: %lu.%ld\n" + " advance start: %lu.%ld\n" + " advance stop: %lu.%ld\n" + " func start: %lu.%ld\n" + " func stop: %lu.%ld\n" " func key: %u\n" " func type: %u\n", sc, diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index a27d615..2bcf706 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -143,8 +143,8 @@ static void o2net_sc_postpone_idle(struct o2net_sock_container *sc); static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc); #ifdef CONFIG_DEBUG_FS -void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype, - u32 msgkey, struct task_struct *task, u8 node) +static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype, + u32 msgkey, struct task_struct *task, u8 node) { INIT_LIST_HEAD(&nst->st_net_debug_item); nst->st_task = task; @@ -153,31 +153,61 @@ void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype, nst->st_node = node; } -void o2net_set_nst_sock_time(struct o2net_send_tracking *nst) +static void o2net_set_nst_sock_time(struct o2net_send_tracking *nst) { do_gettimeofday(&nst->st_sock_time); } -void o2net_set_nst_send_time(struct o2net_send_tracking *nst) +static void o2net_set_nst_send_time(struct o2net_send_tracking *nst) { do_gettimeofday(&nst->st_send_time); } -void o2net_set_nst_status_time(struct o2net_send_tracking *nst) +static void o2net_set_nst_status_time(struct o2net_send_tracking *nst) { do_gettimeofday(&nst->st_status_time); } -void o2net_set_nst_sock_container(struct o2net_send_tracking *nst, +static void o2net_set_nst_sock_container(struct o2net_send_tracking *nst, struct o2net_sock_container *sc) { nst->st_sc = sc; } -void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id) +static void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id) { nst->st_id = msg_id; } + +#else /* CONFIG_DEBUG_FS */ + +static inline void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype, + u32 msgkey, struct task_struct *task, u8 node) +{ +} + +static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst) +{ +} + +static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst) +{ +} + +static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst) +{ +} + +static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst, + struct o2net_sock_container *sc) +{ +} + +static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, + u32 msg_id) +{ +} + #endif /* CONFIG_DEBUG_FS */ static inline int o2net_reconnect_delay(void) diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index 18307ff..8d58cfe 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -224,42 +224,10 @@ struct o2net_send_tracking { struct timeval st_send_time; struct timeval st_status_time; }; - -void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype, - u32 msgkey, struct task_struct *task, u8 node); -void o2net_set_nst_sock_time(struct o2net_send_tracking *nst); -void o2net_set_nst_send_time(struct o2net_send_tracking *nst); -void o2net_set_nst_status_time(struct o2net_send_tracking *nst); -void o2net_set_nst_sock_container(struct o2net_send_tracking *nst, - struct o2net_sock_container *sc); -void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id); - #else struct o2net_send_tracking { u32 dummy; }; - -static inline void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype, - u32 msgkey, struct task_struct *task, u8 node) -{ -} -static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst) -{ -} -static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst) -{ -} -static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst) -{ -} -static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst, - struct o2net_sock_container *sc) -{ -} -static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, - u32 msg_id) -{ -} #endif /* CONFIG_DEBUG_FS */ #endif /* O2CLUSTER_TCP_INTERNAL_H */ diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 8a18758..9cce563 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -1300,7 +1300,6 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, di->i_size = cpu_to_le64(sb->s_blocksize); di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec); di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec); - dir->i_blocks = ocfs2_inode_sector_count(dir); /* * This should never fail as our extent list is empty and all @@ -1310,9 +1309,15 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, NULL); if (ret) { mlog_errno(ret); - goto out; + goto out_commit; } + /* + * Set i_blocks after the extent insert for the most up to + * date ip_clusters value. + */ + dir->i_blocks = ocfs2_inode_sector_count(dir); + ret = ocfs2_journal_dirty(handle, di_bh); if (ret) { mlog_errno(ret); @@ -1336,7 +1341,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, len, 0, NULL); if (ret) { mlog_errno(ret); - goto out; + goto out_commit; } } diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 7a37240..c47bc2a 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1418,13 +1418,13 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb) { unsigned int node_num; int status, i; + u32 gen; struct buffer_head *bh = NULL; struct ocfs2_dinode *di; /* This is called with the super block cluster lock, so we * know that the slot map can't change underneath us. */ - spin_lock(&osb->osb_lock); for (i = 0; i < osb->max_slots; i++) { /* Read journal inode to get the recovery generation */ status = ocfs2_read_journal_inode(osb, i, &bh, NULL); @@ -1433,23 +1433,31 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb) goto bail; } di = (struct ocfs2_dinode *)bh->b_data; - osb->slot_recovery_generations[i] = - ocfs2_get_recovery_generation(di); + gen = ocfs2_get_recovery_generation(di); brelse(bh); bh = NULL; + spin_lock(&osb->osb_lock); + osb->slot_recovery_generations[i] = gen; + mlog(0, "Slot %u recovery generation is %u\n", i, osb->slot_recovery_generations[i]); - if (i == osb->slot_num) + if (i == osb->slot_num) { + spin_unlock(&osb->osb_lock); continue; + } status = ocfs2_slot_to_node_num_locked(osb, i, &node_num); - if (status == -ENOENT) + if (status == -ENOENT) { + spin_unlock(&osb->osb_lock); continue; + } - if (__ocfs2_recovery_map_test(osb, node_num)) + if (__ocfs2_recovery_map_test(osb, node_num)) { + spin_unlock(&osb->osb_lock); continue; + } spin_unlock(&osb->osb_lock); /* Ok, we have a slot occupied by another node which @@ -1465,10 +1473,7 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb) mlog_errno(status); goto bail; } - - spin_lock(&osb->osb_lock); } - spin_unlock(&osb->osb_lock); status = 0; bail: diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 10e149a..07f348b 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -97,13 +97,14 @@ static int ocfs2_stack_driver_request(const char *stack_name, goto out; } - /* Ok, the stack is pinned */ - p->sp_count++; active_stack = p; - rc = 0; out: + /* If we found it, pin it */ + if (!rc) + active_stack->sp_count++; + spin_unlock(&ocfs2_stack_lock); return rc; } diff --git a/fs/omfs/bitmap.c b/fs/omfs/bitmap.c index 697663b..e1c0ec0 100644 --- a/fs/omfs/bitmap.c +++ b/fs/omfs/bitmap.c @@ -92,7 +92,7 @@ int omfs_allocate_block(struct super_block *sb, u64 block) struct buffer_head *bh; struct omfs_sb_info *sbi = OMFS_SB(sb); int bits_per_entry = 8 * sb->s_blocksize; - int map, bit; + unsigned int map, bit; int ret = 0; u64 tmp; @@ -176,7 +176,8 @@ int omfs_clear_range(struct super_block *sb, u64 block, int count) struct omfs_sb_info *sbi = OMFS_SB(sb); int bits_per_entry = 8 * sb->s_blocksize; u64 tmp; - int map, bit, ret; + unsigned int map, bit; + int ret; tmp = block; bit = do_div(tmp, bits_per_entry); diff --git a/fs/omfs/file.c b/fs/omfs/file.c index 7e24990..834b233 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -26,6 +26,13 @@ static int omfs_sync_file(struct file *file, struct dentry *dentry, return err ? -EIO : 0; } +static u32 omfs_max_extents(struct omfs_sb_info *sbi, int offset) +{ + return (sbi->s_sys_blocksize - offset - + sizeof(struct omfs_extent)) / + sizeof(struct omfs_extent_entry) + 1; +} + void omfs_make_empty_table(struct buffer_head *bh, int offset) { struct omfs_extent *oe = (struct omfs_extent *) &bh->b_data[offset]; @@ -45,6 +52,7 @@ int omfs_shrink_inode(struct inode *inode) struct buffer_head *bh; u64 next, last; u32 extent_count; + u32 max_extents; int ret; /* traverse extent table, freeing each entry that is greater @@ -62,15 +70,18 @@ int omfs_shrink_inode(struct inode *inode) goto out; oe = (struct omfs_extent *)(&bh->b_data[OMFS_EXTENT_START]); + max_extents = omfs_max_extents(sbi, OMFS_EXTENT_START); for (;;) { - if (omfs_is_bad(sbi, (struct omfs_header *) bh->b_data, next)) { - brelse(bh); - goto out; - } + if (omfs_is_bad(sbi, (struct omfs_header *) bh->b_data, next)) + goto out_brelse; extent_count = be32_to_cpu(oe->e_extent_count); + + if (extent_count > max_extents) + goto out_brelse; + last = next; next = be64_to_cpu(oe->e_next); entry = &oe->e_entry; @@ -98,10 +109,14 @@ int omfs_shrink_inode(struct inode *inode) if (!bh) goto out; oe = (struct omfs_extent *) (&bh->b_data[OMFS_EXTENT_CONT]); + max_extents = omfs_max_extents(sbi, OMFS_EXTENT_CONT); } ret = 0; out: return ret; +out_brelse: + brelse(bh); + return ret; } static void omfs_truncate(struct inode *inode) @@ -154,9 +169,7 @@ static int omfs_grow_extent(struct inode *inode, struct omfs_extent *oe, goto out; } } - max_count = (sbi->s_sys_blocksize - OMFS_EXTENT_START - - sizeof(struct omfs_extent)) / - sizeof(struct omfs_extent_entry) + 1; + max_count = omfs_max_extents(sbi, OMFS_EXTENT_START); /* TODO: add a continuation block here */ if (be32_to_cpu(oe->e_extent_count) > max_count-1) @@ -225,6 +238,7 @@ static int omfs_get_block(struct inode *inode, sector_t block, sector_t next, offset; int ret; u64 new_block; + u32 max_extents; int extent_count; struct omfs_extent *oe; struct omfs_extent_entry *entry; @@ -238,6 +252,7 @@ static int omfs_get_block(struct inode *inode, sector_t block, goto out; oe = (struct omfs_extent *)(&bh->b_data[OMFS_EXTENT_START]); + max_extents = omfs_max_extents(sbi, OMFS_EXTENT_START); next = inode->i_ino; for (;;) { @@ -249,6 +264,9 @@ static int omfs_get_block(struct inode *inode, sector_t block, next = be64_to_cpu(oe->e_next); entry = &oe->e_entry; + if (extent_count > max_extents) + goto out_brelse; + offset = find_block(inode, entry, block, extent_count, &remain); if (offset > 0) { ret = 0; @@ -266,6 +284,7 @@ static int omfs_get_block(struct inode *inode, sector_t block, if (!bh) goto out; oe = (struct omfs_extent *) (&bh->b_data[OMFS_EXTENT_CONT]); + max_extents = omfs_max_extents(sbi, OMFS_EXTENT_CONT); } if (create) { ret = omfs_grow_extent(inode, oe, &new_block); diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index a95fe59..d29047b 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -232,8 +232,7 @@ struct inode *omfs_iget(struct super_block *sb, ino_t ino) inode->i_mode = S_IFDIR | (S_IRWXUGO & ~sbi->s_dmask); inode->i_op = &omfs_dir_inops; inode->i_fop = &omfs_dir_operations; - inode->i_size = be32_to_cpu(oi->i_head.h_body_size) + - sizeof(struct omfs_header); + inode->i_size = sbi->s_sys_blocksize; inc_nlink(inode); break; case OMFS_FILE: diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 7d6b34e..ecc3330 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -499,9 +499,9 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev) if (!size) continue; if (from + size > get_capacity(disk)) { - printk(KERN_ERR " %s: p%d exceeds device capacity\n", + printk(KERN_WARNING + "%s: p%d exceeds device capacity\n", disk->disk_name, p); - continue; } res = add_partition(disk, p, from, size, state->parts[p].flags); if (res) { diff --git a/fs/proc/array.c b/fs/proc/array.c index 0d6eb33..71c9be5 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -337,65 +337,6 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, return 0; } -/* - * Use precise platform statistics if available: - */ -#ifdef CONFIG_VIRT_CPU_ACCOUNTING -static cputime_t task_utime(struct task_struct *p) -{ - return p->utime; -} - -static cputime_t task_stime(struct task_struct *p) -{ - return p->stime; -} -#else -static cputime_t task_utime(struct task_struct *p) -{ - clock_t utime = cputime_to_clock_t(p->utime), - total = utime + cputime_to_clock_t(p->stime); - u64 temp; - - /* - * Use CFS's precise accounting: - */ - temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); - - if (total) { - temp *= utime; - do_div(temp, total); - } - utime = (clock_t)temp; - - p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); - return p->prev_utime; -} - -static cputime_t task_stime(struct task_struct *p) -{ - clock_t stime; - - /* - * Use CFS's precise accounting. (we subtract utime from - * the total, to make sure the total observed by userspace - * grows monotonically - apps rely on that): - */ - stime = nsec_to_clock_t(p->se.sum_exec_runtime) - - cputime_to_clock_t(task_utime(p)); - - if (stime >= 0) - p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); - - return p->prev_stime; -} -#endif - -static cputime_t task_gtime(struct task_struct *p) -{ - return p->gtime; -} - static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task, int whole) { diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 4fb81e9..7821589 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -330,6 +330,7 @@ retry: spin_lock(&proc_inum_lock); ida_remove(&proc_inum_ida, i); spin_unlock(&proc_inum_lock); + return 0; } return PROC_DYNAMIC_FIRST + i; } @@ -546,8 +547,8 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp for (tmp = dir->subdir; tmp; tmp = tmp->next) if (strcmp(tmp->name, dp->name) == 0) { - printk(KERN_WARNING "proc_dir_entry '%s' already " - "registered\n", dp->name); + printk(KERN_WARNING "proc_dir_entry '%s/%s' already registered\n", + dir->name, dp->name); dump_stack(); break; } diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index 79ecd28..3f87d26 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -52,14 +52,14 @@ int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) } seq_printf(m, - "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n", + "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", vma->vm_start, vma->vm_end, flags & VM_READ ? 'r' : '-', flags & VM_WRITE ? 'w' : '-', flags & VM_EXEC ? 'x' : '-', flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', - vma->vm_pgoff << PAGE_SHIFT, + ((loff_t)vma->vm_pgoff) << PAGE_SHIFT, MAJOR(dev), MINOR(dev), ino, &len); if (file) { diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index ded9698..29e20c6 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -24,6 +24,7 @@ #include <linux/tty.h> #include <linux/string.h> #include <linux/mman.h> +#include <linux/quicklist.h> #include <linux/proc_fs.h> #include <linux/ioport.h> #include <linux/mm.h> @@ -182,6 +183,9 @@ static int meminfo_read_proc(char *page, char **start, off_t off, "SReclaimable: %8lu kB\n" "SUnreclaim: %8lu kB\n" "PageTables: %8lu kB\n" +#ifdef CONFIG_QUICKLIST + "Quicklists: %8lu kB\n" +#endif "NFS_Unstable: %8lu kB\n" "Bounce: %8lu kB\n" "WritebackTmp: %8lu kB\n" @@ -214,6 +218,9 @@ static int meminfo_read_proc(char *page, char **start, off_t off, K(global_page_state(NR_SLAB_RECLAIMABLE)), K(global_page_state(NR_SLAB_UNRECLAIMABLE)), K(global_page_state(NR_PAGETABLE)), +#ifdef CONFIG_QUICKLIST + K(quicklist_total_size()), +#endif K(global_page_state(NR_UNSTABLE_NFS)), K(global_page_state(NR_BOUNCE)), K(global_page_state(NR_WRITEBACK_TEMP)), diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 7546a91..73d1891 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -219,14 +219,14 @@ static int show_map(struct seq_file *m, void *v) ino = inode->i_ino; } - seq_printf(m, "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n", + seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", vma->vm_start, vma->vm_end, flags & VM_READ ? 'r' : '-', flags & VM_WRITE ? 'w' : '-', flags & VM_EXEC ? 'x' : '-', flags & VM_MAYSHARE ? 's' : 'p', - vma->vm_pgoff << PAGE_SHIFT, + ((loff_t)vma->vm_pgoff) << PAGE_SHIFT, MAJOR(dev), MINOR(dev), ino, &len); /* diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 52312ec..5145cb9 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -58,7 +58,7 @@ const struct inode_operations ramfs_file_inode_operations = { * size 0 on the assumption that it's going to be used for an mmap of shared * memory */ -static int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) +int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) { struct pagevec lru_pvec; unsigned long npages, xpages, loop, limit; diff --git a/fs/readdir.c b/fs/readdir.c index 4e026e5..93a7559 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -80,8 +80,10 @@ static int fillonedir(void * __buf, const char * name, int namlen, loff_t offset if (buf->result) return -EINVAL; d_ino = ino; - if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) + if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { + buf->result = -EOVERFLOW; return -EOVERFLOW; + } buf->result++; dirent = buf->dirent; if (!access_ok(VERIFY_WRITE, dirent, @@ -155,8 +157,10 @@ static int filldir(void * __buf, const char * name, int namlen, loff_t offset, if (reclen > buf->count) return -EINVAL; d_ino = ino; - if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) + if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { + buf->error = -EOVERFLOW; return -EOVERFLOW; + } dirent = buf->previous; if (dirent) { if (__put_user(offset, &dirent->d_off)) diff --git a/fs/seq_file.c b/fs/seq_file.c index 5d54205..bd20f7f 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -108,9 +108,9 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) goto Done; } /* we need at least one record in buffer */ + pos = m->index; + p = m->op->start(m, &pos); while (1) { - pos = m->index; - p = m->op->start(m, &pos); err = PTR_ERR(p); if (!p || IS_ERR(p)) break; @@ -119,6 +119,11 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) break; if (unlikely(err)) m->count = 0; + if (unlikely(!m->count)) { + p = m->op->next(m, p, &pos); + m->index = pos; + continue; + } if (m->count < m->size) goto Fill; m->op->stop(m, p); @@ -128,6 +133,8 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) goto Enomem; m->count = 0; m->version = 0; + pos = m->index; + p = m->op->start(m, &pos); } m->op->stop(m, p); m->count = 0; diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index d81fb9e..73db464 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -263,8 +263,8 @@ int ubifs_calc_min_idx_lebs(struct ubifs_info *c) idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx; - /* And make sure we have twice the index size of space reserved */ - idx_size <<= 1; + /* And make sure we have thrice the index size of space reserved */ + idx_size = idx_size + (idx_size << 1); /* * We do not maintain 'old_idx_size' as 'old_idx_lebs'/'old_idx_bytes' @@ -302,18 +302,6 @@ long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs) int subtract_lebs; long long available; - /* - * Force the amount available to the total size reported if the used - * space is zero. - */ - if (c->lst.total_used <= UBIFS_INO_NODE_SZ && - c->budg_data_growth + c->budg_dd_growth == 0) { - /* Do the same calculation as for c->block_cnt */ - available = c->main_lebs - 2; - available *= c->leb_size - c->dark_wm; - return available; - } - available = c->main_bytes - c->lst.total_used; /* @@ -388,11 +376,11 @@ static int can_use_rp(struct ubifs_info *c) * This function makes sure UBIFS has enough free eraseblocks for index growth * and data. * - * When budgeting index space, UBIFS reserves twice as more LEBs as the index + * When budgeting index space, UBIFS reserves thrice as many LEBs as the index * would take if it was consolidated and written to the flash. This guarantees * that the "in-the-gaps" commit method always succeeds and UBIFS will always * be able to commit dirty index. So this function basically adds amount of - * budgeted index space to the size of the current index, multiplies this by 2, + * budgeted index space to the size of the current index, multiplies this by 3, * and makes sure this does not exceed the amount of free eraseblocks. * * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables: @@ -543,8 +531,16 @@ int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req) int err, idx_growth, data_growth, dd_growth; struct retries_info ri; + ubifs_assert(req->new_page <= 1); + ubifs_assert(req->dirtied_page <= 1); + ubifs_assert(req->new_dent <= 1); + ubifs_assert(req->mod_dent <= 1); + ubifs_assert(req->new_ino <= 1); + ubifs_assert(req->new_ino_d <= UBIFS_MAX_INO_DATA); ubifs_assert(req->dirtied_ino <= 4); ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4); + ubifs_assert(!(req->new_ino_d & 7)); + ubifs_assert(!(req->dirtied_ino_d & 7)); data_growth = calc_data_growth(c, req); dd_growth = calc_dd_growth(c, req); @@ -618,8 +614,16 @@ again: */ void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req) { + ubifs_assert(req->new_page <= 1); + ubifs_assert(req->dirtied_page <= 1); + ubifs_assert(req->new_dent <= 1); + ubifs_assert(req->mod_dent <= 1); + ubifs_assert(req->new_ino <= 1); + ubifs_assert(req->new_ino_d <= UBIFS_MAX_INO_DATA); ubifs_assert(req->dirtied_ino <= 4); ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4); + ubifs_assert(!(req->new_ino_d & 7)); + ubifs_assert(!(req->dirtied_ino_d & 7)); if (!req->recalculate) { ubifs_assert(req->idx_growth >= 0); ubifs_assert(req->data_growth >= 0); @@ -647,7 +651,11 @@ void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req) ubifs_assert(c->budg_idx_growth >= 0); ubifs_assert(c->budg_data_growth >= 0); + ubifs_assert(c->budg_dd_growth >= 0); ubifs_assert(c->min_idx_lebs < c->main_lebs); + ubifs_assert(!(c->budg_idx_growth & 7)); + ubifs_assert(!(c->budg_data_growth & 7)); + ubifs_assert(!(c->budg_dd_growth & 7)); spin_unlock(&c->space_lock); } @@ -686,41 +694,114 @@ void ubifs_convert_page_budget(struct ubifs_info *c) void ubifs_release_dirty_inode_budget(struct ubifs_info *c, struct ubifs_inode *ui) { - struct ubifs_budget_req req = {.dd_growth = c->inode_budget, - .dirtied_ino_d = ui->data_len}; + struct ubifs_budget_req req; + memset(&req, 0, sizeof(struct ubifs_budget_req)); + req.dd_growth = c->inode_budget + ALIGN(ui->data_len, 8); ubifs_release_budget(c, &req); } /** - * ubifs_budg_get_free_space - return amount of free space. + * ubifs_reported_space - calculate reported free space. + * @c: the UBIFS file-system description object + * @free: amount of free space + * + * This function calculates amount of free space which will be reported to + * user-space. User-space application tend to expect that if the file-system + * (e.g., via the 'statfs()' call) reports that it has N bytes available, they + * are able to write a file of size N. UBIFS attaches node headers to each data + * node and it has to write indexind nodes as well. This introduces additional + * overhead, and UBIFS it has to report sligtly less free space to meet the + * above expectetion. + * + * This function assumes free space is made up of uncompressed data nodes and + * full index nodes (one per data node, tripled because we always allow enough + * space to write the index thrice). + * + * Note, the calculation is pessimistic, which means that most of the time + * UBIFS reports less space than it actually has. + */ +long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free) +{ + int divisor, factor, f; + + /* + * Reported space size is @free * X, where X is UBIFS block size + * divided by UBIFS block size + all overhead one data block + * introduces. The overhead is the node header + indexing overhead. + * + * Indexing overhead calculations are based on the following formula: + * I = N/(f - 1) + 1, where I - number of indexing nodes, N - number + * of data nodes, f - fanout. Because effective UBIFS fanout is twice + * as less than maximum fanout, we assume that each data node + * introduces 3 * @c->max_idx_node_sz / (@c->fanout/2 - 1) bytes. + * Note, the multiplier 3 is because UBIFS reseves thrice as more space + * for the index. + */ + f = c->fanout > 3 ? c->fanout >> 1 : 2; + factor = UBIFS_BLOCK_SIZE; + divisor = UBIFS_MAX_DATA_NODE_SZ; + divisor += (c->max_idx_node_sz * 3) / (f - 1); + free *= factor; + do_div(free, divisor); + return free; +} + +/** + * ubifs_get_free_space - return amount of free space. * @c: UBIFS file-system description object * - * This function returns amount of free space on the file-system. + * This function calculates amount of free space to report to user-space. + * + * Because UBIFS may introduce substantial overhead (the index, node headers, + * alighment, wastage at the end of eraseblocks, etc), it cannot report real + * amount of free flash space it has (well, because not all dirty space is + * reclamable, UBIFS does not actually know the real amount). If UBIFS did so, + * it would bread user expectetion about what free space is. Users seem to + * accustomed to assume that if the file-system reports N bytes of free space, + * they would be able to fit a file of N bytes to the FS. This almost works for + * traditional file-systems, because they have way less overhead than UBIFS. + * So, to keep users happy, UBIFS tries to take the overhead into account. */ -long long ubifs_budg_get_free_space(struct ubifs_info *c) +long long ubifs_get_free_space(struct ubifs_info *c) { - int min_idx_lebs, rsvd_idx_lebs; + int min_idx_lebs, rsvd_idx_lebs, lebs; long long available, outstanding, free; - /* Do exactly the same calculations as in 'do_budget_space()' */ spin_lock(&c->space_lock); min_idx_lebs = ubifs_calc_min_idx_lebs(c); + outstanding = c->budg_data_growth + c->budg_dd_growth; - if (min_idx_lebs > c->lst.idx_lebs) - rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs; - else - rsvd_idx_lebs = 0; - - if (rsvd_idx_lebs > c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - - c->lst.taken_empty_lebs) { + /* + * Force the amount available to the total size reported if the used + * space is zero. + */ + if (c->lst.total_used <= UBIFS_INO_NODE_SZ && !outstanding) { spin_unlock(&c->space_lock); - return 0; + return (long long)c->block_cnt << UBIFS_BLOCK_SHIFT; } available = ubifs_calc_available(c, min_idx_lebs); - outstanding = c->budg_data_growth + c->budg_dd_growth; - c->min_idx_lebs = min_idx_lebs; + + /* + * When reporting free space to user-space, UBIFS guarantees that it is + * possible to write a file of free space size. This means that for + * empty LEBs we may use more precise calculations than + * 'ubifs_calc_available()' is using. Namely, we know that in empty + * LEBs we would waste only @c->leb_overhead bytes, not @c->dark_wm. + * Thus, amend the available space. + * + * Note, the calculations below are similar to what we have in + * 'do_budget_space()', so refer there for comments. + */ + if (min_idx_lebs > c->lst.idx_lebs) + rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs; + else + rsvd_idx_lebs = 0; + lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - + c->lst.taken_empty_lebs; + lebs -= rsvd_idx_lebs; + available += lebs * (c->dark_wm - c->leb_overhead); spin_unlock(&c->space_lock); if (available > outstanding) diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c index 3b51631..0a6aa2c 100644 --- a/fs/ubifs/commit.c +++ b/fs/ubifs/commit.c @@ -74,6 +74,7 @@ static int do_commit(struct ubifs_info *c) goto out_up; } + c->cmt_no += 1; err = ubifs_gc_start_commit(c); if (err) goto out_up; @@ -115,7 +116,7 @@ static int do_commit(struct ubifs_info *c) goto out; mutex_lock(&c->mst_mutex); - c->mst_node->cmt_no = cpu_to_le64(++c->cmt_no); + c->mst_node->cmt_no = cpu_to_le64(c->cmt_no); c->mst_node->log_lnum = cpu_to_le32(new_ltail_lnum); c->mst_node->root_lnum = cpu_to_le32(zroot.lnum); c->mst_node->root_offs = cpu_to_le32(zroot.offs); diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 4e3aaeb..d7f7645 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -538,7 +538,7 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) printk(KERN_DEBUG "\t%d orphan inode numbers:\n", n); for (i = 0; i < n; i++) printk(KERN_DEBUG "\t ino %llu\n", - le64_to_cpu(orph->inos[i])); + (unsigned long long)le64_to_cpu(orph->inos[i])); break; } default: @@ -568,8 +568,8 @@ void dbg_dump_budget_req(const struct ubifs_budget_req *req) void dbg_dump_lstats(const struct ubifs_lp_stats *lst) { spin_lock(&dbg_lock); - printk(KERN_DEBUG "Lprops statistics: empty_lebs %d, idx_lebs %d\n", - lst->empty_lebs, lst->idx_lebs); + printk(KERN_DEBUG "(pid %d) Lprops statistics: empty_lebs %d, " + "idx_lebs %d\n", current->pid, lst->empty_lebs, lst->idx_lebs); printk(KERN_DEBUG "\ttaken_empty_lebs %d, total_free %lld, " "total_dirty %lld\n", lst->taken_empty_lebs, lst->total_free, lst->total_dirty); @@ -587,8 +587,8 @@ void dbg_dump_budg(struct ubifs_info *c) struct ubifs_gced_idx_leb *idx_gc; spin_lock(&dbg_lock); - printk(KERN_DEBUG "Budgeting info: budg_data_growth %lld, " - "budg_dd_growth %lld, budg_idx_growth %lld\n", + printk(KERN_DEBUG "(pid %d) Budgeting info: budg_data_growth %lld, " + "budg_dd_growth %lld, budg_idx_growth %lld\n", current->pid, c->budg_data_growth, c->budg_dd_growth, c->budg_idx_growth); printk(KERN_DEBUG "\tdata budget sum %lld, total budget sum %lld, " "freeable_cnt %d\n", c->budg_data_growth + c->budg_dd_growth, @@ -634,7 +634,7 @@ void dbg_dump_lprops(struct ubifs_info *c) struct ubifs_lprops lp; struct ubifs_lp_stats lst; - printk(KERN_DEBUG "Dumping LEB properties\n"); + printk(KERN_DEBUG "(pid %d) Dumping LEB properties\n", current->pid); ubifs_get_lp_stats(c, &lst); dbg_dump_lstats(&lst); @@ -655,7 +655,7 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum) if (dbg_failure_mode) return; - printk(KERN_DEBUG "Dumping LEB %d\n", lnum); + printk(KERN_DEBUG "(pid %d) Dumping LEB %d\n", current->pid, lnum); sleb = ubifs_scan(c, lnum, 0, c->dbg_buf); if (IS_ERR(sleb)) { @@ -720,8 +720,8 @@ void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat) { int i; - printk(KERN_DEBUG "Dumping heap cat %d (%d elements)\n", - cat, heap->cnt); + printk(KERN_DEBUG "(pid %d) Dumping heap cat %d (%d elements)\n", + current->pid, cat, heap->cnt); for (i = 0; i < heap->cnt; i++) { struct ubifs_lprops *lprops = heap->arr[i]; @@ -736,7 +736,7 @@ void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, { int i; - printk(KERN_DEBUG "Dumping pnode:\n"); + printk(KERN_DEBUG "(pid %d) Dumping pnode:\n", current->pid); printk(KERN_DEBUG "\taddress %zx parent %zx cnext %zx\n", (size_t)pnode, (size_t)parent, (size_t)pnode->cnext); printk(KERN_DEBUG "\tflags %lu iip %d level %d num %d\n", @@ -755,7 +755,7 @@ void dbg_dump_tnc(struct ubifs_info *c) int level; printk(KERN_DEBUG "\n"); - printk(KERN_DEBUG "Dumping the TNC tree\n"); + printk(KERN_DEBUG "(pid %d) Dumping the TNC tree\n", current->pid); znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL); level = znode->level; printk(KERN_DEBUG "== Level %d ==\n", level); @@ -2208,16 +2208,17 @@ int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset, int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf, int offset, int len, int dtype) { - int err; + int err, failing; if (in_failure_mode(desc)) return -EIO; - if (do_fail(desc, lnum, 1)) + failing = do_fail(desc, lnum, 1); + if (failing) cut_data(buf, len); err = ubi_leb_write(desc, lnum, buf, offset, len, dtype); if (err) return err; - if (in_failure_mode(desc)) + if (failing) return -EIO; return 0; } diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h index 3c4f1e9..50315fc 100644 --- a/fs/ubifs/debug.h +++ b/fs/ubifs/debug.h @@ -27,7 +27,7 @@ #define UBIFS_DBG(op) op -#define ubifs_assert(expr) do { \ +#define ubifs_assert(expr) do { \ if (unlikely(!(expr))) { \ printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \ __func__, __LINE__, current->pid); \ @@ -73,50 +73,50 @@ const char *dbg_key_str1(const struct ubifs_info *c, const union ubifs_key *key); /* - * DBGKEY macros require dbg_lock to be held, which it is in the dbg message + * DBGKEY macros require @dbg_lock to be held, which it is in the dbg message * macros. */ #define DBGKEY(key) dbg_key_str0(c, (key)) #define DBGKEY1(key) dbg_key_str1(c, (key)) /* General messages */ -#define dbg_gen(fmt, ...) dbg_do_msg(UBIFS_MSG_GEN, fmt, ##__VA_ARGS__) +#define dbg_gen(fmt, ...) dbg_do_msg(UBIFS_MSG_GEN, fmt, ##__VA_ARGS__) /* Additional journal messages */ -#define dbg_jnl(fmt, ...) dbg_do_msg(UBIFS_MSG_JNL, fmt, ##__VA_ARGS__) +#define dbg_jnl(fmt, ...) dbg_do_msg(UBIFS_MSG_JNL, fmt, ##__VA_ARGS__) /* Additional TNC messages */ -#define dbg_tnc(fmt, ...) dbg_do_msg(UBIFS_MSG_TNC, fmt, ##__VA_ARGS__) +#define dbg_tnc(fmt, ...) dbg_do_msg(UBIFS_MSG_TNC, fmt, ##__VA_ARGS__) /* Additional lprops messages */ -#define dbg_lp(fmt, ...) dbg_do_msg(UBIFS_MSG_LP, fmt, ##__VA_ARGS__) +#define dbg_lp(fmt, ...) dbg_do_msg(UBIFS_MSG_LP, fmt, ##__VA_ARGS__) /* Additional LEB find messages */ -#define dbg_find(fmt, ...) dbg_do_msg(UBIFS_MSG_FIND, fmt, ##__VA_ARGS__) +#define dbg_find(fmt, ...) dbg_do_msg(UBIFS_MSG_FIND, fmt, ##__VA_ARGS__) /* Additional mount messages */ -#define dbg_mnt(fmt, ...) dbg_do_msg(UBIFS_MSG_MNT, fmt, ##__VA_ARGS__) +#define dbg_mnt(fmt, ...) dbg_do_msg(UBIFS_MSG_MNT, fmt, ##__VA_ARGS__) /* Additional I/O messages */ -#define dbg_io(fmt, ...) dbg_do_msg(UBIFS_MSG_IO, fmt, ##__VA_ARGS__) +#define dbg_io(fmt, ...) dbg_do_msg(UBIFS_MSG_IO, fmt, ##__VA_ARGS__) /* Additional commit messages */ -#define dbg_cmt(fmt, ...) dbg_do_msg(UBIFS_MSG_CMT, fmt, ##__VA_ARGS__) +#define dbg_cmt(fmt, ...) dbg_do_msg(UBIFS_MSG_CMT, fmt, ##__VA_ARGS__) /* Additional budgeting messages */ -#define dbg_budg(fmt, ...) dbg_do_msg(UBIFS_MSG_BUDG, fmt, ##__VA_ARGS__) +#define dbg_budg(fmt, ...) dbg_do_msg(UBIFS_MSG_BUDG, fmt, ##__VA_ARGS__) /* Additional log messages */ -#define dbg_log(fmt, ...) dbg_do_msg(UBIFS_MSG_LOG, fmt, ##__VA_ARGS__) +#define dbg_log(fmt, ...) dbg_do_msg(UBIFS_MSG_LOG, fmt, ##__VA_ARGS__) /* Additional gc messages */ -#define dbg_gc(fmt, ...) dbg_do_msg(UBIFS_MSG_GC, fmt, ##__VA_ARGS__) +#define dbg_gc(fmt, ...) dbg_do_msg(UBIFS_MSG_GC, fmt, ##__VA_ARGS__) /* Additional scan messages */ -#define dbg_scan(fmt, ...) dbg_do_msg(UBIFS_MSG_SCAN, fmt, ##__VA_ARGS__) +#define dbg_scan(fmt, ...) dbg_do_msg(UBIFS_MSG_SCAN, fmt, ##__VA_ARGS__) /* Additional recovery messages */ -#define dbg_rcvry(fmt, ...) dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__) +#define dbg_rcvry(fmt, ...) dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__) /* * Debugging message type flags (must match msg_type_names in debug.c). @@ -239,34 +239,23 @@ typedef int (*dbg_leaf_callback)(struct ubifs_info *c, struct ubifs_zbranch *zbr, void *priv); typedef int (*dbg_znode_callback)(struct ubifs_info *c, struct ubifs_znode *znode, void *priv); - int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb, dbg_znode_callback znode_cb, void *priv); /* Checking functions */ int dbg_check_lprops(struct ubifs_info *c); - int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot); int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot); - int dbg_check_cats(struct ubifs_info *c); - int dbg_check_ltab(struct ubifs_info *c); - int dbg_check_synced_i_size(struct inode *inode); - int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir); - int dbg_check_tnc(struct ubifs_info *c, int extra); - int dbg_check_idx_size(struct ubifs_info *c, long long idx_size); - int dbg_check_filesystem(struct ubifs_info *c); - void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat, int add_pos); - int dbg_check_lprops(struct ubifs_info *c); int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode, int row, int col); @@ -329,71 +318,77 @@ static inline int dbg_change(struct ubi_volume_desc *desc, int lnum, #else /* !CONFIG_UBIFS_FS_DEBUG */ #define UBIFS_DBG(op) -#define ubifs_assert(expr) ({}) -#define ubifs_assert_cmt_locked(c) + +/* Use "if (0)" to make compiler check arguments even if debugging is off */ +#define ubifs_assert(expr) do { \ + if (0 && (expr)) \ + printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \ + __func__, __LINE__, current->pid); \ +} while (0) + +#define dbg_err(fmt, ...) do { \ + if (0) \ + ubifs_err(fmt, ##__VA_ARGS__); \ +} while (0) + +#define dbg_msg(fmt, ...) do { \ + if (0) \ + printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", \ + current->pid, __func__, ##__VA_ARGS__); \ +} while (0) + #define dbg_dump_stack() -#define dbg_err(fmt, ...) ({}) -#define dbg_msg(fmt, ...) ({}) -#define dbg_key(c, key, fmt, ...) ({}) - -#define dbg_gen(fmt, ...) ({}) -#define dbg_jnl(fmt, ...) ({}) -#define dbg_tnc(fmt, ...) ({}) -#define dbg_lp(fmt, ...) ({}) -#define dbg_find(fmt, ...) ({}) -#define dbg_mnt(fmt, ...) ({}) -#define dbg_io(fmt, ...) ({}) -#define dbg_cmt(fmt, ...) ({}) -#define dbg_budg(fmt, ...) ({}) -#define dbg_log(fmt, ...) ({}) -#define dbg_gc(fmt, ...) ({}) -#define dbg_scan(fmt, ...) ({}) -#define dbg_rcvry(fmt, ...) ({}) - -#define dbg_ntype(type) "" -#define dbg_cstate(cmt_state) "" -#define dbg_get_key_dump(c, key) ({}) -#define dbg_dump_inode(c, inode) ({}) -#define dbg_dump_node(c, node) ({}) -#define dbg_dump_budget_req(req) ({}) -#define dbg_dump_lstats(lst) ({}) -#define dbg_dump_budg(c) ({}) -#define dbg_dump_lprop(c, lp) ({}) -#define dbg_dump_lprops(c) ({}) -#define dbg_dump_leb(c, lnum) ({}) -#define dbg_dump_znode(c, znode) ({}) -#define dbg_dump_heap(c, heap, cat) ({}) -#define dbg_dump_pnode(c, pnode, parent, iip) ({}) -#define dbg_dump_tnc(c) ({}) -#define dbg_dump_index(c) ({}) +#define ubifs_assert_cmt_locked(c) -#define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0 +#define dbg_gen(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_jnl(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_tnc(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_lp(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_find(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_mnt(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_io(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_cmt(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_budg(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_log(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_gc(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_scan(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_rcvry(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) + +#define DBGKEY(key) ((char *)(key)) +#define DBGKEY1(key) ((char *)(key)) + +#define dbg_ntype(type) "" +#define dbg_cstate(cmt_state) "" +#define dbg_get_key_dump(c, key) ({}) +#define dbg_dump_inode(c, inode) ({}) +#define dbg_dump_node(c, node) ({}) +#define dbg_dump_budget_req(req) ({}) +#define dbg_dump_lstats(lst) ({}) +#define dbg_dump_budg(c) ({}) +#define dbg_dump_lprop(c, lp) ({}) +#define dbg_dump_lprops(c) ({}) +#define dbg_dump_leb(c, lnum) ({}) +#define dbg_dump_znode(c, znode) ({}) +#define dbg_dump_heap(c, heap, cat) ({}) +#define dbg_dump_pnode(c, pnode, parent, iip) ({}) +#define dbg_dump_tnc(c) ({}) +#define dbg_dump_index(c) ({}) +#define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0 #define dbg_old_index_check_init(c, zroot) 0 #define dbg_check_old_index(c, zroot) 0 - #define dbg_check_cats(c) 0 - #define dbg_check_ltab(c) 0 - #define dbg_check_synced_i_size(inode) 0 - #define dbg_check_dir_size(c, dir) 0 - #define dbg_check_tnc(c, x) 0 - #define dbg_check_idx_size(c, idx_size) 0 - #define dbg_check_filesystem(c) 0 - #define dbg_check_heap(c, heap, cat, add_pos) ({}) - #define dbg_check_lprops(c) 0 #define dbg_check_lpt_nodes(c, cnode, row, col) 0 - #define dbg_force_in_the_gaps_enabled 0 #define dbg_force_in_the_gaps() 0 - #define dbg_failure_mode 0 #define dbg_failure_mode_registration(c) ({}) #define dbg_failure_mode_deregistration(c) ({}) diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index e90374b..526c01e 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -165,7 +165,6 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir, } inode->i_ino = ++c->highest_inum; - inode->i_generation = ++c->vfs_gen; /* * The creation sequence number remains with this inode for its * lifetime. All nodes for this inode have a greater sequence number, @@ -220,15 +219,7 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, err = ubifs_tnc_lookup_nm(c, &key, dent, &dentry->d_name); if (err) { - /* - * Do not hash the direntry if parent 'i_nlink' is zero, because - * this has side-effects - '->delete_inode()' call will not be - * called for the parent orphan inode, because 'd_count' of its - * direntry will stay 1 (it'll be negative direntry I guess) - * and prevent 'iput_final()' until the dentry is destroyed due - * to unmount or memory pressure. - */ - if (err == -ENOENT && dir->i_nlink != 0) { + if (err == -ENOENT) { dbg_gen("not found"); goto done; } @@ -435,7 +426,7 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir) while (1) { dbg_gen("feed '%s', ino %llu, new f_pos %#x", - dent->name, le64_to_cpu(dent->inum), + dent->name, (unsigned long long)le64_to_cpu(dent->inum), key_hash_flash(c, &dent->key)); ubifs_assert(dent->ch.sqnum > ubifs_inode(dir)->creat_sqnum); @@ -525,7 +516,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, struct ubifs_inode *dir_ui = ubifs_inode(dir); int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len); struct ubifs_budget_req req = { .new_dent = 1, .dirtied_ino = 2, - .dirtied_ino_d = ui->data_len }; + .dirtied_ino_d = ALIGN(ui->data_len, 8) }; /* * Budget request settings: new direntry, changing the target inode, @@ -596,7 +587,6 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry) if (err) { if (err != -ENOSPC) return err; - err = 0; budgeted = 0; } @@ -727,8 +717,7 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, int mode) struct ubifs_inode *dir_ui = ubifs_inode(dir); struct ubifs_info *c = dir->i_sb->s_fs_info; int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len); - struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, - .dirtied_ino_d = 1 }; + struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1 }; /* * Budget request settings: new inode, new direntry and changing parent @@ -789,7 +778,8 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry, int sz_change = CALC_DENT_SIZE(dentry->d_name.len); int err, devlen = 0; struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, - .new_ino_d = devlen, .dirtied_ino = 1 }; + .new_ino_d = ALIGN(devlen, 8), + .dirtied_ino = 1 }; /* * Budget request settings: new inode, new direntry and changing parent @@ -863,7 +853,8 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry, int err, len = strlen(symname); int sz_change = CALC_DENT_SIZE(dentry->d_name.len); struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, - .new_ino_d = len, .dirtied_ino = 1 }; + .new_ino_d = ALIGN(len, 8), + .dirtied_ino = 1 }; /* * Budget request settings: new inode, new direntry and changing parent @@ -1012,7 +1003,7 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, struct ubifs_budget_req req = { .new_dent = 1, .mod_dent = 1, .dirtied_ino = 3 }; struct ubifs_budget_req ino_req = { .dirtied_ino = 1, - .dirtied_ino_d = old_inode_ui->data_len }; + .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) }; struct timespec time; /* diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 8565e58..3d698e2 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -793,7 +793,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode, int err; struct ubifs_budget_req req; loff_t old_size = inode->i_size, new_size = attr->ia_size; - int offset = new_size & (UBIFS_BLOCK_SIZE - 1); + int offset = new_size & (UBIFS_BLOCK_SIZE - 1), budgeted = 1; struct ubifs_inode *ui = ubifs_inode(inode); dbg_gen("ino %lu, size %lld -> %lld", inode->i_ino, old_size, new_size); @@ -811,8 +811,15 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode, /* A funny way to budget for truncation node */ req.dirtied_ino_d = UBIFS_TRUN_NODE_SZ; err = ubifs_budget_space(c, &req); - if (err) - return err; + if (err) { + /* + * Treat truncations to zero as deletion and always allow them, + * just like we do for '->unlink()'. + */ + if (new_size || err != -ENOSPC) + return err; + budgeted = 0; + } err = vmtruncate(inode, new_size); if (err) @@ -869,7 +876,12 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode, err = ubifs_jnl_truncate(c, inode, old_size, new_size); mutex_unlock(&ui->ui_mutex); out_budg: - ubifs_release_budget(c, &req); + if (budgeted) + ubifs_release_budget(c, &req); + else { + c->nospace = c->nospace_rp = 0; + smp_wmb(); + } return err; } @@ -890,7 +902,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode, loff_t new_size = attr->ia_size; struct ubifs_inode *ui = ubifs_inode(inode); struct ubifs_budget_req req = { .dirtied_ino = 1, - .dirtied_ino_d = ui->data_len }; + .dirtied_ino_d = ALIGN(ui->data_len, 8) }; err = ubifs_budget_space(c, &req); if (err) @@ -941,7 +953,8 @@ int ubifs_setattr(struct dentry *dentry, struct iattr *attr) struct inode *inode = dentry->d_inode; struct ubifs_info *c = inode->i_sb->s_fs_info; - dbg_gen("ino %lu, ia_valid %#x", inode->i_ino, attr->ia_valid); + dbg_gen("ino %lu, mode %#x, ia_valid %#x", + inode->i_ino, inode->i_mode, attr->ia_valid); err = inode_change_ok(inode, attr); if (err) return err; @@ -1051,7 +1064,7 @@ static int update_mctime(struct ubifs_info *c, struct inode *inode) if (mctime_update_needed(inode, &now)) { int err, release; struct ubifs_budget_req req = { .dirtied_ino = 1, - .dirtied_ino_d = ui->data_len }; + .dirtied_ino_d = ALIGN(ui->data_len, 8) }; err = ubifs_budget_space(c, &req); if (err) @@ -1270,6 +1283,7 @@ struct file_operations ubifs_file_operations = { .fsync = ubifs_fsync, .unlocked_ioctl = ubifs_ioctl, .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, #ifdef CONFIG_COMPAT .compat_ioctl = ubifs_compat_ioctl, #endif diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c index 10394c5..47814cd 100644 --- a/fs/ubifs/find.c +++ b/fs/ubifs/find.c @@ -211,14 +211,8 @@ static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c, * dirty index heap, and it falls-back to LPT scanning if the heaps are empty * or do not have an LEB which satisfies the @min_space criteria. * - * Note: - * o LEBs which have less than dead watermark of dirty space are never picked - * by this function; - * - * Returns zero and the LEB properties of - * found dirty LEB in case of success, %-ENOSPC if no dirty LEB was found and a - * negative error code in case of other failures. The returned LEB is marked as - * "taken". + * Note, LEBs which have less than dead watermark of free + dirty space are + * never picked by this function. * * The additional @pick_free argument controls if this function has to return a * free or freeable LEB if one is present. For example, GC must to set it to %1, @@ -231,6 +225,10 @@ static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c, * * In addition @pick_free is set to %2 by the recovery process in order to * recover gc_lnum in which case an index LEB must not be returned. + * + * This function returns zero and the LEB properties of found dirty LEB in case + * of success, %-ENOSPC if no dirty LEB was found and a negative error code in + * case of other failures. The returned LEB is marked as "taken". */ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp, int min_space, int pick_free) @@ -245,7 +243,7 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp, int lebs, rsvd_idx_lebs = 0; spin_lock(&c->space_lock); - lebs = c->lst.empty_lebs; + lebs = c->lst.empty_lebs + c->idx_gc_cnt; lebs += c->freeable_cnt - c->lst.taken_empty_lebs; /* @@ -290,9 +288,14 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp, idx_lp = idx_heap->arr[0]; sum = idx_lp->free + idx_lp->dirty; /* - * Since we reserve twice as more space for the index than it + * Since we reserve thrice as much space for the index than it * actually takes, it does not make sense to pick indexing LEBs - * with less than half LEB of dirty space. + * with less than, say, half LEB of dirty space. May be half is + * not the optimal boundary - this should be tested and + * checked. This boundary should determine how much we use + * in-the-gaps to consolidate the index comparing to how much + * we use garbage collector to consolidate it. The "half" + * criteria just feels to be fine. */ if (sum < min_space || sum < c->half_leb_size) idx_lp = NULL; @@ -312,7 +315,7 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp, lp = idx_lp; if (lp) { - ubifs_assert(lp->dirty >= c->dead_wm); + ubifs_assert(lp->free + lp->dirty >= c->dead_wm); goto found; } @@ -504,7 +507,6 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free, rsvd_idx_lebs = 0; lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - c->lst.taken_empty_lebs; - ubifs_assert(lebs + c->lst.idx_lebs >= c->min_idx_lebs); if (rsvd_idx_lebs < lebs) /* * OK to allocate an empty LEB, but we still don't want to go diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index d0f3dac..02aba36 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -334,15 +334,21 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp) err = move_nodes(c, sleb); if (err) - goto out; + goto out_inc_seq; err = gc_sync_wbufs(c); if (err) - goto out; + goto out_inc_seq; err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0, 0, 0); if (err) - goto out; + goto out_inc_seq; + + /* Allow for races with TNC */ + c->gced_lnum = lnum; + smp_wmb(); + c->gc_seq += 1; + smp_wmb(); if (c->gc_lnum == -1) { c->gc_lnum = lnum; @@ -363,6 +369,14 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp) out: ubifs_scan_destroy(sleb); return err; + +out_inc_seq: + /* We may have moved at least some nodes so allow for races with TNC */ + c->gced_lnum = lnum; + smp_wmb(); + c->gc_seq += 1; + smp_wmb(); + goto out; } /** diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index 3374f91..054363f 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -54,6 +54,20 @@ #include "ubifs.h" /** + * ubifs_ro_mode - switch UBIFS to read read-only mode. + * @c: UBIFS file-system description object + * @err: error code which is the reason of switching to R/O mode + */ +void ubifs_ro_mode(struct ubifs_info *c, int err) +{ + if (!c->ro_media) { + c->ro_media = 1; + ubifs_warn("switched to read-only mode, error %d", err); + dbg_dump_stack(); + } +} + +/** * ubifs_check_node - check node. * @c: UBIFS file-system description object * @buf: node to check diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 283155a..22993f8 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -447,13 +447,11 @@ static int get_dent_type(int mode) * @ino: buffer in which to pack inode node * @inode: inode to pack * @last: indicates the last node of the group - * @last_reference: non-zero if this is a deletion inode */ static void pack_inode(struct ubifs_info *c, struct ubifs_ino_node *ino, - const struct inode *inode, int last, - int last_reference) + const struct inode *inode, int last) { - int data_len = 0; + int data_len = 0, last_reference = !inode->i_nlink; struct ubifs_inode *ui = ubifs_inode(inode); ino->ch.node_type = UBIFS_INO_NODE; @@ -596,9 +594,9 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, ubifs_prep_grp_node(c, dent, dlen, 0); ino = (void *)dent + aligned_dlen; - pack_inode(c, ino, inode, 0, last_reference); + pack_inode(c, ino, inode, 0); ino = (void *)ino + aligned_ilen; - pack_inode(c, ino, dir, 1, 0); + pack_inode(c, ino, dir, 1); if (last_reference) { err = ubifs_add_orphan(c, inode->i_ino); @@ -606,6 +604,7 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, release_head(c, BASEHD); goto out_finish; } + ui->del_cmtno = c->cmt_no; } err = write_head(c, BASEHD, dent, len, &lnum, &dent_offs, sync); @@ -750,30 +749,25 @@ out_free: * ubifs_jnl_write_inode - flush inode to the journal. * @c: UBIFS file-system description object * @inode: inode to flush - * @deletion: inode has been deleted * * This function writes inode @inode to the journal. If the inode is * synchronous, it also synchronizes the write-buffer. Returns zero in case of * success and a negative error code in case of failure. */ -int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode, - int deletion) +int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode) { - int err, len, lnum, offs, sync = 0; + int err, lnum, offs; struct ubifs_ino_node *ino; struct ubifs_inode *ui = ubifs_inode(inode); + int sync = 0, len = UBIFS_INO_NODE_SZ, last_reference = !inode->i_nlink; - dbg_jnl("ino %lu%s", inode->i_ino, - deletion ? " (last reference)" : ""); - if (deletion) - ubifs_assert(inode->i_nlink == 0); + dbg_jnl("ino %lu, nlink %u", inode->i_ino, inode->i_nlink); - len = UBIFS_INO_NODE_SZ; /* * If the inode is being deleted, do not write the attached data. No * need to synchronize the write-buffer either. */ - if (!deletion) { + if (!last_reference) { len += ui->data_len; sync = IS_SYNC(inode); } @@ -786,7 +780,7 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode, if (err) goto out_free; - pack_inode(c, ino, inode, 1, deletion); + pack_inode(c, ino, inode, 1); err = write_head(c, BASEHD, ino, len, &lnum, &offs, sync); if (err) goto out_release; @@ -795,7 +789,7 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode, inode->i_ino); release_head(c, BASEHD); - if (deletion) { + if (last_reference) { err = ubifs_tnc_remove_ino(c, inode->i_ino); if (err) goto out_ro; @@ -828,6 +822,65 @@ out_free: } /** + * ubifs_jnl_delete_inode - delete an inode. + * @c: UBIFS file-system description object + * @inode: inode to delete + * + * This function deletes inode @inode which includes removing it from orphans, + * deleting it from TNC and, in some cases, writing a deletion inode to the + * journal. + * + * When regular file inodes are unlinked or a directory inode is removed, the + * 'ubifs_jnl_update()' function writes a corresponding deletion inode and + * direntry to the media, and adds the inode to orphans. After this, when the + * last reference to this inode has been dropped, this function is called. In + * general, it has to write one more deletion inode to the media, because if + * a commit happened between 'ubifs_jnl_update()' and + * 'ubifs_jnl_delete_inode()', the deletion inode is not in the journal + * anymore, and in fact it might not be on the flash anymore, because it might + * have been garbage-collected already. And for optimization reasons UBIFS does + * not read the orphan area if it has been unmounted cleanly, so it would have + * no indication in the journal that there is a deleted inode which has to be + * removed from TNC. + * + * However, if there was no commit between 'ubifs_jnl_update()' and + * 'ubifs_jnl_delete_inode()', then there is no need to write the deletion + * inode to the media for the second time. And this is quite a typical case. + * + * This function returns zero in case of success and a negative error code in + * case of failure. + */ +int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode) +{ + int err; + struct ubifs_inode *ui = ubifs_inode(inode); + + ubifs_assert(inode->i_nlink == 0); + + if (ui->del_cmtno != c->cmt_no) + /* A commit happened for sure */ + return ubifs_jnl_write_inode(c, inode); + + down_read(&c->commit_sem); + /* + * Check commit number again, because the first test has been done + * without @c->commit_sem, so a commit might have happened. + */ + if (ui->del_cmtno != c->cmt_no) { + up_read(&c->commit_sem); + return ubifs_jnl_write_inode(c, inode); + } + + err = ubifs_tnc_remove_ino(c, inode->i_ino); + if (err) + ubifs_ro_mode(c, err); + else + ubifs_delete_orphan(c, inode->i_ino); + up_read(&c->commit_sem); + return err; +} + +/** * ubifs_jnl_rename - rename a directory entry. * @c: UBIFS file-system description object * @old_dir: parent inode of directory entry to rename @@ -917,16 +970,16 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, p = (void *)dent2 + aligned_dlen2; if (new_inode) { - pack_inode(c, p, new_inode, 0, last_reference); + pack_inode(c, p, new_inode, 0); p += ALIGN(ilen, 8); } if (!move) - pack_inode(c, p, old_dir, 1, 0); + pack_inode(c, p, old_dir, 1); else { - pack_inode(c, p, old_dir, 0, 0); + pack_inode(c, p, old_dir, 0); p += ALIGN(plen, 8); - pack_inode(c, p, new_dir, 1, 0); + pack_inode(c, p, new_dir, 1); } if (last_reference) { @@ -935,6 +988,7 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, release_head(c, BASEHD); goto out_finish; } + new_ui->del_cmtno = c->cmt_no; } err = write_head(c, BASEHD, dent, len, &lnum, &offs, sync); @@ -1131,7 +1185,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, if (err) goto out_free; - pack_inode(c, ino, inode, 0, 0); + pack_inode(c, ino, inode, 0); ubifs_prep_grp_node(c, trun, UBIFS_TRUN_NODE_SZ, dlen ? 0 : 1); if (dlen) ubifs_prep_grp_node(c, dn, dlen, 1); @@ -1251,9 +1305,9 @@ int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host, ubifs_prep_grp_node(c, xent, xlen, 0); ino = (void *)xent + aligned_xlen; - pack_inode(c, ino, inode, 0, 1); + pack_inode(c, ino, inode, 0); ino = (void *)ino + UBIFS_INO_NODE_SZ; - pack_inode(c, ino, host, 1, 0); + pack_inode(c, ino, host, 1); err = write_head(c, BASEHD, xent, len, &lnum, &xent_offs, sync); if (!sync && !err) @@ -1320,7 +1374,7 @@ int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode, const struct inode *host) { int err, len1, len2, aligned_len, aligned_len1, lnum, offs; - struct ubifs_inode *host_ui = ubifs_inode(inode); + struct ubifs_inode *host_ui = ubifs_inode(host); struct ubifs_ino_node *ino; union ubifs_key key; int sync = IS_DIRSYNC(host); @@ -1344,8 +1398,8 @@ int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode, if (err) goto out_free; - pack_inode(c, ino, host, 0, 0); - pack_inode(c, (void *)ino + aligned_len1, inode, 1, 0); + pack_inode(c, ino, host, 0); + pack_inode(c, (void *)ino + aligned_len1, inode, 1); err = write_head(c, BASEHD, ino, aligned_len, &lnum, &offs, 0); if (!sync && !err) { diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c index 36857b9..3e0aa73 100644 --- a/fs/ubifs/log.c +++ b/fs/ubifs/log.c @@ -317,6 +317,8 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs) return 0; out_unlock: + if (err != -EAGAIN) + ubifs_ro_mode(c, err); mutex_unlock(&c->log_mutex); kfree(ref); kfree(bud); @@ -410,7 +412,7 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum) return -ENOMEM; cs->ch.node_type = UBIFS_CS_NODE; - cs->cmt_no = cpu_to_le64(c->cmt_no + 1); + cs->cmt_no = cpu_to_le64(c->cmt_no); ubifs_prepare_node(c, cs, UBIFS_CS_NODE_SZ, 0); /* diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h index 4beccfc..4c12a92 100644 --- a/fs/ubifs/misc.h +++ b/fs/ubifs/misc.h @@ -80,20 +80,6 @@ static inline struct ubifs_inode *ubifs_inode(const struct inode *inode) } /** - * ubifs_ro_mode - switch UBIFS to read read-only mode. - * @c: UBIFS file-system description object - * @err: error code which is the reason of switching to R/O mode - */ -static inline void ubifs_ro_mode(struct ubifs_info *c, int err) -{ - if (!c->ro_media) { - c->ro_media = 1; - ubifs_warn("switched to read-only mode, error %d", err); - dbg_dump_stack(); - } -} - -/** * ubifs_compr_present - check if compressor was compiled in. * @compr_type: compressor type to check * @@ -298,38 +284,6 @@ static inline void *ubifs_idx_key(const struct ubifs_info *c, } /** - * ubifs_reported_space - calculate reported free space. - * @c: the UBIFS file-system description object - * @free: amount of free space - * - * This function calculates amount of free space which will be reported to - * user-space. User-space application tend to expect that if the file-system - * (e.g., via the 'statfs()' call) reports that it has N bytes available, they - * are able to write a file of size N. UBIFS attaches node headers to each data - * node and it has to write indexind nodes as well. This introduces additional - * overhead, and UBIFS it has to report sligtly less free space to meet the - * above expectetion. - * - * This function assumes free space is made up of uncompressed data nodes and - * full index nodes (one per data node, doubled because we always allow enough - * space to write the index twice). - * - * Note, the calculation is pessimistic, which means that most of the time - * UBIFS reports less space than it actually has. - */ -static inline long long ubifs_reported_space(const struct ubifs_info *c, - uint64_t free) -{ - int divisor, factor; - - divisor = UBIFS_MAX_DATA_NODE_SZ + (c->max_idx_node_sz << 1); - factor = UBIFS_MAX_DATA_NODE_SZ - UBIFS_DATA_NODE_SZ; - do_div(free, divisor); - - return free * factor; -} - -/** * ubifs_current_time - round current time to time granularity. * @inode: inode */ @@ -339,4 +293,21 @@ static inline struct timespec ubifs_current_time(struct inode *inode) current_fs_time(inode->i_sb) : CURRENT_TIME_SEC; } +/** + * ubifs_tnc_lookup - look up a file-system node. + * @c: UBIFS file-system description object + * @key: node key to lookup + * @node: the node is returned here + * + * This function look up and reads node with key @key. The caller has to make + * sure the @node buffer is large enough to fit the node. Returns zero in case + * of success, %-ENOENT if the node was not found, and a negative error code in + * case of failure. + */ +static inline int ubifs_tnc_lookup(struct ubifs_info *c, + const union ubifs_key *key, void *node) +{ + return ubifs_tnc_locate(c, key, node, NULL, NULL); +} + #endif /* __UBIFS_MISC_H__ */ diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index 3afeb92..02d3462 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -310,10 +310,10 @@ static int write_orph_node(struct ubifs_info *c, int atomic) c->cmt_orphans -= cnt; spin_unlock(&c->orphan_lock); if (c->cmt_orphans) - orph->cmt_no = cpu_to_le64(c->cmt_no + 1); + orph->cmt_no = cpu_to_le64(c->cmt_no); else /* Mark the last node of the commit */ - orph->cmt_no = cpu_to_le64((c->cmt_no + 1) | (1ULL << 63)); + orph->cmt_no = cpu_to_le64((c->cmt_no) | (1ULL << 63)); ubifs_assert(c->ohead_offs + len <= c->leb_size); ubifs_assert(c->ohead_lnum >= c->orph_first); ubifs_assert(c->ohead_lnum <= c->orph_last); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index ca1e2d4..3f49020 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -30,7 +30,6 @@ #include <linux/slab.h> #include <linux/module.h> #include <linux/ctype.h> -#include <linux/random.h> #include <linux/kthread.h> #include <linux/parser.h> #include <linux/seq_file.h> @@ -149,7 +148,7 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum) if (err) goto out_invalid; - /* Disable readahead */ + /* Disable read-ahead */ inode->i_mapping->backing_dev_info = &c->bdi; switch (inode->i_mode & S_IFMT) { @@ -278,7 +277,7 @@ static void ubifs_destroy_inode(struct inode *inode) */ static int ubifs_write_inode(struct inode *inode, int wait) { - int err; + int err = 0; struct ubifs_info *c = inode->i_sb->s_fs_info; struct ubifs_inode *ui = ubifs_inode(inode); @@ -299,10 +298,18 @@ static int ubifs_write_inode(struct inode *inode, int wait) return 0; } - dbg_gen("inode %lu", inode->i_ino); - err = ubifs_jnl_write_inode(c, inode, 0); - if (err) - ubifs_err("can't write inode %lu, error %d", inode->i_ino, err); + /* + * As an optimization, do not write orphan inodes to the media just + * because this is not needed. + */ + dbg_gen("inode %lu, mode %#x, nlink %u", + inode->i_ino, (int)inode->i_mode, inode->i_nlink); + if (inode->i_nlink) { + err = ubifs_jnl_write_inode(c, inode); + if (err) + ubifs_err("can't write inode %lu, error %d", + inode->i_ino, err); + } ui->dirty = 0; mutex_unlock(&ui->ui_mutex); @@ -314,8 +321,9 @@ static void ubifs_delete_inode(struct inode *inode) { int err; struct ubifs_info *c = inode->i_sb->s_fs_info; + struct ubifs_inode *ui = ubifs_inode(inode); - if (ubifs_inode(inode)->xattr) + if (ui->xattr) /* * Extended attribute inode deletions are fully handled in * 'ubifs_removexattr()'. These inodes are special and have @@ -323,7 +331,7 @@ static void ubifs_delete_inode(struct inode *inode) */ goto out; - dbg_gen("inode %lu", inode->i_ino); + dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode); ubifs_assert(!atomic_read(&inode->i_count)); ubifs_assert(inode->i_nlink == 0); @@ -331,15 +339,19 @@ static void ubifs_delete_inode(struct inode *inode) if (is_bad_inode(inode)) goto out; - ubifs_inode(inode)->ui_size = inode->i_size = 0; - err = ubifs_jnl_write_inode(c, inode, 1); + ui->ui_size = inode->i_size = 0; + err = ubifs_jnl_delete_inode(c, inode); if (err) /* * Worst case we have a lost orphan inode wasting space, so a - * simple error message is ok here. + * simple error message is OK here. */ - ubifs_err("can't write inode %lu, error %d", inode->i_ino, err); + ubifs_err("can't delete inode %lu, error %d", + inode->i_ino, err); + out: + if (ui->dirty) + ubifs_release_dirty_inode_budget(c, ui); clear_inode(inode); } @@ -358,8 +370,9 @@ static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct ubifs_info *c = dentry->d_sb->s_fs_info; unsigned long long free; + __le32 *uuid = (__le32 *)c->uuid; - free = ubifs_budg_get_free_space(c); + free = ubifs_get_free_space(c); dbg_gen("free space %lld bytes (%lld blocks)", free, free >> UBIFS_BLOCK_SHIFT); @@ -374,7 +387,8 @@ static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = 0; buf->f_ffree = 0; buf->f_namelen = UBIFS_MAX_NLEN; - + buf->f_fsid.val[0] = le32_to_cpu(uuid[0]) ^ le32_to_cpu(uuid[2]); + buf->f_fsid.val[1] = le32_to_cpu(uuid[1]) ^ le32_to_cpu(uuid[3]); return 0; } @@ -518,6 +532,12 @@ static int init_constants_early(struct ubifs_info *c) c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size); c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size); + /* + * Calculate how many bytes would be wasted at the end of LEB if it was + * fully filled with data nodes of maximum size. This is used in + * calculations when reporting free space. + */ + c->leb_overhead = c->leb_size % UBIFS_MAX_DATA_NODE_SZ; return 0; } @@ -635,13 +655,11 @@ static int init_constants_late(struct ubifs_info *c) * internally because it does not make much sense for UBIFS, but it is * necessary to report something for the 'statfs()' call. * - * Subtract the LEB reserved for GC and the LEB which is reserved for - * deletions. - * - * Review 'ubifs_calc_available()' if changing this calculation. + * Subtract the LEB reserved for GC, the LEB which is reserved for + * deletions, and assume only one journal head is available. */ - tmp64 = c->main_lebs - 2; - tmp64 *= (uint64_t)c->leb_size - c->dark_wm; + tmp64 = c->main_lebs - 2 - c->jhead_cnt + 1; + tmp64 *= (uint64_t)c->leb_size - c->leb_overhead; tmp64 = ubifs_reported_space(c, tmp64); c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT; @@ -1006,14 +1024,13 @@ static int mount_ubifs(struct ubifs_info *c) goto out_dereg; } + sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id); if (!mounted_read_only) { err = alloc_wbufs(c); if (err) goto out_cbuf; /* Create background thread */ - sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, - c->vi.vol_id); c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name); if (!c->bgt) c->bgt = ERR_PTR(-EINVAL); @@ -1122,8 +1139,8 @@ static int mount_ubifs(struct ubifs_info *c) if (err) goto out_infos; - ubifs_msg("mounted UBI device %d, volume %d", c->vi.ubi_num, - c->vi.vol_id); + ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"", + c->vi.ubi_num, c->vi.vol_id, c->vi.name); if (mounted_read_only) ubifs_msg("mounted read-only"); x = (long long)c->main_lebs * c->leb_size; @@ -1469,6 +1486,7 @@ static void ubifs_put_super(struct super_block *sb) */ ubifs_assert(atomic_long_read(&c->dirty_pg_cnt) == 0); ubifs_assert(c->budg_idx_growth == 0); + ubifs_assert(c->budg_dd_growth == 0); ubifs_assert(c->budg_data_growth == 0); /* @@ -1657,7 +1675,6 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) INIT_LIST_HEAD(&c->orph_new); c->highest_inum = UBIFS_FIRST_INO; - get_random_bytes(&c->vfs_gen, sizeof(int)); c->lhead_lnum = c->ltail_lnum = UBIFS_LOG_LNUM; ubi_get_volume_info(ubi, &c->vi); @@ -1671,10 +1688,10 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) } /* - * UBIFS provids 'backing_dev_info' in order to disable readahead. For + * UBIFS provides 'backing_dev_info' in order to disable read-ahead. For * UBIFS, I/O is not deferred, it is done immediately in readpage, * which means the user would have to wait not just for their own I/O - * but the readahead I/O as well i.e. completely pointless. + * but the read-ahead I/O as well i.e. completely pointless. * * Read-ahead will be disabled because @c->bdi.ra_pages is 0. */ diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index e909f4a..7634c59 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -506,7 +506,7 @@ static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key, if (keys_cmp(c, key, &node_key) != 0) ret = 0; } - if (ret == 0) + if (ret == 0 && c->replaying) dbg_mnt("dangling branch LEB %d:%d len %d, key %s", zbr->lnum, zbr->offs, zbr->len, DBGKEY(key)); return ret; @@ -1382,50 +1382,39 @@ static int lookup_level0_dirty(struct ubifs_info *c, const union ubifs_key *key, } /** - * ubifs_tnc_lookup - look up a file-system node. + * maybe_leb_gced - determine if a LEB may have been garbage collected. * @c: UBIFS file-system description object - * @key: node key to lookup - * @node: the node is returned here + * @lnum: LEB number + * @gc_seq1: garbage collection sequence number * - * This function look up and reads node with key @key. The caller has to make - * sure the @node buffer is large enough to fit the node. Returns zero in case - * of success, %-ENOENT if the node was not found, and a negative error code in - * case of failure. + * This function determines if @lnum may have been garbage collected since + * sequence number @gc_seq1. If it may have been then %1 is returned, otherwise + * %0 is returned. */ -int ubifs_tnc_lookup(struct ubifs_info *c, const union ubifs_key *key, - void *node) +static int maybe_leb_gced(struct ubifs_info *c, int lnum, int gc_seq1) { - int found, n, err; - struct ubifs_znode *znode; - struct ubifs_zbranch zbr, *zt; + int gc_seq2, gced_lnum; - mutex_lock(&c->tnc_mutex); - found = ubifs_lookup_level0(c, key, &znode, &n); - if (!found) { - err = -ENOENT; - goto out; - } else if (found < 0) { - err = found; - goto out; - } - zt = &znode->zbranch[n]; - if (is_hash_key(c, key)) { - /* - * In this case the leaf node cache gets used, so we pass the - * address of the zbranch and keep the mutex locked - */ - err = tnc_read_node_nm(c, zt, node); - goto out; - } - zbr = znode->zbranch[n]; - mutex_unlock(&c->tnc_mutex); - - err = ubifs_tnc_read_node(c, &zbr, node); - return err; - -out: - mutex_unlock(&c->tnc_mutex); - return err; + gced_lnum = c->gced_lnum; + smp_rmb(); + gc_seq2 = c->gc_seq; + /* Same seq means no GC */ + if (gc_seq1 == gc_seq2) + return 0; + /* Different by more than 1 means we don't know */ + if (gc_seq1 + 1 != gc_seq2) + return 1; + /* + * We have seen the sequence number has increased by 1. Now we need to + * be sure we read the right LEB number, so read it again. + */ + smp_rmb(); + if (gced_lnum != c->gced_lnum) + return 1; + /* Finally we can check lnum */ + if (gced_lnum == lnum) + return 1; + return 0; } /** @@ -1436,16 +1425,19 @@ out: * @lnum: LEB number is returned here * @offs: offset is returned here * - * This function is the same as 'ubifs_tnc_lookup()' but it returns the node - * location also. See 'ubifs_tnc_lookup()'. + * This function look up and reads node with key @key. The caller has to make + * sure the @node buffer is large enough to fit the node. Returns zero in case + * of success, %-ENOENT if the node was not found, and a negative error code in + * case of failure. The node location can be returned in @lnum and @offs. */ int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key, void *node, int *lnum, int *offs) { - int found, n, err; + int found, n, err, safely = 0, gc_seq1; struct ubifs_znode *znode; struct ubifs_zbranch zbr, *zt; +again: mutex_lock(&c->tnc_mutex); found = ubifs_lookup_level0(c, key, &znode, &n); if (!found) { @@ -1456,24 +1448,43 @@ int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key, goto out; } zt = &znode->zbranch[n]; + if (lnum) { + *lnum = zt->lnum; + *offs = zt->offs; + } if (is_hash_key(c, key)) { /* * In this case the leaf node cache gets used, so we pass the * address of the zbranch and keep the mutex locked */ - *lnum = zt->lnum; - *offs = zt->offs; err = tnc_read_node_nm(c, zt, node); goto out; } + if (safely) { + err = ubifs_tnc_read_node(c, zt, node); + goto out; + } + /* Drop the TNC mutex prematurely and race with garbage collection */ zbr = znode->zbranch[n]; + gc_seq1 = c->gc_seq; mutex_unlock(&c->tnc_mutex); - *lnum = zbr.lnum; - *offs = zbr.offs; + if (ubifs_get_wbuf(c, zbr.lnum)) { + /* We do not GC journal heads */ + err = ubifs_tnc_read_node(c, &zbr, node); + return err; + } - err = ubifs_tnc_read_node(c, &zbr, node); - return err; + err = fallible_read_node(c, key, &zbr, node); + if (err <= 0 || maybe_leb_gced(c, zbr.lnum, gc_seq1)) { + /* + * The node may have been GC'ed out from under us so try again + * while keeping the TNC mutex locked. + */ + safely = 1; + goto again; + } + return 0; out: mutex_unlock(&c->tnc_mutex); @@ -1498,7 +1509,6 @@ static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key, { int found, n, err; struct ubifs_znode *znode; - struct ubifs_zbranch zbr; dbg_tnc("name '%.*s' key %s", nm->len, nm->name, DBGKEY(key)); mutex_lock(&c->tnc_mutex); @@ -1522,11 +1532,7 @@ static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key, goto out_unlock; } - zbr = znode->zbranch[n]; - mutex_unlock(&c->tnc_mutex); - - err = tnc_read_node_nm(c, &zbr, node); - return err; + err = tnc_read_node_nm(c, &znode->zbranch[n], node); out_unlock: mutex_unlock(&c->tnc_mutex); diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c index 8117e65..8ac76b1 100644 --- a/fs/ubifs/tnc_commit.c +++ b/fs/ubifs/tnc_commit.c @@ -372,26 +372,25 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt) written = layout_leb_in_gaps(c, p); if (written < 0) { err = written; - if (err == -ENOSPC) { - if (!dbg_force_in_the_gaps_enabled) { - /* - * Do not print scary warnings if the - * debugging option which forces - * in-the-gaps is enabled. - */ - ubifs_err("out of space"); - spin_lock(&c->space_lock); - dbg_dump_budg(c); - spin_unlock(&c->space_lock); - dbg_dump_lprops(c); - } - /* Try to commit anyway */ - err = 0; - break; + if (err != -ENOSPC) { + kfree(c->gap_lebs); + c->gap_lebs = NULL; + return err; } - kfree(c->gap_lebs); - c->gap_lebs = NULL; - return err; + if (!dbg_force_in_the_gaps_enabled) { + /* + * Do not print scary warnings if the debugging + * option which forces in-the-gaps is enabled. + */ + ubifs_err("out of space"); + spin_lock(&c->space_lock); + dbg_dump_budg(c); + spin_unlock(&c->space_lock); + dbg_dump_lprops(c); + } + /* Try to commit anyway */ + err = 0; + break; } p++; cnt -= written; diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h index 0cc7da9..a9ecbd9 100644 --- a/fs/ubifs/ubifs-media.h +++ b/fs/ubifs/ubifs-media.h @@ -87,7 +87,7 @@ #define UBIFS_SK_LEN 8 /* Minimum index tree fanout */ -#define UBIFS_MIN_FANOUT 2 +#define UBIFS_MIN_FANOUT 3 /* Maximum number of levels in UBIFS indexing B-tree */ #define UBIFS_MAX_LEVELS 512 @@ -228,10 +228,10 @@ enum { /* Minimum number of orphan area logical eraseblocks */ #define UBIFS_MIN_ORPH_LEBS 1 /* - * Minimum number of main area logical eraseblocks (buds, 2 for the index, 1 + * Minimum number of main area logical eraseblocks (buds, 3 for the index, 1 * for GC, 1 for deletions, and at least 1 for committed data). */ -#define UBIFS_MIN_MAIN_LEBS (UBIFS_MIN_BUD_LEBS + 5) +#define UBIFS_MIN_MAIN_LEBS (UBIFS_MIN_BUD_LEBS + 6) /* Minimum number of logical eraseblocks */ #define UBIFS_MIN_LEB_CNT (UBIFS_SB_LEBS + UBIFS_MST_LEBS + \ diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index e4f89f2..17c620b 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -20,8 +20,6 @@ * Adrian Hunter */ -/* Implementation version 0.7 */ - #ifndef __UBIFS_H__ #define __UBIFS_H__ @@ -322,6 +320,8 @@ struct ubifs_gced_idx_leb { * struct ubifs_inode - UBIFS in-memory inode description. * @vfs_inode: VFS inode description object * @creat_sqnum: sequence number at time of creation + * @del_cmtno: commit number corresponding to the time the inode was deleted, + * protected by @c->commit_sem; * @xattr_size: summarized size of all extended attributes in bytes * @xattr_cnt: count of extended attributes this inode has * @xattr_names: sum of lengths of all extended attribute names belonging to @@ -373,6 +373,7 @@ struct ubifs_gced_idx_leb { struct ubifs_inode { struct inode vfs_inode; unsigned long long creat_sqnum; + unsigned long long del_cmtno; unsigned int xattr_size; unsigned int xattr_cnt; unsigned int xattr_names; @@ -779,7 +780,7 @@ struct ubifs_compressor { /** * struct ubifs_budget_req - budget requirements of an operation. * - * @fast: non-zero if the budgeting should try to aquire budget quickly and + * @fast: non-zero if the budgeting should try to acquire budget quickly and * should not try to call write-back * @recalculate: non-zero if @idx_growth, @data_growth, and @dd_growth fields * have to be re-calculated @@ -805,21 +806,31 @@ struct ubifs_compressor { * An inode may contain 4KiB of data at max., thus the widths of @new_ino_d * is 13 bits, and @dirtied_ino_d - 15, because up to 4 inodes may be made * dirty by the re-name operation. + * + * Note, UBIFS aligns node lengths to 8-bytes boundary, so the requester has to + * make sure the amount of inode data which contribute to @new_ino_d and + * @dirtied_ino_d fields are aligned. */ struct ubifs_budget_req { unsigned int fast:1; unsigned int recalculate:1; +#ifndef UBIFS_DEBUG unsigned int new_page:1; unsigned int dirtied_page:1; unsigned int new_dent:1; unsigned int mod_dent:1; unsigned int new_ino:1; unsigned int new_ino_d:13; -#ifndef UBIFS_DEBUG unsigned int dirtied_ino:4; unsigned int dirtied_ino_d:15; #else /* Not bit-fields to check for overflows */ + unsigned int new_page; + unsigned int dirtied_page; + unsigned int new_dent; + unsigned int mod_dent; + unsigned int new_ino; + unsigned int new_ino_d; unsigned int dirtied_ino; unsigned int dirtied_ino_d; #endif @@ -860,13 +871,13 @@ struct ubifs_mount_opts { * struct ubifs_info - UBIFS file-system description data structure * (per-superblock). * @vfs_sb: VFS @struct super_block object - * @bdi: backing device info object to make VFS happy and disable readahead + * @bdi: backing device info object to make VFS happy and disable read-ahead * * @highest_inum: highest used inode number - * @vfs_gen: VFS inode generation counter * @max_sqnum: current global sequence number - * @cmt_no: commit number (last successfully completed commit) - * @cnt_lock: protects @highest_inum, @vfs_gen, and @max_sqnum counters + * @cmt_no: commit number of the last successfully completed commit, protected + * by @commit_sem + * @cnt_lock: protects @highest_inum and @max_sqnum counters * @fmt_version: UBIFS on-flash format version * @uuid: UUID from super block * @@ -984,6 +995,9 @@ struct ubifs_mount_opts { * @max_idx_node_sz: maximum indexing node aligned on 8-bytes boundary * @max_inode_sz: maximum possible inode size in bytes * @max_znode_sz: size of znode in bytes + * + * @leb_overhead: how many bytes are wasted in an LEB when it is filled with + * data nodes of maximum size - used in free space reporting * @dead_wm: LEB dead space watermark * @dark_wm: LEB dark space watermark * @block_cnt: count of 4KiB blocks on the FS @@ -1017,6 +1031,8 @@ struct ubifs_mount_opts { * @sbuf: a buffer of LEB size used by GC and replay for scanning * @idx_gc: list of index LEBs that have been garbage collected * @idx_gc_cnt: number of elements on the idx_gc list + * @gc_seq: incremented for every non-index LEB garbage collected + * @gced_lnum: last non-index LEB that was garbage collected * * @infos_list: links all 'ubifs_info' objects * @umount_mutex: serializes shrinker and un-mount @@ -1103,7 +1119,6 @@ struct ubifs_info { struct backing_dev_info bdi; ino_t highest_inum; - unsigned int vfs_gen; unsigned long long max_sqnum; unsigned long long cmt_no; spinlock_t cnt_lock; @@ -1214,6 +1229,8 @@ struct ubifs_info { int max_idx_node_sz; long long max_inode_sz; int max_znode_sz; + + int leb_overhead; int dead_wm; int dark_wm; int block_cnt; @@ -1247,6 +1264,8 @@ struct ubifs_info { void *sbuf; struct list_head idx_gc; int idx_gc_cnt; + volatile int gc_seq; + volatile int gced_lnum; struct list_head infos_list; struct mutex umount_mutex; @@ -1346,6 +1365,7 @@ extern struct backing_dev_info ubifs_backing_dev_info; extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT]; /* io.c */ +void ubifs_ro_mode(struct ubifs_info *c, int err); int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len); int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs, int dtype); @@ -1399,8 +1419,8 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, int deletion, int xent); int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, const union ubifs_key *key, const void *buf, int len); -int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode, - int last_reference); +int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode); +int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode); int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, const struct dentry *old_dentry, const struct inode *new_dir, @@ -1423,9 +1443,10 @@ void ubifs_release_ino_dirty(struct ubifs_info *c, struct inode *inode, struct ubifs_budget_req *req); void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode, struct ubifs_budget_req *req); -long long ubifs_budg_get_free_space(struct ubifs_info *c); +long long ubifs_get_free_space(struct ubifs_info *c); int ubifs_calc_min_idx_lebs(struct ubifs_info *c); void ubifs_convert_page_budget(struct ubifs_info *c); +long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free); long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs); /* find.c */ @@ -1440,8 +1461,6 @@ int ubifs_save_dirty_idx_lnums(struct ubifs_info *c); /* tnc.c */ int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key, struct ubifs_znode **zn, int *n); -int ubifs_tnc_lookup(struct ubifs_info *c, const union ubifs_key *key, - void *node); int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key, void *node, const struct qstr *nm); int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key, diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 1388a07..649bec7 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -61,7 +61,7 @@ /* * Limit the number of extended attributes per inode so that the total size - * (xattr_size) is guaranteeded to fit in an 'unsigned int'. + * (@xattr_size) is guaranteeded to fit in an 'unsigned int'. */ #define MAX_XATTRS_PER_INODE 65535 @@ -103,14 +103,14 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, struct inode *inode; struct ubifs_inode *ui, *host_ui = ubifs_inode(host); struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, - .new_ino_d = size, .dirtied_ino = 1, - .dirtied_ino_d = host_ui->data_len}; + .new_ino_d = ALIGN(size, 8), .dirtied_ino = 1, + .dirtied_ino_d = ALIGN(host_ui->data_len, 8) }; if (host_ui->xattr_cnt >= MAX_XATTRS_PER_INODE) return -ENOSPC; /* * Linux limits the maximum size of the extended attribute names list - * to %XATTR_LIST_MAX. This means we should not allow creating more* + * to %XATTR_LIST_MAX. This means we should not allow creating more * extended attributes if the name list becomes larger. This limitation * is artificial for UBIFS, though. */ @@ -128,7 +128,6 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, goto out_budg; } - mutex_lock(&host_ui->ui_mutex); /* Re-define all operations to be "nothing" */ inode->i_mapping->a_ops = &none_address_operations; inode->i_op = &none_inode_operations; @@ -141,23 +140,19 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, ui->data = kmalloc(size, GFP_NOFS); if (!ui->data) { err = -ENOMEM; - goto out_unlock; + goto out_free; } - memcpy(ui->data, value, size); + inode->i_size = ui->ui_size = size; + ui->data_len = size; + + mutex_lock(&host_ui->ui_mutex); host->i_ctime = ubifs_current_time(host); host_ui->xattr_cnt += 1; host_ui->xattr_size += CALC_DENT_SIZE(nm->len); host_ui->xattr_size += CALC_XATTR_BYTES(size); host_ui->xattr_names += nm->len; - /* - * We do not use i_size_write() because nobody can race with us as we - * are holding host @host->i_mutex - every xattr operation for this - * inode is serialized by it. - */ - inode->i_size = ui->ui_size = size; - ui->data_len = size; err = ubifs_jnl_update(c, host, nm, inode, 0, 1); if (err) goto out_cancel; @@ -172,8 +167,8 @@ out_cancel: host_ui->xattr_cnt -= 1; host_ui->xattr_size -= CALC_DENT_SIZE(nm->len); host_ui->xattr_size -= CALC_XATTR_BYTES(size); -out_unlock: mutex_unlock(&host_ui->ui_mutex); +out_free: make_bad_inode(inode); iput(inode); out_budg: @@ -200,29 +195,28 @@ static int change_xattr(struct ubifs_info *c, struct inode *host, struct ubifs_inode *host_ui = ubifs_inode(host); struct ubifs_inode *ui = ubifs_inode(inode); struct ubifs_budget_req req = { .dirtied_ino = 2, - .dirtied_ino_d = size + host_ui->data_len }; + .dirtied_ino_d = ALIGN(size, 8) + ALIGN(host_ui->data_len, 8) }; ubifs_assert(ui->data_len == inode->i_size); err = ubifs_budget_space(c, &req); if (err) return err; - mutex_lock(&host_ui->ui_mutex); - host->i_ctime = ubifs_current_time(host); - host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len); - host_ui->xattr_size += CALC_XATTR_BYTES(size); - kfree(ui->data); ui->data = kmalloc(size, GFP_NOFS); if (!ui->data) { err = -ENOMEM; - goto out_unlock; + goto out_free; } - memcpy(ui->data, value, size); inode->i_size = ui->ui_size = size; ui->data_len = size; + mutex_lock(&host_ui->ui_mutex); + host->i_ctime = ubifs_current_time(host); + host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len); + host_ui->xattr_size += CALC_XATTR_BYTES(size); + /* * It is important to write the host inode after the xattr inode * because if the host inode gets synchronized (via 'fsync()'), then @@ -240,9 +234,9 @@ static int change_xattr(struct ubifs_info *c, struct inode *host, out_cancel: host_ui->xattr_size -= CALC_XATTR_BYTES(size); host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len); - make_bad_inode(inode); -out_unlock: mutex_unlock(&host_ui->ui_mutex); + make_bad_inode(inode); +out_free: ubifs_release_budget(c, &req); return err; } @@ -312,6 +306,7 @@ int ubifs_setxattr(struct dentry *dentry, const char *name, dbg_gen("xattr '%s', host ino %lu ('%.*s'), size %zd", name, host->i_ino, dentry->d_name.len, dentry->d_name.name, size); + ubifs_assert(mutex_is_locked(&host->i_mutex)); if (size > UBIFS_MAX_INO_DATA) return -ERANGE; @@ -384,7 +379,6 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf, if (!xent) return -ENOMEM; - mutex_lock(&host->i_mutex); xent_key_init(c, &key, host->i_ino, &nm); err = ubifs_tnc_lookup_nm(c, &key, xent, &nm); if (err) { @@ -419,7 +413,6 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf, out_iput: iput(inode); out_unlock: - mutex_unlock(&host->i_mutex); kfree(xent); return err; } @@ -449,8 +442,6 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size) return -ERANGE; lowest_xent_key(c, &key, host->i_ino); - - mutex_lock(&host->i_mutex); while (1) { int type; @@ -479,7 +470,6 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size) pxent = xent; key_read(c, &xent->key, &key); } - mutex_unlock(&host->i_mutex); kfree(pxent); if (err != -ENOENT) { @@ -497,8 +487,8 @@ static int remove_xattr(struct ubifs_info *c, struct inode *host, int err; struct ubifs_inode *host_ui = ubifs_inode(host); struct ubifs_inode *ui = ubifs_inode(inode); - struct ubifs_budget_req req = { .dirtied_ino = 1, .mod_dent = 1, - .dirtied_ino_d = host_ui->data_len }; + struct ubifs_budget_req req = { .dirtied_ino = 2, .mod_dent = 1, + .dirtied_ino_d = ALIGN(host_ui->data_len, 8) }; ubifs_assert(ui->data_len == inode->i_size); diff --git a/fs/udf/file.c b/fs/udf/file.c index 0ed6e14..eb91f3b 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -211,6 +211,7 @@ const struct file_operations udf_file_operations = { .release = udf_release_file, .fsync = udf_fsync_file, .splice_read = generic_file_splice_read, + .llseek = generic_file_llseek, }; const struct inode_operations udf_file_inode_operations = { diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index eb9cfa2..a4f2b3c 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -76,11 +76,24 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err) *err = -ENOSPC; iinfo = UDF_I(inode); - iinfo->i_unique = 0; - iinfo->i_lenExtents = 0; - iinfo->i_next_alloc_block = 0; - iinfo->i_next_alloc_goal = 0; - iinfo->i_strat4096 = 0; + if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_EXTENDED_FE)) { + iinfo->i_efe = 1; + if (UDF_VERS_USE_EXTENDED_FE > sbi->s_udfrev) + sbi->s_udfrev = UDF_VERS_USE_EXTENDED_FE; + iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize - + sizeof(struct extendedFileEntry), + GFP_KERNEL); + } else { + iinfo->i_efe = 0; + iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize - + sizeof(struct fileEntry), + GFP_KERNEL); + } + if (!iinfo->i_ext.i_data) { + iput(inode); + *err = -ENOMEM; + return NULL; + } block = udf_new_block(dir->i_sb, NULL, dinfo->i_location.partitionReferenceNum, @@ -111,6 +124,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err) lvhd->uniqueID = cpu_to_le64(uniqueID); mark_buffer_dirty(sbi->s_lvid_bh); } + mutex_unlock(&sbi->s_alloc_mutex); inode->i_mode = mode; inode->i_uid = current->fsuid; if (dir->i_mode & S_ISGID) { @@ -129,25 +143,6 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err) iinfo->i_lenEAttr = 0; iinfo->i_lenAlloc = 0; iinfo->i_use = 0; - if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_EXTENDED_FE)) { - iinfo->i_efe = 1; - if (UDF_VERS_USE_EXTENDED_FE > sbi->s_udfrev) - sbi->s_udfrev = UDF_VERS_USE_EXTENDED_FE; - iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize - - sizeof(struct extendedFileEntry), - GFP_KERNEL); - } else { - iinfo->i_efe = 0; - iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize - - sizeof(struct fileEntry), - GFP_KERNEL); - } - if (!iinfo->i_ext.i_data) { - iput(inode); - *err = -ENOMEM; - mutex_unlock(&sbi->s_alloc_mutex); - return NULL; - } if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_AD_IN_ICB)) iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; else if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) @@ -158,7 +153,6 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err) iinfo->i_crtime = current_fs_time(inode->i_sb); insert_inode_hash(inode); mark_inode_dirty(inode); - mutex_unlock(&sbi->s_alloc_mutex); if (DQUOT_ALLOC_INODE(inode)) { DQUOT_DROP(inode); diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index f42f80a..a44d68e 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -1338,6 +1338,10 @@ __xfs_get_blocks( offset = (xfs_off_t)iblock << inode->i_blkbits; ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); size = bh_result->b_size; + + if (!create && direct && offset >= i_size_read(inode)) + return 0; + error = xfs_iomap(XFS_I(inode), offset, size, create ? flags : BMAPI_READ, &iomap, &niomap); if (error) diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 5f60363..5311c1a 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -475,6 +475,7 @@ const struct file_operations xfs_invis_file_operations = { const struct file_operations xfs_dir_file_operations = { .read = generic_read_dir, .readdir = xfs_file_readdir, + .llseek = generic_file_llseek, .unlocked_ioctl = xfs_file_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = xfs_file_compat_ioctl, diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 91bcd979..095d271 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -355,7 +355,7 @@ xfs_vn_ci_lookup( /* else case-insensitive match... */ dname.name = ci_name.name; dname.len = ci_name.len; - dentry = d_add_ci(VFS_I(ip), dentry, &dname); + dentry = d_add_ci(dentry, VFS_I(ip), &dname); kmem_free(ci_name.name); return dentry; } diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 73c65f1..18d3c84 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1302,9 +1302,29 @@ xfs_fs_remount( mp->m_flags &= ~XFS_MOUNT_BARRIER; break; default: + /* + * Logically we would return an error here to prevent + * users from believing they might have changed + * mount options using remount which can't be changed. + * + * But unfortunately mount(8) adds all options from + * mtab and fstab to the mount arguments in some cases + * so we can't blindly reject options, but have to + * check for each specified option if it actually + * differs from the currently set option and only + * reject it if that's the case. + * + * Until that is implemented we return success for + * every remount request, and silently ignore all + * options that we can't actually change. + */ +#if 0 printk(KERN_INFO "XFS: mount option \"%s\" not supported for remount\n", p); return -EINVAL; +#else + return 0; +#endif } } diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 608c30c..002fc26 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -732,6 +732,7 @@ xfs_buf_item_init( bip->bli_item.li_ops = &xfs_buf_item_ops; bip->bli_item.li_mountp = mp; bip->bli_buf = bp; + xfs_buf_hold(bp); bip->bli_format.blf_type = XFS_LI_BUF; bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp); bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp)); @@ -867,6 +868,21 @@ xfs_buf_item_dirty( return (bip->bli_flags & XFS_BLI_DIRTY); } +STATIC void +xfs_buf_item_free( + xfs_buf_log_item_t *bip) +{ +#ifdef XFS_TRANS_DEBUG + kmem_free(bip->bli_orig); + kmem_free(bip->bli_logged); +#endif /* XFS_TRANS_DEBUG */ + +#ifdef XFS_BLI_TRACE + ktrace_free(bip->bli_trace); +#endif + kmem_zone_free(xfs_buf_item_zone, bip); +} + /* * This is called when the buf log item is no longer needed. It should * free the buf log item associated with the given buffer and clear @@ -887,18 +903,8 @@ xfs_buf_item_relse( (XFS_BUF_IODONE_FUNC(bp) != NULL)) { XFS_BUF_CLR_IODONE_FUNC(bp); } - -#ifdef XFS_TRANS_DEBUG - kmem_free(bip->bli_orig); - bip->bli_orig = NULL; - kmem_free(bip->bli_logged); - bip->bli_logged = NULL; -#endif /* XFS_TRANS_DEBUG */ - -#ifdef XFS_BLI_TRACE - ktrace_free(bip->bli_trace); -#endif - kmem_zone_free(xfs_buf_item_zone, bip); + xfs_buf_rele(bp); + xfs_buf_item_free(bip); } @@ -1120,6 +1126,7 @@ xfs_buf_iodone( ASSERT(bip->bli_buf == bp); + xfs_buf_rele(bp); mp = bip->bli_item.li_mountp; /* @@ -1136,18 +1143,7 @@ xfs_buf_iodone( * xfs_trans_delete_ail() drops the AIL lock. */ xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip); - -#ifdef XFS_TRANS_DEBUG - kmem_free(bip->bli_orig); - bip->bli_orig = NULL; - kmem_free(bip->bli_logged); - bip->bli_logged = NULL; -#endif /* XFS_TRANS_DEBUG */ - -#ifdef XFS_BLI_TRACE - ktrace_free(bip->bli_trace); -#endif - kmem_zone_free(xfs_buf_item_zone, bip); + xfs_buf_item_free(bip); } #if defined(XFS_BLI_TRACE) diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index 760f4c5..75b0cd4 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -149,7 +149,14 @@ xfs_swap_extents( sbp = &sxp->sx_stat; - xfs_lock_two_inodes(ip, tip, lock_flags); + /* + * we have to do two separate lock calls here to keep lockdep + * happy. If we try to get all the locks in one call, lock will + * report false positives when we drop the ILOCK and regain them + * below. + */ + xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); + xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); locked = 1; /* Verify that both files have the same format */ diff --git a/fs/xfs/xfs_dmapi.h b/fs/xfs/xfs_dmapi.h index cdc2d34..2813cdd 100644 --- a/fs/xfs/xfs_dmapi.h +++ b/fs/xfs/xfs_dmapi.h @@ -18,7 +18,6 @@ #ifndef __XFS_DMAPI_H__ #define __XFS_DMAPI_H__ -#include <linux/version.h> /* Values used to define the on-disk version of dm_attrname_t. All * on-disk attribute names start with the 8-byte string "SGI_DMI_". * diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 00e80df..dbd9cef 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -4118,7 +4118,7 @@ xfs_iext_indirect_to_direct( ASSERT(nextents <= XFS_LINEAR_EXTS); size = nextents * sizeof(xfs_bmbt_rec_t); - xfs_iext_irec_compact_full(ifp); + xfs_iext_irec_compact_pages(ifp); ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ); ep = ifp->if_u1.if_ext_irec->er_extbuf; @@ -4449,8 +4449,7 @@ xfs_iext_irec_remove( * compaction policy is as follows: * * Full Compaction: Extents fit into a single page (or inline buffer) - * Full Compaction: Extents occupy less than 10% of allocated space - * Partial Compaction: Extents occupy > 10% and < 50% of allocated space + * Partial Compaction: Extents occupy less than 50% of allocated space * No Compaction: Extents occupy at least 50% of allocated space */ void @@ -4471,8 +4470,6 @@ xfs_iext_irec_compact( xfs_iext_direct_to_inline(ifp, nextents); } else if (nextents <= XFS_LINEAR_EXTS) { xfs_iext_indirect_to_direct(ifp); - } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 3) { - xfs_iext_irec_compact_full(ifp); } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) { xfs_iext_irec_compact_pages(ifp); } @@ -4496,7 +4493,7 @@ xfs_iext_irec_compact_pages( erp_next = erp + 1; if (erp_next->er_extcount <= (XFS_LINEAR_EXTS - erp->er_extcount)) { - memmove(&erp->er_extbuf[erp->er_extcount], + memcpy(&erp->er_extbuf[erp->er_extcount], erp_next->er_extbuf, erp_next->er_extcount * sizeof(xfs_bmbt_rec_t)); erp->er_extcount += erp_next->er_extcount; @@ -4516,91 +4513,6 @@ xfs_iext_irec_compact_pages( } /* - * Fully compact the extent records managed by the indirection array. - */ -void -xfs_iext_irec_compact_full( - xfs_ifork_t *ifp) /* inode fork pointer */ -{ - xfs_bmbt_rec_host_t *ep, *ep_next; /* extent record pointers */ - xfs_ext_irec_t *erp, *erp_next; /* extent irec pointers */ - int erp_idx = 0; /* extent irec index */ - int ext_avail; /* empty entries in ex list */ - int ext_diff; /* number of exts to add */ - int nlists; /* number of irec's (ex lists) */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - erp = ifp->if_u1.if_ext_irec; - ep = &erp->er_extbuf[erp->er_extcount]; - erp_next = erp + 1; - ep_next = erp_next->er_extbuf; - - while (erp_idx < nlists - 1) { - /* - * Check how many extent records are available in this irec. - * If there is none skip the whole exercise. - */ - ext_avail = XFS_LINEAR_EXTS - erp->er_extcount; - if (ext_avail) { - - /* - * Copy over as many as possible extent records into - * the previous page. - */ - ext_diff = MIN(ext_avail, erp_next->er_extcount); - memcpy(ep, ep_next, ext_diff * sizeof(xfs_bmbt_rec_t)); - erp->er_extcount += ext_diff; - erp_next->er_extcount -= ext_diff; - - /* - * If the next irec is empty now we can simply - * remove it. - */ - if (erp_next->er_extcount == 0) { - /* - * Free page before removing extent record - * so er_extoffs don't get modified in - * xfs_iext_irec_remove. - */ - kmem_free(erp_next->er_extbuf); - erp_next->er_extbuf = NULL; - xfs_iext_irec_remove(ifp, erp_idx + 1); - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - - /* - * If the next irec is not empty move up the content - * that has not been copied to the previous page to - * the beggining of this one. - */ - } else { - memmove(erp_next->er_extbuf, &ep_next[ext_diff], - erp_next->er_extcount * - sizeof(xfs_bmbt_rec_t)); - ep_next = erp_next->er_extbuf; - memset(&ep_next[erp_next->er_extcount], 0, - (XFS_LINEAR_EXTS - - erp_next->er_extcount) * - sizeof(xfs_bmbt_rec_t)); - } - } - - if (erp->er_extcount == XFS_LINEAR_EXTS) { - erp_idx++; - if (erp_idx < nlists) - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - else - break; - } - ep = &erp->er_extbuf[erp->er_extcount]; - erp_next = erp + 1; - ep_next = erp_next->er_extbuf; - } -} - -/* * This is called to update the er_extoff field in the indirection * array when extents have been added or removed from one of the * extent lists. erp_idx contains the irec index to begin updating diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index ccba14e..503ea89 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -124,16 +124,27 @@ STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog, STATIC int xlog_iclogs_empty(xlog_t *log); #if defined(XFS_LOG_TRACE) + +#define XLOG_TRACE_LOGGRANT_SIZE 2048 +#define XLOG_TRACE_ICLOG_SIZE 256 + +void +xlog_trace_loggrant_alloc(xlog_t *log) +{ + log->l_grant_trace = ktrace_alloc(XLOG_TRACE_LOGGRANT_SIZE, KM_NOFS); +} + +void +xlog_trace_loggrant_dealloc(xlog_t *log) +{ + ktrace_free(log->l_grant_trace); +} + void xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string) { unsigned long cnts; - if (!log->l_grant_trace) { - log->l_grant_trace = ktrace_alloc(2048, KM_NOSLEEP); - if (!log->l_grant_trace) - return; - } /* ticket counts are 1 byte each */ cnts = ((unsigned long)tic->t_ocnt) | ((unsigned long)tic->t_cnt) << 8; @@ -157,10 +168,20 @@ xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string) } void +xlog_trace_iclog_alloc(xlog_in_core_t *iclog) +{ + iclog->ic_trace = ktrace_alloc(XLOG_TRACE_ICLOG_SIZE, KM_NOFS); +} + +void +xlog_trace_iclog_dealloc(xlog_in_core_t *iclog) +{ + ktrace_free(iclog->ic_trace); +} + +void xlog_trace_iclog(xlog_in_core_t *iclog, uint state) { - if (!iclog->ic_trace) - iclog->ic_trace = ktrace_alloc(256, KM_NOFS); ktrace_enter(iclog->ic_trace, (void *)((unsigned long)state), (void *)((unsigned long)current_pid()), @@ -170,8 +191,15 @@ xlog_trace_iclog(xlog_in_core_t *iclog, uint state) (void *)NULL, (void *)NULL); } #else + +#define xlog_trace_loggrant_alloc(log) +#define xlog_trace_loggrant_dealloc(log) #define xlog_trace_loggrant(log,tic,string) + +#define xlog_trace_iclog_alloc(iclog) +#define xlog_trace_iclog_dealloc(iclog) #define xlog_trace_iclog(iclog,state) + #endif /* XFS_LOG_TRACE */ @@ -1009,7 +1037,7 @@ xlog_iodone(xfs_buf_t *bp) * layer, it means the underlyin device no longer supports * barrier I/O. Warn loudly and turn off barriers. */ - if ((l->l_mp->m_flags & XFS_MOUNT_BARRIER) && !XFS_BUF_ORDERED(bp)) { + if ((l->l_mp->m_flags & XFS_MOUNT_BARRIER) && !XFS_BUF_ISORDERED(bp)) { l->l_mp->m_flags &= ~XFS_MOUNT_BARRIER; xfs_fs_cmn_err(CE_WARN, l->l_mp, "xlog_iodone: Barriers are no longer supported" @@ -1231,6 +1259,7 @@ xlog_alloc_log(xfs_mount_t *mp, spin_lock_init(&log->l_grant_lock); sv_init(&log->l_flush_wait, 0, "flush_wait"); + xlog_trace_loggrant_alloc(log); /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */ ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0); @@ -1285,6 +1314,8 @@ xlog_alloc_log(xfs_mount_t *mp, sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force"); sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write"); + xlog_trace_iclog_alloc(iclog); + iclogp = &iclog->ic_next; } *iclogp = log->l_iclog; /* complete ring */ @@ -1565,11 +1596,7 @@ xlog_dealloc_log(xlog_t *log) sv_destroy(&iclog->ic_force_wait); sv_destroy(&iclog->ic_write_wait); xfs_buf_free(iclog->ic_bp); -#ifdef XFS_LOG_TRACE - if (iclog->ic_trace != NULL) { - ktrace_free(iclog->ic_trace); - } -#endif + xlog_trace_iclog_dealloc(iclog); next_iclog = iclog->ic_next; kmem_free(iclog); iclog = next_iclog; @@ -1578,14 +1605,7 @@ xlog_dealloc_log(xlog_t *log) spinlock_destroy(&log->l_grant_lock); xfs_buf_free(log->l_xbuf); -#ifdef XFS_LOG_TRACE - if (log->l_trace != NULL) { - ktrace_free(log->l_trace); - } - if (log->l_grant_trace != NULL) { - ktrace_free(log->l_grant_trace); - } -#endif + xlog_trace_loggrant_dealloc(log); log->l_mp->m_log = NULL; kmem_free(log); } /* xlog_dealloc_log */ diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index c8a5b22..e7d8f84 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -448,7 +448,6 @@ typedef struct log { int l_grant_write_bytes; #ifdef XFS_LOG_TRACE - struct ktrace *l_trace; struct ktrace *l_grant_trace; #endif diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index aa238c8..8b6812f 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -1838,6 +1838,12 @@ again: #endif } +/* + * xfs_lock_two_inodes() can only be used to lock one type of lock + * at a time - the iolock or the ilock, but not both at once. If + * we lock both at once, lockdep will report false positives saying + * we have violated locking orders. + */ void xfs_lock_two_inodes( xfs_inode_t *ip0, @@ -1848,6 +1854,8 @@ xfs_lock_two_inodes( int attempts = 0; xfs_log_item_t *lp; + if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) + ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0); ASSERT(ip0->i_ino != ip1->i_ino); if (ip0->i_ino > ip1->i_ino) { @@ -3152,6 +3160,13 @@ error1: /* Just cancel transaction */ /* * Zero file bytes between startoff and endoff inclusive. * The iolock is held exclusive and no blocks are buffered. + * + * This function is used by xfs_free_file_space() to zero + * partial blocks when the range to free is not block aligned. + * When unreserving space with boundaries that are not block + * aligned we round up the start and round down the end + * boundaries and then use this function to zero the parts of + * the blocks that got dropped during the rounding. */ STATIC int xfs_zero_remaining_bytes( @@ -3168,6 +3183,17 @@ xfs_zero_remaining_bytes( int nimap; int error = 0; + /* + * Avoid doing I/O beyond eof - it's not necessary + * since nothing can read beyond eof. The space will + * be zeroed when the file is extended anyway. + */ + if (startoff >= ip->i_size) + return 0; + + if (endoff > ip->i_size) + endoff = ip->i_size; + bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize, XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp); |