From d5e84fd8d0634d056248b67463b42f6c85896a19 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 19 Sep 2016 10:57:40 +0100 Subject: Btrfs: fix incremental send failure caused by balance Commit 951555856b88 ("Btrfs: send, don't bug on inconsistent snapshots") removed some BUG_ON() statements (replacing them with returning errors to user space and logging error messages) when a snapshot is in an inconsistent state due to failures to update a delayed inode item (ENOMEM or ENOSPC) after adding/updating/deleting references, xattrs or file extent items. However there is a case, when no errors happen, where a file extent item can be modified without having the corresponding inode item updated. This case happens during balance under very specific timings, when relocation is in the stage where it updates data pointers and a leaf that contains file extent items is COWed. When that happens file extent items get their disk_bytenr field updated to a new value that reflects the post relocation logical address of the extent, without updating their respective inode items (as there is nothing that needs to be updated on them). This is performed at relocation.c:replace_file_extents() through relocation.c:btrfs_reloc_cow_block(). So make an incremental send deal with this case and don't do any processing for a file extent item that got its disk_bytenr field updated by relocation, since the extent's data is the same as the one pointed by the file extent item in the parent snapshot. After the recent commit mentioned above this case resulted in EIO errors returned to user space (and an error message logged to dmesg/syslog) when doing an incremental send, while before it, it resulted in hitting a BUG_ON leading to the following trace: [ 952.206705] ------------[ cut here ]------------ [ 952.206714] kernel BUG at ../fs/btrfs/send.c:5653! [ 952.206719] Internal error: Oops - BUG: 0 [#1] SMP [ 952.209854] Modules linked in: st dm_mod nls_utf8 isofs fuse nf_log_ipv6 xt_pkttype xt_physdev br_netfilter nf_log_ipv4 nf_log_common xt_LOG xt_limit ebtable_filter ebtables af_packet bridge stp llc ip6t_REJECT xt_tcpudp nf_conntrack_ipv6 nf_defrag_ipv6 ip6table_raw ipt_REJECT iptable_raw xt_CT iptable_filter ip6table_mangle nf_conntrack_netbios_ns nf_conntrack_broadcast nf_conntrack_ipv4 nf_defrag_ipv4 ip_tables xt_conntrack nf_conntrack ip6table_filter ip6_tables x_tables xfs libcrc32c nls_iso8859_1 nls_cp437 vfat fat joydev aes_ce_blk ablk_helper cryptd snd_intel8x0 aes_ce_cipher snd_ac97_codec ac97_bus snd_pcm ghash_ce sha2_ce sha1_ce snd_timer snd virtio_net soundcore btrfs xor sr_mod cdrom hid_generic usbhid raid6_pq virtio_blk virtio_scsi bochs_drm drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm virtio_mmio xhci_pci xhci_hcd usbcore usb_common virtio_pci virtio_ring virtio drm sg efivarfs [ 952.228333] Supported: Yes [ 952.228908] CPU: 0 PID: 12779 Comm: snapperd Not tainted 4.4.14-50-default #1 [ 952.230329] Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015 [ 952.231683] task: ffff800058e94100 ti: ffff8000d866c000 task.ti: ffff8000d866c000 [ 952.233279] PC is at changed_cb+0x9f4/0xa48 [btrfs] [ 952.234375] LR is at changed_cb+0x58/0xa48 [btrfs] [ 952.236552] pc : [] lr : [] pstate: 80000145 [ 952.238049] sp : ffff8000d866fa20 [ 952.238732] x29: ffff8000d866fa20 x28: 0000000000000019 [ 952.239840] x27: 00000000000028d5 x26: 00000000000024a2 [ 952.241008] x25: 0000000000000002 x24: ffff8000e66e92f0 [ 952.242131] x23: ffff8000b8c76800 x22: ffff800092879140 [ 952.243238] x21: 0000000000000002 x20: ffff8000d866fb78 [ 952.244348] x19: ffff8000b8f8c200 x18: 0000000000002710 [ 952.245607] x17: 0000ffff90d42480 x16: ffff800000237dc0 [ 952.246719] x15: 0000ffff90de7510 x14: ab000c000a2faf08 [ 952.247835] x13: 0000000000577c2b x12: ab000c000b696665 [ 952.248981] x11: 2e65726f632f6966 x10: 652d34366d72612f [ 952.250101] x9 : 32627572672f746f x8 : ab000c00092f1671 [ 952.251352] x7 : 8000000000577c2b x6 : ffff800053eadf45 [ 952.252468] x5 : 0000000000000000 x4 : ffff80005e169494 [ 952.253582] x3 : 0000000000000004 x2 : ffff8000d866fb78 [ 952.254695] x1 : 000000000003e2a3 x0 : 000000000003e2a4 [ 952.255803] [ 952.256150] Process snapperd (pid: 12779, stack limit = 0xffff8000d866c020) [ 952.257516] Stack: (0xffff8000d866fa20 to 0xffff8000d8670000) [ 952.258654] fa20: ffff8000d866fae0 ffff7ffffc308fc0 ffff800092879140 ffff8000e66e92f0 [ 952.260219] fa40: 0000000000000035 ffff800055de6000 ffff8000b8c76800 ffff8000d866fb78 [ 952.261745] fa60: 0000000000000002 00000000000024a2 00000000000028d5 0000000000000019 [ 952.263269] fa80: ffff8000d866fae0 ffff7ffffc3090f0 ffff8000d866fae0 ffff7ffffc309128 [ 952.264797] faa0: ffff800092879140 ffff8000e66e92f0 0000000000000035 ffff800055de6000 [ 952.268261] fac0: ffff8000b8c76800 ffff8000d866fb78 0000000000000002 0000000000001000 [ 952.269822] fae0: ffff8000d866fbc0 ffff7ffffc39ecfc ffff8000b8f8c200 ffff8000b8f8c368 [ 952.271368] fb00: ffff8000b8f8c378 ffff800055de6000 0000000000000001 ffff8000ecb17500 [ 952.272893] fb20: ffff8000b8c76800 ffff800092879140 ffff800062b6d000 ffff80007a9e2470 [ 952.274420] fb40: ffff8000b8f8c208 0000000005784000 ffff8000580a8000 ffff8000b8f8c200 [ 952.276088] fb60: ffff7ffffc39d488 00000002b8f8c368 0000000000000000 000000000003e2a4 [ 952.280275] fb80: 000000000000006c ffff7ffffc39ec00 000000000003e2a4 000000000000006c [ 952.283219] fba0: ffff8000b8f8c300 0000000000000100 0000000000000001 ffff8000ecb17500 [ 952.286166] fbc0: ffff8000d866fcd0 ffff7ffffc3643c0 ffff8000f8842700 0000ffff8ffe9278 [ 952.289136] fbe0: 0000000040489426 ffff800055de6000 0000ffff8ffe9278 0000000040489426 [ 952.292083] fc00: 000000000000011d 000000000000001d ffff80007a9e4598 ffff80007a9e43e8 [ 952.294959] fc20: ffff8000b8c7693f 0000000000003b24 0000000000000019 ffff8000b8f8c218 [ 952.301161] fc40: 00000001d866fc70 ffff8000b8c76800 0000000000000128 ffffffffffffff84 [ 952.305749] fc60: ffff800058e941ff 0000000000003a58 ffff8000d866fcb0 ffff8000000f7390 [ 952.308875] fc80: 000000000000012a 0000000000010290 ffff8000d866fc00 000000000000007b [ 952.311915] fca0: 0000000000010290 ffff800046c1b100 74732d7366727462 000001006d616572 [ 952.314937] fcc0: ffff8000fffc4100 cb88537fdc8ba60e ffff8000d866fe10 ffff8000002499e8 [ 952.318008] fce0: 0000000040489426 ffff8000f8842700 0000ffff8ffe9278 ffff80007a9e4598 [ 952.321321] fd00: 0000ffff8ffe9278 0000000040489426 000000000000011d 000000000000001d [ 952.324280] fd20: ffff80000072c000 ffff8000d866c000 ffff8000d866fda0 ffff8000000e997c [ 952.327156] fd40: ffff8000fffc4180 00000000000031ed ffff8000fffc4180 ffff800046c1b7d4 [ 952.329895] fd60: 0000000000000140 0000ffff907ea170 000000000000011d 00000000000000dc [ 952.334641] fd80: ffff80000072c000 ffff8000d866c000 0000000000000000 0000000000000002 [ 952.338002] fda0: ffff8000d866fdd0 ffff8000000ebacc ffff800046c1b080 ffff800046c1b7d4 [ 952.340724] fdc0: ffff8000d866fdf0 ffff8000000db67c 0000000000000040 ffff800000e69198 [ 952.343415] fde0: 0000ffff8ffea790 00000000000031ed ffff8000d866fe20 ffff800000254000 [ 952.346101] fe00: 000000000000001d 0000000000000004 ffff8000d866fe90 ffff800000249d3c [ 952.348980] fe20: ffff8000f8842700 0000000000000000 ffff8000f8842701 0000000000000008 [ 952.351696] fe40: ffff8000d866fe70 0000000000000008 ffff8000d866fe90 ffff800000249cf8 [ 952.354387] fe60: ffff8000f8842700 0000ffff8ffe9170 ffff8000f8842701 0000000000000008 [ 952.357083] fe80: 0000ffff8ffe9278 ffff80008ff85500 0000ffff8ffe90c0 ffff800000085c84 [ 952.359800] fea0: 0000000000000000 0000ffff8ffe9170 ffffffffffffffff 0000ffff90d473bc [ 952.365351] fec0: 0000000000000000 0000000000000015 0000000000000008 0000000040489426 [ 952.369550] fee0: 0000ffff8ffe9278 0000ffff907ea790 0000ffff907ea170 0000ffff907ea790 [ 952.372416] ff00: 0000ffff907ea170 0000000000000000 000000000000001d 0000000000000004 [ 952.375223] ff20: 0000ffff90a32220 00000000003d0f00 0000ffff907ea0a0 0000ffff8ffe8f30 [ 952.378099] ff40: 0000ffff9100f554 0000ffff91147000 0000ffff91117bc0 0000ffff90d473b0 [ 952.381115] ff60: 0000ffff9100f620 0000ffff880069b0 0000ffff8ffe9170 0000ffff8ffe91a0 [ 952.384003] ff80: 0000ffff8ffe9160 0000ffff8ffe9140 0000ffff88006990 0000ffff8ffe9278 [ 952.386860] ffa0: 0000ffff88008a60 0000ffff8ffe9480 0000ffff88014ca0 0000ffff8ffe90c0 [ 952.389654] ffc0: 0000ffff910be8e8 0000ffff8ffe90c0 0000ffff90d473bc 0000000000000000 [ 952.410986] ffe0: 0000000000000008 000000000000001d 6e2079747265706f 72616d223d656d61 [ 952.415497] Call trace: [ 952.417403] [] changed_cb+0x9f4/0xa48 [btrfs] [ 952.420023] [] btrfs_compare_trees+0x500/0x6b0 [btrfs] [ 952.422759] [] btrfs_ioctl_send+0xb4c/0xe10 [btrfs] [ 952.425601] [] btrfs_ioctl+0x374/0x29a4 [btrfs] [ 952.428031] [] do_vfs_ioctl+0x33c/0x600 [ 952.430360] [] SyS_ioctl+0x90/0xa4 [ 952.432552] [] el0_svc_naked+0x38/0x3c [ 952.434803] Code: 2a1503e0 17fffdac b9404282 17ffff28 (d4210000) [ 952.437457] ---[ end trace 9afd7090c466cf15 ]--- Signed-off-by: Filipe Manana --- fs/btrfs/send.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 9787968..0b46289 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -5805,6 +5805,64 @@ static int changed_extent(struct send_ctx *sctx, int ret = 0; if (sctx->cur_ino != sctx->cmp_key->objectid) { + + if (result == BTRFS_COMPARE_TREE_CHANGED) { + struct extent_buffer *leaf_l; + struct extent_buffer *leaf_r; + struct btrfs_file_extent_item *ei_l; + struct btrfs_file_extent_item *ei_r; + + leaf_l = sctx->left_path->nodes[0]; + leaf_r = sctx->right_path->nodes[0]; + ei_l = btrfs_item_ptr(leaf_l, + sctx->left_path->slots[0], + struct btrfs_file_extent_item); + ei_r = btrfs_item_ptr(leaf_r, + sctx->right_path->slots[0], + struct btrfs_file_extent_item); + + /* + * We may have found an extent item that has changed + * only its disk_bytenr field and the corresponding + * inode item was not updated. This case happens due to + * very specific timings during relocation when a leaf + * that contains file extent items is COWed while + * relocation is ongoing and its in the stage where it + * updates data pointers. So when this happens we can + * safely ignore it since we know it's the same extent, + * but just at different logical and physical locations + * (when an extent is fully replaced with a new one, we + * know the generation number must have changed too, + * since snapshot creation implies committing the current + * transaction, and the inode item must have been updated + * as well). + * This replacement of the disk_bytenr happens at + * relocation.c:replace_file_extents() through + * relocation.c:btrfs_reloc_cow_block(). + */ + if (btrfs_file_extent_generation(leaf_l, ei_l) == + btrfs_file_extent_generation(leaf_r, ei_r) && + btrfs_file_extent_ram_bytes(leaf_l, ei_l) == + btrfs_file_extent_ram_bytes(leaf_r, ei_r) && + btrfs_file_extent_compression(leaf_l, ei_l) == + btrfs_file_extent_compression(leaf_r, ei_r) && + btrfs_file_extent_encryption(leaf_l, ei_l) == + btrfs_file_extent_encryption(leaf_r, ei_r) && + btrfs_file_extent_other_encoding(leaf_l, ei_l) == + btrfs_file_extent_other_encoding(leaf_r, ei_r) && + btrfs_file_extent_type(leaf_l, ei_l) == + btrfs_file_extent_type(leaf_r, ei_r) && + btrfs_file_extent_disk_bytenr(leaf_l, ei_l) != + btrfs_file_extent_disk_bytenr(leaf_r, ei_r) && + btrfs_file_extent_disk_num_bytes(leaf_l, ei_l) == + btrfs_file_extent_disk_num_bytes(leaf_r, ei_r) && + btrfs_file_extent_offset(leaf_l, ei_l) == + btrfs_file_extent_offset(leaf_r, ei_r) && + btrfs_file_extent_num_bytes(leaf_l, ei_l) == + btrfs_file_extent_num_bytes(leaf_r, ei_r)) + return 0; + } + inconsistent_snapshot_error(sctx, result, "extent"); return -EIO; } -- cgit v1.1 From de0dcc40f6e24d6bac6b60e36eac4659bbbd3f00 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 12 Oct 2016 13:38:41 -0700 Subject: f2fs: fix wrong sum_page pointer in f2fs_gc This patch fixes using a wrong pointer for sum_page in f2fs_gc. Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 93985c6..6f14ee9 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -852,16 +852,16 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, for (segno = start_segno; segno < end_segno; segno++) { - if (get_valid_blocks(sbi, segno, 1) == 0 || - unlikely(f2fs_cp_error(sbi))) - goto next; - /* find segment summary of victim */ sum_page = find_get_page(META_MAPPING(sbi), GET_SUM_BLOCK(sbi, segno)); - f2fs_bug_on(sbi, !PageUptodate(sum_page)); f2fs_put_page(sum_page, 0); + if (get_valid_blocks(sbi, segno, 1) == 0 || + !PageUptodate(sum_page) || + unlikely(f2fs_cp_error(sbi))) + goto next; + sum = page_address(sum_page); f2fs_bug_on(sbi, type != GET_SUM_TYPE((&sum->footer))); -- cgit v1.1 From 651e1c3b1576c5ffda6df01db1ef535eeb8b1a37 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 12 Oct 2016 23:12:53 -0400 Subject: ext4: super.c: Update logging style using KERN_CONT Recent commit require line continuing printks to use PR_CONT. Update super.c to use KERN_CONT and use vsprintf extension %pV to avoid a printk/vprintk/printk("\n") sequence as well. Signed-off-by: Joe Perches Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/ext4/super.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 6db81fb..20da99d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -597,14 +597,15 @@ void __ext4_std_error(struct super_block *sb, const char *function, void __ext4_abort(struct super_block *sb, const char *function, unsigned int line, const char *fmt, ...) { + struct va_format vaf; va_list args; save_error_info(sb, function, line); va_start(args, fmt); - printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: ", sb->s_id, - function, line); - vprintk(fmt, args); - printk("\n"); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: %pV\n", + sb->s_id, function, line, &vaf); va_end(args); if ((sb->s_flags & MS_RDONLY) == 0) { @@ -2715,12 +2716,12 @@ static void print_daily_error_info(unsigned long arg) es->s_first_error_func, le32_to_cpu(es->s_first_error_line)); if (es->s_first_error_ino) - printk(": inode %u", + printk(KERN_CONT ": inode %u", le32_to_cpu(es->s_first_error_ino)); if (es->s_first_error_block) - printk(": block %llu", (unsigned long long) + printk(KERN_CONT ": block %llu", (unsigned long long) le64_to_cpu(es->s_first_error_block)); - printk("\n"); + printk(KERN_CONT "\n"); } if (es->s_last_error_time) { printk(KERN_NOTICE "EXT4-fs (%s): last error at time %u: %.*s:%d", @@ -2729,12 +2730,12 @@ static void print_daily_error_info(unsigned long arg) es->s_last_error_func, le32_to_cpu(es->s_last_error_line)); if (es->s_last_error_ino) - printk(": inode %u", + printk(KERN_CONT ": inode %u", le32_to_cpu(es->s_last_error_ino)); if (es->s_last_error_block) - printk(": block %llu", (unsigned long long) + printk(KERN_CONT ": block %llu", (unsigned long long) le64_to_cpu(es->s_last_error_block)); - printk("\n"); + printk(KERN_CONT "\n"); } mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */ } -- cgit v1.1 From 559cce698eaf4ccecb2213b2519ea3a0413e5155 Mon Sep 17 00:00:00 2001 From: Taesoo Kim Date: Wed, 12 Oct 2016 23:19:18 -0400 Subject: jbd2: fix incorrect unlock on j_list_lock When 'jh->b_transaction == transaction' (asserted by below) J_ASSERT_JH(jh, (jh->b_transaction == transaction || ... 'journal->j_list_lock' will be incorrectly unlocked, since the the lock is aquired only at the end of if / else-if statements (missing the else case). Signed-off-by: Taesoo Kim Signed-off-by: Theodore Ts'o Reviewed-by: Andreas Dilger Fixes: 6e4862a5bb9d12be87e4ea5d9a60836ebed71d28 Cc: stable@vger.kernel.org # 3.14+ --- fs/jbd2/transaction.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 3d8246a..e165266 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1149,6 +1149,7 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) JBUFFER_TRACE(jh, "file as BJ_Reserved"); spin_lock(&journal->j_list_lock); __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved); + spin_unlock(&journal->j_list_lock); } else if (jh->b_transaction == journal->j_committing_transaction) { /* first access by this transaction */ jh->b_modified = 0; @@ -1156,8 +1157,8 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) JBUFFER_TRACE(jh, "set next transaction"); spin_lock(&journal->j_list_lock); jh->b_next_transaction = transaction; + spin_unlock(&journal->j_list_lock); } - spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); /* -- cgit v1.1 From c4704a4fbe834eee4109ca064131d440941f6235 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 12 Oct 2016 23:24:51 -0400 Subject: ext4: do not advertise encryption support when disabled The sysfs file /sys/fs/ext4/features/encryption was present on kernels compiled with CONFIG_EXT4_FS_ENCRYPTION=n. This was misleading because such kernels do not actually support ext4 encryption. Therefore, only provide this file on kernels compiled with CONFIG_EXT4_FS_ENCRYPTION=y. Note: since the ext4 feature files are all hardcoded to have a contents of "supported", it really is the presence or absence of the file that is significant, not the contents (and this change reflects that). Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/sysfs.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 73bcfd4..42145be 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -223,14 +223,18 @@ static struct attribute *ext4_attrs[] = { EXT4_ATTR_FEATURE(lazy_itable_init); EXT4_ATTR_FEATURE(batched_discard); EXT4_ATTR_FEATURE(meta_bg_resize); +#ifdef CONFIG_EXT4_FS_ENCRYPTION EXT4_ATTR_FEATURE(encryption); +#endif EXT4_ATTR_FEATURE(metadata_csum_seed); static struct attribute *ext4_feat_attrs[] = { ATTR_LIST(lazy_itable_init), ATTR_LIST(batched_discard), ATTR_LIST(meta_bg_resize), +#ifdef CONFIG_EXT4_FS_ENCRYPTION ATTR_LIST(encryption), +#endif ATTR_LIST(metadata_csum_seed), NULL, }; -- cgit v1.1 From fb4454376df9d820d95452d71dd83da6971f9338 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 12 Oct 2016 23:30:16 -0400 Subject: fscrypto: make XTS tweak initialization endian-independent The XTS tweak (or IV) was initialized differently on little endian and big endian systems. Because the ciphertext depends on the XTS tweak, it was not possible to use an encrypted filesystem created by a little endian system on a big endian system and vice versa, even if they shared the same PAGE_SIZE. Fix this by always using little endian. This will break hypothetical big endian users of ext4 or f2fs encryption. However, all users we are aware of are little endian, and it's believed that "real" big endian users are unlikely to exist yet. So this might as well be fixed now before it's too late. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/crypto/crypto.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 61057b7d..98f87fe 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -151,7 +151,10 @@ static int do_page_crypto(struct inode *inode, struct page *src_page, struct page *dest_page, gfp_t gfp_flags) { - u8 xts_tweak[FS_XTS_TWEAK_SIZE]; + struct { + __le64 index; + u8 padding[FS_XTS_TWEAK_SIZE - sizeof(__le64)]; + } xts_tweak; struct skcipher_request *req = NULL; DECLARE_FS_COMPLETION_RESULT(ecr); struct scatterlist dst, src; @@ -171,17 +174,15 @@ static int do_page_crypto(struct inode *inode, req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, page_crypt_complete, &ecr); - BUILD_BUG_ON(FS_XTS_TWEAK_SIZE < sizeof(index)); - memcpy(xts_tweak, &index, sizeof(index)); - memset(&xts_tweak[sizeof(index)], 0, - FS_XTS_TWEAK_SIZE - sizeof(index)); + BUILD_BUG_ON(sizeof(xts_tweak) != FS_XTS_TWEAK_SIZE); + xts_tweak.index = cpu_to_le64(index); + memset(xts_tweak.padding, 0, sizeof(xts_tweak.padding)); sg_init_table(&dst, 1); sg_set_page(&dst, dest_page, PAGE_SIZE, 0); sg_init_table(&src, 1); sg_set_page(&src, src_page, PAGE_SIZE, 0); - skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE, - xts_tweak); + skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE, &xts_tweak); if (rw == FS_DECRYPT) res = crypto_skcipher_decrypt(req); else -- cgit v1.1 From 233c9edcca0136f2b9b304fd4f32e9bb6ce88ea9 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 12 Oct 2016 09:09:44 +0300 Subject: afs: unmapping the wrong buffer We switched from kmap_atomic() to kmap() so the kunmap() calls need to be updated to match. Fixes: d001648ec7cf ('rxrpc: Don't expose skbs to in-kernel users [ver #2]') Signed-off-by: Dan Carpenter Signed-off-by: David Howells --- fs/afs/fsclient.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 96f4d76..31c616a 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -364,7 +364,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) buffer = kmap(page); ret = afs_extract_data(call, buffer, call->count, true); - kunmap(buffer); + kunmap(page); if (ret < 0) return ret; } @@ -397,7 +397,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) page = call->reply3; buffer = kmap(page); memset(buffer + call->count, 0, PAGE_SIZE - call->count); - kunmap(buffer); + kunmap(page); } _leave(" = 0 [done]"); -- cgit v1.1 From 50a2c95381a7d0e453d7bdfde81d0c5f8351ba54 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 13 Oct 2016 08:27:10 +0100 Subject: afs: call->operation_ID sometimes used as __be32 sometimes as u32 call->operation_ID is sometimes being used as __be32 sometimes is being used as u32. Be consistent and settle on using as u32. Signed-off-by: David Howells operation_ID); + _enter("{CB.OP %u}", call->operation_ID); - _enter("{CB.OP %u}", operation_id); - - switch (operation_id) { + switch (call->operation_ID) { case CBCallBack: call->type = &afs_SRXCBCallBack; return true; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 5497c84..535a38d 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -112,7 +112,7 @@ struct afs_call { bool need_attention; /* T if RxRPC poked us */ u16 service_id; /* RxRPC service ID to call */ __be16 port; /* target UDP port */ - __be32 operation_ID; /* operation ID for an incoming call */ + u32 operation_ID; /* operation ID for an incoming call */ u32 count; /* count for use in unmarshalling */ __be32 tmp; /* place to extract temporary data */ afs_dataversion_t store_version; /* updated version expected from store */ diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 477928b..25f05a8 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -676,10 +676,11 @@ static int afs_deliver_cm_op_id(struct afs_call *call) ASSERTCMP(call->offset, <, 4); /* the operation ID forms the first four bytes of the request data */ - ret = afs_extract_data(call, &call->operation_ID, 4, true); + ret = afs_extract_data(call, &call->tmp, 4, true); if (ret < 0) return ret; + call->operation_ID = ntohl(call->tmp); call->state = AFS_CALL_AWAIT_REQUEST; call->offset = 0; -- cgit v1.1 From a3f9d1b58a9ffce011ef4f074bfa36ae30eade28 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Tue, 11 Oct 2016 15:53:21 -0400 Subject: pnfs/blocklayout: fix last_write_offset incorrectly set to page boundary Commit 41963c10c47a35185e68cb9049f7a3493c94d2d7 sets the block layout's last written byte to the offset of the end of the extent rather than the end of the write which incorrectly updates the inode's size for partial-page writes. Fixes: 41963c10c47a ("pnfs/blocklayout: update last_write_offset atomically with extents") Signed-off-by: Benjamin Coddington Reviewed-by: Christoph Hellwig Tested-by: Christoph Hellwig Cc: stable@vger.kernel.org # 4.8+ Signed-off-by: Anna Schumaker --- fs/nfs/blocklayout/blocklayout.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 2178476..2905479 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -344,9 +344,10 @@ static void bl_write_cleanup(struct work_struct *work) u64 start = hdr->args.offset & (loff_t)PAGE_MASK; u64 end = (hdr->args.offset + hdr->args.count + PAGE_SIZE - 1) & (loff_t)PAGE_MASK; + u64 lwb = hdr->args.offset + hdr->args.count; ext_tree_mark_written(bl, start >> SECTOR_SHIFT, - (end - start) >> SECTOR_SHIFT, end); + (end - start) >> SECTOR_SHIFT, lwb); } pnfs_ld_write_done(hdr); -- cgit v1.1 From 199625098a18a5522b424dea9b122b254c022fc5 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 15 Oct 2016 09:39:31 -0400 Subject: ext4: correct endianness conversion in __xattr_check_inode() It should be cpu_to_le32(), not le32_to_cpu(). No change in behavior. Found with sparse, and this was the only endianness warning in fs/ext4/. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/ext4/xattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index c15d633..e90c5cd 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -241,7 +241,7 @@ __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, int error = -EFSCORRUPTED; if (((void *) header >= end) || - (header->h_magic != le32_to_cpu(EXT4_XATTR_MAGIC))) + (header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC))) goto errout; error = ext4_xattr_check_names(entry, end, entry); errout: -- cgit v1.1 From 8906a8223ad4909b391c5628f7991ebceda30e52 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 15 Oct 2016 09:48:50 -0400 Subject: fscrypto: lock inode while setting encryption policy i_rwsem needs to be acquired while setting an encryption policy so that concurrent calls to FS_IOC_SET_ENCRYPTION_POLICY are correctly serialized (especially the ->get_context() + ->set_context() pair), and so that new files cannot be created in the directory during or after the ->empty_dir() check. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o Reviewed-by: Richard Weinberger Cc: stable@vger.kernel.org --- fs/crypto/policy.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index ed115ac..6865663 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -109,6 +109,8 @@ int fscrypt_process_policy(struct file *filp, if (ret) return ret; + inode_lock(inode); + if (!inode_has_encryption_context(inode)) { if (!S_ISDIR(inode->i_mode)) ret = -EINVAL; @@ -127,6 +129,8 @@ int fscrypt_process_policy(struct file *filp, ret = -EINVAL; } + inode_unlock(inode); + mnt_drop_write_file(filp); return ret; } -- cgit v1.1 From d74f3d25289aa9722cf777a7482eeee2eacdf46e Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sat, 15 Oct 2016 09:57:31 -0400 Subject: ext4: add missing KERN_CONT to a few more debugging uses Recent commits require line continuing printks to always use pr_cont or KERN_CONT. Add these markings to a few more printks. Miscellaneaous: o Integrate the ea_idebug and ea_bdebug macros to use a single call to printk(KERN_DEBUG instead of 3 separate printks o Use the more common varargs macro style Signed-off-by: Joe Perches Signed-off-by: Theodore Ts'o Reviewed-by: Andreas Dilger --- fs/ext4/block_validity.c | 4 ++-- fs/ext4/mballoc.h | 17 ++++++++--------- fs/ext4/namei.c | 18 ++++++++++-------- fs/ext4/xattr.c | 18 ++++++------------ 4 files changed, 26 insertions(+), 31 deletions(-) (limited to 'fs') diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 02ddec6..fdb1954 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -128,12 +128,12 @@ static void debug_print_tree(struct ext4_sb_info *sbi) node = rb_first(&sbi->system_blks); while (node) { entry = rb_entry(node, struct ext4_system_zone, node); - printk("%s%llu-%llu", first ? "" : ", ", + printk(KERN_CONT "%s%llu-%llu", first ? "" : ", ", entry->start_blk, entry->start_blk + entry->count - 1); first = 0; node = rb_next(node); } - printk("\n"); + printk(KERN_CONT "\n"); } int ext4_setup_system_zone(struct super_block *sb) diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 3ef1df6..1aba469 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -27,16 +27,15 @@ #ifdef CONFIG_EXT4_DEBUG extern ushort ext4_mballoc_debug; -#define mb_debug(n, fmt, a...) \ - do { \ - if ((n) <= ext4_mballoc_debug) { \ - printk(KERN_DEBUG "(%s, %d): %s: ", \ - __FILE__, __LINE__, __func__); \ - printk(fmt, ## a); \ - } \ - } while (0) +#define mb_debug(n, fmt, ...) \ +do { \ + if ((n) <= ext4_mballoc_debug) { \ + printk(KERN_DEBUG "(%s, %d): %s: " fmt, \ + __FILE__, __LINE__, __func__, ##__VA_ARGS__); \ + } \ +} while (0) #else -#define mb_debug(n, fmt, a...) no_printk(fmt, ## a) +#define mb_debug(n, fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif #define EXT4_MB_HISTORY_ALLOC 1 /* allocation */ diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index f92f10d..104f8bf 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -577,12 +577,13 @@ static inline unsigned dx_node_limit(struct inode *dir) static void dx_show_index(char * label, struct dx_entry *entries) { int i, n = dx_get_count (entries); - printk(KERN_DEBUG "%s index ", label); + printk(KERN_DEBUG "%s index", label); for (i = 0; i < n; i++) { - printk("%x->%lu ", i ? dx_get_hash(entries + i) : - 0, (unsigned long)dx_get_block(entries + i)); + printk(KERN_CONT " %x->%lu", + i ? dx_get_hash(entries + i) : 0, + (unsigned long)dx_get_block(entries + i)); } - printk("\n"); + printk(KERN_CONT "\n"); } struct stats @@ -679,7 +680,7 @@ static struct stats dx_show_leaf(struct inode *dir, } de = ext4_next_entry(de, size); } - printk("(%i)\n", names); + printk(KERN_CONT "(%i)\n", names); return (struct stats) { names, space, 1 }; } @@ -798,7 +799,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, q = entries + count - 1; while (p <= q) { m = p + (q - p) / 2; - dxtrace(printk(".")); + dxtrace(printk(KERN_CONT ".")); if (dx_get_hash(m) > hash) q = m - 1; else @@ -810,7 +811,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, at = entries; while (n--) { - dxtrace(printk(",")); + dxtrace(printk(KERN_CONT ",")); if (dx_get_hash(++at) > hash) { at--; @@ -821,7 +822,8 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, } at = p - 1; - dxtrace(printk(" %x->%u\n", at == entries ? 0 : dx_get_hash(at), + dxtrace(printk(KERN_CONT " %x->%u\n", + at == entries ? 0 : dx_get_hash(at), dx_get_block(at))); frame->entries = entries; frame->at = at; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index e90c5cd..d77be9e 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -61,18 +61,12 @@ #include "acl.h" #ifdef EXT4_XATTR_DEBUG -# define ea_idebug(inode, f...) do { \ - printk(KERN_DEBUG "inode %s:%lu: ", \ - inode->i_sb->s_id, inode->i_ino); \ - printk(f); \ - printk("\n"); \ - } while (0) -# define ea_bdebug(bh, f...) do { \ - printk(KERN_DEBUG "block %pg:%lu: ", \ - bh->b_bdev, (unsigned long) bh->b_blocknr); \ - printk(f); \ - printk("\n"); \ - } while (0) +# define ea_idebug(inode, fmt, ...) \ + printk(KERN_DEBUG "inode %s:%lu: " fmt "\n", \ + inode->i_sb->s_id, inode->i_ino, ##__VA_ARGS__) +# define ea_bdebug(bh, fmt, ...) \ + printk(KERN_DEBUG "block %pg:%lu: " fmt "\n", \ + bh->b_bdev, (unsigned long)bh->b_blocknr, ##__VA_ARGS__) #else # define ea_idebug(inode, fmt, ...) no_printk(fmt, ##__VA_ARGS__) # define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__) -- cgit v1.1 From 0d7718f666be181fda1ba2d08f137d87c1419347 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 10 Oct 2016 15:38:18 +0300 Subject: ceph: fix error handling in ceph_read_iter In case __ceph_do_getattr returns an error and the retry_op in ceph_read_iter is not READ_INLINE, then it's possible to invoke __free_page on a page which is NULL, this naturally leads to a crash. This can happen when, for example, a process waiting on a MDS reply receives sigterm. Fix this by explicitly checking whether the page is set or not. Cc: stable@vger.kernel.org # 3.19+ Signed-off-by: Nikolay Borisov Reviewed-by: Yan, Zheng Signed-off-by: Ilya Dryomov --- fs/ceph/file.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 7bf0882..18630e8 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1272,7 +1272,8 @@ again: statret = __ceph_do_getattr(inode, page, CEPH_STAT_CAP_INLINE_DATA, !!page); if (statret < 0) { - __free_page(page); + if (page) + __free_page(page); if (statret == -ENODATA) { BUG_ON(retry_op != READ_INLINE); goto again; -- cgit v1.1 From 4547f4d8ffd63ba4ac129f9136027bd14b729101 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 23 Sep 2016 14:05:04 -0700 Subject: Btrfs: kill BUG_ON in do_relocation While updating btree, we try to push items between sibling nodes/leaves in order to keep height as low as possible. But we don't memset the original places with zero when pushing items so that we could end up leaving stale content in nodes/leaves. One may read the above stale content by increasing btree blocks' @nritems. One case I've come across is that in fs tree, a leaf has two parent nodes, hence running balance ends up with processing this leaf with two parent nodes, but it can only reach the valid parent node through btrfs_search_slot, so it'd be like, do_relocation for P in all parent nodes of block A: if !P->eb: btrfs_search_slot(key); --> get path from P to A. if lowest: BUG_ON(A->bytenr != bytenr of A recorded in P); btrfs_cow_block(P, A); --> change A's bytenr in P. After btrfs_cow_block, P has the new bytenr of A, but with the same @key, we get the same path again, and get panic by BUG_ON. Note that this is only happening in a corrupted fs, for a regular fs in which we have correct @nritems so that we won't read stale content in any case. Reviewed-by: Josef Bacik Signed-off-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 0ec8ffa..c4af0cd 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2728,7 +2728,14 @@ static int do_relocation(struct btrfs_trans_handle *trans, bytenr = btrfs_node_blockptr(upper->eb, slot); if (lowest) { - BUG_ON(bytenr != node->bytenr); + if (bytenr != node->bytenr) { + btrfs_err(root->fs_info, + "lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu", + bytenr, node->bytenr, slot, + upper->eb->start); + err = -EIO; + goto next; + } } else { if (node->eb->start == bytenr) goto next; -- cgit v1.1 From 14155cafeadda946376260e2ad5d39a0528a332f Mon Sep 17 00:00:00 2001 From: Junjie Mao Date: Mon, 17 Oct 2016 09:20:25 +0800 Subject: btrfs: assign error values to the correct bio structs Fixes: 4246a0b63bd8 ("block: add a bi_error field to struct bio") Signed-off-by: Junjie Mao Acked-by: David Sterba Cc: stable@vger.kernel.org # 4.3+ Signed-off-by: Linus Torvalds --- fs/btrfs/compression.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index ccc70d9..d4d8b7e 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -698,7 +698,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_map_bio(root, comp_bio, mirror_num, 0); if (ret) { - bio->bi_error = ret; + comp_bio->bi_error = ret; bio_endio(comp_bio); } @@ -728,7 +728,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_map_bio(root, comp_bio, mirror_num, 0); if (ret) { - bio->bi_error = ret; + comp_bio->bi_error = ret; bio_endio(comp_bio); } -- cgit v1.1 From a2ed0b391dd9c3ef1d64c7c3e370f4a5ffcd324a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 4 Oct 2016 13:44:06 +0200 Subject: isofs: Do not return EACCES for unknown filesystems When isofs_mount() is called to mount a device read-write, it returns EACCES even before it checks that the device actually contains an isofs filesystem. This may confuse mount(8) which then tries to mount all subsequent filesystem types in read-only mode. Fix the problem by returning EACCES only once we verify that the device indeed contains an iso9660 filesystem. CC: stable@vger.kernel.org Fixes: 17b7f7cf58926844e1dd40f5eb5348d481deca6a Reported-by: Kent Overstreet Reported-by: Karel Zak Signed-off-by: Jan Kara --- fs/isofs/inode.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index ad0c745..871c8b3 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -687,6 +687,11 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent) pri_bh = NULL; root_found: + /* We don't support read-write mounts */ + if (!(s->s_flags & MS_RDONLY)) { + error = -EACCES; + goto out_freebh; + } if (joliet_level && (pri == NULL || !opt.rock)) { /* This is the case of Joliet with the norock mount flag. @@ -1501,9 +1506,6 @@ struct inode *__isofs_iget(struct super_block *sb, static struct dentry *isofs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - /* We don't support read-write mounts */ - if (!(flags & MS_RDONLY)) - return ERR_PTR(-EACCES); return mount_bdev(fs_type, flags, dev_name, data, isofs_fill_super); } -- cgit v1.1 From e952813e210b3addaea063da64ef68c3f30c0aa2 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 18 Oct 2016 00:05:34 +0200 Subject: ext2: avoid bogus -Wmaybe-uninitialized warning On ARM, we get this false-positive warning since the rework of the ext2_get_blocks interface: fs/ext2/inode.c: In function 'ext2_get_block': include/linux/buffer_head.h:340:16: error: 'bno' may be used uninitialized in this function [-Werror=maybe-uninitialized] The calling conventions for this function are rather complex, and it's not surprising that the compiler gets this wrong, I spent a long time trying to understand how it all fits together myself. This change to avoid the warning makes sure the compiler sees that we always set 'bno' pointer whenever we have a positive return code. The transformation is correct because we always arrive at the 'got_it' label with a positive count that gets used as the return value, while any branch to the 'cleanup' label has a negative or zero 'err'. Fixes: 6750ad71986d ("ext2: stop passing buffer_head to ext2_get_blocks") Signed-off-by: Arnd Bergmann Reviewed-by: Christoph Hellwig Cc: Dave Chinner Signed-off-by: Jan Kara --- fs/ext2/inode.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index d831e24..41b8b44 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -622,7 +622,7 @@ static int ext2_get_blocks(struct inode *inode, u32 *bno, bool *new, bool *boundary, int create) { - int err = -EIO; + int err; int offsets[4]; Indirect chain[4]; Indirect *partial; @@ -639,7 +639,7 @@ static int ext2_get_blocks(struct inode *inode, depth = ext2_block_to_path(inode,iblock,offsets,&blocks_to_boundary); if (depth == 0) - return (err); + return -EIO; partial = ext2_get_branch(inode, depth, offsets, chain, &err); /* Simplest case - block found, no allocation needed */ @@ -761,7 +761,6 @@ static int ext2_get_blocks(struct inode *inode, ext2_splice_branch(inode, iblock, partial, indirect_blks, count); mutex_unlock(&ei->truncate_mutex); got_it: - *bno = le32_to_cpu(chain[depth-1].key); if (count > blocks_to_boundary) *boundary = true; err = count; @@ -772,6 +771,8 @@ cleanup: brelse(partial->bh); partial--; } + if (err > 0) + *bno = le32_to_cpu(chain[depth-1].key); return err; } -- cgit v1.1 From f72f94555aa32b2c4374dde5ad9b960cc4ff32e2 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 12 Oct 2016 14:48:28 +0800 Subject: ceph: fix readdir vs fragmentation race following sequence of events tigger the race - client readdir frag 0* -> got item 'A' - MDS merges frag 0* and frag 1* - client send readdir request (frag 1*, offset 2, readdir_start 'A') - MDS reply items (that are after item 'A') in frag * Link: http://tracker.ceph.com/issues/17286 Signed-off-by: Yan, Zheng --- fs/ceph/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index bca1b49..ef4d046 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1511,7 +1511,8 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir); } - if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) { + if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2 && + !(rinfo->hash_order && req->r_path2)) { /* note dir version at start of readdir so we can tell * if any dentries get dropped */ req->r_dir_release_cnt = atomic64_read(&ci->i_release_count); -- cgit v1.1 From 31ca58781019de191c7f520f0626ca76a88c1f6e Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 13 Oct 2016 17:15:37 +0200 Subject: ceph: fix uninitialized dentry pointer in ceph_real_mount() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fs/ceph/super.c: In function ‘ceph_real_mount’: fs/ceph/super.c:818: warning: ‘root’ may be used uninitialized in this function If s_root is already valid, dentry pointer root is never initialized, and returned by ceph_real_mount(). This will cause a crash later when the caller dereferences the pointer. Fixes: ce2728aaa82bbeba ("ceph: avoid accessing / when mounting a subpath") Signed-off-by: Geert Uytterhoeven Signed-off-by: Yan, Zheng --- fs/ceph/super.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ceph/super.c b/fs/ceph/super.c index a29ffce..b382e59 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -845,6 +845,8 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc) err = ceph_fs_debugfs_init(fsc); if (err < 0) goto fail; + } else { + root = dget(fsc->sb->s_root); } fsc->mount_state = CEPH_MOUNT_MOUNTED; -- cgit v1.1 From 5f43086bb9224987010460dcf3dee68fbd4f574d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 8 Oct 2016 10:12:28 +0200 Subject: locking, fs/locks: Add missing file_sem locks I overlooked a few code-paths that can lead to locks_delete_global_locks(). Reported-by: Dmitry Vyukov Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Jeff Layton Cc: Al Viro Cc: Bruce Fields Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-fsdevel@vger.kernel.org Cc: syzkaller Link: http://lkml.kernel.org/r/20161008081228.GF3142@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- fs/locks.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/locks.c b/fs/locks.c index ce93b41..22c5b4a 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1609,6 +1609,7 @@ int fcntl_getlease(struct file *filp) ctx = smp_load_acquire(&inode->i_flctx); if (ctx && !list_empty_careful(&ctx->flc_lease)) { + percpu_down_read_preempt_disable(&file_rwsem); spin_lock(&ctx->flc_lock); time_out_leases(inode, &dispose); list_for_each_entry(fl, &ctx->flc_lease, fl_list) { @@ -1618,6 +1619,8 @@ int fcntl_getlease(struct file *filp) break; } spin_unlock(&ctx->flc_lock); + percpu_up_read_preempt_enable(&file_rwsem); + locks_dispose_list(&dispose); } return type; @@ -2529,11 +2532,14 @@ locks_remove_lease(struct file *filp, struct file_lock_context *ctx) if (list_empty(&ctx->flc_lease)) return; + percpu_down_read_preempt_disable(&file_rwsem); spin_lock(&ctx->flc_lock); list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) if (filp == fl->fl_file) lease_modify(fl, F_UNLCK, &dispose); spin_unlock(&ctx->flc_lock); + percpu_up_read_preempt_enable(&file_rwsem); + locks_dispose_list(&dispose); } -- cgit v1.1 From 5130ccea7cf4646a24c005be1309b7f86f1e91c9 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 17 Oct 2016 15:11:29 +0000 Subject: ceph: fix non static symbol warning Fixes the following sparse warning: fs/ceph/xattr.c:19:28: warning: symbol 'ceph_other_xattr_handler' was not declared. Should it be static? Signed-off-by: Wei Yongjun Signed-off-by: Ilya Dryomov --- fs/ceph/xattr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 40b7032..febc28f 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -16,7 +16,7 @@ static int __remove_xattr(struct ceph_inode_info *ci, struct ceph_inode_xattr *xattr); -const struct xattr_handler ceph_other_xattr_handler; +static const struct xattr_handler ceph_other_xattr_handler; /* * List of handlers for synthetic system.* attributes. Other @@ -1086,7 +1086,7 @@ static int ceph_set_xattr_handler(const struct xattr_handler *handler, return __ceph_setxattr(inode, name, value, size, flags); } -const struct xattr_handler ceph_other_xattr_handler = { +static const struct xattr_handler ceph_other_xattr_handler = { .prefix = "", /* match any name => handlers called with full name */ .get = ceph_get_xattr_handler, .set = ceph_set_xattr_handler, -- cgit v1.1 From 9beae1ea89305a9667ceaab6d0bf46a045ad71e7 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 13 Oct 2016 01:20:17 +0100 Subject: mm: replace get_user_pages_remote() write/force parameters with gup_flags This removes the 'write' and 'force' from get_user_pages_remote() and replaces them with 'gup_flags' to make the use of FOLL_FORCE explicit in callers as use of this flag can result in surprising behaviour (and hence bugs) within the mm subsystem. Signed-off-by: Lorenzo Stoakes Acked-by: Michal Hocko Reviewed-by: Jan Kara Signed-off-by: Linus Torvalds --- fs/exec.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 6fcfb3f..4e497b9 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -191,6 +191,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, { struct page *page; int ret; + unsigned int gup_flags = FOLL_FORCE; #ifdef CONFIG_STACK_GROWSUP if (write) { @@ -199,12 +200,16 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, return NULL; } #endif + + if (write) + gup_flags |= FOLL_WRITE; + /* * We are doing an exec(). 'current' is the process * doing the exec and bprm->mm is the new process's mm. */ - ret = get_user_pages_remote(current, bprm->mm, pos, 1, write, - 1, &page, NULL); + ret = get_user_pages_remote(current, bprm->mm, pos, 1, gup_flags, + &page, NULL); if (ret <= 0) return NULL; -- cgit v1.1 From 6347e8d5bcce33fc36e651901efefbe2c93a43ef Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 13 Oct 2016 01:20:19 +0100 Subject: mm: replace access_remote_vm() write parameter with gup_flags This removes the 'write' argument from access_remote_vm() and replaces it with 'gup_flags' as use of this function previously silently implied FOLL_FORCE, whereas after this patch callers explicitly pass this flag. We make this explicit as use of FOLL_FORCE can result in surprising behaviour (and hence bugs) within the mm subsystem. Signed-off-by: Lorenzo Stoakes Acked-by: Michal Hocko Signed-off-by: Linus Torvalds --- fs/proc/base.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index c2964d8..8e65446 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -252,7 +252,7 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, * Inherently racy -- command line shares address space * with code and data. */ - rv = access_remote_vm(mm, arg_end - 1, &c, 1, 0); + rv = access_remote_vm(mm, arg_end - 1, &c, 1, FOLL_FORCE); if (rv <= 0) goto out_free_page; @@ -270,7 +270,8 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, int nr_read; _count = min3(count, len, PAGE_SIZE); - nr_read = access_remote_vm(mm, p, page, _count, 0); + nr_read = access_remote_vm(mm, p, page, _count, + FOLL_FORCE); if (nr_read < 0) rv = nr_read; if (nr_read <= 0) @@ -305,7 +306,8 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, bool final; _count = min3(count, len, PAGE_SIZE); - nr_read = access_remote_vm(mm, p, page, _count, 0); + nr_read = access_remote_vm(mm, p, page, _count, + FOLL_FORCE); if (nr_read < 0) rv = nr_read; if (nr_read <= 0) @@ -354,7 +356,8 @@ skip_argv: bool final; _count = min3(count, len, PAGE_SIZE); - nr_read = access_remote_vm(mm, p, page, _count, 0); + nr_read = access_remote_vm(mm, p, page, _count, + FOLL_FORCE); if (nr_read < 0) rv = nr_read; if (nr_read <= 0) @@ -832,6 +835,7 @@ static ssize_t mem_rw(struct file *file, char __user *buf, unsigned long addr = *ppos; ssize_t copied; char *page; + unsigned int flags = FOLL_FORCE; if (!mm) return 0; @@ -844,6 +848,9 @@ static ssize_t mem_rw(struct file *file, char __user *buf, if (!atomic_inc_not_zero(&mm->mm_users)) goto free; + if (write) + flags |= FOLL_WRITE; + while (count > 0) { int this_len = min_t(int, count, PAGE_SIZE); @@ -852,7 +859,7 @@ static ssize_t mem_rw(struct file *file, char __user *buf, break; } - this_len = access_remote_vm(mm, addr, page, this_len, write); + this_len = access_remote_vm(mm, addr, page, this_len, flags); if (!this_len) { if (!copied) copied = -EIO; @@ -965,7 +972,7 @@ static ssize_t environ_read(struct file *file, char __user *buf, this_len = min(max_len, this_len); retval = access_remote_vm(mm, (env_start + src), - page, this_len, 0); + page, this_len, FOLL_FORCE); if (retval <= 0) { ret = retval; -- cgit v1.1 From 83aa3e0f791d458a28f91d7a50f92926f971ef7c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 18 Oct 2016 17:21:30 +0200 Subject: nfs4: fix missing-braces warning A bugfix introduced a harmless warning for update_open_stateid: fs/nfs/nfs4proc.c:1548:2: error: missing braces around initializer [-Werror=missing-braces] Removing the zero in the initializer will do the right thing here and initialize the entire structure to zero. Fixes: 1393d9612ba0 ("NFSv4: Fix a race when updating an open_stateid") Signed-off-by: Arnd Bergmann Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ed7da82..45b38ee 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1545,7 +1545,7 @@ static int update_open_stateid(struct nfs4_state *state, struct nfs_client *clp = server->nfs_client; struct nfs_inode *nfsi = NFS_I(state->inode); struct nfs_delegation *deleg_cur; - nfs4_stateid freeme = {0}; + nfs4_stateid freeme = { }; int ret = 0; fmode &= (FMODE_READ|FMODE_WRITE); -- cgit v1.1 From 390975ac3978162ec9c6beca66f0fe83b0be33bb Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Tue, 18 Oct 2016 21:21:43 +0200 Subject: ubifs: Rename ubifs_rename2 Since ->rename2 is gone, rename ubifs_rename2() to ubifs_rename(). Suggested-by: Linus Torvalds Signed-off-by: Richard Weinberger --- fs/ubifs/dir.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index c8f60df..668ec3b 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -1060,9 +1060,9 @@ static void unlock_4_inodes(struct inode *inode1, struct inode *inode2, mutex_unlock(&ubifs_inode(inode1)->ui_mutex); } -static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) +static int do_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { struct ubifs_info *c = old_dir->i_sb->s_fs_info; struct inode *old_inode = d_inode(old_dentry); @@ -1323,7 +1323,7 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry, return err; } -static int ubifs_rename2(struct inode *old_dir, struct dentry *old_dentry, +static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { @@ -1336,7 +1336,7 @@ static int ubifs_rename2(struct inode *old_dir, struct dentry *old_dentry, if (flags & RENAME_EXCHANGE) return ubifs_xrename(old_dir, old_dentry, new_dir, new_dentry); - return ubifs_rename(old_dir, old_dentry, new_dir, new_dentry, flags); + return do_rename(old_dir, old_dentry, new_dir, new_dentry, flags); } int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry, @@ -1387,7 +1387,7 @@ const struct inode_operations ubifs_dir_inode_operations = { .mkdir = ubifs_mkdir, .rmdir = ubifs_rmdir, .mknod = ubifs_mknod, - .rename = ubifs_rename2, + .rename = ubifs_rename, .setattr = ubifs_setattr, .getattr = ubifs_getattr, .listxattr = ubifs_listxattr, -- cgit v1.1 From 843741c5778398ea67055067f4cc65ae6c80ca0e Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Tue, 20 Sep 2016 10:08:30 +0200 Subject: ubifs: Fix xattr_names length in exit paths When the operation fails we also have to undo the changes we made to ->xattr_names. Otherwise listxattr() will report wrong lengths. Cc: stable@vger.kernel.org Signed-off-by: Richard Weinberger --- fs/ubifs/xattr.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 6c2f4d4..d9f9615 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -172,6 +172,7 @@ out_cancel: host_ui->xattr_cnt -= 1; host_ui->xattr_size -= CALC_DENT_SIZE(nm->len); host_ui->xattr_size -= CALC_XATTR_BYTES(size); + host_ui->xattr_names -= nm->len; mutex_unlock(&host_ui->ui_mutex); out_free: make_bad_inode(inode); @@ -478,6 +479,7 @@ out_cancel: host_ui->xattr_cnt += 1; host_ui->xattr_size += CALC_DENT_SIZE(nm->len); host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len); + host_ui->xattr_names += nm->len; mutex_unlock(&host_ui->ui_mutex); ubifs_release_budget(c, &req); make_bad_inode(inode); -- cgit v1.1 From c83ed4c9dbb358b9e7707486e167e940d48bfeed Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Wed, 19 Oct 2016 12:43:07 +0200 Subject: ubifs: Abort readdir upon error If UBIFS is facing an error while walking a directory, it reports this error and ubifs_readdir() returns the error code. But the VFS readdir logic does not make the getdents system call fail in all cases. When the readdir cursor indicates that more entries are present, the system call will just return and the libc wrapper will try again since it also knows that more entries are present. This causes the libc wrapper to busy loop for ever when a directory is corrupted on UBIFS. A common approach do deal with corrupted directory entries is skipping them by setting the cursor to the next entry. On UBIFS this approach is not possible since we cannot compute the next directory entry cursor position without reading the current entry. So all we can do is setting the cursor to the "no more entries" position and make getdents exit. Cc: stable@vger.kernel.org Signed-off-by: Richard Weinberger --- fs/ubifs/dir.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 668ec3b..bd4a5e8 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -439,7 +439,7 @@ static unsigned int vfs_dent_type(uint8_t type) */ static int ubifs_readdir(struct file *file, struct dir_context *ctx) { - int err; + int err = 0; struct qstr nm; union ubifs_key key; struct ubifs_dent_node *dent; @@ -541,14 +541,12 @@ out: kfree(file->private_data); file->private_data = NULL; - if (err != -ENOENT) { + if (err != -ENOENT) ubifs_err(c, "cannot find next direntry, error %d", err); - return err; - } /* 2 is a special value indicating that there are no more direntries */ ctx->pos = 2; - return 0; + return err; } /* Free saved readdir() state when the directory is closed */ -- cgit v1.1 From 1d55a4bfd080ff4c6c96acfccfb7cdd2615ed6c2 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 20 Oct 2016 15:40:55 +1100 Subject: xfs: remove redundant assignment of ifp Remove redundant ifp = ifp statement, it does nothing. Found with static analysis by CoverityScan. Signed-off-by: Colin Ian King Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_bmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index c27344c..0283b7e 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -5204,7 +5204,7 @@ xfs_bunmapi_cow( ep = xfs_bmap_search_extents(ip, del->br_startoff, XFS_COW_FORK, &eof, &eidx, &got, &new); - ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); ifp = ifp; + ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); ASSERT((eidx >= 0) && (eidx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); ASSERT(del->br_blockcount > 0); -- cgit v1.1 From 1be7f9be0efa4e90547f50b8188f4e70710a1173 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 20 Oct 2016 15:41:48 +1100 Subject: xfs: Fix uninitialized variable in xfs_reflink_reserve_cow_range() with gcc 4.1.2: fs/xfs/xfs_reflink.c: In function xfs_reflink_reserve_cow_range: fs/xfs/xfs_reflink.c:327: warning: error may be used uninitialized in this function Indeed, if "count" is zero, the function will return an uninitialized error value. While "count" is unlikely to be zero, this function is called through the public iomap API. Hence fix this by preinitializing error to zero. Fixes: 2a06705cd5954030 ("xfs: create delalloc extents in CoW fork") Signed-off-by: Geert Uytterhoeven Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_reflink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 5965e94..d48a7cc 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -324,7 +324,7 @@ xfs_reflink_reserve_cow_range( struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t offset_fsb, end_fsb; bool skipped = false; - int error; + int error = 0; trace_xfs_reflink_reserve_cow_range(ip, offset, count); -- cgit v1.1 From f1b8243c55ca6fd2a3898e2f586b8cfcfff684bb Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 20 Oct 2016 15:42:30 +1100 Subject: xfs: add some 'static' annotations sparse reported that several variables and a function were not forward-declared anywhere and therefore should be 'static'. Found with sparse by running 'make C=2 CF=-D__CHECK_ENDIAN__ fs/xfs/' Signed-off-by: Eric Biggers Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_btree.c | 2 +- fs/xfs/xfs_sysfs.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 5c8e6f2..0e80993 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -4826,7 +4826,7 @@ xfs_btree_calc_size( return rval; } -int +static int xfs_btree_count_blocks_helper( struct xfs_btree_cur *cur, int level, diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index 5f8d55d..276d302 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -512,13 +512,13 @@ static struct attribute *xfs_error_attrs[] = { }; -struct kobj_type xfs_error_cfg_ktype = { +static struct kobj_type xfs_error_cfg_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, .default_attrs = xfs_error_attrs, }; -struct kobj_type xfs_error_ktype = { +static struct kobj_type xfs_error_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, }; -- cgit v1.1 From 0ee7a3f6b5b2f22bb69bfc6c60d0ea0777003098 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Oct 2016 15:44:14 +1100 Subject: xfs: don't take the IOLOCK exclusive for direct I/O page invalidation XFS historically took the iolock exclusive when invalidating pages before direct I/O operations to protect against writeback starvations. But this writeback starvation issues has been fixed a long time ago in the core writeback code, and all other file systems manage to do without the exclusive lock. Convert XFS over to avoid the exclusive lock in this case, and also move to range invalidations like done by the other file systems. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Carlos Maiolino Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 98 +++++++++++++++++-------------------------------------- 1 file changed, 30 insertions(+), 68 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index a314fc7..0dc9971 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -249,6 +249,7 @@ xfs_file_dio_aio_read( struct xfs_inode *ip = XFS_I(inode); loff_t isize = i_size_read(inode); size_t count = iov_iter_count(to); + loff_t end = iocb->ki_pos + count - 1; struct iov_iter data; struct xfs_buftarg *target; ssize_t ret = 0; @@ -272,49 +273,21 @@ xfs_file_dio_aio_read( file_accessed(iocb->ki_filp); - /* - * Locking is a bit tricky here. If we take an exclusive lock for direct - * IO, we effectively serialise all new concurrent read IO to this file - * and block it behind IO that is currently in progress because IO in - * progress holds the IO lock shared. We only need to hold the lock - * exclusive to blow away the page cache, so only take lock exclusively - * if the page cache needs invalidation. This allows the normal direct - * IO case of no page cache pages to proceeed concurrently without - * serialisation. - */ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); if (mapping->nrpages) { - xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); - xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); + ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end); + if (ret) + goto out_unlock; /* - * The generic dio code only flushes the range of the particular - * I/O. Because we take an exclusive lock here, this whole - * sequence is considerably more expensive for us. This has a - * noticeable performance impact for any file with cached pages, - * even when outside of the range of the particular I/O. - * - * Hence, amortize the cost of the lock against a full file - * flush and reduce the chances of repeated iolock cycles going - * forward. + * Invalidate whole pages. This can return an error if we fail + * to invalidate a page, but this should never happen on XFS. + * Warn if it does fail. */ - if (mapping->nrpages) { - ret = filemap_write_and_wait(mapping); - if (ret) { - xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); - return ret; - } - - /* - * Invalidate whole pages. This can return an error if - * we fail to invalidate a page, but this should never - * happen on XFS. Warn if it does fail. - */ - ret = invalidate_inode_pages2(mapping); - WARN_ON_ONCE(ret); - ret = 0; - } - xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); + ret = invalidate_inode_pages2_range(mapping, + iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT); + WARN_ON_ONCE(ret); + ret = 0; } data = *to; @@ -324,8 +297,9 @@ xfs_file_dio_aio_read( iocb->ki_pos += ret; iov_iter_advance(to, ret); } - xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); +out_unlock: + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); return ret; } @@ -570,61 +544,49 @@ xfs_file_dio_aio_write( if ((iocb->ki_pos | count) & target->bt_logical_sectormask) return -EINVAL; - /* "unaligned" here means not aligned to a filesystem block */ - if ((iocb->ki_pos & mp->m_blockmask) || - ((iocb->ki_pos + count) & mp->m_blockmask)) - unaligned_io = 1; - /* - * We don't need to take an exclusive lock unless there page cache needs - * to be invalidated or unaligned IO is being executed. We don't need to - * consider the EOF extension case here because - * xfs_file_aio_write_checks() will relock the inode as necessary for - * EOF zeroing cases and fill out the new inode size as appropriate. + * Don't take the exclusive iolock here unless the I/O is unaligned to + * the file system block size. We don't need to consider the EOF + * extension case here because xfs_file_aio_write_checks() will relock + * the inode as necessary for EOF zeroing cases and fill out the new + * inode size as appropriate. */ - if (unaligned_io || mapping->nrpages) + if ((iocb->ki_pos & mp->m_blockmask) || + ((iocb->ki_pos + count) & mp->m_blockmask)) { + unaligned_io = 1; iolock = XFS_IOLOCK_EXCL; - else + } else { iolock = XFS_IOLOCK_SHARED; - xfs_rw_ilock(ip, iolock); - - /* - * Recheck if there are cached pages that need invalidate after we got - * the iolock to protect against other threads adding new pages while - * we were waiting for the iolock. - */ - if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) { - xfs_rw_iunlock(ip, iolock); - iolock = XFS_IOLOCK_EXCL; - xfs_rw_ilock(ip, iolock); } + xfs_rw_ilock(ip, iolock); + ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) goto out; count = iov_iter_count(from); end = iocb->ki_pos + count - 1; - /* - * See xfs_file_dio_aio_read() for why we do a full-file flush here. - */ if (mapping->nrpages) { - ret = filemap_write_and_wait(VFS_I(ip)->i_mapping); + ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end); if (ret) goto out; + /* * Invalidate whole pages. This can return an error if we fail * to invalidate a page, but this should never happen on XFS. * Warn if it does fail. */ - ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping); + ret = invalidate_inode_pages2_range(mapping, + iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT); WARN_ON_ONCE(ret); ret = 0; } /* * If we are doing unaligned IO, wait for all other IO to drain, - * otherwise demote the lock if we had to flush cached pages + * otherwise demote the lock if we had to take the exclusive lock + * for other reasons in xfs_file_aio_write_checks. */ if (unaligned_io) inode_dio_wait(inode); -- cgit v1.1 From fe23759eaf2f6540de20c1623f066aad967ff9c9 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Thu, 20 Oct 2016 15:44:53 +1100 Subject: xfs: remove pointless error goto in xfs_bmap_remap_alloc The commit: f65306ea xfs: map an inode's offset to an exact physical block added a pointless error0: target; remove it. Addresses-Coverity-Id: 1373865 Signed-off-by: Eric Sandeen Reviewed-by: Bill O'Donnell Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_bmap.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 0283b7e..80bdb11 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -3974,9 +3974,6 @@ xfs_bmap_remap_alloc( * allocating, so skip that check by pretending to be freeing. */ error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING); - if (error) - goto error0; -error0: xfs_perag_put(args.pag); if (error) trace_xfs_bmap_remap_alloc_error(ap->ip, error, _RET_IP_); -- cgit v1.1 From d099245297e28fc9f8493edd9d5a1f0967a72511 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 20 Oct 2016 15:45:40 +1100 Subject: xfs: unset MS_ACTIVE if mount fails As part of the inode block map intent log item recovery process, we had to set the IRECOVERY flag to prevent an unlinked inode from being truncated during the first iput call. This required us to set MS_ACTIVE so that iput puts the inode on the lru instead of immediately evicting the inode. Unfortunately, if the mount fails later on, the inodes that have been loaded (root dir and realtime) actually need to be evicted since we're aborting the mount. If we don't clear MS_ACTIVE in the failure step, those inodes are not evicted and therefore leak. The leak was found by running xfs/130 and rmmoding xfs immediately after the test. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_mount.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index fc78739..b341f10 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1009,6 +1009,7 @@ xfs_mountfs( out_quota: xfs_qm_unmount_quotas(mp); out_rtunmount: + mp->m_super->s_flags &= ~MS_ACTIVE; xfs_rtunmount_inodes(mp); out_rele_rip: IRELE(rip); -- cgit v1.1 From 58d789678546d46d7bbd809dd7dab417c0f23655 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 20 Oct 2016 15:46:18 +1100 Subject: libxfs: clean up _calc_dquots_per_chunk The function xfs_calc_dquots_per_chunk takes a parameter in units of basic blocks. The kernel seems to get the units wrong, but userspace got 'fixed' by commenting out the unnecessary conversion. Fix both. cc: Signed-off-by: Darrick J. Wong Reviewed-by: Eric Sandeen Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_dquot_buf.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index 3cc3cf7..ac9a003 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -191,8 +191,7 @@ xfs_dquot_buf_verify_crc( if (mp->m_quotainfo) ndquots = mp->m_quotainfo->qi_dqperchunk; else - ndquots = xfs_calc_dquots_per_chunk( - XFS_BB_TO_FSB(mp, bp->b_length)); + ndquots = xfs_calc_dquots_per_chunk(bp->b_length); for (i = 0; i < ndquots; i++, d++) { if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk), -- cgit v1.1 From 8cdcc8102c0cfad20513ed1bfb96e0e9963928a8 Mon Sep 17 00:00:00 2001 From: Roger Willcocks Date: Thu, 20 Oct 2016 15:48:38 +1100 Subject: libxfs: v3 inodes are only valid on crc-enabled filesystems xfs_repair was not detecting that version 3 inodes are invalid for for non-CRC filesystems. The result is specific inode corruptions go undetected and hence aren't repaired if only the version number is out of range. The core of the problem is that the XFS_DINODE_GOOD_VERSION() macro doesn't know that valid inode versions are dependent on a superblock version number. Fix this in libxfs, and propagate the new function out into the rest of xfsprogs to fix the issue. [Darrick: port to kernel from xfsprogs] Reported-by: Leslie Rhorer Signed-off-by: Roger Willcocks Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_format.h | 1 - fs/xfs/libxfs/xfs_inode_buf.c | 13 ++++++++++++- fs/xfs/libxfs/xfs_inode_buf.h | 2 ++ 3 files changed, 14 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index f6547fc..6b7579e 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -865,7 +865,6 @@ typedef struct xfs_timestamp { * padding field for v3 inodes. */ #define XFS_DINODE_MAGIC 0x494e /* 'IN' */ -#define XFS_DINODE_GOOD_VERSION(v) ((v) >= 1 && (v) <= 3) typedef struct xfs_dinode { __be16 di_magic; /* inode magic # = XFS_DINODE_MAGIC */ __be16 di_mode; /* mode and type of file */ diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 8de9a3a..134424f 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -57,6 +57,17 @@ xfs_inobp_check( } #endif +bool +xfs_dinode_good_version( + struct xfs_mount *mp, + __u8 version) +{ + if (xfs_sb_version_hascrc(&mp->m_sb)) + return version == 3; + + return version == 1 || version == 2; +} + /* * If we are doing readahead on an inode buffer, we might be in log recovery * reading an inode allocation buffer that hasn't yet been replayed, and hence @@ -91,7 +102,7 @@ xfs_inode_buf_verify( dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog)); di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && - XFS_DINODE_GOOD_VERSION(dip->di_version); + xfs_dinode_good_version(mp, dip->di_version); if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP, XFS_RANDOM_ITOBP_INOTOBP))) { diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index 62d9d46..3cfe12a 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -74,6 +74,8 @@ void xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from); void xfs_log_dinode_to_disk(struct xfs_log_dinode *from, struct xfs_dinode *to); +bool xfs_dinode_good_version(struct xfs_mount *mp, __u8 version); + #if defined(DEBUG) void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *); #else -- cgit v1.1 From 4fbc2c65255f77b315a4ee3ccac397d677a35737 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Oct 2016 15:48:54 +1100 Subject: xfs: remove the same fs check from xfs_file_share_range The VFS already does the check, and the placement of this duplicate is in the way of the following locking rework. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 0dc9971..194f8f3 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -965,9 +965,6 @@ xfs_file_share_range( IS_SWAPFILE(inode_out)) return -ETXTBSY; - /* Reflink only works within this filesystem. */ - if (inode_in->i_sb != inode_out->i_sb) - return -EXDEV; same_inode = (inode_in->i_ino == inode_out->i_ino); /* Don't reflink dirs, pipes, sockets... */ -- cgit v1.1 From a62e82b35b97e60e9e22a4e303900f342139822f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Oct 2016 15:49:03 +1100 Subject: xfs: fix the same_inode check in xfs_file_share_range The VFS i_ino is an unsigned long, while XFS inode numbers are 64-bit wide, so checking i_ino for equality could lead to rate false positives on 32-bit architectures. Just compare the inode pointers themselves to be safe. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 194f8f3..d5b835e 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -965,7 +965,7 @@ xfs_file_share_range( IS_SWAPFILE(inode_out)) return -ETXTBSY; - same_inode = (inode_in->i_ino == inode_out->i_ino); + same_inode = (inode_in == inode_out); /* Don't reflink dirs, pipes, sockets... */ if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) -- cgit v1.1 From 576177818e6f1e65f6109ed4a8fae8b60131c861 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Oct 2016 15:49:19 +1100 Subject: xfs: move inode locking from xfs_reflink_remap_range to xfs_file_share_range We need the iolock protection to stabilizie the IS_SWAPFILE and IS_IMMUTABLE values, as well as preventing new buffered writers re-dirtying the file data that we just wrote out. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 62 ++++++++++++++++++++++++++++++++++------------------ fs/xfs/xfs_reflink.c | 15 ------------- 2 files changed, 41 insertions(+), 36 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index d5b835e..663761e 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -958,38 +958,54 @@ xfs_file_share_range( inode_out = file_inode(file_out); bs = inode_out->i_sb->s_blocksize; + /* Lock both files against IO */ + same_inode = (inode_in == inode_out); + if (same_inode) { + xfs_ilock(XFS_I(inode_in), XFS_IOLOCK_EXCL); + xfs_ilock(XFS_I(inode_in), XFS_MMAPLOCK_EXCL); + } else { + xfs_lock_two_inodes(XFS_I(inode_in), XFS_I(inode_out), + XFS_IOLOCK_EXCL); + xfs_lock_two_inodes(XFS_I(inode_in), XFS_I(inode_out), + XFS_MMAPLOCK_EXCL); + } + /* Don't touch certain kinds of inodes */ + ret = -EPERM; if (IS_IMMUTABLE(inode_out)) - return -EPERM; - if (IS_SWAPFILE(inode_in) || - IS_SWAPFILE(inode_out)) - return -ETXTBSY; - - same_inode = (inode_in == inode_out); + goto out_unlock; + ret = -ETXTBSY; + if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) + goto out_unlock; /* Don't reflink dirs, pipes, sockets... */ + ret = -EISDIR; if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) - return -EISDIR; + goto out_unlock; + ret = -EINVAL; if (S_ISFIFO(inode_in->i_mode) || S_ISFIFO(inode_out->i_mode)) - return -EINVAL; + goto out_unlock; if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) - return -EINVAL; + goto out_unlock; /* Don't share DAX file data for now. */ if (IS_DAX(inode_in) || IS_DAX(inode_out)) - return -EINVAL; + goto out_unlock; /* Are we going all the way to the end? */ isize = i_size_read(inode_in); - if (isize == 0) - return 0; + if (isize == 0) { + ret = 0; + goto out_unlock; + } + if (len == 0) len = isize - pos_in; /* Ensure offsets don't wrap and the input is inside i_size */ if (pos_in + len < pos_in || pos_out + len < pos_out || pos_in + len > isize) - return -EINVAL; + goto out_unlock; /* Don't allow dedupe past EOF in the dest file */ if (is_dedupe) { @@ -997,7 +1013,7 @@ xfs_file_share_range( disize = i_size_read(inode_out); if (pos_out >= disize || pos_out + len > disize) - return -EINVAL; + goto out_unlock; } /* If we're linking to EOF, continue to the block boundary. */ @@ -1009,28 +1025,32 @@ xfs_file_share_range( /* Only reflink if we're aligned to block boundaries */ if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) || !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs)) - return -EINVAL; + goto out_unlock; /* Don't allow overlapped reflink within the same file */ if (same_inode && pos_out + blen > pos_in && pos_out < pos_in + blen) - return -EINVAL; + goto out_unlock; /* Wait for the completion of any pending IOs on srcfile */ ret = xfs_file_wait_for_io(inode_in, pos_in, len); if (ret) - goto out; + goto out_unlock; ret = xfs_file_wait_for_io(inode_out, pos_out, len); if (ret) - goto out; + goto out_unlock; if (is_dedupe) flags |= XFS_REFLINK_DEDUPE; ret = xfs_reflink_remap_range(XFS_I(inode_in), pos_in, XFS_I(inode_out), pos_out, len, flags); - if (ret < 0) - goto out; -out: +out_unlock: + xfs_iunlock(XFS_I(inode_in), XFS_MMAPLOCK_EXCL); + xfs_iunlock(XFS_I(inode_in), XFS_IOLOCK_EXCL); + if (!same_inode) { + xfs_iunlock(XFS_I(inode_out), XFS_MMAPLOCK_EXCL); + xfs_iunlock(XFS_I(inode_out), XFS_IOLOCK_EXCL); + } return ret; } diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index d48a7cc..3b1c1a6 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1341,15 +1341,6 @@ xfs_reflink_remap_range( trace_xfs_reflink_remap_range(src, srcoff, len, dest, destoff); - /* Lock both files against IO */ - if (src->i_ino == dest->i_ino) { - xfs_ilock(src, XFS_IOLOCK_EXCL); - xfs_ilock(src, XFS_MMAPLOCK_EXCL); - } else { - xfs_lock_two_inodes(src, dest, XFS_IOLOCK_EXCL); - xfs_lock_two_inodes(src, dest, XFS_MMAPLOCK_EXCL); - } - /* * Check that the extents are the same. */ @@ -1401,12 +1392,6 @@ xfs_reflink_remap_range( goto out_error; out_error: - xfs_iunlock(src, XFS_MMAPLOCK_EXCL); - xfs_iunlock(src, XFS_IOLOCK_EXCL); - if (src->i_ino != dest->i_ino) { - xfs_iunlock(dest, XFS_MMAPLOCK_EXCL); - xfs_iunlock(dest, XFS_IOLOCK_EXCL); - } if (error) trace_xfs_reflink_remap_range_error(dest, error, _RET_IP_); return error; -- cgit v1.1 From ec40759902556f21f37641ad9f19d02c4dd4b555 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Oct 2016 15:49:55 +1100 Subject: xfs: remove xfs_file_wait_for_io filemap_write_and_wait_range operates on full pages, so there is no need for the rounding operations. Additionally this allows us to micro-optimize by skipping the second inode_dio_wait for a intra-file clone. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 39 ++++++++++----------------------------- 1 file changed, 10 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 663761e..9372975 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -909,32 +909,6 @@ out_unlock: return error; } -/* - * Flush all file writes out to disk. - */ -static int -xfs_file_wait_for_io( - struct inode *inode, - loff_t offset, - size_t len) -{ - loff_t rounding; - loff_t ioffset; - loff_t iendoffset; - loff_t bs; - int ret; - - bs = inode->i_sb->s_blocksize; - inode_dio_wait(inode); - - rounding = max_t(xfs_off_t, bs, PAGE_SIZE); - ioffset = round_down(offset, rounding); - iendoffset = round_up(offset + len, rounding) - 1; - ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, - iendoffset); - return ret; -} - /* Hook up to the VFS reflink function */ STATIC int xfs_file_share_range( @@ -1031,11 +1005,18 @@ xfs_file_share_range( if (same_inode && pos_out + blen > pos_in && pos_out < pos_in + blen) goto out_unlock; - /* Wait for the completion of any pending IOs on srcfile */ - ret = xfs_file_wait_for_io(inode_in, pos_in, len); + /* Wait for the completion of any pending IOs on both files */ + inode_dio_wait(inode_in); + if (!same_inode) + inode_dio_wait(inode_out); + + ret = filemap_write_and_wait_range(inode_in->i_mapping, + pos_in, pos_in + len - 1); if (ret) goto out_unlock; - ret = xfs_file_wait_for_io(inode_out, pos_out, len); + + ret = filemap_write_and_wait_range(inode_out->i_mapping, + pos_out, pos_out + len - 1); if (ret) goto out_unlock; -- cgit v1.1 From 5faaf4fa0a20d38edc4df57baf24ea35b7e91178 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Oct 2016 15:50:07 +1100 Subject: xfs: merge xfs_reflink_remap_range and xfs_file_share_range There is no clear division of responsibility between those functions, so just merge them into one to keep the code simple. Also move xfs_file_wait_for_io to xfs_reflink.c together with its only caller. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 132 +------------------------------------- fs/xfs/xfs_reflink.c | 178 +++++++++++++++++++++++++++++++++++++++------------ fs/xfs/xfs_reflink.h | 7 +- 3 files changed, 143 insertions(+), 174 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 9372975..6e4f7f9 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -909,132 +909,6 @@ out_unlock: return error; } -/* Hook up to the VFS reflink function */ -STATIC int -xfs_file_share_range( - struct file *file_in, - loff_t pos_in, - struct file *file_out, - loff_t pos_out, - u64 len, - bool is_dedupe) -{ - struct inode *inode_in; - struct inode *inode_out; - ssize_t ret; - loff_t bs; - loff_t isize; - int same_inode; - loff_t blen; - unsigned int flags = 0; - - inode_in = file_inode(file_in); - inode_out = file_inode(file_out); - bs = inode_out->i_sb->s_blocksize; - - /* Lock both files against IO */ - same_inode = (inode_in == inode_out); - if (same_inode) { - xfs_ilock(XFS_I(inode_in), XFS_IOLOCK_EXCL); - xfs_ilock(XFS_I(inode_in), XFS_MMAPLOCK_EXCL); - } else { - xfs_lock_two_inodes(XFS_I(inode_in), XFS_I(inode_out), - XFS_IOLOCK_EXCL); - xfs_lock_two_inodes(XFS_I(inode_in), XFS_I(inode_out), - XFS_MMAPLOCK_EXCL); - } - - /* Don't touch certain kinds of inodes */ - ret = -EPERM; - if (IS_IMMUTABLE(inode_out)) - goto out_unlock; - ret = -ETXTBSY; - if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) - goto out_unlock; - - /* Don't reflink dirs, pipes, sockets... */ - ret = -EISDIR; - if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) - goto out_unlock; - ret = -EINVAL; - if (S_ISFIFO(inode_in->i_mode) || S_ISFIFO(inode_out->i_mode)) - goto out_unlock; - if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) - goto out_unlock; - - /* Don't share DAX file data for now. */ - if (IS_DAX(inode_in) || IS_DAX(inode_out)) - goto out_unlock; - - /* Are we going all the way to the end? */ - isize = i_size_read(inode_in); - if (isize == 0) { - ret = 0; - goto out_unlock; - } - - if (len == 0) - len = isize - pos_in; - - /* Ensure offsets don't wrap and the input is inside i_size */ - if (pos_in + len < pos_in || pos_out + len < pos_out || - pos_in + len > isize) - goto out_unlock; - - /* Don't allow dedupe past EOF in the dest file */ - if (is_dedupe) { - loff_t disize; - - disize = i_size_read(inode_out); - if (pos_out >= disize || pos_out + len > disize) - goto out_unlock; - } - - /* If we're linking to EOF, continue to the block boundary. */ - if (pos_in + len == isize) - blen = ALIGN(isize, bs) - pos_in; - else - blen = len; - - /* Only reflink if we're aligned to block boundaries */ - if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) || - !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs)) - goto out_unlock; - - /* Don't allow overlapped reflink within the same file */ - if (same_inode && pos_out + blen > pos_in && pos_out < pos_in + blen) - goto out_unlock; - - /* Wait for the completion of any pending IOs on both files */ - inode_dio_wait(inode_in); - if (!same_inode) - inode_dio_wait(inode_out); - - ret = filemap_write_and_wait_range(inode_in->i_mapping, - pos_in, pos_in + len - 1); - if (ret) - goto out_unlock; - - ret = filemap_write_and_wait_range(inode_out->i_mapping, - pos_out, pos_out + len - 1); - if (ret) - goto out_unlock; - - if (is_dedupe) - flags |= XFS_REFLINK_DEDUPE; - ret = xfs_reflink_remap_range(XFS_I(inode_in), pos_in, XFS_I(inode_out), - pos_out, len, flags); - -out_unlock: - xfs_iunlock(XFS_I(inode_in), XFS_MMAPLOCK_EXCL); - xfs_iunlock(XFS_I(inode_in), XFS_IOLOCK_EXCL); - if (!same_inode) { - xfs_iunlock(XFS_I(inode_out), XFS_MMAPLOCK_EXCL); - xfs_iunlock(XFS_I(inode_out), XFS_IOLOCK_EXCL); - } - return ret; -} - STATIC ssize_t xfs_file_copy_range( struct file *file_in, @@ -1046,7 +920,7 @@ xfs_file_copy_range( { int error; - error = xfs_file_share_range(file_in, pos_in, file_out, pos_out, + error = xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out, len, false); if (error) return error; @@ -1061,7 +935,7 @@ xfs_file_clone_range( loff_t pos_out, u64 len) { - return xfs_file_share_range(file_in, pos_in, file_out, pos_out, + return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out, len, false); } @@ -1084,7 +958,7 @@ xfs_file_dedupe_range( if (len > XFS_MAX_DEDUPE_LEN) len = XFS_MAX_DEDUPE_LEN; - error = xfs_file_share_range(src_file, loff, dst_file, dst_loff, + error = xfs_reflink_remap_range(src_file, loff, dst_file, dst_loff, len, true); if (error) return error; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 3b1c1a6..6592daa 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1312,19 +1312,26 @@ out_error: */ int xfs_reflink_remap_range( - struct xfs_inode *src, - xfs_off_t srcoff, - struct xfs_inode *dest, - xfs_off_t destoff, - xfs_off_t len, - unsigned int flags) + struct file *file_in, + loff_t pos_in, + struct file *file_out, + loff_t pos_out, + u64 len, + bool is_dedupe) { + struct inode *inode_in = file_inode(file_in); + struct xfs_inode *src = XFS_I(inode_in); + struct inode *inode_out = file_inode(file_out); + struct xfs_inode *dest = XFS_I(inode_out); struct xfs_mount *mp = src->i_mount; + loff_t bs = inode_out->i_sb->s_blocksize; + bool same_inode = (inode_in == inode_out); xfs_fileoff_t sfsbno, dfsbno; xfs_filblks_t fsblen; - int error; xfs_extlen_t cowextsize; - bool is_same; + loff_t isize; + ssize_t ret; + loff_t blen; if (!xfs_sb_version_hasreflink(&mp->m_sb)) return -EOPNOTSUPP; @@ -1332,48 +1339,135 @@ xfs_reflink_remap_range( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; + /* Lock both files against IO */ + if (same_inode) { + xfs_ilock(src, XFS_IOLOCK_EXCL); + xfs_ilock(src, XFS_MMAPLOCK_EXCL); + } else { + xfs_lock_two_inodes(src, dest, XFS_IOLOCK_EXCL); + xfs_lock_two_inodes(src, dest, XFS_MMAPLOCK_EXCL); + } + + /* Don't touch certain kinds of inodes */ + ret = -EPERM; + if (IS_IMMUTABLE(inode_out)) + goto out_unlock; + + ret = -ETXTBSY; + if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) + goto out_unlock; + + + /* Don't reflink dirs, pipes, sockets... */ + ret = -EISDIR; + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) + goto out_unlock; + ret = -EINVAL; + if (S_ISFIFO(inode_in->i_mode) || S_ISFIFO(inode_out->i_mode)) + goto out_unlock; + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) + goto out_unlock; + /* Don't reflink realtime inodes */ if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest)) - return -EINVAL; + goto out_unlock; + + /* Don't share DAX file data for now. */ + if (IS_DAX(inode_in) || IS_DAX(inode_out)) + goto out_unlock; + + /* Are we going all the way to the end? */ + isize = i_size_read(inode_in); + if (isize == 0) { + ret = 0; + goto out_unlock; + } + + if (len == 0) + len = isize - pos_in; + + /* Ensure offsets don't wrap and the input is inside i_size */ + if (pos_in + len < pos_in || pos_out + len < pos_out || + pos_in + len > isize) + goto out_unlock; - if (flags & ~XFS_REFLINK_ALL) - return -EINVAL; + /* Don't allow dedupe past EOF in the dest file */ + if (is_dedupe) { + loff_t disize; - trace_xfs_reflink_remap_range(src, srcoff, len, dest, destoff); + disize = i_size_read(inode_out); + if (pos_out >= disize || pos_out + len > disize) + goto out_unlock; + } + + /* If we're linking to EOF, continue to the block boundary. */ + if (pos_in + len == isize) + blen = ALIGN(isize, bs) - pos_in; + else + blen = len; + + /* Only reflink if we're aligned to block boundaries */ + if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) || + !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs)) + goto out_unlock; + + /* Don't allow overlapped reflink within the same file */ + if (same_inode) { + if (pos_out + blen > pos_in && pos_out < pos_in + blen) + goto out_unlock; + } + + /* Wait for the completion of any pending IOs on both files */ + inode_dio_wait(inode_in); + if (!same_inode) + inode_dio_wait(inode_out); + + ret = filemap_write_and_wait_range(inode_in->i_mapping, + pos_in, pos_in + len - 1); + if (ret) + goto out_unlock; + + ret = filemap_write_and_wait_range(inode_out->i_mapping, + pos_out, pos_out + len - 1); + if (ret) + goto out_unlock; + + trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); /* * Check that the extents are the same. */ - if (flags & XFS_REFLINK_DEDUPE) { - is_same = false; - error = xfs_compare_extents(VFS_I(src), srcoff, VFS_I(dest), - destoff, len, &is_same); - if (error) - goto out_error; + if (is_dedupe) { + bool is_same = false; + + ret = xfs_compare_extents(inode_in, pos_in, inode_out, pos_out, + len, &is_same); + if (ret) + goto out_unlock; if (!is_same) { - error = -EBADE; - goto out_error; + ret = -EBADE; + goto out_unlock; } } - error = xfs_reflink_set_inode_flag(src, dest); - if (error) - goto out_error; + ret = xfs_reflink_set_inode_flag(src, dest); + if (ret) + goto out_unlock; /* * Invalidate the page cache so that we can clear any CoW mappings * in the destination file. */ - truncate_inode_pages_range(&VFS_I(dest)->i_data, destoff, - PAGE_ALIGN(destoff + len) - 1); + truncate_inode_pages_range(&inode_out->i_data, pos_out, + PAGE_ALIGN(pos_out + len) - 1); - dfsbno = XFS_B_TO_FSBT(mp, destoff); - sfsbno = XFS_B_TO_FSBT(mp, srcoff); + dfsbno = XFS_B_TO_FSBT(mp, pos_out); + sfsbno = XFS_B_TO_FSBT(mp, pos_in); fsblen = XFS_B_TO_FSB(mp, len); - error = xfs_reflink_remap_blocks(src, sfsbno, dest, dfsbno, fsblen, - destoff + len); - if (error) - goto out_error; + ret = xfs_reflink_remap_blocks(src, sfsbno, dest, dfsbno, fsblen, + pos_out + len); + if (ret) + goto out_unlock; /* * Carry the cowextsize hint from src to dest if we're sharing the @@ -1381,20 +1475,24 @@ xfs_reflink_remap_range( * has a cowextsize hint, and the destination file does not. */ cowextsize = 0; - if (srcoff == 0 && len == i_size_read(VFS_I(src)) && + if (pos_in == 0 && len == i_size_read(inode_in) && (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) && - destoff == 0 && len >= i_size_read(VFS_I(dest)) && + pos_out == 0 && len >= i_size_read(inode_out) && !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE)) cowextsize = src->i_d.di_cowextsize; - error = xfs_reflink_update_dest(dest, destoff + len, cowextsize); - if (error) - goto out_error; + ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize); -out_error: - if (error) - trace_xfs_reflink_remap_range_error(dest, error, _RET_IP_); - return error; +out_unlock: + xfs_iunlock(src, XFS_MMAPLOCK_EXCL); + xfs_iunlock(src, XFS_IOLOCK_EXCL); + if (src->i_ino != dest->i_ino) { + xfs_iunlock(dest, XFS_MMAPLOCK_EXCL); + xfs_iunlock(dest, XFS_IOLOCK_EXCL); + } + if (ret) + trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); + return ret; } /* diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index 5dc3c8a..7ddd9f6 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -43,11 +43,8 @@ extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset, extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count); extern int xfs_reflink_recover_cow(struct xfs_mount *mp); -#define XFS_REFLINK_DEDUPE 1 /* only reflink if contents match */ -#define XFS_REFLINK_ALL (XFS_REFLINK_DEDUPE) -extern int xfs_reflink_remap_range(struct xfs_inode *src, xfs_off_t srcoff, - struct xfs_inode *dest, xfs_off_t destoff, xfs_off_t len, - unsigned int flags); +extern int xfs_reflink_remap_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, u64 len, bool is_dedupe); extern int xfs_reflink_clear_inode_flag(struct xfs_inode *ip, struct xfs_trans **tpp); extern int xfs_reflink_unshare(struct xfs_inode *ip, xfs_off_t offset, -- cgit v1.1 From d33fd776f992332be110f6ceca6dad60cb5d513f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Oct 2016 15:51:28 +1100 Subject: iomap: add IOMAP_REPORT This allows the file system to tell a FIEMAP from a read operation, and thus avoids the need to report flags that aren't actually used in the read path. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/iomap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/iomap.c b/fs/iomap.c index 013d1d3..a922040 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -561,7 +561,7 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, } while (len > 0) { - ret = iomap_apply(inode, start, len, 0, ops, &ctx, + ret = iomap_apply(inode, start, len, IOMAP_REPORT, ops, &ctx, iomap_fiemap_actor); /* inode with no (attribute) mapping will give ENOENT */ if (ret == -ENOENT) -- cgit v1.1 From 0a0af28cad9a43d90f13c2047bd8ee3d4cffb7f3 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 20 Oct 2016 15:51:50 +1100 Subject: xfs: add xfs_trim_extent This helpers allows to trim an extent to a subset of it's original range while making sure the block numbers in it remain valid, In the future xfs_trim_extent and xfs_bmapi_trim_map should probably be merged in some form. Signed-off-by: Darrick J. Wong [hch: split from a previous patch from Darrick, moved around and added support for "raw" delayed extents"] Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_bmap.c | 33 +++++++++++++++++++++++++++++++++ fs/xfs/libxfs/xfs_bmap.h | 2 ++ 2 files changed, 35 insertions(+) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 80bdb11..381e765 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -3996,6 +3996,39 @@ xfs_bmap_alloc( return xfs_bmap_btalloc(ap); } +/* Trim extent to fit a logical block range. */ +void +xfs_trim_extent( + struct xfs_bmbt_irec *irec, + xfs_fileoff_t bno, + xfs_filblks_t len) +{ + xfs_fileoff_t distance; + xfs_fileoff_t end = bno + len; + + if (irec->br_startoff + irec->br_blockcount <= bno || + irec->br_startoff >= end) { + irec->br_blockcount = 0; + return; + } + + if (irec->br_startoff < bno) { + distance = bno - irec->br_startoff; + if (isnullstartblock(irec->br_startblock)) + irec->br_startblock = DELAYSTARTBLOCK; + if (irec->br_startblock != DELAYSTARTBLOCK && + irec->br_startblock != HOLESTARTBLOCK) + irec->br_startblock += distance; + irec->br_startoff += distance; + irec->br_blockcount -= distance; + } + + if (end < irec->br_startoff + irec->br_blockcount) { + distance = irec->br_startoff + irec->br_blockcount - end; + irec->br_blockcount -= distance; + } +} + /* * Trim the returned map to the required bounds */ diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index f97db71..eb86af0 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -190,6 +190,8 @@ void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt, #define XFS_BMAP_TRACE_EXLIST(ip,c,w) #endif +void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, + xfs_filblks_t len); int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork); void xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_defer_ops *dfops, -- cgit v1.1 From 62c5ac89de7d5eecdbaa94ca3d554b0bd41578b1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Oct 2016 15:52:00 +1100 Subject: xfs: handle "raw" delayed extents xfs_reflink_trim_around_shared Delalloc extents in the extent list contain the number of reserved indirect blocks in their startblock value and don't use the magic DELAYSTARTBLOCK constant. Ensure that xfs_reflink_trim_around_shared handles them properly by checking for isnullstartblock(). Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_reflink.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 6592daa..6c4c215 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -182,7 +182,8 @@ xfs_reflink_trim_around_shared( if (!xfs_is_reflink_inode(ip) || ISUNWRITTEN(irec) || irec->br_startblock == HOLESTARTBLOCK || - irec->br_startblock == DELAYSTARTBLOCK) { + irec->br_startblock == DELAYSTARTBLOCK || + isnullstartblock(irec->br_startblock)) { *shared = false; return 0; } -- cgit v1.1 From 5f9268ca53aca992106d74edde3e7cf6c1be60a0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Oct 2016 15:53:32 +1100 Subject: xfs: don't bother looking at the refcount tree for reads There is no need to trim an extent into a shared or non-shared one, or report any flags for plain old reads. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_iomap.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index d907eb9..1dabf2e 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -996,11 +996,14 @@ xfs_file_iomap_begin( return error; } - /* Trim the mapping to the nearest shared extent boundary. */ - error = xfs_reflink_trim_around_shared(ip, &imap, &shared, &trimmed); - if (error) { - xfs_iunlock(ip, lockmode); - return error; + if (flags & (IOMAP_WRITE | IOMAP_ZERO | IOMAP_REPORT)) { + /* Trim the mapping to the nearest shared extent boundary. */ + error = xfs_reflink_trim_around_shared(ip, &imap, &shared, + &trimmed); + if (error) { + xfs_iunlock(ip, lockmode); + return error; + } } if ((flags & IOMAP_WRITE) && imap_needs_alloc(inode, &imap, nimaps)) { -- cgit v1.1 From 3ba020befef030aaabbd5eb82a09f6ddf02a9542 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Oct 2016 15:53:50 +1100 Subject: xfs: optimize writes to reflink files Instead of reserving space as the first thing in write_begin move it past reading the extent in the data fork. That way we only have to read from the data fork once and can reuse that information for trimming the extent to the shared/unshared boundary. Additionally this allows to easily limit the actual write size to said boundary, and avoid a roundtrip on the ilock. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_iomap.c | 56 ++++++++++++++------ fs/xfs/xfs_reflink.c | 142 +++++++++++++++++++++------------------------------ fs/xfs/xfs_reflink.h | 4 +- fs/xfs/xfs_trace.h | 3 +- 4 files changed, 100 insertions(+), 105 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 1dabf2e..436e109 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -566,6 +566,17 @@ xfs_file_iomap_begin_delay( xfs_bmap_search_extents(ip, offset_fsb, XFS_DATA_FORK, &eof, &idx, &got, &prev); if (!eof && got.br_startoff <= offset_fsb) { + if (xfs_is_reflink_inode(ip)) { + bool shared; + + end_fsb = min(XFS_B_TO_FSB(mp, offset + count), + maxbytes_fsb); + xfs_trim_extent(&got, offset_fsb, end_fsb - offset_fsb); + error = xfs_reflink_reserve_cow(ip, &got, &shared); + if (error) + goto out_unlock; + } + trace_xfs_iomap_found(ip, offset, count, 0, &got); goto done; } @@ -961,19 +972,13 @@ xfs_file_iomap_begin( struct xfs_mount *mp = ip->i_mount; struct xfs_bmbt_irec imap; xfs_fileoff_t offset_fsb, end_fsb; - bool shared, trimmed; int nimaps = 1, error = 0; + bool shared = false, trimmed = false; unsigned lockmode; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) { - error = xfs_reflink_reserve_cow_range(ip, offset, length); - if (error < 0) - return error; - } - if ((flags & IOMAP_WRITE) && !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) { /* Reserve delalloc blocks for regular writeback. */ @@ -981,7 +986,16 @@ xfs_file_iomap_begin( iomap); } - lockmode = xfs_ilock_data_map_shared(ip); + /* + * COW writes will allocate delalloc space, so we need to make sure + * to take the lock exclusively here. + */ + if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) { + lockmode = XFS_ILOCK_EXCL; + xfs_ilock(ip, XFS_ILOCK_EXCL); + } else { + lockmode = xfs_ilock_data_map_shared(ip); + } ASSERT(offset <= mp->m_super->s_maxbytes); if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes) @@ -991,19 +1005,24 @@ xfs_file_iomap_begin( error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, 0); - if (error) { - xfs_iunlock(ip, lockmode); - return error; - } + if (error) + goto out_unlock; - if (flags & (IOMAP_WRITE | IOMAP_ZERO | IOMAP_REPORT)) { + if (flags & IOMAP_REPORT) { /* Trim the mapping to the nearest shared extent boundary. */ error = xfs_reflink_trim_around_shared(ip, &imap, &shared, &trimmed); - if (error) { - xfs_iunlock(ip, lockmode); - return error; - } + if (error) + goto out_unlock; + } + + if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) { + error = xfs_reflink_reserve_cow(ip, &imap, &shared); + if (error) + goto out_unlock; + + end_fsb = imap.br_startoff + imap.br_blockcount; + length = XFS_FSB_TO_B(mp, end_fsb) - offset; } if ((flags & IOMAP_WRITE) && imap_needs_alloc(inode, &imap, nimaps)) { @@ -1042,6 +1061,9 @@ xfs_file_iomap_begin( if (shared) iomap->flags |= IOMAP_F_SHARED; return 0; +out_unlock: + xfs_iunlock(ip, lockmode); + return error; } static int diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 6c4c215..9c477de3 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -228,50 +228,54 @@ xfs_reflink_trim_around_shared( } } -/* Create a CoW reservation for a range of blocks within a file. */ -static int -__xfs_reflink_reserve_cow( +/* + * Trim the passed in imap to the next shared/unshared extent boundary, and + * if imap->br_startoff points to a shared extent reserve space for it in the + * COW fork. In this case *shared is set to true, else to false. + * + * Note that imap will always contain the block numbers for the existing blocks + * in the data fork, as the upper layers need them for read-modify-write + * operations. + */ +int +xfs_reflink_reserve_cow( struct xfs_inode *ip, - xfs_fileoff_t *offset_fsb, - xfs_fileoff_t end_fsb, - bool *skipped) + struct xfs_bmbt_irec *imap, + bool *shared) { - struct xfs_bmbt_irec got, prev, imap; - xfs_fileoff_t orig_end_fsb; - int nimaps, eof = 0, error = 0; - bool shared = false, trimmed = false; + struct xfs_bmbt_irec got, prev; + xfs_fileoff_t end_fsb, orig_end_fsb; + int eof = 0, error = 0; + bool trimmed; xfs_extnum_t idx; xfs_extlen_t align; - /* Already reserved? Skip the refcount btree access. */ - xfs_bmap_search_extents(ip, *offset_fsb, XFS_COW_FORK, &eof, &idx, + /* + * Search the COW fork extent list first. This serves two purposes: + * first this implement the speculative preallocation using cowextisze, + * so that we also unshared block adjacent to shared blocks instead + * of just the shared blocks themselves. Second the lookup in the + * extent list is generally faster than going out to the shared extent + * tree. + */ + xfs_bmap_search_extents(ip, imap->br_startoff, XFS_COW_FORK, &eof, &idx, &got, &prev); - if (!eof && got.br_startoff <= *offset_fsb) { - end_fsb = orig_end_fsb = got.br_startoff + got.br_blockcount; - trace_xfs_reflink_cow_found(ip, &got); - goto done; - } + if (!eof && got.br_startoff <= imap->br_startoff) { + trace_xfs_reflink_cow_found(ip, imap); + xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); - /* Read extent from the source file. */ - nimaps = 1; - error = xfs_bmapi_read(ip, *offset_fsb, end_fsb - *offset_fsb, - &imap, &nimaps, 0); - if (error) - goto out_unlock; - ASSERT(nimaps == 1); + *shared = true; + return 0; + } /* Trim the mapping to the nearest shared extent boundary. */ - error = xfs_reflink_trim_around_shared(ip, &imap, &shared, &trimmed); + error = xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed); if (error) - goto out_unlock; - - end_fsb = orig_end_fsb = imap.br_startoff + imap.br_blockcount; + return error; /* Not shared? Just report the (potentially capped) extent. */ - if (!shared) { - *skipped = true; - goto done; - } + if (!*shared) + return 0; /* * Fork all the shared blocks from our write offset until the end of @@ -279,72 +283,38 @@ __xfs_reflink_reserve_cow( */ error = xfs_qm_dqattach_locked(ip, 0); if (error) - goto out_unlock; + return error; + + end_fsb = orig_end_fsb = imap->br_startoff + imap->br_blockcount; align = xfs_eof_alignment(ip, xfs_get_cowextsz_hint(ip)); if (align) end_fsb = roundup_64(end_fsb, align); retry: - error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, *offset_fsb, - end_fsb - *offset_fsb, &got, - &prev, &idx, eof); + error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff, + end_fsb - imap->br_startoff, &got, &prev, &idx, eof); switch (error) { case 0: break; case -ENOSPC: case -EDQUOT: /* retry without any preallocation */ - trace_xfs_reflink_cow_enospc(ip, &imap); + trace_xfs_reflink_cow_enospc(ip, imap); if (end_fsb != orig_end_fsb) { end_fsb = orig_end_fsb; goto retry; } /*FALLTHRU*/ default: - goto out_unlock; + return error; } if (end_fsb != orig_end_fsb) xfs_inode_set_cowblocks_tag(ip); trace_xfs_reflink_cow_alloc(ip, &got); -done: - *offset_fsb = end_fsb; -out_unlock: - return error; -} - -/* Create a CoW reservation for part of a file. */ -int -xfs_reflink_reserve_cow_range( - struct xfs_inode *ip, - xfs_off_t offset, - xfs_off_t count) -{ - struct xfs_mount *mp = ip->i_mount; - xfs_fileoff_t offset_fsb, end_fsb; - bool skipped = false; - int error = 0; - - trace_xfs_reflink_reserve_cow_range(ip, offset, count); - - offset_fsb = XFS_B_TO_FSBT(mp, offset); - end_fsb = XFS_B_TO_FSB(mp, offset + count); - - xfs_ilock(ip, XFS_ILOCK_EXCL); - while (offset_fsb < end_fsb) { - error = __xfs_reflink_reserve_cow(ip, &offset_fsb, end_fsb, - &skipped); - if (error) { - trace_xfs_reflink_reserve_cow_range_error(ip, error, - _RET_IP_); - break; - } - } - xfs_iunlock(ip, XFS_ILOCK_EXCL); - - return error; + return 0; } /* Allocate all CoW reservations covering a range of blocks in a file. */ @@ -359,9 +329,8 @@ __xfs_reflink_allocate_cow( struct xfs_defer_ops dfops; struct xfs_trans *tp; xfs_fsblock_t first_block; - xfs_fileoff_t next_fsb; int nimaps = 1, error; - bool skipped = false; + bool shared; xfs_defer_init(&dfops, &first_block); @@ -372,33 +341,38 @@ __xfs_reflink_allocate_cow( xfs_ilock(ip, XFS_ILOCK_EXCL); - next_fsb = *offset_fsb; - error = __xfs_reflink_reserve_cow(ip, &next_fsb, end_fsb, &skipped); + /* Read extent from the source file. */ + nimaps = 1; + error = xfs_bmapi_read(ip, *offset_fsb, end_fsb - *offset_fsb, + &imap, &nimaps, 0); + if (error) + goto out_unlock; + ASSERT(nimaps == 1); + + error = xfs_reflink_reserve_cow(ip, &imap, &shared); if (error) goto out_trans_cancel; - if (skipped) { - *offset_fsb = next_fsb; + if (!shared) { + *offset_fsb = imap.br_startoff + imap.br_blockcount; goto out_trans_cancel; } xfs_trans_ijoin(tp, ip, 0); - error = xfs_bmapi_write(tp, ip, *offset_fsb, next_fsb - *offset_fsb, + error = xfs_bmapi_write(tp, ip, imap.br_startoff, imap.br_blockcount, XFS_BMAPI_COWFORK, &first_block, XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), &imap, &nimaps, &dfops); if (error) goto out_trans_cancel; - /* We might not have been able to map the whole delalloc extent */ - *offset_fsb = min(*offset_fsb + imap.br_blockcount, next_fsb); - error = xfs_defer_finish(&tp, &dfops, NULL); if (error) goto out_trans_cancel; error = xfs_trans_commit(tp); + *offset_fsb = imap.br_startoff + imap.br_blockcount; out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index 7ddd9f6..fad1160 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -26,8 +26,8 @@ extern int xfs_reflink_find_shared(struct xfs_mount *mp, xfs_agnumber_t agno, extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip, struct xfs_bmbt_irec *irec, bool *shared, bool *trimmed); -extern int xfs_reflink_reserve_cow_range(struct xfs_inode *ip, - xfs_off_t offset, xfs_off_t count); +extern int xfs_reflink_reserve_cow(struct xfs_inode *ip, + struct xfs_bmbt_irec *imap, bool *shared); extern int xfs_reflink_allocate_cow_range(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count); extern bool xfs_reflink_find_cow_mapping(struct xfs_inode *ip, xfs_off_t offset, diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index ad188d3..72f9f6b 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3346,7 +3346,7 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc); -DEFINE_RW_EVENT(xfs_reflink_reserve_cow_range); +DEFINE_RW_EVENT(xfs_reflink_reserve_cow); DEFINE_RW_EVENT(xfs_reflink_allocate_cow_range); DEFINE_INODE_IREC_EVENT(xfs_reflink_bounce_dio_write); @@ -3358,7 +3358,6 @@ DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_piece); -DEFINE_INODE_ERROR_EVENT(xfs_reflink_reserve_cow_range_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_allocate_cow_range_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error); -- cgit v1.1 From fa5c836ca8eb5bad6316ddfc066acbc4e2485356 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Oct 2016 15:54:14 +1100 Subject: xfs: refactor xfs_bunmapi_cow Split out two helpers for deleting delayed or real extents from the COW fork. This allows to call them directly from xfs_reflink_cow_end_io once that function is refactored to iterate the extent tree. It will also allow to reuse the delalloc deletion from xfs_bunmapi in the future. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_bmap.c | 374 ++++++++++++++++++++++++++++------------------- fs/xfs/libxfs/xfs_bmap.h | 5 + fs/xfs/xfs_reflink.c | 5 - 3 files changed, 225 insertions(+), 159 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 381e765..d7ad511 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4859,6 +4859,219 @@ xfs_bmap_split_indlen( return stolen; } +int +xfs_bmap_del_extent_delay( + struct xfs_inode *ip, + int whichfork, + xfs_extnum_t *idx, + struct xfs_bmbt_irec *got, + struct xfs_bmbt_irec *del) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_bmbt_irec new; + int64_t da_old, da_new, da_diff = 0; + xfs_fileoff_t del_endoff, got_endoff; + xfs_filblks_t got_indlen, new_indlen, stolen; + int error = 0, state = 0; + bool isrt; + + XFS_STATS_INC(mp, xs_del_exlist); + + isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip); + del_endoff = del->br_startoff + del->br_blockcount; + got_endoff = got->br_startoff + got->br_blockcount; + da_old = startblockval(got->br_startblock); + da_new = 0; + + ASSERT(*idx >= 0); + ASSERT(*idx < ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(del->br_blockcount > 0); + ASSERT(got->br_startoff <= del->br_startoff); + ASSERT(got_endoff >= del_endoff); + + if (isrt) { + int64_t rtexts = XFS_FSB_TO_B(mp, del->br_blockcount); + + do_div(rtexts, mp->m_sb.sb_rextsize); + xfs_mod_frextents(mp, rtexts); + } + + /* + * Update the inode delalloc counter now and wait to update the + * sb counters as we might have to borrow some blocks for the + * indirect block accounting. + */ + xfs_trans_reserve_quota_nblks(NULL, ip, -((long)del->br_blockcount), 0, + isrt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS); + ip->i_delayed_blks -= del->br_blockcount; + + if (whichfork == XFS_COW_FORK) + state |= BMAP_COWFORK; + + if (got->br_startoff == del->br_startoff) + state |= BMAP_LEFT_CONTIG; + if (got_endoff == del_endoff) + state |= BMAP_RIGHT_CONTIG; + + switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) { + case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + /* + * Matches the whole extent. Delete the entry. + */ + xfs_iext_remove(ip, *idx, 1, state); + --*idx; + break; + case BMAP_LEFT_CONTIG: + /* + * Deleting the first part of the extent. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + got->br_startoff = del_endoff; + got->br_blockcount -= del->br_blockcount; + da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, + got->br_blockcount), da_old); + got->br_startblock = nullstartblock((int)da_new); + xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + break; + case BMAP_RIGHT_CONTIG: + /* + * Deleting the last part of the extent. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + got->br_blockcount = got->br_blockcount - del->br_blockcount; + da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, + got->br_blockcount), da_old); + got->br_startblock = nullstartblock((int)da_new); + xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + break; + case 0: + /* + * Deleting the middle of the extent. + * + * Distribute the original indlen reservation across the two new + * extents. Steal blocks from the deleted extent if necessary. + * Stealing blocks simply fudges the fdblocks accounting below. + * Warn if either of the new indlen reservations is zero as this + * can lead to delalloc problems. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + + got->br_blockcount = del->br_startoff - got->br_startoff; + got_indlen = xfs_bmap_worst_indlen(ip, got->br_blockcount); + + new.br_blockcount = got_endoff - del_endoff; + new_indlen = xfs_bmap_worst_indlen(ip, new.br_blockcount); + + WARN_ON_ONCE(!got_indlen || !new_indlen); + stolen = xfs_bmap_split_indlen(da_old, &got_indlen, &new_indlen, + del->br_blockcount); + + got->br_startblock = nullstartblock((int)got_indlen); + xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got); + trace_xfs_bmap_post_update(ip, *idx, 0, _THIS_IP_); + + new.br_startoff = del_endoff; + new.br_state = got->br_state; + new.br_startblock = nullstartblock((int)new_indlen); + + ++*idx; + xfs_iext_insert(ip, *idx, 1, &new, state); + + da_new = got_indlen + new_indlen - stolen; + del->br_blockcount -= stolen; + break; + } + + ASSERT(da_old >= da_new); + da_diff = da_old - da_new; + if (!isrt) + da_diff += del->br_blockcount; + if (da_diff) + xfs_mod_fdblocks(mp, da_diff, false); + return error; +} + +void +xfs_bmap_del_extent_cow( + struct xfs_inode *ip, + xfs_extnum_t *idx, + struct xfs_bmbt_irec *got, + struct xfs_bmbt_irec *del) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + struct xfs_bmbt_irec new; + xfs_fileoff_t del_endoff, got_endoff; + int state = BMAP_COWFORK; + + XFS_STATS_INC(mp, xs_del_exlist); + + del_endoff = del->br_startoff + del->br_blockcount; + got_endoff = got->br_startoff + got->br_blockcount; + + ASSERT(*idx >= 0); + ASSERT(*idx < ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(del->br_blockcount > 0); + ASSERT(got->br_startoff <= del->br_startoff); + ASSERT(got_endoff >= del_endoff); + ASSERT(!isnullstartblock(got->br_startblock)); + + if (got->br_startoff == del->br_startoff) + state |= BMAP_LEFT_CONTIG; + if (got_endoff == del_endoff) + state |= BMAP_RIGHT_CONTIG; + + switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) { + case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + /* + * Matches the whole extent. Delete the entry. + */ + xfs_iext_remove(ip, *idx, 1, state); + --*idx; + break; + case BMAP_LEFT_CONTIG: + /* + * Deleting the first part of the extent. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + got->br_startoff = del_endoff; + got->br_blockcount -= del->br_blockcount; + got->br_startblock = del->br_startblock + del->br_blockcount; + xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + break; + case BMAP_RIGHT_CONTIG: + /* + * Deleting the last part of the extent. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + got->br_blockcount -= del->br_blockcount; + xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + break; + case 0: + /* + * Deleting the middle of the extent. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + got->br_blockcount = del->br_startoff - got->br_startoff; + xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + new.br_startoff = del_endoff; + new.br_blockcount = got_endoff - del_endoff; + new.br_state = got->br_state; + new.br_startblock = del->br_startblock + del->br_blockcount; + + ++*idx; + xfs_iext_insert(ip, *idx, 1, &new, state); + break; + } +} + /* * Called by xfs_bmapi to update file extent records and the btree * after removing space (or undoing a delayed allocation). @@ -5207,167 +5420,20 @@ xfs_bunmapi_cow( struct xfs_inode *ip, struct xfs_bmbt_irec *del) { - xfs_filblks_t da_new; - xfs_filblks_t da_old; - xfs_fsblock_t del_endblock = 0; - xfs_fileoff_t del_endoff; - int delay; struct xfs_bmbt_rec_host *ep; - int error; struct xfs_bmbt_irec got; - xfs_fileoff_t got_endoff; - struct xfs_ifork *ifp; - struct xfs_mount *mp; - xfs_filblks_t nblks; struct xfs_bmbt_irec new; - /* REFERENCED */ - uint qfield; - xfs_filblks_t temp; - xfs_filblks_t temp2; - int state = BMAP_COWFORK; int eof; xfs_extnum_t eidx; - mp = ip->i_mount; - XFS_STATS_INC(mp, xs_del_exlist); - ep = xfs_bmap_search_extents(ip, del->br_startoff, XFS_COW_FORK, &eof, - &eidx, &got, &new); - - ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); - ASSERT((eidx >= 0) && (eidx < ifp->if_bytes / - (uint)sizeof(xfs_bmbt_rec_t))); - ASSERT(del->br_blockcount > 0); - ASSERT(got.br_startoff <= del->br_startoff); - del_endoff = del->br_startoff + del->br_blockcount; - got_endoff = got.br_startoff + got.br_blockcount; - ASSERT(got_endoff >= del_endoff); - delay = isnullstartblock(got.br_startblock); - ASSERT(isnullstartblock(del->br_startblock) == delay); - qfield = 0; - error = 0; - /* - * If deleting a real allocation, must free up the disk space. - */ - if (!delay) { - nblks = del->br_blockcount; - qfield = XFS_TRANS_DQ_BCOUNT; - /* - * Set up del_endblock and cur for later. - */ - del_endblock = del->br_startblock + del->br_blockcount; - da_old = da_new = 0; - } else { - da_old = startblockval(got.br_startblock); - da_new = 0; - nblks = 0; - } - qfield = qfield; - nblks = nblks; - - /* - * Set flag value to use in switch statement. - * Left-contig is 2, right-contig is 1. - */ - switch (((got.br_startoff == del->br_startoff) << 1) | - (got_endoff == del_endoff)) { - case 3: - /* - * Matches the whole extent. Delete the entry. - */ - xfs_iext_remove(ip, eidx, 1, BMAP_COWFORK); - --eidx; - break; - - case 2: - /* - * Deleting the first part of the extent. - */ - trace_xfs_bmap_pre_update(ip, eidx, state, _THIS_IP_); - xfs_bmbt_set_startoff(ep, del_endoff); - temp = got.br_blockcount - del->br_blockcount; - xfs_bmbt_set_blockcount(ep, temp); - if (delay) { - temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - da_old); - xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_); - da_new = temp; - break; - } - xfs_bmbt_set_startblock(ep, del_endblock); - trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_); - break; - - case 1: - /* - * Deleting the last part of the extent. - */ - temp = got.br_blockcount - del->br_blockcount; - trace_xfs_bmap_pre_update(ip, eidx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(ep, temp); - if (delay) { - temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - da_old); - xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_); - da_new = temp; - break; - } - trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_); - break; - - case 0: - /* - * Deleting the middle of the extent. - */ - temp = del->br_startoff - got.br_startoff; - trace_xfs_bmap_pre_update(ip, eidx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(ep, temp); - new.br_startoff = del_endoff; - temp2 = got_endoff - del_endoff; - new.br_blockcount = temp2; - new.br_state = got.br_state; - if (!delay) { - new.br_startblock = del_endblock; - } else { - temp = xfs_bmap_worst_indlen(ip, temp); - xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); - temp2 = xfs_bmap_worst_indlen(ip, temp2); - new.br_startblock = nullstartblock((int)temp2); - da_new = temp + temp2; - while (da_new > da_old) { - if (temp) { - temp--; - da_new--; - xfs_bmbt_set_startblock(ep, - nullstartblock((int)temp)); - } - if (da_new == da_old) - break; - if (temp2) { - temp2--; - da_new--; - new.br_startblock = - nullstartblock((int)temp2); - } - } - } - trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_); - xfs_iext_insert(ip, eidx + 1, 1, &new, state); - ++eidx; - break; - } - - /* - * Account for change in delayed indirect blocks. - * Nothing to do for disk quota accounting here. - */ - ASSERT(da_old >= da_new); - if (da_old > da_new) - xfs_mod_fdblocks(mp, (int64_t)(da_old - da_new), false); - - return error; + &eidx, &got, &new); + ASSERT(ep); + if (isnullstartblock(got.br_startblock)) + xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &eidx, &got, del); + else + xfs_bmap_del_extent_cow(ip, &eidx, &got, del); + return 0; } /* diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index eb86af0..5f18248 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -224,6 +224,11 @@ int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, xfs_extnum_t nexts, xfs_fsblock_t *firstblock, struct xfs_defer_ops *dfops, int *done); int xfs_bunmapi_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *del); +int xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork, + xfs_extnum_t *idx, struct xfs_bmbt_irec *got, + struct xfs_bmbt_irec *del); +void xfs_bmap_del_extent_cow(struct xfs_inode *ip, xfs_extnum_t *idx, + struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *del); int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx, xfs_extnum_t num); uint xfs_default_attroffset(struct xfs_inode *ip); diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 9c477de3..09bd5dc 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -534,11 +534,6 @@ xfs_reflink_cancel_cow_blocks( trace_xfs_reflink_cancel_cow(ip, &irec); if (irec.br_startblock == DELAYSTARTBLOCK) { - /* Free a delayed allocation. */ - xfs_mod_fdblocks(ip->i_mount, irec.br_blockcount, - false); - ip->i_delayed_blks -= irec.br_blockcount; - /* Remove the mapping from the CoW fork. */ error = xfs_bunmapi_cow(ip, &irec); if (error) -- cgit v1.1 From 3e0ee78f7a5a8ed43b129e9e4c5c0ff10c71df74 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Oct 2016 15:54:31 +1100 Subject: xfs: optimize xfs_reflink_cancel_cow_blocks Rewrite xfs_reflink_cancel_cow_blocks so that we only do a search for the first extent in the extent list and then iterate over the remaining extents using the extent index, passing the extent we operate on directly to xfs_bmap_del_extent_delay or xfs_bmap_del_extent_cow instead of going through xfs_bunmapi and doing yet another extent list lookup. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_reflink.c | 51 +++++++++++++++++++++++---------------------------- 1 file changed, 23 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 09bd5dc..1e9c589 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -511,53 +511,49 @@ xfs_reflink_cancel_cow_blocks( xfs_fileoff_t offset_fsb, xfs_fileoff_t end_fsb) { - struct xfs_bmbt_irec irec; - xfs_filblks_t count_fsb; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + struct xfs_bmbt_irec got, prev, del; + xfs_extnum_t idx; xfs_fsblock_t firstfsb; struct xfs_defer_ops dfops; - int error = 0; - int nimaps; + int error = 0, eof = 0; if (!xfs_is_reflink_inode(ip)) return 0; - /* Go find the old extent in the CoW fork. */ - while (offset_fsb < end_fsb) { - nimaps = 1; - count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb); - error = xfs_bmapi_read(ip, offset_fsb, count_fsb, &irec, - &nimaps, XFS_BMAPI_COWFORK); - if (error) - break; - ASSERT(nimaps == 1); + xfs_bmap_search_extents(ip, offset_fsb, XFS_COW_FORK, &eof, &idx, + &got, &prev); + if (eof) + return 0; - trace_xfs_reflink_cancel_cow(ip, &irec); + while (got.br_startoff < end_fsb) { + del = got; + xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb); + trace_xfs_reflink_cancel_cow(ip, &del); - if (irec.br_startblock == DELAYSTARTBLOCK) { - /* Remove the mapping from the CoW fork. */ - error = xfs_bunmapi_cow(ip, &irec); + if (isnullstartblock(del.br_startblock)) { + error = xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, + &idx, &got, &del); if (error) break; - } else if (irec.br_startblock == HOLESTARTBLOCK) { - /* empty */ } else { xfs_trans_ijoin(*tpp, ip, 0); xfs_defer_init(&dfops, &firstfsb); /* Free the CoW orphan record. */ error = xfs_refcount_free_cow_extent(ip->i_mount, - &dfops, irec.br_startblock, - irec.br_blockcount); + &dfops, del.br_startblock, + del.br_blockcount); if (error) break; xfs_bmap_add_free(ip->i_mount, &dfops, - irec.br_startblock, irec.br_blockcount, + del.br_startblock, del.br_blockcount, NULL); /* Update quota accounting */ xfs_trans_mod_dquot_byino(*tpp, ip, XFS_TRANS_DQ_BCOUNT, - -(long)irec.br_blockcount); + -(long)del.br_blockcount); /* Roll the transaction */ error = xfs_defer_finish(tpp, &dfops, ip); @@ -567,13 +563,12 @@ xfs_reflink_cancel_cow_blocks( } /* Remove the mapping from the CoW fork. */ - error = xfs_bunmapi_cow(ip, &irec); - if (error) - break; + xfs_bmap_del_extent_cow(ip, &idx, &got, &del); } - /* Roll on... */ - offset_fsb = irec.br_startoff + irec.br_blockcount; + if (++idx >= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)) + return 0; + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &got); } return error; -- cgit v1.1 From c1112b6e626637ec09319883b63e705a931c398b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Oct 2016 15:54:45 +1100 Subject: xfs: optimize xfs_reflink_end_cow Instead of doing a full extent list search for each extent that is to be deleted using xfs_bmapi_read and then doing another one inside of xfs_bunmapi_cow use the same scheme that xfs_bumapi uses: look up the last extent to be deleted and then use the extent index to walk downward until we are outside the range to be deleted. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_reflink.c | 119 ++++++++++++++++++++++++--------------------------- fs/xfs/xfs_trace.h | 1 - 2 files changed, 56 insertions(+), 64 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 1e9c589..cd308f1 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -633,25 +633,26 @@ xfs_reflink_end_cow( xfs_off_t offset, xfs_off_t count) { - struct xfs_bmbt_irec irec; - struct xfs_bmbt_irec uirec; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + struct xfs_bmbt_irec got, prev, del; struct xfs_trans *tp; xfs_fileoff_t offset_fsb; xfs_fileoff_t end_fsb; - xfs_filblks_t count_fsb; xfs_fsblock_t firstfsb; struct xfs_defer_ops dfops; - int error; + int error, eof = 0; unsigned int resblks; - xfs_filblks_t ilen; xfs_filblks_t rlen; - int nimaps; + xfs_extnum_t idx; trace_xfs_reflink_end_cow(ip, offset, count); + /* No COW extents? That's easy! */ + if (ifp->if_bytes == 0) + return 0; + offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); - count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb); /* Start a rolling transaction to switch the mappings */ resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK); @@ -663,72 +664,65 @@ xfs_reflink_end_cow( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - /* Go find the old extent in the CoW fork. */ - while (offset_fsb < end_fsb) { - /* Read extent from the source file */ - nimaps = 1; - count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb); - error = xfs_bmapi_read(ip, offset_fsb, count_fsb, &irec, - &nimaps, XFS_BMAPI_COWFORK); - if (error) - goto out_cancel; - ASSERT(nimaps == 1); + xfs_bmap_search_extents(ip, end_fsb - 1, XFS_COW_FORK, &eof, &idx, + &got, &prev); - ASSERT(irec.br_startblock != DELAYSTARTBLOCK); - trace_xfs_reflink_cow_remap(ip, &irec); + /* If there is a hole at end_fsb - 1 go to the previous extent */ + if (eof || got.br_startoff > end_fsb) { + ASSERT(idx > 0); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, --idx), &got); + } - /* - * We can have a hole in the CoW fork if part of a directio - * write is CoW but part of it isn't. - */ - rlen = ilen = irec.br_blockcount; - if (irec.br_startblock == HOLESTARTBLOCK) + /* Walk backwards until we're out of the I/O range... */ + while (got.br_startoff + got.br_blockcount > offset_fsb) { + del = got; + xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb); + + /* Extent delete may have bumped idx forward */ + if (!del.br_blockcount) { + idx--; goto next_extent; + } + + ASSERT(!isnullstartblock(got.br_startblock)); /* Unmap the old blocks in the data fork. */ - while (rlen) { - xfs_defer_init(&dfops, &firstfsb); - error = __xfs_bunmapi(tp, ip, irec.br_startoff, - &rlen, 0, 1, &firstfsb, &dfops); - if (error) - goto out_defer; - - /* - * Trim the extent to whatever got unmapped. - * Remember, bunmapi works backwards. - */ - uirec.br_startblock = irec.br_startblock + rlen; - uirec.br_startoff = irec.br_startoff + rlen; - uirec.br_blockcount = irec.br_blockcount - rlen; - irec.br_blockcount = rlen; - trace_xfs_reflink_cow_remap_piece(ip, &uirec); + xfs_defer_init(&dfops, &firstfsb); + rlen = del.br_blockcount; + error = __xfs_bunmapi(tp, ip, del.br_startoff, &rlen, 0, 1, + &firstfsb, &dfops); + if (error) + goto out_defer; - /* Free the CoW orphan record. */ - error = xfs_refcount_free_cow_extent(tp->t_mountp, - &dfops, uirec.br_startblock, - uirec.br_blockcount); - if (error) - goto out_defer; + /* Trim the extent to whatever got unmapped. */ + if (rlen) { + xfs_trim_extent(&del, del.br_startoff + rlen, + del.br_blockcount - rlen); + } + trace_xfs_reflink_cow_remap(ip, &del); - /* Map the new blocks into the data fork. */ - error = xfs_bmap_map_extent(tp->t_mountp, &dfops, - ip, &uirec); - if (error) - goto out_defer; + /* Free the CoW orphan record. */ + error = xfs_refcount_free_cow_extent(tp->t_mountp, &dfops, + del.br_startblock, del.br_blockcount); + if (error) + goto out_defer; - /* Remove the mapping from the CoW fork. */ - error = xfs_bunmapi_cow(ip, &uirec); - if (error) - goto out_defer; + /* Map the new blocks into the data fork. */ + error = xfs_bmap_map_extent(tp->t_mountp, &dfops, ip, &del); + if (error) + goto out_defer; - error = xfs_defer_finish(&tp, &dfops, ip); - if (error) - goto out_defer; - } + /* Remove the mapping from the CoW fork. */ + xfs_bmap_del_extent_cow(ip, &idx, &got, &del); + + error = xfs_defer_finish(&tp, &dfops, ip); + if (error) + goto out_defer; next_extent: - /* Roll on... */ - offset_fsb = irec.br_startoff + ilen; + if (idx < 0) + break; + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &got); } error = xfs_trans_commit(tp); @@ -739,7 +733,6 @@ next_extent: out_defer: xfs_defer_cancel(&dfops); -out_cancel: xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); out: diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 72f9f6b..0907752 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3356,7 +3356,6 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_irec); DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range); DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap); -DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_piece); DEFINE_INODE_ERROR_EVENT(xfs_reflink_allocate_cow_range_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error); -- cgit v1.1 From 64e6428ddd00f864e3ca105f914a2b6920c2bc41 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Oct 2016 15:54:59 +1100 Subject: xfs: remove xfs_bunmapi_cow Since no one uses it anymore. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_bmap.c | 22 ---------------------- fs/xfs/libxfs/xfs_bmap.h | 1 - 2 files changed, 23 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index d7ad511..c6eb219 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -5414,28 +5414,6 @@ done: return error; } -/* Remove an extent from the CoW fork. Similar to xfs_bmap_del_extent. */ -int -xfs_bunmapi_cow( - struct xfs_inode *ip, - struct xfs_bmbt_irec *del) -{ - struct xfs_bmbt_rec_host *ep; - struct xfs_bmbt_irec got; - struct xfs_bmbt_irec new; - int eof; - xfs_extnum_t eidx; - - ep = xfs_bmap_search_extents(ip, del->br_startoff, XFS_COW_FORK, &eof, - &eidx, &got, &new); - ASSERT(ep); - if (isnullstartblock(got.br_startblock)) - xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &eidx, &got, del); - else - xfs_bmap_del_extent_cow(ip, &eidx, &got, del); - return 0; -} - /* * Unmap (remove) blocks from a file. * If nexts is nonzero then the number of extents to remove is limited to diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 5f18248..7cae6ec 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -223,7 +223,6 @@ int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, int flags, xfs_extnum_t nexts, xfs_fsblock_t *firstblock, struct xfs_defer_ops *dfops, int *done); -int xfs_bunmapi_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *del); int xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork, xfs_extnum_t *idx, struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *del); -- cgit v1.1 From 0a1eb2d474edfe75466be6b4677ad84e5e8ca3f5 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 30 Sep 2016 10:58:56 -0700 Subject: fs/proc: Stop reporting eip and esp in /proc/PID/stat Reporting these fields on a non-current task is dangerous. If the task is in any state other than normal kernel code, they may contain garbage or even kernel addresses on some architectures. (x86_64 used to do this. I bet lots of architectures still do.) With CONFIG_THREAD_INFO_IN_TASK=y, it can OOPS, too. As far as I know, there are no use programs that make any material use of these fields, so just get rid of them. Reported-by: Jann Horn Signed-off-by: Andy Lutomirski Acked-by: Thomas Gleixner Cc: Al Viro Cc: Andrew Morton Cc: Borislav Petkov Cc: Brian Gerst Cc: Kees Cook Cc: Linus Torvalds Cc: Linux API Cc: Peter Zijlstra Cc: Tetsuo Handa Cc: Tycho Andersen Link: http://lkml.kernel.org/r/a5fed4c3f4e33ed25d4bb03567e329bc5a712bcc.1475257877.git.luto@kernel.org Signed-off-by: Ingo Molnar --- fs/proc/array.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/proc/array.c b/fs/proc/array.c index 89600fd..81818ad 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -412,10 +412,11 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, mm = get_task_mm(task); if (mm) { vsize = task_vsize(mm); - if (permitted) { - eip = KSTK_EIP(task); - esp = KSTK_ESP(task); - } + /* + * esp and eip are intentionally zeroed out. There is no + * non-racy way to read them without freezing the task. + * Programs that need reliable values can use ptrace(2). + */ } get_task_comm(tcomm, task); -- cgit v1.1 From b18cb64ead400c01bf1580eeba330ace51f8087d Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 30 Sep 2016 10:58:57 -0700 Subject: fs/proc: Stop trying to report thread stacks This reverts more of: b76437579d13 ("procfs: mark thread stack correctly in proc//maps") ... which was partially reverted by: 65376df58217 ("proc: revert /proc//maps [stack:TID] annotation") Originally, /proc/PID/task/TID/maps was the same as /proc/TID/maps. In current kernels, /proc/PID/maps (or /proc/TID/maps even for threads) shows "[stack]" for VMAs in the mm's stack address range. In contrast, /proc/PID/task/TID/maps uses KSTK_ESP to guess the target thread's stack's VMA. This is racy, probably returns garbage and, on arches with CONFIG_TASK_INFO_IN_THREAD=y, is also crash-prone: KSTK_ESP is not safe to use on tasks that aren't known to be running ordinary process-context kernel code. This patch removes the difference and just shows "[stack]" for VMAs in the mm's stack range. This is IMO much more sensible -- the actual "stack" address really is treated specially by the VM code, and the current thread stack isn't even well-defined for programs that frequently switch stacks on their own. Reported-by: Jann Horn Signed-off-by: Andy Lutomirski Acked-by: Thomas Gleixner Cc: Al Viro Cc: Andrew Morton Cc: Borislav Petkov Cc: Brian Gerst Cc: Johannes Weiner Cc: Kees Cook Cc: Linus Torvalds Cc: Linux API Cc: Peter Zijlstra Cc: Tycho Andersen Link: http://lkml.kernel.org/r/3e678474ec14e0a0ec34c611016753eea2e1b8ba.1475257877.git.luto@kernel.org Signed-off-by: Ingo Molnar --- fs/proc/task_mmu.c | 29 ++++++++++------------------- fs/proc/task_nommu.c | 28 ++++++++++------------------ 2 files changed, 20 insertions(+), 37 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 6909582..35b92d8 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -266,24 +266,15 @@ static int do_maps_open(struct inode *inode, struct file *file, * /proc/PID/maps that is the stack of the main task. */ static int is_stack(struct proc_maps_private *priv, - struct vm_area_struct *vma, int is_pid) + struct vm_area_struct *vma) { - int stack = 0; - - if (is_pid) { - stack = vma->vm_start <= vma->vm_mm->start_stack && - vma->vm_end >= vma->vm_mm->start_stack; - } else { - struct inode *inode = priv->inode; - struct task_struct *task; - - rcu_read_lock(); - task = pid_task(proc_pid(inode), PIDTYPE_PID); - if (task) - stack = vma_is_stack_for_task(vma, task); - rcu_read_unlock(); - } - return stack; + /* + * We make no effort to guess what a given thread considers to be + * its "stack". It's not even well-defined for programs written + * languages like Go. + */ + return vma->vm_start <= vma->vm_mm->start_stack && + vma->vm_end >= vma->vm_mm->start_stack; } static void @@ -354,7 +345,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) goto done; } - if (is_stack(priv, vma, is_pid)) + if (is_stack(priv, vma)) name = "[stack]"; } @@ -1669,7 +1660,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) seq_file_path(m, file, "\n\t= "); } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { seq_puts(m, " heap"); - } else if (is_stack(proc_priv, vma, is_pid)) { + } else if (is_stack(proc_priv, vma)) { seq_puts(m, " stack"); } diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index faacb0c..3717562 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -124,25 +124,17 @@ unsigned long task_statm(struct mm_struct *mm, } static int is_stack(struct proc_maps_private *priv, - struct vm_area_struct *vma, int is_pid) + struct vm_area_struct *vma) { struct mm_struct *mm = vma->vm_mm; - int stack = 0; - - if (is_pid) { - stack = vma->vm_start <= mm->start_stack && - vma->vm_end >= mm->start_stack; - } else { - struct inode *inode = priv->inode; - struct task_struct *task; - - rcu_read_lock(); - task = pid_task(proc_pid(inode), PIDTYPE_PID); - if (task) - stack = vma_is_stack_for_task(vma, task); - rcu_read_unlock(); - } - return stack; + + /* + * We make no effort to guess what a given thread considers to be + * its "stack". It's not even well-defined for programs written + * languages like Go. + */ + return vma->vm_start <= mm->start_stack && + vma->vm_end >= mm->start_stack; } /* @@ -184,7 +176,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, if (file) { seq_pad(m, ' '); seq_file_path(m, file, ""); - } else if (mm && is_stack(priv, vma, is_pid)) { + } else if (mm && is_stack(priv, vma)) { seq_pad(m, ' '); seq_printf(m, "[stack]"); } -- cgit v1.1 From c663e29f8885269c1608b5aa6057729fa9267b35 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 24 Oct 2016 14:20:25 +1100 Subject: fs: Do to trim high file position bits in iomap_page_mkwrite_actor iomap_page_mkwrite_actor() calls __block_write_begin_int() with position masked as pos & ~PAGE_MASK which is equivalent to pos & (PAGE_SIZE-1). Thus it masks off high bits of file position. However __block_write_begin_int() expects full file position on input. This does not cause any visible issues because all __block_write_begin_int() really cares about are low file position bits but still it is a bug waiting to happen. Signed-off-by: Jan Kara Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/iomap.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/iomap.c b/fs/iomap.c index a922040..a8ee8c3 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -433,8 +433,7 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, struct page *page = data; int ret; - ret = __block_write_begin_int(page, pos & ~PAGE_MASK, length, - NULL, iomap); + ret = __block_write_begin_int(page, pos, length, NULL, iomap); if (ret) return ret; -- cgit v1.1 From 7b7381f043568224af798b1decb607dca97b4114 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 24 Oct 2016 14:21:00 +1100 Subject: xfs: fix up inode cowblocks tracking tracepoints These calls are still using the eofblocks tracepoints. The cowblocks equivalents are already defined, we just aren't actually calling them. Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/xfs_icache.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 14796b7..f295049 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1656,9 +1656,9 @@ void xfs_inode_set_cowblocks_tag( xfs_inode_t *ip) { - trace_xfs_inode_set_eofblocks_tag(ip); + trace_xfs_inode_set_cowblocks_tag(ip); return __xfs_inode_set_eofblocks_tag(ip, xfs_queue_cowblocks, - trace_xfs_perag_set_eofblocks, + trace_xfs_perag_set_cowblocks, XFS_ICI_COWBLOCKS_TAG); } @@ -1666,7 +1666,7 @@ void xfs_inode_clear_cowblocks_tag( xfs_inode_t *ip) { - trace_xfs_inode_clear_eofblocks_tag(ip); + trace_xfs_inode_clear_cowblocks_tag(ip); return __xfs_inode_clear_eofblocks_tag(ip, - trace_xfs_perag_clear_eofblocks, XFS_ICI_COWBLOCKS_TAG); + trace_xfs_perag_clear_cowblocks, XFS_ICI_COWBLOCKS_TAG); } -- cgit v1.1 From c17a8ef43d6b80ed3519b828c37d18645445949f Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 24 Oct 2016 14:21:08 +1100 Subject: xfs: clear cowblocks tag when cow fork is emptied The background cowblocks scan job takes care of scanning for inodes with potentially lingering blocks in the cow fork and clearing them out. If the background scanner reclaims the cow fork blocks, however, it doesn't immediately clear the cowblocks tag from the inode. Instead, the inode remains tagged until the background scanner comes around again, discovers the inode cow fork has no blocks, clears the tag and fires the trace_xfs_inode_free_cowblocks_invalid() tracepoint to indicate that the inode may have been incorrectly tagged. This is not a major functional problem as the tag is ultimately cleared. Nonetheless, clear the tag when an inode cow fork is explicitly emptied to avoid the extra round trip through the background scanner and spurious "invalid" tracepoint. Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/xfs_reflink.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index cd308f1..a279b4e 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -567,10 +567,14 @@ xfs_reflink_cancel_cow_blocks( } if (++idx >= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)) - return 0; + break; xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &got); } + /* clear tag if cow fork is emptied */ + if (!ifp->if_bytes) + xfs_inode_clear_cowblocks_tag(ip); + return error; } -- cgit v1.1 From 0b34c261e235a5c74dcf78bd305845bd15fe2b42 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Fri, 30 Sep 2016 10:40:52 -0500 Subject: btrfs: qgroup: Prevent qgroup->reserved from going subzero While free'ing qgroup->reserved resources, we much check if the page has not been invalidated by a truncate operation by checking if the page is still dirty before reducing the qgroup resources. Resources in such a case are free'd when the entire extent is released by delayed_ref. This fixes a double accounting while releasing resources in case of truncating a file, reproduced by the following testcase. SCRATCH_DEV=/dev/vdb SCRATCH_MNT=/mnt mkfs.btrfs -f $SCRATCH_DEV mount -t btrfs $SCRATCH_DEV $SCRATCH_MNT cd $SCRATCH_MNT btrfs quota enable $SCRATCH_MNT btrfs subvolume create a btrfs qgroup limit 500m a $SCRATCH_MNT sync for c in {1..15}; do dd if=/dev/zero bs=1M count=40 of=$SCRATCH_MNT/a/file; done sleep 10 sync sleep 5 touch $SCRATCH_MNT/a/newfile echo "Removing file" rm $SCRATCH_MNT/a/file Fixes: b9d0b38928 ("btrfs: Add handler for invalidate page") Signed-off-by: Goldwyn Rodrigues Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/inode.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 50ba4ca..6677674 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8930,9 +8930,14 @@ again: * So even we call qgroup_free_data(), it won't decrease reserved * space. * 2) Not written to disk - * This means the reserved space should be freed here. + * This means the reserved space should be freed here. However, + * if a truncate invalidates the page (by clearing PageDirty) + * and the page is accounted for while allocating extent + * in btrfs_check_data_free_space() we let delayed_ref to + * free the entire extent. */ - btrfs_qgroup_free_data(inode, page_start, PAGE_SIZE); + if (PageDirty(page)) + btrfs_qgroup_free_data(inode, page_start, PAGE_SIZE); if (!inode_evicting) { clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | EXTENT_DIRTY | -- cgit v1.1 From 69ae5e4459e43e56f03d0987e865fbac2b05af2a Mon Sep 17 00:00:00 2001 From: Wang Xiaoguang Date: Thu, 13 Oct 2016 09:23:39 +0800 Subject: btrfs: make file clone aware of fatal signals Indeed this just make the behavior similar to xfs when process has fatal signals pending, and it'll make fstests/generic/298 happy. Signed-off-by: Wang Xiaoguang Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index af69129..7163457 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3814,6 +3814,11 @@ process_slot: } btrfs_release_path(path); key.offset = next_key_min_offset; + + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto out; + } } ret = 0; -- cgit v1.1 From dd4b857aabbce57518f2d32d9805365d3ff34fc6 Mon Sep 17 00:00:00 2001 From: Wang Xiaoguang Date: Tue, 18 Oct 2016 15:56:13 +0800 Subject: btrfs: pass correct args to btrfs_async_run_delayed_refs() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In btrfs_truncate_inode_items()->btrfs_async_run_delayed_refs(), we swap the arg2 and arg3 wrongly, fix this. This bug just impacts asynchronous delayed refs handle when we truncate inodes. In delayed_ref_async_start(), there is such codes: trans = btrfs_join_transaction(async->root); if (trans->transid > async->transid) goto end; ret = btrfs_run_delayed_refs(trans, async->root, async->count); From this codes, we can see that this just influence whether can we handle delayed refs or the number of delayed refs to handle, this may impact performance, but will not result in missing delayed refs, all delayed refs will be handled in btrfs_commit_transaction(). Signed-off-by: Wang Xiaoguang Reviewed-by: Holger Hoffstätte Signed-off-by: David Sterba --- fs/btrfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6677674..1f980ef 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4605,8 +4605,8 @@ delete: BUG_ON(ret); if (btrfs_should_throttle_delayed_refs(trans, root)) btrfs_async_run_delayed_refs(root, - trans->transid, - trans->delayed_ref_updates * 2, 0); + trans->delayed_ref_updates * 2, + trans->transid, 0); if (be_nice) { if (truncate_space_check(trans, root, extent_num_bytes)) { -- cgit v1.1 From 9c894696f56f5d84fb5766af81bcda4a7cb9a842 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 12 Oct 2016 11:33:21 +0300 Subject: Btrfs: remove some no-op casts We cast 0 to a u8 but then because of type promotion, it's immediately cast to int back to int before we do a bitwise negate. The cast doesn't matter in this case, the code works as intended. It causes a static checker warning though so let's remove it. Signed-off-by: Dan Carpenter Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 66a7551..8ed05d9 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5569,7 +5569,7 @@ void le_bitmap_set(u8 *map, unsigned int start, int len) *p |= mask_to_set; len -= bits_to_set; bits_to_set = BITS_PER_BYTE; - mask_to_set = ~(u8)0; + mask_to_set = ~0; p++; } if (len) { @@ -5589,7 +5589,7 @@ void le_bitmap_clear(u8 *map, unsigned int start, int len) *p &= ~mask_to_clear; len -= bits_to_clear; bits_to_clear = BITS_PER_BYTE; - mask_to_clear = ~(u8)0; + mask_to_clear = ~0; p++; } if (len) { @@ -5679,7 +5679,7 @@ void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start, kaddr[offset] |= mask_to_set; len -= bits_to_set; bits_to_set = BITS_PER_BYTE; - mask_to_set = ~(u8)0; + mask_to_set = ~0; if (++offset >= PAGE_SIZE && len > 0) { offset = 0; page = eb->pages[++i]; @@ -5721,7 +5721,7 @@ void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start, kaddr[offset] &= ~mask_to_clear; len -= bits_to_clear; bits_to_clear = BITS_PER_BYTE; - mask_to_clear = ~(u8)0; + mask_to_clear = ~0; if (++offset >= PAGE_SIZE && len > 0) { offset = 0; page = eb->pages[++i]; -- cgit v1.1 From 9d1032cc49a8a1065e79ee323de66bcb4fdbd535 Mon Sep 17 00:00:00 2001 From: Wang Xiaoguang Date: Mon, 20 Jun 2016 09:18:52 +0800 Subject: btrfs: fix WARNING in btrfs_select_ref_head() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This issue was found when testing in-band dedupe enospc behaviour, sometimes run_one_delayed_ref() may fail for enospc reason, then __btrfs_run_delayed_refs()will return, but forget to add num_heads_read back, which will trigger "WARN_ON(delayed_refs->num_heads_ready == 0)" in btrfs_select_ref_head(). Signed-off-by: Wang Xiaoguang Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 210c94a..4607af3 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2647,7 +2647,10 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, btrfs_free_delayed_extent_op(extent_op); if (ret) { + spin_lock(&delayed_refs->lock); locked_ref->processing = 0; + delayed_refs->num_heads_ready++; + spin_unlock(&delayed_refs->lock); btrfs_delayed_ref_unlock(locked_ref); btrfs_put_delayed_ref(ref); btrfs_debug(fs_info, "run_one_delayed_ref returned %d", -- cgit v1.1 From d62a9025aee446994a706c711e45c6a655d9d348 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 21 Oct 2016 07:33:57 +0300 Subject: orangefs: user file_inode() where it is due Replace wrong use of file->f_path.dentry->d_inode with file_inode(file). In case orangefs ever finds itself as an overelayfs layer, it would want to get its own inode and not overlayfs's inode. DISCLAIMER: I did not test this patch because I do not know how to setup an orangefs mount Signed-off-by: Amir Goldstein Signed-off-by: Mike Marshall --- fs/orangefs/file.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 66ea0cc..02cc613 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -621,9 +621,9 @@ static int orangefs_file_release(struct inode *inode, struct file *file) * readahead cache (if any); this forces an expensive refresh of * data for the next caller of mmap (or 'get_block' accesses) */ - if (file->f_path.dentry->d_inode && - file->f_path.dentry->d_inode->i_mapping && - mapping_nrpages(&file->f_path.dentry->d_inode->i_data)) { + if (file_inode(file) && + file_inode(file)->i_mapping && + mapping_nrpages(&file_inode(file)->i_data)) { if (orangefs_features & ORANGEFS_FEATURE_READAHEAD) { gossip_debug(GOSSIP_INODE_DEBUG, "calling flush_racache on %pU\n", @@ -632,7 +632,7 @@ static int orangefs_file_release(struct inode *inode, struct file *file) gossip_debug(GOSSIP_INODE_DEBUG, "flush_racache finished\n"); } - truncate_inode_pages(file->f_path.dentry->d_inode->i_mapping, + truncate_inode_pages(file_inode(file)->i_mapping, 0); } return 0; @@ -648,7 +648,7 @@ static int orangefs_fsync(struct file *file, { int ret = -EINVAL; struct orangefs_inode_s *orangefs_inode = - ORANGEFS_I(file->f_path.dentry->d_inode); + ORANGEFS_I(file_inode(file)); struct orangefs_kernel_op_s *new_op = NULL; /* required call */ @@ -661,7 +661,7 @@ static int orangefs_fsync(struct file *file, ret = service_operation(new_op, "orangefs_fsync", - get_interruptible_flag(file->f_path.dentry->d_inode)); + get_interruptible_flag(file_inode(file))); gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_fsync got return value of %d\n", @@ -669,7 +669,7 @@ static int orangefs_fsync(struct file *file, op_release(new_op); - orangefs_flush_inode(file->f_path.dentry->d_inode); + orangefs_flush_inode(file_inode(file)); return ret; } -- cgit v1.1 From 804b1737d71253f01621d2a37a0dce6279a2d440 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 17 Oct 2016 10:14:23 +0200 Subject: orangefs: don't use d_time Instead use d_fsdata which is the same size. Hoping to get rid of d_time, which is used by very few filesystems by this time. Signed-off-by: Miklos Szeredi Reviewed-by: Martin Brandenburg Signed-off-by: Mike Marshall --- fs/orangefs/dcache.c | 5 +++-- fs/orangefs/namei.c | 8 ++++---- fs/orangefs/orangefs-kernel.h | 7 +++++++ 3 files changed, 14 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/orangefs/dcache.c b/fs/orangefs/dcache.c index 1e8fe84..5355efb 100644 --- a/fs/orangefs/dcache.c +++ b/fs/orangefs/dcache.c @@ -73,7 +73,7 @@ static int orangefs_revalidate_lookup(struct dentry *dentry) } } - dentry->d_time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000; + orangefs_set_timeout(dentry); ret = 1; out_release_op: op_release(new_op); @@ -94,8 +94,9 @@ out_drop: static int orangefs_d_revalidate(struct dentry *dentry, unsigned int flags) { int ret; + unsigned long time = (unsigned long) dentry->d_fsdata; - if (time_before(jiffies, dentry->d_time)) + if (time_before(jiffies, time)) return 1; if (flags & LOOKUP_RCU) diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c index d15d3d2..a290ff6 100644 --- a/fs/orangefs/namei.c +++ b/fs/orangefs/namei.c @@ -72,7 +72,7 @@ static int orangefs_create(struct inode *dir, d_instantiate(dentry, inode); unlock_new_inode(inode); - dentry->d_time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000; + orangefs_set_timeout(dentry); ORANGEFS_I(inode)->getattr_time = jiffies - 1; gossip_debug(GOSSIP_NAME_DEBUG, @@ -183,7 +183,7 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry, goto out; } - dentry->d_time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000; + orangefs_set_timeout(dentry); inode = orangefs_iget(dir->i_sb, &new_op->downcall.resp.lookup.refn); if (IS_ERR(inode)) { @@ -322,7 +322,7 @@ static int orangefs_symlink(struct inode *dir, d_instantiate(dentry, inode); unlock_new_inode(inode); - dentry->d_time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000; + orangefs_set_timeout(dentry); ORANGEFS_I(inode)->getattr_time = jiffies - 1; gossip_debug(GOSSIP_NAME_DEBUG, @@ -386,7 +386,7 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode d_instantiate(dentry, inode); unlock_new_inode(inode); - dentry->d_time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000; + orangefs_set_timeout(dentry); ORANGEFS_I(inode)->getattr_time = jiffies - 1; gossip_debug(GOSSIP_NAME_DEBUG, diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 0a82048..3bf803d 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -580,4 +580,11 @@ static inline void orangefs_i_size_write(struct inode *inode, loff_t i_size) #endif } +static inline void orangefs_set_timeout(struct dentry *dentry) +{ + unsigned long time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000; + + dentry->d_fsdata = (void *) time; +} + #endif /* __ORANGEFSKERNEL_H */ -- cgit v1.1 From 0cc11a61b80a1ab1d12f1597b27b8b45ef8bac4a Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 20 Oct 2016 09:34:31 -0400 Subject: nfsd: move blocked lock handling under a dedicated spinlock Bruce was hitting some lockdep warnings in testing, showing that we could hit a deadlock with the new CB_NOTIFY_LOCK handling, involving a rather complex situation involving four different spinlocks. The crux of the matter is that we end up taking the nn->client_lock in the lm_notify handler. The simplest fix is to just declare a new per-nfsd_net spinlock to protect the new CB_NOTIFY_LOCK structures. Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/netns.h | 5 +++++ fs/nfsd/nfs4state.c | 28 +++++++++++++++------------- 2 files changed, 20 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index b10d557..ee36efd 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -84,6 +84,8 @@ struct nfsd_net { struct list_head client_lru; struct list_head close_lru; struct list_head del_recall_lru; + + /* protected by blocked_locks_lock */ struct list_head blocked_locks_lru; struct delayed_work laundromat_work; @@ -91,6 +93,9 @@ struct nfsd_net { /* client_lock protects the client lru list and session hash table */ spinlock_t client_lock; + /* protects blocked_locks_lru */ + spinlock_t blocked_locks_lock; + struct file *rec_file; bool in_grace; const struct nfsd4_client_tracking_ops *client_tracking_ops; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 9752beb..9547419 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -217,7 +217,7 @@ find_blocked_lock(struct nfs4_lockowner *lo, struct knfsd_fh *fh, { struct nfsd4_blocked_lock *cur, *found = NULL; - spin_lock(&nn->client_lock); + spin_lock(&nn->blocked_locks_lock); list_for_each_entry(cur, &lo->lo_blocked, nbl_list) { if (fh_match(fh, &cur->nbl_fh)) { list_del_init(&cur->nbl_list); @@ -226,7 +226,7 @@ find_blocked_lock(struct nfs4_lockowner *lo, struct knfsd_fh *fh, break; } } - spin_unlock(&nn->client_lock); + spin_unlock(&nn->blocked_locks_lock); if (found) posix_unblock_lock(&found->nbl_lock); return found; @@ -4665,7 +4665,7 @@ nfs4_laundromat(struct nfsd_net *nn) * indefinitely once the lock does become free. */ BUG_ON(!list_empty(&reaplist)); - spin_lock(&nn->client_lock); + spin_lock(&nn->blocked_locks_lock); while (!list_empty(&nn->blocked_locks_lru)) { nbl = list_first_entry(&nn->blocked_locks_lru, struct nfsd4_blocked_lock, nbl_lru); @@ -4678,7 +4678,7 @@ nfs4_laundromat(struct nfsd_net *nn) list_move(&nbl->nbl_lru, &reaplist); list_del_init(&nbl->nbl_list); } - spin_unlock(&nn->client_lock); + spin_unlock(&nn->blocked_locks_lock); while (!list_empty(&reaplist)) { nbl = list_first_entry(&nn->blocked_locks_lru, @@ -5439,13 +5439,13 @@ nfsd4_lm_notify(struct file_lock *fl) bool queue = false; /* An empty list means that something else is going to be using it */ - spin_lock(&nn->client_lock); + spin_lock(&nn->blocked_locks_lock); if (!list_empty(&nbl->nbl_list)) { list_del_init(&nbl->nbl_list); list_del_init(&nbl->nbl_lru); queue = true; } - spin_unlock(&nn->client_lock); + spin_unlock(&nn->blocked_locks_lock); if (queue) nfsd4_run_cb(&nbl->nbl_cb); @@ -5868,10 +5868,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (fl_flags & FL_SLEEP) { nbl->nbl_time = jiffies; - spin_lock(&nn->client_lock); + spin_lock(&nn->blocked_locks_lock); list_add_tail(&nbl->nbl_list, &lock_sop->lo_blocked); list_add_tail(&nbl->nbl_lru, &nn->blocked_locks_lru); - spin_unlock(&nn->client_lock); + spin_unlock(&nn->blocked_locks_lock); } err = vfs_lock_file(filp, F_SETLK, file_lock, conflock); @@ -5900,10 +5900,10 @@ out: if (nbl) { /* dequeue it if we queued it before */ if (fl_flags & FL_SLEEP) { - spin_lock(&nn->client_lock); + spin_lock(&nn->blocked_locks_lock); list_del_init(&nbl->nbl_list); list_del_init(&nbl->nbl_lru); - spin_unlock(&nn->client_lock); + spin_unlock(&nn->blocked_locks_lock); } free_blocked_lock(nbl); } @@ -6943,9 +6943,11 @@ static int nfs4_state_create_net(struct net *net) INIT_LIST_HEAD(&nn->client_lru); INIT_LIST_HEAD(&nn->close_lru); INIT_LIST_HEAD(&nn->del_recall_lru); - INIT_LIST_HEAD(&nn->blocked_locks_lru); spin_lock_init(&nn->client_lock); + spin_lock_init(&nn->blocked_locks_lock); + INIT_LIST_HEAD(&nn->blocked_locks_lru); + INIT_DELAYED_WORK(&nn->laundromat_work, laundromat_main); get_net(net); @@ -7063,14 +7065,14 @@ nfs4_state_shutdown_net(struct net *net) } BUG_ON(!list_empty(&reaplist)); - spin_lock(&nn->client_lock); + spin_lock(&nn->blocked_locks_lock); while (!list_empty(&nn->blocked_locks_lru)) { nbl = list_first_entry(&nn->blocked_locks_lru, struct nfsd4_blocked_lock, nbl_lru); list_move(&nbl->nbl_lru, &reaplist); list_del_init(&nbl->nbl_list); } - spin_unlock(&nn->client_lock); + spin_unlock(&nn->blocked_locks_lock); while (!list_empty(&reaplist)) { nbl = list_first_entry(&nn->blocked_locks_lru, -- cgit v1.1 From 272ddc8b37354c3fe111ab26d25e792629148eee Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 24 Oct 2016 19:00:44 -0700 Subject: proc: don't use FOLL_FORCE for reading cmdline and environment Now that Lorenzo cleaned things up and made the FOLL_FORCE users explicit, it becomes obvious how some of them don't really need FOLL_FORCE at all. So remove FOLL_FORCE from the proc code that reads the command line and arguments from user space. The mem_rw() function actually does want FOLL_FORCE, because gdd (and possibly many other debuggers) use it as a much more convenient version of PTRACE_PEEKDATA, but we should consider making the FOLL_FORCE part conditional on actually being a ptracer. This does not actually do that, just moves adds a comment to that effect and moves the gup_flags settings next to each other. Signed-off-by: Linus Torvalds --- fs/proc/base.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 8e65446..adfc5b4 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -252,7 +252,7 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, * Inherently racy -- command line shares address space * with code and data. */ - rv = access_remote_vm(mm, arg_end - 1, &c, 1, FOLL_FORCE); + rv = access_remote_vm(mm, arg_end - 1, &c, 1, 0); if (rv <= 0) goto out_free_page; @@ -270,8 +270,7 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, int nr_read; _count = min3(count, len, PAGE_SIZE); - nr_read = access_remote_vm(mm, p, page, _count, - FOLL_FORCE); + nr_read = access_remote_vm(mm, p, page, _count, 0); if (nr_read < 0) rv = nr_read; if (nr_read <= 0) @@ -306,8 +305,7 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, bool final; _count = min3(count, len, PAGE_SIZE); - nr_read = access_remote_vm(mm, p, page, _count, - FOLL_FORCE); + nr_read = access_remote_vm(mm, p, page, _count, 0); if (nr_read < 0) rv = nr_read; if (nr_read <= 0) @@ -356,8 +354,7 @@ skip_argv: bool final; _count = min3(count, len, PAGE_SIZE); - nr_read = access_remote_vm(mm, p, page, _count, - FOLL_FORCE); + nr_read = access_remote_vm(mm, p, page, _count, 0); if (nr_read < 0) rv = nr_read; if (nr_read <= 0) @@ -835,7 +832,7 @@ static ssize_t mem_rw(struct file *file, char __user *buf, unsigned long addr = *ppos; ssize_t copied; char *page; - unsigned int flags = FOLL_FORCE; + unsigned int flags; if (!mm) return 0; @@ -848,6 +845,8 @@ static ssize_t mem_rw(struct file *file, char __user *buf, if (!atomic_inc_not_zero(&mm->mm_users)) goto free; + /* Maybe we should limit FOLL_FORCE to actual ptrace users? */ + flags = FOLL_FORCE; if (write) flags |= FOLL_WRITE; @@ -971,8 +970,7 @@ static ssize_t environ_read(struct file *file, char __user *buf, max_len = min_t(size_t, PAGE_SIZE, count); this_len = min(max_len, this_len); - retval = access_remote_vm(mm, (env_start + src), - page, this_len, FOLL_FORCE); + retval = access_remote_vm(mm, (env_start + src), page, this_len, 0); if (retval <= 0) { ret = retval; -- cgit v1.1 From 2a9becdd4dbed499815938308bdab9aae70dd561 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 14 Oct 2016 10:56:42 -0700 Subject: kernfs: Add noop_fsync to supported kernfs_file_fops If you edit a kernfs backed file with vi(1), you see an ugly error message when you write the file because vi tries to fsync(2) the file after writing, which fails. We have noop_fsync() for this, use it. Signed-off-by: Tony Luck Signed-off-by: Greg Kroah-Hartman --- fs/kernfs/file.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 2bcb86e..78219d5 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -911,6 +911,7 @@ const struct file_operations kernfs_file_fops = { .open = kernfs_fop_open, .release = kernfs_fop_release, .poll = kernfs_fop_poll, + .fsync = noop_fsync, }; /** -- cgit v1.1 From 570dd45042a7c8a7aba1ee029c5dd0f5ccf41b9b Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 27 Oct 2016 10:42:20 -0700 Subject: btrfs: fix races on root_log_ctx lists btrfs_remove_all_log_ctxs takes a shortcut where it avoids walking the list because it knows all of the waiters are patiently waiting for the commit to finish. But, there's a small race where btrfs_sync_log can remove itself from the list if it finds a log commit is already done. Also, it uses list_del_init() to remove itself from the list, but there's no way to know if btrfs_remove_all_log_ctxs has already run, so we don't know for sure if it is safe to call list_del_init(). This gets rid of all the shortcuts for btrfs_remove_all_log_ctxs(), and just calls it with the proper locking. This is part two of the corruption fixed by cbd60aa7cd1. I should have done this in the first place, but convinced myself the optimizations were safe. A 12 hour run of dbench 2048 will eventually trigger a list debug WARN_ON for the list_del_init() in btrfs_sync_log(). Fixes: d1433debe7f4346cf9fc0dafc71c3137d2a97bc4 Reported-by: Dave Jones cc: stable@vger.kernel.org # 3.15+ Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 688df71..4b1c0a6 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2713,14 +2713,12 @@ static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, int index, int error) { struct btrfs_log_ctx *ctx; + struct btrfs_log_ctx *safe; - if (!error) { - INIT_LIST_HEAD(&root->log_ctxs[index]); - return; - } - - list_for_each_entry(ctx, &root->log_ctxs[index], list) + list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) { + list_del_init(&ctx->list); ctx->log_ret = error; + } INIT_LIST_HEAD(&root->log_ctxs[index]); } @@ -2961,13 +2959,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_unlock(&root->log_mutex); out_wake_log_root: - /* - * We needn't get log_mutex here because we are sure all - * the other tasks are blocked. - */ + mutex_lock(&log_root_tree->log_mutex); btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); - mutex_lock(&log_root_tree->log_mutex); log_root_tree->log_transid_committed++; atomic_set(&log_root_tree->log_commit[index2], 0); mutex_unlock(&log_root_tree->log_mutex); @@ -2978,10 +2972,8 @@ out_wake_log_root: if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) wake_up(&log_root_tree->log_commit_wait[index2]); out: - /* See above. */ - btrfs_remove_all_log_ctxs(root, index1, ret); - mutex_lock(&root->log_mutex); + btrfs_remove_all_log_ctxs(root, index1, ret); root->log_transid_committed++; atomic_set(&root->log_commit[index1], 0); mutex_unlock(&root->log_mutex); -- cgit v1.1 From 06b2849d103f4a91212876a211d0d7df227a9513 Mon Sep 17 00:00:00 2001 From: Leon Yu Date: Thu, 27 Oct 2016 17:46:50 -0700 Subject: proc: fix NULL dereference when reading /proc//auxv Reading auxv of any kernel thread results in NULL pointer dereferencing in auxv_read() where mm can be NULL. Fix that by checking for NULL mm and bailing out early. This is also the original behavior changed by recent commit c5317167854e ("proc: switch auxv to use of __mem_open()"). # cat /proc/2/auxv Unable to handle kernel NULL pointer dereference at virtual address 000000a8 Internal error: Oops: 17 [#1] PREEMPT SMP ARM CPU: 3 PID: 113 Comm: cat Not tainted 4.9.0-rc1-ARCH+ #1 Hardware name: BCM2709 task: ea3b0b00 task.stack: e99b2000 PC is at auxv_read+0x24/0x4c LR is at do_readv_writev+0x2fc/0x37c Process cat (pid: 113, stack limit = 0xe99b2210) Call chain: auxv_read do_readv_writev vfs_readv default_file_splice_read splice_direct_to_actor do_splice_direct do_sendfile SyS_sendfile64 ret_fast_syscall Fixes: c5317167854e ("proc: switch auxv to use of __mem_open()") Link: http://lkml.kernel.org/r/1476966200-14457-1-git-send-email-chianglungyu@gmail.com Signed-off-by: Leon Yu Acked-by: Oleg Nesterov Acked-by: Michal Hocko Cc: Al Viro Cc: Kees Cook Cc: John Stultz Cc: Mateusz Guzik Cc: Janis Danisevskis Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index adfc5b4..ca651ac 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1012,6 +1012,9 @@ static ssize_t auxv_read(struct file *file, char __user *buf, { struct mm_struct *mm = file->private_data; unsigned int nwords = 0; + + if (!mm) + return 0; do { nwords += 2; } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */ -- cgit v1.1 From 14f947c87a2164b19c7f8c02234f4f348e03f409 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Thu, 27 Oct 2016 17:47:02 -0700 Subject: fs: exofs: print a hex number after a 0x prefix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It makes the message hard to interpret correctly if a base 10 number is prefixed by 0x. So change to a hex number. Link: http://lkml.kernel.org/r/20161026125658.25728-2-u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König Cc: Boaz Harrosh Cc: Benny Halevy Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exofs/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c index 79101651..42f9a0a 100644 --- a/fs/exofs/dir.c +++ b/fs/exofs/dir.c @@ -137,7 +137,7 @@ Espan: bad_entry: EXOFS_ERR( "ERROR [exofs_check_page]: bad entry in directory(0x%lx): %s - " - "offset=%lu, inode=0x%llu, rec_len=%d, name_len=%d\n", + "offset=%lu, inode=0x%llx, rec_len=%d, name_len=%d\n", dir->i_ino, error, (page->index<inode_no)), rec_len, p->name_len); -- cgit v1.1 From a00052a296e54205cf238c75bd98d17d5d02a6db Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Fri, 28 Oct 2016 11:49:03 +0200 Subject: ubifs: Fix regression in ubifs_readdir() Commit c83ed4c9dbb35 ("ubifs: Abort readdir upon error") broke overlayfs support because the fix exposed an internal error code to VFS. Reported-by: Peter Rosin Tested-by: Peter Rosin Reported-by: Ralph Sennhauser Tested-by: Ralph Sennhauser Fixes: c83ed4c9dbb35 ("ubifs: Abort readdir upon error") Cc: stable@vger.kernel.org Signed-off-by: Richard Weinberger --- fs/ubifs/dir.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index bd4a5e8..ca16c5d 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -543,6 +543,14 @@ out: if (err != -ENOENT) ubifs_err(c, "cannot find next direntry, error %d", err); + else + /* + * -ENOENT is a non-fatal error in this context, the TNC uses + * it to indicate that the cursor moved past the current directory + * and readdir() has to stop. + */ + err = 0; + /* 2 is a special value indicating that there are no more direntries */ ctx->pos = 2; -- cgit v1.1 From fd3220d37b1f6f0cab6142d98b0e6c4082e63299 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 31 Oct 2016 14:42:14 +0100 Subject: ovl: update S_ISGID when setting posix ACLs This change fixes xfstest generic/375, which failed to clear the setgid bit in the following test case on overlayfs: touch $testfile chown 100:100 $testfile chmod 2755 $testfile _runas -u 100 -g 101 -- setfacl -m u::rwx,g::rwx,o::rwx $testfile Reported-by: Amir Goldstein Signed-off-by: Miklos Szeredi Tested-by: Amir Goldstein Fixes: d837a49bd57f ("ovl: fix POSIX ACL setting") Cc: # v4.8 --- fs/overlayfs/super.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'fs') diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index bcf3965..edd46a0 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1037,6 +1037,21 @@ ovl_posix_acl_xattr_set(const struct xattr_handler *handler, posix_acl_release(acl); + /* + * Check if sgid bit needs to be cleared (actual setacl operation will + * be done with mounter's capabilities and so that won't do it for us). + */ + if (unlikely(inode->i_mode & S_ISGID) && + handler->flags == ACL_TYPE_ACCESS && + !in_group_p(inode->i_gid) && + !capable_wrt_inode_uidgid(inode, CAP_FSETID)) { + struct iattr iattr = { .ia_valid = ATTR_KILL_SGID }; + + err = ovl_setattr(dentry, &iattr); + if (err) + return err; + } + err = ovl_xattr_set(dentry, handler->name, value, size, flags); if (!err) ovl_copyattr(ovl_inode_real(inode, NULL), inode); -- cgit v1.1 From b93d4a0eb308d4400b84c8b24c1b80e09a9497d0 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 31 Oct 2016 14:42:14 +0100 Subject: ovl: fix get_acl() on tmpfs tmpfs doesn't have ->get_acl() because it only uses cached acls. This fixes the acl tests in pjdfstest when tmpfs is used as the upper layer of the overlay. Reported-by: Amir Goldstein Signed-off-by: Miklos Szeredi Fixes: 39a25b2b3762 ("ovl: define ->get_acl() for overlay inodes") Cc: # v4.8 --- fs/overlayfs/inode.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index c58f01b..7fb53d0 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -270,9 +270,6 @@ struct posix_acl *ovl_get_acl(struct inode *inode, int type) if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !IS_POSIXACL(realinode)) return NULL; - if (!realinode->i_op->get_acl) - return NULL; - old_cred = ovl_override_creds(inode->i_sb); acl = get_acl(realinode, type); revert_creds(old_cred); -- cgit v1.1 From 641089c1549d8d3df0b047b5de7e9a111362cdce Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 31 Oct 2016 14:42:14 +0100 Subject: ovl: fsync after copy-up Make sure the copied up file hits the disk before renaming to the final destination. If this is not done then the copy-up may corrupt the data in the file in case of a crash. Signed-off-by: Miklos Szeredi Cc: --- fs/overlayfs/copy_up.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index aeb60f7..36795ee 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -178,6 +178,8 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len) len -= bytes; } + if (!error) + error = vfs_fsync(new_file, 0); fput(new_file); out_fput: fput(old_file); -- cgit v1.1 From f46c445b79906a9da55c13e0a6f6b6a006b892fe Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 29 Oct 2016 18:19:03 -0400 Subject: nfsd: Fix general protection fault in release_lock_stateid() When I push NFSv4.1 / RDMA hard, (xfstests generic/089, for example), I get this crash on the server: Oct 28 22:04:30 klimt kernel: general protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC Oct 28 22:04:30 klimt kernel: Modules linked in: cts rpcsec_gss_krb5 iTCO_wdt iTCO_vendor_support sb_edac edac_core x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel kvm btrfs irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel aesni_intel lrw gf128mul glue_helper ablk_helper cryptd xor pcspkr raid6_pq i2c_i801 i2c_smbus lpc_ich mfd_core sg mei_me mei ioatdma shpchp wmi ipmi_si ipmi_msghandler rpcrdma ib_ipoib rdma_ucm acpi_power_meter acpi_pad ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c mlx4_ib mlx4_en ib_core sr_mod cdrom sd_mod ast drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm drm crc32c_intel igb ahci libahci ptp mlx4_core pps_core dca libata i2c_algo_bit i2c_core dm_mirror dm_region_hash dm_log dm_mod Oct 28 22:04:30 klimt kernel: CPU: 7 PID: 1558 Comm: nfsd Not tainted 4.9.0-rc2-00005-g82cd754 #8 Oct 28 22:04:30 klimt kernel: Hardware name: Supermicro Super Server/X10SRL-F, BIOS 1.0c 09/09/2015 Oct 28 22:04:30 klimt kernel: task: ffff880835c3a100 task.stack: ffff8808420d8000 Oct 28 22:04:30 klimt kernel: RIP: 0010:[] [] release_lock_stateid+0x1f/0x60 [nfsd] Oct 28 22:04:30 klimt kernel: RSP: 0018:ffff8808420dbce0 EFLAGS: 00010246 Oct 28 22:04:30 klimt kernel: RAX: ffff88084e6660f0 RBX: ffff88084e667020 RCX: 0000000000000000 Oct 28 22:04:30 klimt kernel: RDX: 0000000000000007 RSI: 0000000000000000 RDI: ffff88084e667020 Oct 28 22:04:30 klimt kernel: RBP: ffff8808420dbcf8 R08: 0000000000000001 R09: 0000000000000000 Oct 28 22:04:30 klimt kernel: R10: ffff880835c3a100 R11: ffff880835c3aca8 R12: 6b6b6b6b6b6b6b6b Oct 28 22:04:30 klimt kernel: R13: ffff88084e6670d8 R14: ffff880835f546f0 R15: ffff880835f1c548 Oct 28 22:04:30 klimt kernel: FS: 0000000000000000(0000) GS:ffff88087bdc0000(0000) knlGS:0000000000000000 Oct 28 22:04:30 klimt kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 Oct 28 22:04:30 klimt kernel: CR2: 00007ff020389000 CR3: 0000000001c06000 CR4: 00000000001406e0 Oct 28 22:04:30 klimt kernel: Stack: Oct 28 22:04:30 klimt kernel: ffff88084e667020 0000000000000000 ffff88084e6670d8 ffff8808420dbd20 Oct 28 22:04:30 klimt kernel: ffffffffa05ac80d ffff880835f54548 ffff88084e640008 ffff880835f545b0 Oct 28 22:04:30 klimt kernel: ffff8808420dbd70 ffffffffa059803d ffff880835f1c768 0000000000000870 Oct 28 22:04:30 klimt kernel: Call Trace: Oct 28 22:04:30 klimt kernel: [] nfsd4_free_stateid+0xfd/0x1b0 [nfsd] Oct 28 22:04:30 klimt kernel: [] nfsd4_proc_compound+0x40d/0x690 [nfsd] Oct 28 22:04:30 klimt kernel: [] nfsd_dispatch+0xd4/0x1d0 [nfsd] Oct 28 22:04:30 klimt kernel: [] svc_process_common+0x3d9/0x700 [sunrpc] Oct 28 22:04:30 klimt kernel: [] svc_process+0xf4/0x330 [sunrpc] Oct 28 22:04:30 klimt kernel: [] nfsd+0xfa/0x160 [nfsd] Oct 28 22:04:30 klimt kernel: [] ? nfsd_destroy+0x170/0x170 [nfsd] Oct 28 22:04:30 klimt kernel: [] kthread+0x10b/0x120 Oct 28 22:04:30 klimt kernel: [] ? kthread_stop+0x280/0x280 Oct 28 22:04:30 klimt kernel: [] ret_from_fork+0x2a/0x40 Oct 28 22:04:30 klimt kernel: Code: c3 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 55 48 89 e5 41 55 41 54 53 48 8b 87 b0 00 00 00 48 89 fb 4c 8b a0 98 00 00 00 <49> 8b 44 24 20 48 8d b8 80 03 00 00 e8 10 66 1a e1 48 89 df e8 Oct 28 22:04:30 klimt kernel: RIP [] release_lock_stateid+0x1f/0x60 [nfsd] Oct 28 22:04:30 klimt kernel: RSP Oct 28 22:04:30 klimt kernel: ---[ end trace cf5d0b371973e167 ]--- Jeff Layton says: > Hm...now that I look though, this is a little suspicious: > > struct nfs4_openowner *oo = openowner(stp->st_openstp->st_stateowner); > > I wonder if it's possible for the openstateid to have already been > destroyed at this point. > > We might be better off doing something like this to get the client pointer: > > stp->st_stid.sc_client; > > ...which should be more direct and less dependent on other stateids > staying valid. With the suggested change, I am no longer able to reproduce the above oops. v2: Fix unhash_lock_stateid() as well Fix-suggested-by: Jeff Layton Fixes: 42691398be08 ('nfsd: Fix race between FREE_STATEID and LOCK') Signed-off-by: Chuck Lever Reviewed-by: Jeff Layton Cc: stable@vger.kernel.org Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 9547419..4b4beaa 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1227,9 +1227,7 @@ static void put_ol_stateid_locked(struct nfs4_ol_stateid *stp, static bool unhash_lock_stateid(struct nfs4_ol_stateid *stp) { - struct nfs4_openowner *oo = openowner(stp->st_openstp->st_stateowner); - - lockdep_assert_held(&oo->oo_owner.so_client->cl_lock); + lockdep_assert_held(&stp->st_stid.sc_client->cl_lock); list_del_init(&stp->st_locks); nfs4_unhash_stid(&stp->st_stid); @@ -1238,12 +1236,12 @@ static bool unhash_lock_stateid(struct nfs4_ol_stateid *stp) static void release_lock_stateid(struct nfs4_ol_stateid *stp) { - struct nfs4_openowner *oo = openowner(stp->st_openstp->st_stateowner); + struct nfs4_client *clp = stp->st_stid.sc_client; bool unhashed; - spin_lock(&oo->oo_owner.so_client->cl_lock); + spin_lock(&clp->cl_lock); unhashed = unhash_lock_stateid(stp); - spin_unlock(&oo->oo_owner.so_client->cl_lock); + spin_unlock(&clp->cl_lock); if (unhashed) nfs4_put_stid(&stp->st_stid); } -- cgit v1.1