From 2ef9ccbfcb90cf84bdba320a571b18b05c41101b Mon Sep 17 00:00:00 2001
From: Zheng Liu <gnehzuil.liu@gmail.com>
Date: Sun, 29 Nov 2015 17:17:05 -0800
Subject: bcache: fix a livelock when we cause a huge number of cache misses

Subject :	[PATCH v2] bcache: fix a livelock in btree lock
Date :	Wed, 25 Feb 2015 20:32:09 +0800 (02/25/2015 04:32:09 AM)

This commit tries to fix a livelock in bcache.  This livelock might
happen when we causes a huge number of cache misses simultaneously.

When we get a cache miss, bcache will execute the following path.

->cached_dev_make_request()
  ->cached_dev_read()
    ->cached_lookup()
      ->bch->btree_map_keys()
        ->btree_root()  <------------------------
          ->bch_btree_map_keys_recurse()        |
            ->cache_lookup_fn()                 |
              ->cached_dev_cache_miss()         |
                ->bch_btree_insert_check_key() -|
                  [If btree->seq is not equal to seq + 1, we should return
                   EINTR and traverse btree again.]

In bch_btree_insert_check_key() function we first need to check upgrade
flag (op->lock == -1), and when this flag is true we need to release
read btree->lock and try to take write btree->lock.  During taking and
releasing this write lock, btree->seq will be monotone increased in
order to prevent other threads modify this in cache miss (see btree.h:74).
But if there are some cache misses caused by some requested, we could
meet a livelock because btree->seq is always changed by others.  Thus no
one can make progress.

This commit will try to take write btree->lock if it encounters a race
when we traverse btree.  Although it sacrifice the scalability but we
can ensure that only one can modify the btree.

Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
Tested-by: Joshua Schmid <jschmid@suse.com>
Tested-by: Eric Wheeler <bcache@linux.ewheeler.net>
Cc: Joshua Schmid <jschmid@suse.com>
Cc: Zhu Yanhai <zhu.yanhai@gmail.com>
Cc: Kent Overstreet <kmo@daterainc.com>
Cc: stable@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/md/bcache/btree.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'drivers/md/bcache')

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 83392f8..4a1179c 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -2162,8 +2162,10 @@ int bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
 		rw_lock(true, b, b->level);
 
 		if (b->key.ptr[0] != btree_ptr ||
-		    b->seq != seq + 1)
+                   b->seq != seq + 1) {
+                       op->lock = b->level;
 			goto out;
+               }
 	}
 
 	SET_KEY_PTRS(check_key, 1);
-- 
cgit v1.1


From c5f1e5adf956e3ba82d204c7c141a75da9fa449a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Sun, 29 Nov 2015 17:18:33 -0800
Subject: bcache: Add a cond_resched() call to gc

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Tested-by: Eric Wheeler <bcache@linux.ewheeler.net>
Cc: Kent Overstreet <kmo@daterainc.com>
Cc: stable@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/md/bcache/btree.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'drivers/md/bcache')

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 4a1179c..22b9e34 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1741,6 +1741,7 @@ static void bch_btree_gc(struct cache_set *c)
 	do {
 		ret = btree_root(gc_root, c, &op, &writes, &stats);
 		closure_sync(&writes);
+		cond_resched();
 
 		if (ret && ret != -EAGAIN)
 			pr_warn("gc failed!");
-- 
cgit v1.1


From fecaee6f20ee122ad75402c53d8278f9bb142ddc Mon Sep 17 00:00:00 2001
From: Zheng Liu <wenqing.lz@taobao.com>
Date: Sun, 29 Nov 2015 17:19:32 -0800
Subject: bcache: clear BCACHE_DEV_UNLINK_DONE flag when attaching a backing
 device

This bug can be reproduced by the following script:

  #!/bin/bash

  bcache_sysfs="/sys/fs/bcache"

  function clear_cache()
  {
  	if [ ! -e $bcache_sysfs ]; then
  		echo "no bcache sysfs"
  		exit
  	fi

  	cset_uuid=$(ls -l $bcache_sysfs|head -n 2|tail -n 1|awk '{print $9}')
  	sudo sh -c "echo $cset_uuid > /sys/block/sdb/sdb1/bcache/detach"
  	sleep 5
  	sudo sh -c "echo $cset_uuid > /sys/block/sdb/sdb1/bcache/attach"
  }

  for ((i=0;i<10;i++)); do
  	clear_cache
  done

The warning messages look like below:
[  275.948611] ------------[ cut here ]------------
[  275.963840] WARNING: at fs/sysfs/dir.c:512 sysfs_add_one+0xb8/0xd0() (Tainted: P        W
---------------   )
[  275.979253] Hardware name: Tecal RH2285
[  275.994106] sysfs: cannot create duplicate filename '/devices/pci0000:00/0000:00:09.0/0000:08:00.0/host4/target4:2:1/4:2:1:0/block/sdb/sdb1/bcache/cache'
[  276.024105] Modules linked in: bcache tcp_diag inet_diag ipmi_devintf ipmi_si ipmi_msghandler
bonding 8021q garp stp llc ipv6 ext3 jbd loop sg iomemory_vsl(P) bnx2 microcode serio_raw i2c_i801
i2c_core iTCO_wdt iTCO_vendor_support i7core_edac edac_core shpchp ext4 jbd2 mbcache megaraid_sas
pata_acpi ata_generic ata_piix dm_mod [last unloaded: scsi_wait_scan]
[  276.072643] Pid: 2765, comm: sh Tainted: P        W  ---------------    2.6.32 #1
[  276.089315] Call Trace:
[  276.105801]  [<ffffffff81070fe7>] ? warn_slowpath_common+0x87/0xc0
[  276.122650]  [<ffffffff810710d6>] ? warn_slowpath_fmt+0x46/0x50
[  276.139361]  [<ffffffff81205c08>] ? sysfs_add_one+0xb8/0xd0
[  276.156012]  [<ffffffff8120609b>] ? sysfs_do_create_link+0x12b/0x170
[  276.172682]  [<ffffffff81206113>] ? sysfs_create_link+0x13/0x20
[  276.189282]  [<ffffffffa03bda21>] ? bcache_device_link+0xc1/0x110 [bcache]
[  276.205993]  [<ffffffffa03bfa08>] ? bch_cached_dev_attach+0x478/0x4f0 [bcache]
[  276.222794]  [<ffffffffa03c4a17>] ? bch_cached_dev_store+0x627/0x780 [bcache]
[  276.239680]  [<ffffffff8116783a>] ? alloc_pages_current+0xaa/0x110
[  276.256594]  [<ffffffff81203b15>] ? sysfs_write_file+0xe5/0x170
[  276.273364]  [<ffffffff811887b8>] ? vfs_write+0xb8/0x1a0
[  276.290133]  [<ffffffff811890b1>] ? sys_write+0x51/0x90
[  276.306368]  [<ffffffff8100c072>] ? system_call_fastpath+0x16/0x1b
[  276.322301] ---[ end trace 9f5d4fcdd0c3edfb ]---
[  276.338241] ------------[ cut here ]------------
[  276.354109] WARNING: at /home/wenqing.lz/bcache/bcache/super.c:720
bcache_device_link+0xdf/0x110 [bcache]() (Tainted: P        W  ---------------   )
[  276.386017] Hardware name: Tecal RH2285
[  276.401430] Couldn't create device <-> cache set symlinks
[  276.401759] Modules linked in: bcache tcp_diag inet_diag ipmi_devintf ipmi_si ipmi_msghandler
bonding 8021q garp stp llc ipv6 ext3 jbd loop sg iomemory_vsl(P) bnx2 microcode serio_raw i2c_i801
i2c_core iTCO_wdt iTCO_vendor_support i7core_edac edac_core shpchp ext4 jbd2 mbcache megaraid_sas
pata_acpi ata_generic ata_piix dm_mod [last unloaded: scsi_wait_scan]
[  276.465477] Pid: 2765, comm: sh Tainted: P        W  ---------------    2.6.32 #1
[  276.482169] Call Trace:
[  276.498610]  [<ffffffff81070fe7>] ? warn_slowpath_common+0x87/0xc0
[  276.515405]  [<ffffffff810710d6>] ? warn_slowpath_fmt+0x46/0x50
[  276.532059]  [<ffffffffa03bda3f>] ? bcache_device_link+0xdf/0x110 [bcache]
[  276.548808]  [<ffffffffa03bfa08>] ? bch_cached_dev_attach+0x478/0x4f0 [bcache]
[  276.565569]  [<ffffffffa03c4a17>] ? bch_cached_dev_store+0x627/0x780 [bcache]
[  276.582418]  [<ffffffff8116783a>] ? alloc_pages_current+0xaa/0x110
[  276.599341]  [<ffffffff81203b15>] ? sysfs_write_file+0xe5/0x170
[  276.616142]  [<ffffffff811887b8>] ? vfs_write+0xb8/0x1a0
[  276.632607]  [<ffffffff811890b1>] ? sys_write+0x51/0x90
[  276.648671]  [<ffffffff8100c072>] ? system_call_fastpath+0x16/0x1b
[  276.664756] ---[ end trace 9f5d4fcdd0c3edfc ]---

We forget to clear BCACHE_DEV_UNLINK_DONE flag in bcache_device_attach()
function when we attach a backing device first time.  After detaching this
backing device, this flag will be true and sysfs_remove_link() isn't called in
bcache_device_unlink().  Then when we attach this backing device again,
sysfs_create_link() will return EEXIST error in bcache_device_link().

So the fix is trival and we clear this flag in bcache_device_link().

Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
Tested-by: Joshua Schmid <jschmid@suse.com>
Tested-by: Eric Wheeler <bcache@linux.ewheeler.net>
Cc: Kent Overstreet <kmo@daterainc.com>
Cc: stable@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/md/bcache/super.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'drivers/md/bcache')

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 679a093..383f060 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -685,6 +685,8 @@ static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
 	WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") ||
 	     sysfs_create_link(&c->kobj, &d->kobj, d->name),
 	     "Couldn't create device <-> cache set symlinks");
+
+	clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags);
 }
 
 static void bcache_device_detach(struct bcache_device *d)
-- 
cgit v1.1


From 4d4d8573a8451acc9f01cbea24b7e55f04a252fe Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Sun, 29 Nov 2015 17:20:59 -0800
Subject: bcache: fix a leak in bch_cached_dev_run()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Joshua Schmid <jschmid@suse.com>
Tested-by: Eric Wheeler <bcache@linux.ewheeler.net>
Cc: Kent Overstreet <kmo@daterainc.com>
Cc: stable@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/md/bcache/super.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'drivers/md/bcache')

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 383f060..43e911e 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -849,8 +849,11 @@ void bch_cached_dev_run(struct cached_dev *dc)
 	buf[SB_LABEL_SIZE] = '\0';
 	env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf);
 
-	if (atomic_xchg(&dc->running, 1))
+	if (atomic_xchg(&dc->running, 1)) {
+		kfree(env[1]);
+		kfree(env[2]);
 		return;
+	}
 
 	if (!d->c &&
 	    BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) {
-- 
cgit v1.1


From 2ecf0cdb2b437402110ab57546e02abfa68a716b Mon Sep 17 00:00:00 2001
From: Zheng Liu <wenqing.lz@taobao.com>
Date: Sun, 29 Nov 2015 17:21:57 -0800
Subject: bcache: unregister reboot notifier if bcache fails to unregister
 device

In bcache_init() function it forgot to unregister reboot notifier if
bcache fails to unregister a block device.  This commit fixes this.

Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
Tested-by: Joshua Schmid <jschmid@suse.com>
Tested-by: Eric Wheeler <bcache@linux.ewheeler.net>
Cc: Kent Overstreet <kmo@daterainc.com>
Cc: stable@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/md/bcache/super.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'drivers/md/bcache')

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 43e911e..18f14a2 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -2071,8 +2071,10 @@ static int __init bcache_init(void)
 	closure_debug_init();
 
 	bcache_major = register_blkdev(0, "bcache");
-	if (bcache_major < 0)
+	if (bcache_major < 0) {
+		unregister_reboot_notifier(&reboot);
 		return bcache_major;
+	}
 
 	if (!(bcache_wq = create_workqueue("bcache")) ||
 	    !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
-- 
cgit v1.1


From d7076f21629f8f329bca4a44dc408d94670f49e2 Mon Sep 17 00:00:00 2001
From: Gabriel de Perthuis <g2p.code@gmail.com>
Date: Sun, 29 Nov 2015 18:40:23 -0800
Subject: bcache: allows use of register in udev to avoid "device_busy" error.

Allows to use register, not register_quiet in udev to avoid "device_busy" error.
The initial patch proposed at https://lkml.org/lkml/2013/8/26/549 by Gabriel de Perthuis
<g2p.code@gmail.com> does not unlock the mutex and hangs the kernel.

See http://thread.gmane.org/gmane.linux.kernel.bcache.devel/2594 for the discussion.

Cc: Denis Bychkov <manover@gmail.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Eric Wheeler <bcache@linux.ewheeler.net>
Cc: Gabriel de Perthuis <g2p.code@gmail.com>
Cc: stable@vger.kernel.org

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/md/bcache/super.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'drivers/md/bcache')

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 18f14a2..8d0ead9 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1938,6 +1938,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 			else
 				err = "device busy";
 			mutex_unlock(&bch_register_lock);
+			if (attr == &ksysfs_register_quiet)
+				goto out;
 		}
 		goto err;
 	}
@@ -1976,8 +1978,7 @@ out:
 err_close:
 	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 err:
-	if (attr != &ksysfs_register_quiet)
-		pr_info("error opening %s: %s", path, err);
+	pr_info("error opening %s: %s", path, err);
 	ret = -EINVAL;
 	goto out;
 }
-- 
cgit v1.1


From 8d16ce540c94c9d366eb36fc91b7154d92d6397b Mon Sep 17 00:00:00 2001
From: Stefan Bader <stefan.bader@canonical.com>
Date: Sun, 29 Nov 2015 18:44:49 -0800
Subject: bcache: prevent crash on changing writeback_running

Added a safeguard in the shutdown case. At least while not being
attached it is also possible to trigger a kernel bug by writing into
writeback_running. This change  adds the same check before trying to
wake up the thread for that case.

Signed-off-by: Stefan Bader <stefan.bader@canonical.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: stable@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/md/bcache/writeback.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'drivers/md/bcache')

diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index 0a9dab1..073a042 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -63,7 +63,8 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
 
 static inline void bch_writeback_queue(struct cached_dev *dc)
 {
-	wake_up_process(dc->writeback_thread);
+	if (!IS_ERR_OR_NULL(dc->writeback_thread))
+		wake_up_process(dc->writeback_thread);
 }
 
 static inline void bch_writeback_add(struct cached_dev *dc)
-- 
cgit v1.1


From 627ccd20b4ad3ba836472468208e2ac4dfadbf03 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Sun, 29 Nov 2015 18:47:01 -0800
Subject: bcache: Change refill_dirty() to always scan entire disk if necessary

Previously, it would only scan the entire disk if it was starting from
the very start of the disk - i.e. if the previous scan got to the end.

This was broken by refill_full_stripes(), which updates last_scanned so
that refill_dirty was never triggering the searched_from_start path.

But if we change refill_dirty() to always scan the entire disk if
necessary, regardless of what last_scanned was, the code gets cleaner
and we fix that bug too.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Cc: stable@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/md/bcache/writeback.c | 37 ++++++++++++++++++++++++++++++-------
 1 file changed, 30 insertions(+), 7 deletions(-)

(limited to 'drivers/md/bcache')

diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index b23f88d..b9346cd 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -323,6 +323,10 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
 
 static bool dirty_pred(struct keybuf *buf, struct bkey *k)
 {
+	struct cached_dev *dc = container_of(buf, struct cached_dev, writeback_keys);
+
+	BUG_ON(KEY_INODE(k) != dc->disk.id);
+
 	return KEY_DIRTY(k);
 }
 
@@ -372,11 +376,24 @@ next:
 	}
 }
 
+/*
+ * Returns true if we scanned the entire disk
+ */
 static bool refill_dirty(struct cached_dev *dc)
 {
 	struct keybuf *buf = &dc->writeback_keys;
+	struct bkey start = KEY(dc->disk.id, 0, 0);
 	struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0);
-	bool searched_from_start = false;
+	struct bkey start_pos;
+
+	/*
+	 * make sure keybuf pos is inside the range for this disk - at bringup
+	 * we might not be attached yet so this disk's inode nr isn't
+	 * initialized then
+	 */
+	if (bkey_cmp(&buf->last_scanned, &start) < 0 ||
+	    bkey_cmp(&buf->last_scanned, &end) > 0)
+		buf->last_scanned = start;
 
 	if (dc->partial_stripes_expensive) {
 		refill_full_stripes(dc);
@@ -384,14 +401,20 @@ static bool refill_dirty(struct cached_dev *dc)
 			return false;
 	}
 
-	if (bkey_cmp(&buf->last_scanned, &end) >= 0) {
-		buf->last_scanned = KEY(dc->disk.id, 0, 0);
-		searched_from_start = true;
-	}
-
+	start_pos = buf->last_scanned;
 	bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred);
 
-	return bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start;
+	if (bkey_cmp(&buf->last_scanned, &end) < 0)
+		return false;
+
+	/*
+	 * If we get to the end start scanning again from the beginning, and
+	 * only scan up to where we initially started scanning from:
+	 */
+	buf->last_scanned = start;
+	bch_refill_keybuf(dc->disk.c, buf, &start_pos, dirty_pred);
+
+	return bkey_cmp(&buf->last_scanned, &start_pos) >= 0;
 }
 
 static int bch_writeback_thread(void *arg)
-- 
cgit v1.1