diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-01-21 18:19:38 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-01-21 18:19:38 -0800 |
commit | 641203549a21ba6a701aecd05c3dfc969ec670cc (patch) | |
tree | 5e3d177c380ed811b5bf37e0bf9b8098416a9bc6 /drivers/block | |
parent | 404a47410c26a115123885977053e9a1a4460929 (diff) | |
parent | e93d12ae3be91d18b2a46deebb90a3f516db3d3c (diff) | |
download | op-kernel-dev-641203549a21ba6a701aecd05c3dfc969ec670cc.zip op-kernel-dev-641203549a21ba6a701aecd05c3dfc969ec670cc.tar.gz |
Merge branch 'for-4.5/drivers' of git://git.kernel.dk/linux-block
Pull block driver updates from Jens Axboe:
"This is the block driver pull request for 4.5, with the exception of
NVMe, which is in a separate branch and will be posted after this one.
This pull request contains:
- A set of bcache stability fixes, which have been acked by Kent.
These have been used and tested for more than a year by the
community, so it's about time that they got in.
- A set of drbd updates from the drbd team (Andreas, Lars, Philipp)
and Markus Elfring, Oleg Drokin.
- A set of fixes for xen blkback/front from the usual suspects, (Bob,
Konrad) as well as community based fixes from Kiri, Julien, and
Peng.
- A 2038 time fix for sx8 from Shraddha, with a fix from me.
- A small mtip32xx cleanup from Zhu Yanjun.
- A null_blk division fix from Arnd"
* 'for-4.5/drivers' of git://git.kernel.dk/linux-block: (71 commits)
null_blk: use sector_div instead of do_div
mtip32xx: restrict variables visible in current code module
xen/blkfront: Fix crash if backend doesn't follow the right states.
xen/blkback: Fix two memory leaks.
xen/blkback: make st_ statistics per ring
xen/blkfront: Handle non-indirect grant with 64KB pages
xen-blkfront: Introduce blkif_ring_get_request
xen-blkback: clear PF_NOFREEZE for xen_blkif_schedule()
xen/blkback: Free resources if connect_ring failed.
xen/blocks: Return -EXX instead of -1
xen/blkback: make pool of persistent grants and free pages per-queue
xen/blkback: get the number of hardware queues/rings from blkfront
xen/blkback: pseudo support for multi hardware queues/rings
xen/blkback: separate ring information out of struct xen_blkif
xen/blkfront: correct setting for xen_blkif_max_ring_order
xen/blkfront: make persistent grants pool per-queue
xen/blkfront: Remove duplicate setting of ->xbdev.
xen/blkfront: Cleanup of comments, fix unaligned variables, and syntax errors.
xen/blkfront: negotiate number of queues/rings to be used with backend
xen/blkfront: split per device io_lock
...
Diffstat (limited to 'drivers/block')
-rw-r--r-- | drivers/block/drbd/drbd_actlog.c | 323 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_bitmap.c | 22 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_debugfs.c | 10 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_int.h | 111 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_main.c | 74 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_nl.c | 1361 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_proc.c | 6 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_protocol.h | 2 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_receiver.c | 254 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_req.c | 147 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_req.h | 17 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_state.c | 428 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_state.h | 6 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_state_change.h | 63 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_worker.c | 105 | ||||
-rw-r--r-- | drivers/block/mtip32xx/mtip32xx.c | 6 | ||||
-rw-r--r-- | drivers/block/null_blk.c | 8 | ||||
-rw-r--r-- | drivers/block/sx8.c | 7 | ||||
-rw-r--r-- | drivers/block/xen-blkback/blkback.c | 391 | ||||
-rw-r--r-- | drivers/block/xen-blkback/common.h | 86 | ||||
-rw-r--r-- | drivers/block/xen-blkback/xenbus.c | 416 | ||||
-rw-r--r-- | drivers/block/xen-blkfront.c | 1061 |
22 files changed, 3603 insertions, 1301 deletions
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index b3868e7..10459a1 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -288,7 +288,162 @@ bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval * return need_transaction; } -static int al_write_transaction(struct drbd_device *device); +#if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT) +/* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT + * are still coupled, or assume too much about their relation. + * Code below will not work if this is violated. + * Will be cleaned up with some followup patch. + */ +# error FIXME +#endif + +static unsigned int al_extent_to_bm_page(unsigned int al_enr) +{ + return al_enr >> + /* bit to page */ + ((PAGE_SHIFT + 3) - + /* al extent number to bit */ + (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)); +} + +static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device) +{ + const unsigned int stripes = device->ldev->md.al_stripes; + const unsigned int stripe_size_4kB = device->ldev->md.al_stripe_size_4k; + + /* transaction number, modulo on-disk ring buffer wrap around */ + unsigned int t = device->al_tr_number % (device->ldev->md.al_size_4k); + + /* ... to aligned 4k on disk block */ + t = ((t % stripes) * stripe_size_4kB) + t/stripes; + + /* ... to 512 byte sector in activity log */ + t *= 8; + + /* ... plus offset to the on disk position */ + return device->ldev->md.md_offset + device->ldev->md.al_offset + t; +} + +static int __al_write_transaction(struct drbd_device *device, struct al_transaction_on_disk *buffer) +{ + struct lc_element *e; + sector_t sector; + int i, mx; + unsigned extent_nr; + unsigned crc = 0; + int err = 0; + + memset(buffer, 0, sizeof(*buffer)); + buffer->magic = cpu_to_be32(DRBD_AL_MAGIC); + buffer->tr_number = cpu_to_be32(device->al_tr_number); + + i = 0; + + /* Even though no one can start to change this list + * once we set the LC_LOCKED -- from drbd_al_begin_io(), + * lc_try_lock_for_transaction() --, someone may still + * be in the process of changing it. */ + spin_lock_irq(&device->al_lock); + list_for_each_entry(e, &device->act_log->to_be_changed, list) { + if (i == AL_UPDATES_PER_TRANSACTION) { + i++; + break; + } + buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index); + buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number); + if (e->lc_number != LC_FREE) + drbd_bm_mark_for_writeout(device, + al_extent_to_bm_page(e->lc_number)); + i++; + } + spin_unlock_irq(&device->al_lock); + BUG_ON(i > AL_UPDATES_PER_TRANSACTION); + + buffer->n_updates = cpu_to_be16(i); + for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) { + buffer->update_slot_nr[i] = cpu_to_be16(-1); + buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE); + } + + buffer->context_size = cpu_to_be16(device->act_log->nr_elements); + buffer->context_start_slot_nr = cpu_to_be16(device->al_tr_cycle); + + mx = min_t(int, AL_CONTEXT_PER_TRANSACTION, + device->act_log->nr_elements - device->al_tr_cycle); + for (i = 0; i < mx; i++) { + unsigned idx = device->al_tr_cycle + i; + extent_nr = lc_element_by_index(device->act_log, idx)->lc_number; + buffer->context[i] = cpu_to_be32(extent_nr); + } + for (; i < AL_CONTEXT_PER_TRANSACTION; i++) + buffer->context[i] = cpu_to_be32(LC_FREE); + + device->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION; + if (device->al_tr_cycle >= device->act_log->nr_elements) + device->al_tr_cycle = 0; + + sector = al_tr_number_to_on_disk_sector(device); + + crc = crc32c(0, buffer, 4096); + buffer->crc32c = cpu_to_be32(crc); + + if (drbd_bm_write_hinted(device)) + err = -EIO; + else { + bool write_al_updates; + rcu_read_lock(); + write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates; + rcu_read_unlock(); + if (write_al_updates) { + if (drbd_md_sync_page_io(device, device->ldev, sector, WRITE)) { + err = -EIO; + drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR); + } else { + device->al_tr_number++; + device->al_writ_cnt++; + } + } + } + + return err; +} + +static int al_write_transaction(struct drbd_device *device) +{ + struct al_transaction_on_disk *buffer; + int err; + + if (!get_ldev(device)) { + drbd_err(device, "disk is %s, cannot start al transaction\n", + drbd_disk_str(device->state.disk)); + return -EIO; + } + + /* The bitmap write may have failed, causing a state change. */ + if (device->state.disk < D_INCONSISTENT) { + drbd_err(device, + "disk is %s, cannot write al transaction\n", + drbd_disk_str(device->state.disk)); + put_ldev(device); + return -EIO; + } + + /* protects md_io_buffer, al_tr_cycle, ... */ + buffer = drbd_md_get_buffer(device, __func__); + if (!buffer) { + drbd_err(device, "disk failed while waiting for md_io buffer\n"); + put_ldev(device); + return -ENODEV; + } + + err = __al_write_transaction(device, buffer); + + drbd_md_put_buffer(device); + put_ldev(device); + + return err; +} + void drbd_al_begin_io_commit(struct drbd_device *device) { @@ -420,153 +575,6 @@ void drbd_al_complete_io(struct drbd_device *device, struct drbd_interval *i) wake_up(&device->al_wait); } -#if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT) -/* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT - * are still coupled, or assume too much about their relation. - * Code below will not work if this is violated. - * Will be cleaned up with some followup patch. - */ -# error FIXME -#endif - -static unsigned int al_extent_to_bm_page(unsigned int al_enr) -{ - return al_enr >> - /* bit to page */ - ((PAGE_SHIFT + 3) - - /* al extent number to bit */ - (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)); -} - -static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device) -{ - const unsigned int stripes = device->ldev->md.al_stripes; - const unsigned int stripe_size_4kB = device->ldev->md.al_stripe_size_4k; - - /* transaction number, modulo on-disk ring buffer wrap around */ - unsigned int t = device->al_tr_number % (device->ldev->md.al_size_4k); - - /* ... to aligned 4k on disk block */ - t = ((t % stripes) * stripe_size_4kB) + t/stripes; - - /* ... to 512 byte sector in activity log */ - t *= 8; - - /* ... plus offset to the on disk position */ - return device->ldev->md.md_offset + device->ldev->md.al_offset + t; -} - -int al_write_transaction(struct drbd_device *device) -{ - struct al_transaction_on_disk *buffer; - struct lc_element *e; - sector_t sector; - int i, mx; - unsigned extent_nr; - unsigned crc = 0; - int err = 0; - - if (!get_ldev(device)) { - drbd_err(device, "disk is %s, cannot start al transaction\n", - drbd_disk_str(device->state.disk)); - return -EIO; - } - - /* The bitmap write may have failed, causing a state change. */ - if (device->state.disk < D_INCONSISTENT) { - drbd_err(device, - "disk is %s, cannot write al transaction\n", - drbd_disk_str(device->state.disk)); - put_ldev(device); - return -EIO; - } - - /* protects md_io_buffer, al_tr_cycle, ... */ - buffer = drbd_md_get_buffer(device, __func__); - if (!buffer) { - drbd_err(device, "disk failed while waiting for md_io buffer\n"); - put_ldev(device); - return -ENODEV; - } - - memset(buffer, 0, sizeof(*buffer)); - buffer->magic = cpu_to_be32(DRBD_AL_MAGIC); - buffer->tr_number = cpu_to_be32(device->al_tr_number); - - i = 0; - - /* Even though no one can start to change this list - * once we set the LC_LOCKED -- from drbd_al_begin_io(), - * lc_try_lock_for_transaction() --, someone may still - * be in the process of changing it. */ - spin_lock_irq(&device->al_lock); - list_for_each_entry(e, &device->act_log->to_be_changed, list) { - if (i == AL_UPDATES_PER_TRANSACTION) { - i++; - break; - } - buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index); - buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number); - if (e->lc_number != LC_FREE) - drbd_bm_mark_for_writeout(device, - al_extent_to_bm_page(e->lc_number)); - i++; - } - spin_unlock_irq(&device->al_lock); - BUG_ON(i > AL_UPDATES_PER_TRANSACTION); - - buffer->n_updates = cpu_to_be16(i); - for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) { - buffer->update_slot_nr[i] = cpu_to_be16(-1); - buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE); - } - - buffer->context_size = cpu_to_be16(device->act_log->nr_elements); - buffer->context_start_slot_nr = cpu_to_be16(device->al_tr_cycle); - - mx = min_t(int, AL_CONTEXT_PER_TRANSACTION, - device->act_log->nr_elements - device->al_tr_cycle); - for (i = 0; i < mx; i++) { - unsigned idx = device->al_tr_cycle + i; - extent_nr = lc_element_by_index(device->act_log, idx)->lc_number; - buffer->context[i] = cpu_to_be32(extent_nr); - } - for (; i < AL_CONTEXT_PER_TRANSACTION; i++) - buffer->context[i] = cpu_to_be32(LC_FREE); - - device->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION; - if (device->al_tr_cycle >= device->act_log->nr_elements) - device->al_tr_cycle = 0; - - sector = al_tr_number_to_on_disk_sector(device); - - crc = crc32c(0, buffer, 4096); - buffer->crc32c = cpu_to_be32(crc); - - if (drbd_bm_write_hinted(device)) - err = -EIO; - else { - bool write_al_updates; - rcu_read_lock(); - write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates; - rcu_read_unlock(); - if (write_al_updates) { - if (drbd_md_sync_page_io(device, device->ldev, sector, WRITE)) { - err = -EIO; - drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR); - } else { - device->al_tr_number++; - device->al_writ_cnt++; - } - } - } - - drbd_md_put_buffer(device); - put_ldev(device); - - return err; -} - static int _try_lc_del(struct drbd_device *device, struct lc_element *al_ext) { int rv; @@ -606,21 +614,24 @@ void drbd_al_shrink(struct drbd_device *device) wake_up(&device->al_wait); } -int drbd_initialize_al(struct drbd_device *device, void *buffer) +int drbd_al_initialize(struct drbd_device *device, void *buffer) { struct al_transaction_on_disk *al = buffer; struct drbd_md *md = &device->ldev->md; - sector_t al_base = md->md_offset + md->al_offset; int al_size_4k = md->al_stripes * md->al_stripe_size_4k; int i; - memset(al, 0, 4096); - al->magic = cpu_to_be32(DRBD_AL_MAGIC); - al->transaction_type = cpu_to_be16(AL_TR_INITIALIZED); - al->crc32c = cpu_to_be32(crc32c(0, al, 4096)); + __al_write_transaction(device, al); + /* There may or may not have been a pending transaction. */ + spin_lock_irq(&device->al_lock); + lc_committed(device->act_log); + spin_unlock_irq(&device->al_lock); - for (i = 0; i < al_size_4k; i++) { - int err = drbd_md_sync_page_io(device, device->ldev, al_base + i * 8, WRITE); + /* The rest of the transactions will have an empty "updates" list, and + * are written out only to provide the context, and to initialize the + * on-disk ring buffer. */ + for (i = 1; i < al_size_4k; i++) { + int err = __al_write_transaction(device, al); if (err) return err; } diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 9462d27..0dabc9b 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -24,7 +24,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/bitops.h> +#include <linux/bitmap.h> #include <linux/vmalloc.h> #include <linux/string.h> #include <linux/drbd.h> @@ -479,8 +479,14 @@ void drbd_bm_cleanup(struct drbd_device *device) * this masks out the remaining bits. * Returns the number of bits cleared. */ +#ifndef BITS_PER_PAGE #define BITS_PER_PAGE (1UL << (PAGE_SHIFT + 3)) #define BITS_PER_PAGE_MASK (BITS_PER_PAGE - 1) +#else +# if BITS_PER_PAGE != (1UL << (PAGE_SHIFT + 3)) +# error "ambiguous BITS_PER_PAGE" +# endif +#endif #define BITS_PER_LONG_MASK (BITS_PER_LONG - 1) static int bm_clear_surplus(struct drbd_bitmap *b) { @@ -559,21 +565,19 @@ static unsigned long bm_count_bits(struct drbd_bitmap *b) unsigned long *p_addr; unsigned long bits = 0; unsigned long mask = (1UL << (b->bm_bits & BITS_PER_LONG_MASK)) -1; - int idx, i, last_word; + int idx, last_word; /* all but last page */ for (idx = 0; idx < b->bm_number_of_pages - 1; idx++) { p_addr = __bm_map_pidx(b, idx); - for (i = 0; i < LWPP; i++) - bits += hweight_long(p_addr[i]); + bits += bitmap_weight(p_addr, BITS_PER_PAGE); __bm_unmap(p_addr); cond_resched(); } /* last (or only) page */ last_word = ((b->bm_bits - 1) & BITS_PER_PAGE_MASK) >> LN2_BPL; p_addr = __bm_map_pidx(b, idx); - for (i = 0; i < last_word; i++) - bits += hweight_long(p_addr[i]); + bits += bitmap_weight(p_addr, last_word * BITS_PER_LONG); p_addr[last_word] &= cpu_to_lel(mask); bits += hweight_long(p_addr[last_word]); /* 32bit arch, may have an unused padding long */ @@ -1419,6 +1423,9 @@ static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b, int bits; int changed = 0; unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]); + + /* I think it is more cache line friendly to hweight_long then set to ~0UL, + * than to first bitmap_weight() all words, then bitmap_fill() all words */ for (i = first_word; i < last_word; i++) { bits = hweight_long(paddr[i]); paddr[i] = ~0UL; @@ -1628,8 +1635,7 @@ int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr) int n = e-s; p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s)); bm = p_addr + MLPP(s); - while (n--) - count += hweight_long(*bm++); + count += bitmap_weight(bm, n * BITS_PER_LONG); bm_unmap(p_addr); } else { drbd_err(device, "start offset (%d) too large in drbd_bm_e_weight\n", s); diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c index 6b88a35..96a0107 100644 --- a/drivers/block/drbd/drbd_debugfs.c +++ b/drivers/block/drbd/drbd_debugfs.c @@ -771,6 +771,13 @@ static int device_data_gen_id_show(struct seq_file *m, void *ignored) return 0; } +static int device_ed_gen_id_show(struct seq_file *m, void *ignored) +{ + struct drbd_device *device = m->private; + seq_printf(m, "0x%016llX\n", (unsigned long long)device->ed_uuid); + return 0; +} + #define drbd_debugfs_device_attr(name) \ static int device_ ## name ## _open(struct inode *inode, struct file *file) \ { \ @@ -796,6 +803,7 @@ drbd_debugfs_device_attr(oldest_requests) drbd_debugfs_device_attr(act_log_extents) drbd_debugfs_device_attr(resync_extents) drbd_debugfs_device_attr(data_gen_id) +drbd_debugfs_device_attr(ed_gen_id) void drbd_debugfs_device_add(struct drbd_device *device) { @@ -839,6 +847,7 @@ void drbd_debugfs_device_add(struct drbd_device *device) DCF(act_log_extents); DCF(resync_extents); DCF(data_gen_id); + DCF(ed_gen_id); #undef DCF return; @@ -854,6 +863,7 @@ void drbd_debugfs_device_cleanup(struct drbd_device *device) drbd_debugfs_remove(&device->debugfs_vol_act_log_extents); drbd_debugfs_remove(&device->debugfs_vol_resync_extents); drbd_debugfs_remove(&device->debugfs_vol_data_gen_id); + drbd_debugfs_remove(&device->debugfs_vol_ed_gen_id); drbd_debugfs_remove(&device->debugfs_vol); } diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index e66d453..b6844fe 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -77,13 +77,6 @@ extern int fault_devs; extern char usermode_helper[]; -/* I don't remember why XCPU ... - * This is used to wake the asender, - * and to interrupt sending the sending task - * on disconnect. - */ -#define DRBD_SIG SIGXCPU - /* This is used to stop/restart our threads. * Cannot use SIGTERM nor SIGKILL, since these * are sent out by init on runlevel changes @@ -292,6 +285,9 @@ struct drbd_device_work { extern int drbd_wait_misc(struct drbd_device *, struct drbd_interval *); +extern void lock_all_resources(void); +extern void unlock_all_resources(void); + struct drbd_request { struct drbd_work w; struct drbd_device *device; @@ -504,7 +500,6 @@ enum { MD_NO_FUA, /* Users wants us to not use FUA/FLUSH on meta data dev */ - SUSPEND_IO, /* suspend application io */ BITMAP_IO, /* suspend application io; once no more io in flight, start bitmap io */ BITMAP_IO_QUEUED, /* Started bitmap IO */ @@ -632,12 +627,6 @@ struct bm_io_work { void (*done)(struct drbd_device *device, int rv); }; -enum write_ordering_e { - WO_none, - WO_drain_io, - WO_bdev_flush, -}; - struct fifo_buffer { unsigned int head_index; unsigned int size; @@ -650,8 +639,7 @@ extern struct fifo_buffer *fifo_alloc(int fifo_size); enum { NET_CONGESTED, /* The data socket is congested */ RESOLVE_CONFLICTS, /* Set on one node, cleared on the peer! */ - SEND_PING, /* whether asender should send a ping asap */ - SIGNAL_ASENDER, /* whether asender wants to be interrupted */ + SEND_PING, GOT_PING_ACK, /* set when we receive a ping_ack packet, ping_wait gets woken */ CONN_WD_ST_CHG_REQ, /* A cluster wide state change on the connection is active */ CONN_WD_ST_CHG_OKAY, @@ -670,6 +658,8 @@ enum { DEVICE_WORK_PENDING, /* tell worker that some device has pending work */ }; +enum which_state { NOW, OLD = NOW, NEW }; + struct drbd_resource { char *name; #ifdef CONFIG_DEBUG_FS @@ -755,7 +745,8 @@ struct drbd_connection { unsigned long last_reconnect_jif; struct drbd_thread receiver; struct drbd_thread worker; - struct drbd_thread asender; + struct drbd_thread ack_receiver; + struct workqueue_struct *ack_sender; /* cached pointers, * so we can look up the oldest pending requests more quickly. @@ -774,6 +765,8 @@ struct drbd_connection { struct drbd_thread_timing_details r_timing_details[DRBD_THREAD_DETAILS_HIST]; struct { + unsigned long last_sent_barrier_jif; + /* whether this sender thread * has processed a single write yet. */ bool seen_any_write_yet; @@ -788,6 +781,17 @@ struct drbd_connection { } send; }; +static inline bool has_net_conf(struct drbd_connection *connection) +{ + bool has_net_conf; + + rcu_read_lock(); + has_net_conf = rcu_dereference(connection->net_conf); + rcu_read_unlock(); + + return has_net_conf; +} + void __update_timing_details( struct drbd_thread_timing_details *tdp, unsigned int *cb_nr, @@ -811,6 +815,7 @@ struct drbd_peer_device { struct list_head peer_devices; struct drbd_device *device; struct drbd_connection *connection; + struct work_struct send_acks_work; #ifdef CONFIG_DEBUG_FS struct dentry *debugfs_peer_dev; #endif @@ -829,6 +834,7 @@ struct drbd_device { struct dentry *debugfs_vol_act_log_extents; struct dentry *debugfs_vol_resync_extents; struct dentry *debugfs_vol_data_gen_id; + struct dentry *debugfs_vol_ed_gen_id; #endif unsigned int vnr; /* volume number within the connection */ @@ -873,6 +879,7 @@ struct drbd_device { atomic_t rs_pending_cnt; /* RS request/data packets on the wire */ atomic_t unacked_cnt; /* Need to send replies for */ atomic_t local_cnt; /* Waiting for local completion */ + atomic_t suspend_cnt; /* Interval tree of pending local requests */ struct rb_root read_requests; @@ -1020,6 +1027,12 @@ static inline struct drbd_peer_device *first_peer_device(struct drbd_device *dev return list_first_entry_or_null(&device->peer_devices, struct drbd_peer_device, peer_devices); } +static inline struct drbd_peer_device * +conn_peer_device(struct drbd_connection *connection, int volume_number) +{ + return idr_find(&connection->peer_devices, volume_number); +} + #define for_each_resource(resource, _resources) \ list_for_each_entry(resource, _resources, resources) @@ -1113,7 +1126,7 @@ extern int drbd_send_ov_request(struct drbd_peer_device *, sector_t sector, int extern int drbd_send_bitmap(struct drbd_device *device); extern void drbd_send_sr_reply(struct drbd_peer_device *, enum drbd_state_rv retcode); extern void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode); -extern void drbd_free_ldev(struct drbd_backing_dev *ldev); +extern void drbd_backing_dev_free(struct drbd_device *device, struct drbd_backing_dev *ldev); extern void drbd_device_cleanup(struct drbd_device *device); void drbd_print_uuids(struct drbd_device *device, const char *text); @@ -1424,7 +1437,7 @@ extern struct bio_set *drbd_md_io_bio_set; /* to allocate from that set */ extern struct bio *bio_alloc_drbd(gfp_t gfp_mask); -extern rwlock_t global_state_lock; +extern struct mutex resources_mutex; extern int conn_lowest_minor(struct drbd_connection *connection); extern enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor); @@ -1454,6 +1467,9 @@ extern int is_valid_ar_handle(struct drbd_request *, sector_t); /* drbd_nl.c */ + +extern struct mutex notification_mutex; + extern void drbd_suspend_io(struct drbd_device *device); extern void drbd_resume_io(struct drbd_device *device); extern char *ppsize(char *buf, unsigned long long size); @@ -1536,7 +1552,9 @@ extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req); /* drbd_receiver.c */ extern int drbd_receiver(struct drbd_thread *thi); -extern int drbd_asender(struct drbd_thread *thi); +extern int drbd_ack_receiver(struct drbd_thread *thi); +extern void drbd_send_ping_wf(struct work_struct *ws); +extern void drbd_send_acks_wf(struct work_struct *ws); extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device); extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector, bool throttle_if_app_is_waiting); @@ -1649,7 +1667,7 @@ extern int __drbd_change_sync(struct drbd_device *device, sector_t sector, int s #define drbd_rs_failed_io(device, sector, size) \ __drbd_change_sync(device, sector, size, RECORD_RS_FAILED) extern void drbd_al_shrink(struct drbd_device *device); -extern int drbd_initialize_al(struct drbd_device *, void *); +extern int drbd_al_initialize(struct drbd_device *, void *); /* drbd_nl.c */ /* state info broadcast */ @@ -1668,6 +1686,29 @@ struct sib_info { }; void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib); +extern void notify_resource_state(struct sk_buff *, + unsigned int, + struct drbd_resource *, + struct resource_info *, + enum drbd_notification_type); +extern void notify_device_state(struct sk_buff *, + unsigned int, + struct drbd_device *, + struct device_info *, + enum drbd_notification_type); +extern void notify_connection_state(struct sk_buff *, + unsigned int, + struct drbd_connection *, + struct connection_info *, + enum drbd_notification_type); +extern void notify_peer_device_state(struct sk_buff *, + unsigned int, + struct drbd_peer_device *, + struct peer_device_info *, + enum drbd_notification_type); +extern void notify_helper(enum drbd_notification_type, struct drbd_device *, + struct drbd_connection *, const char *, int); + /* * inline helper functions *************************/ @@ -1694,19 +1735,6 @@ static inline int drbd_peer_req_has_active_page(struct drbd_peer_request *peer_r return 0; } -static inline enum drbd_state_rv -_drbd_set_state(struct drbd_device *device, union drbd_state ns, - enum chg_state_flags flags, struct completion *done) -{ - enum drbd_state_rv rv; - - read_lock(&global_state_lock); - rv = __drbd_set_state(device, ns, flags, done); - read_unlock(&global_state_lock); - - return rv; -} - static inline union drbd_state drbd_read_state(struct drbd_device *device) { struct drbd_resource *resource = device->resource; @@ -1937,16 +1965,21 @@ drbd_device_post_work(struct drbd_device *device, int work_bit) extern void drbd_flush_workqueue(struct drbd_work_queue *work_queue); -static inline void wake_asender(struct drbd_connection *connection) +/* To get the ack_receiver out of the blocking network stack, + * so it can change its sk_rcvtimeo from idle- to ping-timeout, + * and send a ping, we need to send a signal. + * Which signal we send is irrelevant. */ +static inline void wake_ack_receiver(struct drbd_connection *connection) { - if (test_bit(SIGNAL_ASENDER, &connection->flags)) - force_sig(DRBD_SIG, connection->asender.task); + struct task_struct *task = connection->ack_receiver.task; + if (task && get_t_state(&connection->ack_receiver) == RUNNING) + force_sig(SIGXCPU, task); } static inline void request_ping(struct drbd_connection *connection) { set_bit(SEND_PING, &connection->flags); - wake_asender(connection); + wake_ack_receiver(connection); } extern void *conn_prepare_command(struct drbd_connection *, struct drbd_socket *); @@ -2230,7 +2263,7 @@ static inline bool may_inc_ap_bio(struct drbd_device *device) if (drbd_suspended(device)) return false; - if (test_bit(SUSPEND_IO, &device->flags)) + if (atomic_read(&device->suspend_cnt)) return false; /* to avoid potential deadlock or bitmap corruption, diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 74d97f4..5b43dfb 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -117,6 +117,7 @@ module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0 */ struct idr drbd_devices; struct list_head drbd_resources; +struct mutex resources_mutex; struct kmem_cache *drbd_request_cache; struct kmem_cache *drbd_ee_cache; /* peer requests */ @@ -1435,8 +1436,8 @@ static int we_should_drop_the_connection(struct drbd_connection *connection, str /* long elapsed = (long)(jiffies - device->last_received); */ drop_it = connection->meta.socket == sock - || !connection->asender.task - || get_t_state(&connection->asender) != RUNNING + || !connection->ack_receiver.task + || get_t_state(&connection->ack_receiver) != RUNNING || connection->cstate < C_WF_REPORT_PARAMS; if (drop_it) @@ -1793,15 +1794,6 @@ int drbd_send(struct drbd_connection *connection, struct socket *sock, drbd_update_congested(connection); } do { - /* STRANGE - * tcp_sendmsg does _not_ use its size parameter at all ? - * - * -EAGAIN on timeout, -EINTR on signal. - */ -/* THINK - * do we need to block DRBD_SIG if sock == &meta.socket ?? - * otherwise wake_asender() might interrupt some send_*Ack ! - */ rv = kernel_sendmsg(sock, &msg, &iov, 1, size); if (rv == -EAGAIN) { if (we_should_drop_the_connection(connection, sock)) @@ -2000,7 +1992,7 @@ void drbd_device_cleanup(struct drbd_device *device) drbd_bm_cleanup(device); } - drbd_free_ldev(device->ldev); + drbd_backing_dev_free(device, device->ldev); device->ldev = NULL; clear_bit(AL_SUSPENDED, &device->flags); @@ -2179,7 +2171,7 @@ void drbd_destroy_device(struct kref *kref) if (device->this_bdev) bdput(device->this_bdev); - drbd_free_ldev(device->ldev); + drbd_backing_dev_free(device, device->ldev); device->ldev = NULL; drbd_release_all_peer_reqs(device); @@ -2563,7 +2555,7 @@ int set_resource_options(struct drbd_resource *resource, struct res_opts *res_op cpumask_copy(resource->cpu_mask, new_cpu_mask); for_each_connection_rcu(connection, resource) { connection->receiver.reset_cpu_mask = 1; - connection->asender.reset_cpu_mask = 1; + connection->ack_receiver.reset_cpu_mask = 1; connection->worker.reset_cpu_mask = 1; } } @@ -2590,7 +2582,7 @@ struct drbd_resource *drbd_create_resource(const char *name) kref_init(&resource->kref); idr_init(&resource->devices); INIT_LIST_HEAD(&resource->connections); - resource->write_ordering = WO_bdev_flush; + resource->write_ordering = WO_BDEV_FLUSH; list_add_tail_rcu(&resource->resources, &drbd_resources); mutex_init(&resource->conf_update); mutex_init(&resource->adm_mutex); @@ -2652,8 +2644,8 @@ struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts) connection->receiver.connection = connection; drbd_thread_init(resource, &connection->worker, drbd_worker, "worker"); connection->worker.connection = connection; - drbd_thread_init(resource, &connection->asender, drbd_asender, "asender"); - connection->asender.connection = connection; + drbd_thread_init(resource, &connection->ack_receiver, drbd_ack_receiver, "ack_recv"); + connection->ack_receiver.connection = connection; kref_init(&connection->kref); @@ -2702,8 +2694,8 @@ static int init_submitter(struct drbd_device *device) { /* opencoded create_singlethread_workqueue(), * to be able to say "drbd%d", ..., minor */ - device->submit.wq = alloc_workqueue("drbd%u_submit", - WQ_UNBOUND | WQ_MEM_RECLAIM, 1, device->minor); + device->submit.wq = + alloc_ordered_workqueue("drbd%u_submit", WQ_MEM_RECLAIM, device->minor); if (!device->submit.wq) return -ENOMEM; @@ -2820,6 +2812,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig goto out_idr_remove_from_resource; } kref_get(&connection->kref); + INIT_WORK(&peer_device->send_acks_work, drbd_send_acks_wf); } if (init_submitter(device)) { @@ -2923,7 +2916,7 @@ static int __init drbd_init(void) drbd_proc = NULL; /* play safe for drbd_cleanup */ idr_init(&drbd_devices); - rwlock_init(&global_state_lock); + mutex_init(&resources_mutex); INIT_LIST_HEAD(&drbd_resources); err = drbd_genl_register(); @@ -2971,18 +2964,6 @@ fail: return err; } -void drbd_free_ldev(struct drbd_backing_dev *ldev) -{ - if (ldev == NULL) - return; - - blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); - blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); - - kfree(ldev->disk_conf); - kfree(ldev); -} - static void drbd_free_one_sock(struct drbd_socket *ds) { struct socket *s; @@ -3277,6 +3258,10 @@ int drbd_md_read(struct drbd_device *device, struct drbd_backing_dev *bdev) * and read it. */ bdev->md.meta_dev_idx = bdev->disk_conf->meta_dev_idx; bdev->md.md_offset = drbd_md_ss(bdev); + /* Even for (flexible or indexed) external meta data, + * initially restrict us to the 4k superblock for now. + * Affects the paranoia out-of-range access check in drbd_md_sync_page_io(). */ + bdev->md.md_size_sect = 8; if (drbd_md_sync_page_io(device, bdev, bdev->md.md_offset, READ)) { /* NOTE: can't do normal error processing here as this is @@ -3578,7 +3563,9 @@ void drbd_queue_bitmap_io(struct drbd_device *device, spin_lock_irq(&device->resource->req_lock); set_bit(BITMAP_IO, &device->flags); - if (atomic_read(&device->ap_bio_cnt) == 0) { + /* don't wait for pending application IO if the caller indicates that + * application IO does not conflict anyways. */ + if (flags == BM_LOCKED_CHANGE_ALLOWED || atomic_read(&device->ap_bio_cnt) == 0) { if (!test_and_set_bit(BITMAP_IO_QUEUED, &device->flags)) drbd_queue_work(&first_peer_device(device)->connection->sender_work, &device->bm_io_work.w); @@ -3746,6 +3733,27 @@ int drbd_wait_misc(struct drbd_device *device, struct drbd_interval *i) return 0; } +void lock_all_resources(void) +{ + struct drbd_resource *resource; + int __maybe_unused i = 0; + + mutex_lock(&resources_mutex); + local_irq_disable(); + for_each_resource(resource, &drbd_resources) + spin_lock_nested(&resource->req_lock, i++); +} + +void unlock_all_resources(void) +{ + struct drbd_resource *resource; + + for_each_resource(resource, &drbd_resources) + spin_unlock(&resource->req_lock); + local_irq_enable(); + mutex_unlock(&resources_mutex); +} + #ifdef CONFIG_DRBD_FAULT_INJECTION /* Fault insertion support including random number generator shamelessly * stolen from kernel/rcutorture.c */ diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index e80cbef..c055c5e 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -36,6 +36,7 @@ #include "drbd_int.h" #include "drbd_protocol.h" #include "drbd_req.h" +#include "drbd_state_change.h" #include <asm/unaligned.h> #include <linux/drbd_limits.h> #include <linux/kthread.h> @@ -75,11 +76,24 @@ int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info); int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info); /* .dumpit */ int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb); +int drbd_adm_dump_resources(struct sk_buff *skb, struct netlink_callback *cb); +int drbd_adm_dump_devices(struct sk_buff *skb, struct netlink_callback *cb); +int drbd_adm_dump_devices_done(struct netlink_callback *cb); +int drbd_adm_dump_connections(struct sk_buff *skb, struct netlink_callback *cb); +int drbd_adm_dump_connections_done(struct netlink_callback *cb); +int drbd_adm_dump_peer_devices(struct sk_buff *skb, struct netlink_callback *cb); +int drbd_adm_dump_peer_devices_done(struct netlink_callback *cb); +int drbd_adm_get_initial_state(struct sk_buff *skb, struct netlink_callback *cb); #include <linux/drbd_genl_api.h> #include "drbd_nla.h" #include <linux/genl_magic_func.h> +static atomic_t drbd_genl_seq = ATOMIC_INIT(2); /* two. */ +static atomic_t notify_genl_seq = ATOMIC_INIT(2); /* two. */ + +DEFINE_MUTEX(notification_mutex); + /* used blkdev_get_by_path, to claim our meta data device(s) */ static char *drbd_m_holder = "Hands off! this is DRBD's meta data device."; @@ -349,6 +363,7 @@ int drbd_khelper(struct drbd_device *device, char *cmd) sib.sib_reason = SIB_HELPER_PRE; sib.helper_name = cmd; drbd_bcast_event(device, &sib); + notify_helper(NOTIFY_CALL, device, connection, cmd, 0); ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC); if (ret) drbd_warn(device, "helper command: %s %s %s exit code %u (0x%x)\n", @@ -361,6 +376,7 @@ int drbd_khelper(struct drbd_device *device, char *cmd) sib.sib_reason = SIB_HELPER_POST; sib.helper_exit_code = ret; drbd_bcast_event(device, &sib); + notify_helper(NOTIFY_RESPONSE, device, connection, cmd, ret); if (current == connection->worker.task) clear_bit(CALLBACK_PENDING, &connection->flags); @@ -388,6 +404,7 @@ static int conn_khelper(struct drbd_connection *connection, char *cmd) drbd_info(connection, "helper command: %s %s %s\n", usermode_helper, cmd, resource_name); /* TODO: conn_bcast_event() ?? */ + notify_helper(NOTIFY_CALL, NULL, connection, cmd, 0); ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC); if (ret) @@ -399,6 +416,7 @@ static int conn_khelper(struct drbd_connection *connection, char *cmd) usermode_helper, cmd, resource_name, (ret >> 8) & 0xff, ret); /* TODO: conn_bcast_event() ?? */ + notify_helper(NOTIFY_RESPONSE, NULL, connection, cmd, ret); if (ret < 0) /* Ignore any ERRNOs we got. */ ret = 0; @@ -847,9 +865,11 @@ char *ppsize(char *buf, unsigned long long size) * and can be long lived. * This changes an device->flag, is triggered by drbd internals, * and should be short-lived. */ +/* It needs to be a counter, since multiple threads might + independently suspend and resume IO. */ void drbd_suspend_io(struct drbd_device *device) { - set_bit(SUSPEND_IO, &device->flags); + atomic_inc(&device->suspend_cnt); if (drbd_suspended(device)) return; wait_event(device->misc_wait, !atomic_read(&device->ap_bio_cnt)); @@ -857,8 +877,8 @@ void drbd_suspend_io(struct drbd_device *device) void drbd_resume_io(struct drbd_device *device) { - clear_bit(SUSPEND_IO, &device->flags); - wake_up(&device->misc_wait); + if (atomic_dec_and_test(&device->suspend_cnt)) + wake_up(&device->misc_wait); } /** @@ -871,27 +891,32 @@ void drbd_resume_io(struct drbd_device *device) enum determine_dev_size drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct resize_parms *rs) __must_hold(local) { - sector_t prev_first_sect, prev_size; /* previous meta location */ - sector_t la_size_sect, u_size; + struct md_offsets_and_sizes { + u64 last_agreed_sect; + u64 md_offset; + s32 al_offset; + s32 bm_offset; + u32 md_size_sect; + + u32 al_stripes; + u32 al_stripe_size_4k; + } prev; + sector_t u_size, size; struct drbd_md *md = &device->ldev->md; - u32 prev_al_stripe_size_4k; - u32 prev_al_stripes; - sector_t size; char ppb[10]; void *buffer; int md_moved, la_size_changed; enum determine_dev_size rv = DS_UNCHANGED; - /* race: - * application request passes inc_ap_bio, - * but then cannot get an AL-reference. - * this function later may wait on ap_bio_cnt == 0. -> deadlock. + /* We may change the on-disk offsets of our meta data below. Lock out + * anything that may cause meta data IO, to avoid acting on incomplete + * layout changes or scribbling over meta data that is in the process + * of being moved. * - * to avoid that: - * Suspend IO right here. - * still lock the act_log to not trigger ASSERTs there. - */ + * Move is not exactly correct, btw, currently we have all our meta + * data in core memory, to "move" it we just write it all out, there + * are no reads. */ drbd_suspend_io(device); buffer = drbd_md_get_buffer(device, __func__); /* Lock meta-data IO */ if (!buffer) { @@ -899,19 +924,17 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct return DS_ERROR; } - /* no wait necessary anymore, actually we could assert that */ - wait_event(device->al_wait, lc_try_lock(device->act_log)); - - prev_first_sect = drbd_md_first_sector(device->ldev); - prev_size = device->ldev->md.md_size_sect; - la_size_sect = device->ldev->md.la_size_sect; + /* remember current offset and sizes */ + prev.last_agreed_sect = md->la_size_sect; + prev.md_offset = md->md_offset; + prev.al_offset = md->al_offset; + prev.bm_offset = md->bm_offset; + prev.md_size_sect = md->md_size_sect; + prev.al_stripes = md->al_stripes; + prev.al_stripe_size_4k = md->al_stripe_size_4k; if (rs) { /* rs is non NULL if we should change the AL layout only */ - - prev_al_stripes = md->al_stripes; - prev_al_stripe_size_4k = md->al_stripe_size_4k; - md->al_stripes = rs->al_stripes; md->al_stripe_size_4k = rs->al_stripe_size / 4; md->al_size_4k = (u64)rs->al_stripes * rs->al_stripe_size / 4; @@ -924,7 +947,7 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct rcu_read_unlock(); size = drbd_new_dev_size(device, device->ldev, u_size, flags & DDSF_FORCED); - if (size < la_size_sect) { + if (size < prev.last_agreed_sect) { if (rs && u_size == 0) { /* Remove "rs &&" later. This check should always be active, but right now the receiver expects the permissive behavior */ @@ -945,30 +968,29 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct err = drbd_bm_resize(device, size, !(flags & DDSF_NO_RESYNC)); if (unlikely(err)) { /* currently there is only one error: ENOMEM! */ - size = drbd_bm_capacity(device)>>1; + size = drbd_bm_capacity(device); if (size == 0) { drbd_err(device, "OUT OF MEMORY! " "Could not allocate bitmap!\n"); } else { drbd_err(device, "BM resizing failed. " - "Leaving size unchanged at size = %lu KB\n", - (unsigned long)size); + "Leaving size unchanged\n"); } rv = DS_ERROR; } /* racy, see comments above. */ drbd_set_my_capacity(device, size); - device->ldev->md.la_size_sect = size; + md->la_size_sect = size; drbd_info(device, "size = %s (%llu KB)\n", ppsize(ppb, size>>1), (unsigned long long)size>>1); } if (rv <= DS_ERROR) goto err_out; - la_size_changed = (la_size_sect != device->ldev->md.la_size_sect); + la_size_changed = (prev.last_agreed_sect != md->la_size_sect); - md_moved = prev_first_sect != drbd_md_first_sector(device->ldev) - || prev_size != device->ldev->md.md_size_sect; + md_moved = prev.md_offset != md->md_offset + || prev.md_size_sect != md->md_size_sect; if (la_size_changed || md_moved || rs) { u32 prev_flags; @@ -977,20 +999,29 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct * Clear the timer, to avoid scary "timer expired!" messages, * "Superblock" is written out at least twice below, anyways. */ del_timer(&device->md_sync_timer); - drbd_al_shrink(device); /* All extents inactive. */ + /* We won't change the "al-extents" setting, we just may need + * to move the on-disk location of the activity log ringbuffer. + * Lock for transaction is good enough, it may well be "dirty" + * or even "starving". */ + wait_event(device->al_wait, lc_try_lock_for_transaction(device->act_log)); + + /* mark current on-disk bitmap and activity log as unreliable */ prev_flags = md->flags; - md->flags &= ~MDF_PRIMARY_IND; + md->flags |= MDF_FULL_SYNC | MDF_AL_DISABLED; drbd_md_write(device, buffer); + drbd_al_initialize(device, buffer); + drbd_info(device, "Writing the whole bitmap, %s\n", la_size_changed && md_moved ? "size changed and md moved" : la_size_changed ? "size changed" : "md moved"); /* next line implicitly does drbd_suspend_io()+drbd_resume_io() */ drbd_bitmap_io(device, md_moved ? &drbd_bm_write_all : &drbd_bm_write, "size changed", BM_LOCKED_MASK); - drbd_initialize_al(device, buffer); + /* on-disk bitmap and activity log is authoritative again + * (unless there was an IO error meanwhile...) */ md->flags = prev_flags; drbd_md_write(device, buffer); @@ -999,20 +1030,22 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct md->al_stripes, md->al_stripe_size_4k * 4); } - if (size > la_size_sect) - rv = la_size_sect ? DS_GREW : DS_GREW_FROM_ZERO; - if (size < la_size_sect) + if (size > prev.last_agreed_sect) + rv = prev.last_agreed_sect ? DS_GREW : DS_GREW_FROM_ZERO; + if (size < prev.last_agreed_sect) rv = DS_SHRUNK; if (0) { err_out: - if (rs) { - md->al_stripes = prev_al_stripes; - md->al_stripe_size_4k = prev_al_stripe_size_4k; - md->al_size_4k = (u64)prev_al_stripes * prev_al_stripe_size_4k; - - drbd_md_set_sector_offsets(device, device->ldev); - } + /* restore previous offset and sizes */ + md->la_size_sect = prev.last_agreed_sect; + md->md_offset = prev.md_offset; + md->al_offset = prev.al_offset; + md->bm_offset = prev.bm_offset; + md->md_size_sect = prev.md_size_sect; + md->al_stripes = prev.al_stripes; + md->al_stripe_size_4k = prev.al_stripe_size_4k; + md->al_size_4k = (u64)prev.al_stripes * prev.al_stripe_size_4k; } lc_unlock(device->act_log); wake_up(&device->al_wait); @@ -1115,8 +1148,7 @@ static int drbd_check_al_size(struct drbd_device *device, struct disk_conf *dc) lc_destroy(n); return -EBUSY; } else { - if (t) - lc_destroy(t); + lc_destroy(t); } drbd_md_mark_dirty(device); /* we changed device->act_log->nr_elemens */ return 0; @@ -1151,21 +1183,20 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi if (b) { struct drbd_connection *connection = first_peer_device(device)->connection; + blk_queue_max_discard_sectors(q, DRBD_MAX_DISCARD_SECTORS); + if (blk_queue_discard(b) && (connection->cstate < C_CONNECTED || connection->agreed_features & FF_TRIM)) { - /* For now, don't allow more than one activity log extent worth of data - * to be discarded in one go. We may need to rework drbd_al_begin_io() - * to allow for even larger discard ranges */ - blk_queue_max_discard_sectors(q, DRBD_MAX_DISCARD_SECTORS); - + /* We don't care, stacking below should fix it for the local device. + * Whether or not it is a suitable granularity on the remote device + * is not our problem, really. If you care, you need to + * use devices with similar topology on all peers. */ + q->limits.discard_granularity = 512; queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); - /* REALLY? Is stacking secdiscard "legal"? */ - if (blk_queue_secdiscard(b)) - queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, q); } else { blk_queue_max_discard_sectors(q, 0); queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q); - queue_flag_clear_unlocked(QUEUE_FLAG_SECDISCARD, q); + q->limits.discard_granularity = 0; } blk_queue_stack_limits(q, b); @@ -1177,6 +1208,12 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages; } } + /* To avoid confusion, if this queue does not support discard, clear + * max_discard_sectors, which is what lsblk -D reports to the user. */ + if (!blk_queue_discard(q)) { + blk_queue_max_discard_sectors(q, 0); + q->limits.discard_granularity = 0; + } } void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev) @@ -1241,8 +1278,8 @@ static void conn_reconfig_done(struct drbd_connection *connection) connection->cstate == C_STANDALONE; spin_unlock_irq(&connection->resource->req_lock); if (stop_threads) { - /* asender is implicitly stopped by receiver - * in conn_disconnect() */ + /* ack_receiver thread and ack_sender workqueue are implicitly + * stopped by receiver in conn_disconnect() */ drbd_thread_stop(&connection->receiver); drbd_thread_stop(&connection->worker); } @@ -1389,13 +1426,13 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) goto fail_unlock; } - write_lock_irq(&global_state_lock); + lock_all_resources(); retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after); if (retcode == NO_ERROR) { rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf); drbd_resync_after_changed(device); } - write_unlock_irq(&global_state_lock); + unlock_all_resources(); if (retcode != NO_ERROR) goto fail_unlock; @@ -1418,7 +1455,7 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) set_bit(MD_NO_FUA, &device->flags); if (write_ordering_changed(old_disk_conf, new_disk_conf)) - drbd_bump_write_ordering(device->resource, NULL, WO_bdev_flush); + drbd_bump_write_ordering(device->resource, NULL, WO_BDEV_FLUSH); drbd_md_sync(device); @@ -1449,6 +1486,88 @@ success: return 0; } +static struct block_device *open_backing_dev(struct drbd_device *device, + const char *bdev_path, void *claim_ptr, bool do_bd_link) +{ + struct block_device *bdev; + int err = 0; + + bdev = blkdev_get_by_path(bdev_path, + FMODE_READ | FMODE_WRITE | FMODE_EXCL, claim_ptr); + if (IS_ERR(bdev)) { + drbd_err(device, "open(\"%s\") failed with %ld\n", + bdev_path, PTR_ERR(bdev)); + return bdev; + } + + if (!do_bd_link) + return bdev; + + err = bd_link_disk_holder(bdev, device->vdisk); + if (err) { + blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + drbd_err(device, "bd_link_disk_holder(\"%s\", ...) failed with %d\n", + bdev_path, err); + bdev = ERR_PTR(err); + } + return bdev; +} + +static int open_backing_devices(struct drbd_device *device, + struct disk_conf *new_disk_conf, + struct drbd_backing_dev *nbc) +{ + struct block_device *bdev; + + bdev = open_backing_dev(device, new_disk_conf->backing_dev, device, true); + if (IS_ERR(bdev)) + return ERR_OPEN_DISK; + nbc->backing_bdev = bdev; + + /* + * meta_dev_idx >= 0: external fixed size, possibly multiple + * drbd sharing one meta device. TODO in that case, paranoia + * check that [md_bdev, meta_dev_idx] is not yet used by some + * other drbd minor! (if you use drbd.conf + drbdadm, that + * should check it for you already; but if you don't, or + * someone fooled it, we need to double check here) + */ + bdev = open_backing_dev(device, new_disk_conf->meta_dev, + /* claim ptr: device, if claimed exclusively; shared drbd_m_holder, + * if potentially shared with other drbd minors */ + (new_disk_conf->meta_dev_idx < 0) ? (void*)device : (void*)drbd_m_holder, + /* avoid double bd_claim_by_disk() for the same (source,target) tuple, + * as would happen with internal metadata. */ + (new_disk_conf->meta_dev_idx != DRBD_MD_INDEX_FLEX_INT && + new_disk_conf->meta_dev_idx != DRBD_MD_INDEX_INTERNAL)); + if (IS_ERR(bdev)) + return ERR_OPEN_MD_DISK; + nbc->md_bdev = bdev; + return NO_ERROR; +} + +static void close_backing_dev(struct drbd_device *device, struct block_device *bdev, + bool do_bd_unlink) +{ + if (!bdev) + return; + if (do_bd_unlink) + bd_unlink_disk_holder(bdev, device->vdisk); + blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); +} + +void drbd_backing_dev_free(struct drbd_device *device, struct drbd_backing_dev *ldev) +{ + if (ldev == NULL) + return; + + close_backing_dev(device, ldev->md_bdev, ldev->md_bdev != ldev->backing_bdev); + close_backing_dev(device, ldev->backing_bdev, true); + + kfree(ldev->disk_conf); + kfree(ldev); +} + int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) { struct drbd_config_context adm_ctx; @@ -1462,7 +1581,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) sector_t min_md_device_sectors; struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */ struct disk_conf *new_disk_conf = NULL; - struct block_device *bdev; struct lru_cache *resync_lru = NULL; struct fifo_buffer *new_plan = NULL; union drbd_state ns, os; @@ -1478,7 +1596,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) device = adm_ctx.device; mutex_lock(&adm_ctx.resource->adm_mutex); peer_device = first_peer_device(device); - connection = peer_device ? peer_device->connection : NULL; + connection = peer_device->connection; conn_reconfig_start(connection); /* if you want to reconfigure, please tear down first */ @@ -1539,12 +1657,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) goto fail; } - write_lock_irq(&global_state_lock); - retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after); - write_unlock_irq(&global_state_lock); - if (retcode != NO_ERROR) - goto fail; - rcu_read_lock(); nc = rcu_dereference(connection->net_conf); if (nc) { @@ -1556,35 +1668,9 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) } rcu_read_unlock(); - bdev = blkdev_get_by_path(new_disk_conf->backing_dev, - FMODE_READ | FMODE_WRITE | FMODE_EXCL, device); - if (IS_ERR(bdev)) { - drbd_err(device, "open(\"%s\") failed with %ld\n", new_disk_conf->backing_dev, - PTR_ERR(bdev)); - retcode = ERR_OPEN_DISK; - goto fail; - } - nbc->backing_bdev = bdev; - - /* - * meta_dev_idx >= 0: external fixed size, possibly multiple - * drbd sharing one meta device. TODO in that case, paranoia - * check that [md_bdev, meta_dev_idx] is not yet used by some - * other drbd minor! (if you use drbd.conf + drbdadm, that - * should check it for you already; but if you don't, or - * someone fooled it, we need to double check here) - */ - bdev = blkdev_get_by_path(new_disk_conf->meta_dev, - FMODE_READ | FMODE_WRITE | FMODE_EXCL, - (new_disk_conf->meta_dev_idx < 0) ? - (void *)device : (void *)drbd_m_holder); - if (IS_ERR(bdev)) { - drbd_err(device, "open(\"%s\") failed with %ld\n", new_disk_conf->meta_dev, - PTR_ERR(bdev)); - retcode = ERR_OPEN_MD_DISK; + retcode = open_backing_devices(device, new_disk_conf, nbc); + if (retcode != NO_ERROR) goto fail; - } - nbc->md_bdev = bdev; if ((nbc->backing_bdev == nbc->md_bdev) != (new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_INTERNAL || @@ -1707,6 +1793,13 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) goto force_diskless_dec; } + lock_all_resources(); + retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after); + if (retcode != NO_ERROR) { + unlock_all_resources(); + goto force_diskless_dec; + } + /* Reset the "barriers don't work" bits here, then force meta data to * be written, to ensure we determine if barriers are supported. */ if (new_disk_conf->md_flushes) @@ -1727,7 +1820,9 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) new_disk_conf = NULL; new_plan = NULL; - drbd_bump_write_ordering(device->resource, device->ldev, WO_bdev_flush); + drbd_resync_after_changed(device); + drbd_bump_write_ordering(device->resource, device->ldev, WO_BDEV_FLUSH); + unlock_all_resources(); if (drbd_md_test_flag(device->ldev, MDF_CRASHED_PRIMARY)) set_bit(CRASHED_PRIMARY, &device->flags); @@ -1875,12 +1970,8 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) fail: conn_reconfig_done(connection); if (nbc) { - if (nbc->backing_bdev) - blkdev_put(nbc->backing_bdev, - FMODE_READ | FMODE_WRITE | FMODE_EXCL); - if (nbc->md_bdev) - blkdev_put(nbc->md_bdev, - FMODE_READ | FMODE_WRITE | FMODE_EXCL); + close_backing_dev(device, nbc->md_bdev, nbc->md_bdev != nbc->backing_bdev); + close_backing_dev(device, nbc->backing_bdev, true); kfree(nbc); } kfree(new_disk_conf); @@ -1895,6 +1986,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) static int adm_detach(struct drbd_device *device, int force) { enum drbd_state_rv retcode; + void *buffer; int ret; if (force) { @@ -1905,13 +1997,16 @@ static int adm_detach(struct drbd_device *device, int force) } drbd_suspend_io(device); /* so no-one is stuck in drbd_al_begin_io */ - drbd_md_get_buffer(device, __func__); /* make sure there is no in-flight meta-data IO */ - retcode = drbd_request_state(device, NS(disk, D_FAILED)); - drbd_md_put_buffer(device); + buffer = drbd_md_get_buffer(device, __func__); /* make sure there is no in-flight meta-data IO */ + if (buffer) { + retcode = drbd_request_state(device, NS(disk, D_FAILED)); + drbd_md_put_buffer(device); + } else /* already <= D_FAILED */ + retcode = SS_NOTHING_TO_DO; /* D_FAILED will transition to DISKLESS. */ + drbd_resume_io(device); ret = wait_event_interruptible(device->misc_wait, device->state.disk != D_FAILED); - drbd_resume_io(device); if ((int)retcode == (int)SS_IS_DISKLESS) retcode = SS_NOTHING_TO_DO; if (ret) @@ -2245,8 +2340,31 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) return 0; } +static void connection_to_info(struct connection_info *info, + struct drbd_connection *connection) +{ + info->conn_connection_state = connection->cstate; + info->conn_role = conn_highest_peer(connection); +} + +static void peer_device_to_info(struct peer_device_info *info, + struct drbd_peer_device *peer_device) +{ + struct drbd_device *device = peer_device->device; + + info->peer_repl_state = + max_t(enum drbd_conns, C_WF_REPORT_PARAMS, device->state.conn); + info->peer_disk_state = device->state.pdsk; + info->peer_resync_susp_user = device->state.user_isp; + info->peer_resync_susp_peer = device->state.peer_isp; + info->peer_resync_susp_dependency = device->state.aftr_isp; +} + int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) { + struct connection_info connection_info; + enum drbd_notification_type flags; + unsigned int peer_devices = 0; struct drbd_config_context adm_ctx; struct drbd_peer_device *peer_device; struct net_conf *old_net_conf, *new_net_conf = NULL; @@ -2347,6 +2465,22 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) connection->peer_addr_len = nla_len(adm_ctx.peer_addr); memcpy(&connection->peer_addr, nla_data(adm_ctx.peer_addr), connection->peer_addr_len); + idr_for_each_entry(&connection->peer_devices, peer_device, i) { + peer_devices++; + } + + connection_to_info(&connection_info, connection); + flags = (peer_devices--) ? NOTIFY_CONTINUES : 0; + mutex_lock(¬ification_mutex); + notify_connection_state(NULL, 0, connection, &connection_info, NOTIFY_CREATE | flags); + idr_for_each_entry(&connection->peer_devices, peer_device, i) { + struct peer_device_info peer_device_info; + + peer_device_to_info(&peer_device_info, peer_device); + flags = (peer_devices--) ? NOTIFY_CONTINUES : 0; + notify_peer_device_state(NULL, 0, peer_device, &peer_device_info, NOTIFY_CREATE | flags); + } + mutex_unlock(¬ification_mutex); mutex_unlock(&adm_ctx.resource->conf_update); rcu_read_lock(); @@ -2428,6 +2562,8 @@ static enum drbd_state_rv conn_try_disconnect(struct drbd_connection *connection drbd_err(connection, "unexpected rv2=%d in conn_try_disconnect()\n", rv2); + /* Unlike in DRBD 9, the state engine has generated + * NOTIFY_DESTROY events before clearing connection->net_conf. */ } return rv; } @@ -2585,6 +2721,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) mutex_unlock(&device->resource->conf_update); synchronize_rcu(); kfree(old_disk_conf); + new_disk_conf = NULL; } ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0); @@ -2618,6 +2755,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) fail_ldev: put_ldev(device); + kfree(new_disk_conf); goto fail; } @@ -2855,7 +2993,30 @@ int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info) mutex_lock(&adm_ctx.resource->adm_mutex); device = adm_ctx.device; if (test_bit(NEW_CUR_UUID, &device->flags)) { - drbd_uuid_new_current(device); + if (get_ldev_if_state(device, D_ATTACHING)) { + drbd_uuid_new_current(device); + put_ldev(device); + } else { + /* This is effectively a multi-stage "forced down". + * The NEW_CUR_UUID bit is supposedly only set, if we + * lost the replication connection, and are configured + * to freeze IO and wait for some fence-peer handler. + * So we still don't have a replication connection. + * And now we don't have a local disk either. After + * resume, we will fail all pending and new IO, because + * we don't have any data anymore. Which means we will + * eventually be able to terminate all users of this + * device, and then take it down. By bumping the + * "effective" data uuid, we make sure that you really + * need to tear down before you reconfigure, we will + * the refuse to re-connect or re-attach (because no + * matching real data uuid exists). + */ + u64 val; + get_random_bytes(&val, sizeof(u64)); + drbd_set_ed_uuid(device, val); + drbd_warn(device, "Resumed without access to data; please tear down before attempting to re-configure.\n"); + } clear_bit(NEW_CUR_UUID, &device->flags); } drbd_suspend_io(device); @@ -2910,6 +3071,486 @@ nla_put_failure: } /* + * The generic netlink dump callbacks are called outside the genl_lock(), so + * they cannot use the simple attribute parsing code which uses global + * attribute tables. + */ +static struct nlattr *find_cfg_context_attr(const struct nlmsghdr *nlh, int attr) +{ + const unsigned hdrlen = GENL_HDRLEN + GENL_MAGIC_FAMILY_HDRSZ; + const int maxtype = ARRAY_SIZE(drbd_cfg_context_nl_policy) - 1; + struct nlattr *nla; + + nla = nla_find(nlmsg_attrdata(nlh, hdrlen), nlmsg_attrlen(nlh, hdrlen), + DRBD_NLA_CFG_CONTEXT); + if (!nla) + return NULL; + return drbd_nla_find_nested(maxtype, nla, __nla_type(attr)); +} + +static void resource_to_info(struct resource_info *, struct drbd_resource *); + +int drbd_adm_dump_resources(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct drbd_genlmsghdr *dh; + struct drbd_resource *resource; + struct resource_info resource_info; + struct resource_statistics resource_statistics; + int err; + + rcu_read_lock(); + if (cb->args[0]) { + for_each_resource_rcu(resource, &drbd_resources) + if (resource == (struct drbd_resource *)cb->args[0]) + goto found_resource; + err = 0; /* resource was probably deleted */ + goto out; + } + resource = list_entry(&drbd_resources, + struct drbd_resource, resources); + +found_resource: + list_for_each_entry_continue_rcu(resource, &drbd_resources, resources) { + goto put_result; + } + err = 0; + goto out; + +put_result: + dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, &drbd_genl_family, + NLM_F_MULTI, DRBD_ADM_GET_RESOURCES); + err = -ENOMEM; + if (!dh) + goto out; + dh->minor = -1U; + dh->ret_code = NO_ERROR; + err = nla_put_drbd_cfg_context(skb, resource, NULL, NULL); + if (err) + goto out; + err = res_opts_to_skb(skb, &resource->res_opts, !capable(CAP_SYS_ADMIN)); + if (err) + goto out; + resource_to_info(&resource_info, resource); + err = resource_info_to_skb(skb, &resource_info, !capable(CAP_SYS_ADMIN)); + if (err) + goto out; + resource_statistics.res_stat_write_ordering = resource->write_ordering; + err = resource_statistics_to_skb(skb, &resource_statistics, !capable(CAP_SYS_ADMIN)); + if (err) + goto out; + cb->args[0] = (long)resource; + genlmsg_end(skb, dh); + err = 0; + +out: + rcu_read_unlock(); + if (err) + return err; + return skb->len; +} + +static void device_to_statistics(struct device_statistics *s, + struct drbd_device *device) +{ + memset(s, 0, sizeof(*s)); + s->dev_upper_blocked = !may_inc_ap_bio(device); + if (get_ldev(device)) { + struct drbd_md *md = &device->ldev->md; + u64 *history_uuids = (u64 *)s->history_uuids; + struct request_queue *q; + int n; + + spin_lock_irq(&md->uuid_lock); + s->dev_current_uuid = md->uuid[UI_CURRENT]; + BUILD_BUG_ON(sizeof(s->history_uuids) < UI_HISTORY_END - UI_HISTORY_START + 1); + for (n = 0; n < UI_HISTORY_END - UI_HISTORY_START + 1; n++) + history_uuids[n] = md->uuid[UI_HISTORY_START + n]; + for (; n < HISTORY_UUIDS; n++) + history_uuids[n] = 0; + s->history_uuids_len = HISTORY_UUIDS; + spin_unlock_irq(&md->uuid_lock); + + s->dev_disk_flags = md->flags; + q = bdev_get_queue(device->ldev->backing_bdev); + s->dev_lower_blocked = + bdi_congested(&q->backing_dev_info, + (1 << WB_async_congested) | + (1 << WB_sync_congested)); + put_ldev(device); + } + s->dev_size = drbd_get_capacity(device->this_bdev); + s->dev_read = device->read_cnt; + s->dev_write = device->writ_cnt; + s->dev_al_writes = device->al_writ_cnt; + s->dev_bm_writes = device->bm_writ_cnt; + s->dev_upper_pending = atomic_read(&device->ap_bio_cnt); + s->dev_lower_pending = atomic_read(&device->local_cnt); + s->dev_al_suspended = test_bit(AL_SUSPENDED, &device->flags); + s->dev_exposed_data_uuid = device->ed_uuid; +} + +static int put_resource_in_arg0(struct netlink_callback *cb, int holder_nr) +{ + if (cb->args[0]) { + struct drbd_resource *resource = + (struct drbd_resource *)cb->args[0]; + kref_put(&resource->kref, drbd_destroy_resource); + } + + return 0; +} + +int drbd_adm_dump_devices_done(struct netlink_callback *cb) { + return put_resource_in_arg0(cb, 7); +} + +static void device_to_info(struct device_info *, struct drbd_device *); + +int drbd_adm_dump_devices(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nlattr *resource_filter; + struct drbd_resource *resource; + struct drbd_device *uninitialized_var(device); + int minor, err, retcode; + struct drbd_genlmsghdr *dh; + struct device_info device_info; + struct device_statistics device_statistics; + struct idr *idr_to_search; + + resource = (struct drbd_resource *)cb->args[0]; + if (!cb->args[0] && !cb->args[1]) { + resource_filter = find_cfg_context_attr(cb->nlh, T_ctx_resource_name); + if (resource_filter) { + retcode = ERR_RES_NOT_KNOWN; + resource = drbd_find_resource(nla_data(resource_filter)); + if (!resource) + goto put_result; + cb->args[0] = (long)resource; + } + } + + rcu_read_lock(); + minor = cb->args[1]; + idr_to_search = resource ? &resource->devices : &drbd_devices; + device = idr_get_next(idr_to_search, &minor); + if (!device) { + err = 0; + goto out; + } + idr_for_each_entry_continue(idr_to_search, device, minor) { + retcode = NO_ERROR; + goto put_result; /* only one iteration */ + } + err = 0; + goto out; /* no more devices */ + +put_result: + dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, &drbd_genl_family, + NLM_F_MULTI, DRBD_ADM_GET_DEVICES); + err = -ENOMEM; + if (!dh) + goto out; + dh->ret_code = retcode; + dh->minor = -1U; + if (retcode == NO_ERROR) { + dh->minor = device->minor; + err = nla_put_drbd_cfg_context(skb, device->resource, NULL, device); + if (err) + goto out; + if (get_ldev(device)) { + struct disk_conf *disk_conf = + rcu_dereference(device->ldev->disk_conf); + + err = disk_conf_to_skb(skb, disk_conf, !capable(CAP_SYS_ADMIN)); + put_ldev(device); + if (err) + goto out; + } + device_to_info(&device_info, device); + err = device_info_to_skb(skb, &device_info, !capable(CAP_SYS_ADMIN)); + if (err) + goto out; + + device_to_statistics(&device_statistics, device); + err = device_statistics_to_skb(skb, &device_statistics, !capable(CAP_SYS_ADMIN)); + if (err) + goto out; + cb->args[1] = minor + 1; + } + genlmsg_end(skb, dh); + err = 0; + +out: + rcu_read_unlock(); + if (err) + return err; + return skb->len; +} + +int drbd_adm_dump_connections_done(struct netlink_callback *cb) +{ + return put_resource_in_arg0(cb, 6); +} + +enum { SINGLE_RESOURCE, ITERATE_RESOURCES }; + +int drbd_adm_dump_connections(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nlattr *resource_filter; + struct drbd_resource *resource = NULL, *next_resource; + struct drbd_connection *uninitialized_var(connection); + int err = 0, retcode; + struct drbd_genlmsghdr *dh; + struct connection_info connection_info; + struct connection_statistics connection_statistics; + + rcu_read_lock(); + resource = (struct drbd_resource *)cb->args[0]; + if (!cb->args[0]) { + resource_filter = find_cfg_context_attr(cb->nlh, T_ctx_resource_name); + if (resource_filter) { + retcode = ERR_RES_NOT_KNOWN; + resource = drbd_find_resource(nla_data(resource_filter)); + if (!resource) + goto put_result; + cb->args[0] = (long)resource; + cb->args[1] = SINGLE_RESOURCE; + } + } + if (!resource) { + if (list_empty(&drbd_resources)) + goto out; + resource = list_first_entry(&drbd_resources, struct drbd_resource, resources); + kref_get(&resource->kref); + cb->args[0] = (long)resource; + cb->args[1] = ITERATE_RESOURCES; + } + + next_resource: + rcu_read_unlock(); + mutex_lock(&resource->conf_update); + rcu_read_lock(); + if (cb->args[2]) { + for_each_connection_rcu(connection, resource) + if (connection == (struct drbd_connection *)cb->args[2]) + goto found_connection; + /* connection was probably deleted */ + goto no_more_connections; + } + connection = list_entry(&resource->connections, struct drbd_connection, connections); + +found_connection: + list_for_each_entry_continue_rcu(connection, &resource->connections, connections) { + if (!has_net_conf(connection)) + continue; + retcode = NO_ERROR; + goto put_result; /* only one iteration */ + } + +no_more_connections: + if (cb->args[1] == ITERATE_RESOURCES) { + for_each_resource_rcu(next_resource, &drbd_resources) { + if (next_resource == resource) + goto found_resource; + } + /* resource was probably deleted */ + } + goto out; + +found_resource: + list_for_each_entry_continue_rcu(next_resource, &drbd_resources, resources) { + mutex_unlock(&resource->conf_update); + kref_put(&resource->kref, drbd_destroy_resource); + resource = next_resource; + kref_get(&resource->kref); + cb->args[0] = (long)resource; + cb->args[2] = 0; + goto next_resource; + } + goto out; /* no more resources */ + +put_result: + dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, &drbd_genl_family, + NLM_F_MULTI, DRBD_ADM_GET_CONNECTIONS); + err = -ENOMEM; + if (!dh) + goto out; + dh->ret_code = retcode; + dh->minor = -1U; + if (retcode == NO_ERROR) { + struct net_conf *net_conf; + + err = nla_put_drbd_cfg_context(skb, resource, connection, NULL); + if (err) + goto out; + net_conf = rcu_dereference(connection->net_conf); + if (net_conf) { + err = net_conf_to_skb(skb, net_conf, !capable(CAP_SYS_ADMIN)); + if (err) + goto out; + } + connection_to_info(&connection_info, connection); + err = connection_info_to_skb(skb, &connection_info, !capable(CAP_SYS_ADMIN)); + if (err) + goto out; + connection_statistics.conn_congested = test_bit(NET_CONGESTED, &connection->flags); + err = connection_statistics_to_skb(skb, &connection_statistics, !capable(CAP_SYS_ADMIN)); + if (err) + goto out; + cb->args[2] = (long)connection; + } + genlmsg_end(skb, dh); + err = 0; + +out: + rcu_read_unlock(); + if (resource) + mutex_unlock(&resource->conf_update); + if (err) + return err; + return skb->len; +} + +enum mdf_peer_flag { + MDF_PEER_CONNECTED = 1 << 0, + MDF_PEER_OUTDATED = 1 << 1, + MDF_PEER_FENCING = 1 << 2, + MDF_PEER_FULL_SYNC = 1 << 3, +}; + +static void peer_device_to_statistics(struct peer_device_statistics *s, + struct drbd_peer_device *peer_device) +{ + struct drbd_device *device = peer_device->device; + + memset(s, 0, sizeof(*s)); + s->peer_dev_received = device->recv_cnt; + s->peer_dev_sent = device->send_cnt; + s->peer_dev_pending = atomic_read(&device->ap_pending_cnt) + + atomic_read(&device->rs_pending_cnt); + s->peer_dev_unacked = atomic_read(&device->unacked_cnt); + s->peer_dev_out_of_sync = drbd_bm_total_weight(device) << (BM_BLOCK_SHIFT - 9); + s->peer_dev_resync_failed = device->rs_failed << (BM_BLOCK_SHIFT - 9); + if (get_ldev(device)) { + struct drbd_md *md = &device->ldev->md; + + spin_lock_irq(&md->uuid_lock); + s->peer_dev_bitmap_uuid = md->uuid[UI_BITMAP]; + spin_unlock_irq(&md->uuid_lock); + s->peer_dev_flags = + (drbd_md_test_flag(device->ldev, MDF_CONNECTED_IND) ? + MDF_PEER_CONNECTED : 0) + + (drbd_md_test_flag(device->ldev, MDF_CONSISTENT) && + !drbd_md_test_flag(device->ldev, MDF_WAS_UP_TO_DATE) ? + MDF_PEER_OUTDATED : 0) + + /* FIXME: MDF_PEER_FENCING? */ + (drbd_md_test_flag(device->ldev, MDF_FULL_SYNC) ? + MDF_PEER_FULL_SYNC : 0); + put_ldev(device); + } +} + +int drbd_adm_dump_peer_devices_done(struct netlink_callback *cb) +{ + return put_resource_in_arg0(cb, 9); +} + +int drbd_adm_dump_peer_devices(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nlattr *resource_filter; + struct drbd_resource *resource; + struct drbd_device *uninitialized_var(device); + struct drbd_peer_device *peer_device = NULL; + int minor, err, retcode; + struct drbd_genlmsghdr *dh; + struct idr *idr_to_search; + + resource = (struct drbd_resource *)cb->args[0]; + if (!cb->args[0] && !cb->args[1]) { + resource_filter = find_cfg_context_attr(cb->nlh, T_ctx_resource_name); + if (resource_filter) { + retcode = ERR_RES_NOT_KNOWN; + resource = drbd_find_resource(nla_data(resource_filter)); + if (!resource) + goto put_result; + } + cb->args[0] = (long)resource; + } + + rcu_read_lock(); + minor = cb->args[1]; + idr_to_search = resource ? &resource->devices : &drbd_devices; + device = idr_find(idr_to_search, minor); + if (!device) { +next_device: + minor++; + cb->args[2] = 0; + device = idr_get_next(idr_to_search, &minor); + if (!device) { + err = 0; + goto out; + } + } + if (cb->args[2]) { + for_each_peer_device(peer_device, device) + if (peer_device == (struct drbd_peer_device *)cb->args[2]) + goto found_peer_device; + /* peer device was probably deleted */ + goto next_device; + } + /* Make peer_device point to the list head (not the first entry). */ + peer_device = list_entry(&device->peer_devices, struct drbd_peer_device, peer_devices); + +found_peer_device: + list_for_each_entry_continue_rcu(peer_device, &device->peer_devices, peer_devices) { + if (!has_net_conf(peer_device->connection)) + continue; + retcode = NO_ERROR; + goto put_result; /* only one iteration */ + } + goto next_device; + +put_result: + dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, &drbd_genl_family, + NLM_F_MULTI, DRBD_ADM_GET_PEER_DEVICES); + err = -ENOMEM; + if (!dh) + goto out; + dh->ret_code = retcode; + dh->minor = -1U; + if (retcode == NO_ERROR) { + struct peer_device_info peer_device_info; + struct peer_device_statistics peer_device_statistics; + + dh->minor = minor; + err = nla_put_drbd_cfg_context(skb, device->resource, peer_device->connection, device); + if (err) + goto out; + peer_device_to_info(&peer_device_info, peer_device); + err = peer_device_info_to_skb(skb, &peer_device_info, !capable(CAP_SYS_ADMIN)); + if (err) + goto out; + peer_device_to_statistics(&peer_device_statistics, peer_device); + err = peer_device_statistics_to_skb(skb, &peer_device_statistics, !capable(CAP_SYS_ADMIN)); + if (err) + goto out; + cb->args[1] = minor; + cb->args[2] = (long)peer_device; + } + genlmsg_end(skb, dh); + err = 0; + +out: + rcu_read_unlock(); + if (err) + return err; + return skb->len; +} +/* * Return the connection of @resource if @resource has exactly one connection. */ static struct drbd_connection *the_only_connection(struct drbd_resource *resource) @@ -3414,8 +4055,18 @@ drbd_check_resource_name(struct drbd_config_context *adm_ctx) return NO_ERROR; } +static void resource_to_info(struct resource_info *info, + struct drbd_resource *resource) +{ + info->res_role = conn_highest_role(first_connection(resource)); + info->res_susp = resource->susp; + info->res_susp_nod = resource->susp_nod; + info->res_susp_fen = resource->susp_fen; +} + int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info) { + struct drbd_connection *connection; struct drbd_config_context adm_ctx; enum drbd_ret_code retcode; struct res_opts res_opts; @@ -3449,13 +4100,33 @@ int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info) } /* not yet safe for genl_family.parallel_ops */ - if (!conn_create(adm_ctx.resource_name, &res_opts)) + mutex_lock(&resources_mutex); + connection = conn_create(adm_ctx.resource_name, &res_opts); + mutex_unlock(&resources_mutex); + + if (connection) { + struct resource_info resource_info; + + mutex_lock(¬ification_mutex); + resource_to_info(&resource_info, connection->resource); + notify_resource_state(NULL, 0, connection->resource, + &resource_info, NOTIFY_CREATE); + mutex_unlock(¬ification_mutex); + } else retcode = ERR_NOMEM; + out: drbd_adm_finish(&adm_ctx, info, retcode); return 0; } +static void device_to_info(struct device_info *info, + struct drbd_device *device) +{ + info->dev_disk_state = device->state.disk; +} + + int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info) { struct drbd_config_context adm_ctx; @@ -3490,6 +4161,36 @@ int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info) mutex_lock(&adm_ctx.resource->adm_mutex); retcode = drbd_create_device(&adm_ctx, dh->minor); + if (retcode == NO_ERROR) { + struct drbd_device *device; + struct drbd_peer_device *peer_device; + struct device_info info; + unsigned int peer_devices = 0; + enum drbd_notification_type flags; + + device = minor_to_device(dh->minor); + for_each_peer_device(peer_device, device) { + if (!has_net_conf(peer_device->connection)) + continue; + peer_devices++; + } + + device_to_info(&info, device); + mutex_lock(¬ification_mutex); + flags = (peer_devices--) ? NOTIFY_CONTINUES : 0; + notify_device_state(NULL, 0, device, &info, NOTIFY_CREATE | flags); + for_each_peer_device(peer_device, device) { + struct peer_device_info peer_device_info; + + if (!has_net_conf(peer_device->connection)) + continue; + peer_device_to_info(&peer_device_info, peer_device); + flags = (peer_devices--) ? NOTIFY_CONTINUES : 0; + notify_peer_device_state(NULL, 0, peer_device, &peer_device_info, + NOTIFY_CREATE | flags); + } + mutex_unlock(¬ification_mutex); + } mutex_unlock(&adm_ctx.resource->adm_mutex); out: drbd_adm_finish(&adm_ctx, info, retcode); @@ -3498,13 +4199,35 @@ out: static enum drbd_ret_code adm_del_minor(struct drbd_device *device) { + struct drbd_peer_device *peer_device; + if (device->state.disk == D_DISKLESS && /* no need to be device->state.conn == C_STANDALONE && * we may want to delete a minor from a live replication group. */ device->state.role == R_SECONDARY) { + struct drbd_connection *connection = + first_connection(device->resource); + _drbd_request_state(device, NS(conn, C_WF_REPORT_PARAMS), CS_VERBOSE + CS_WAIT_COMPLETE); + + /* If the state engine hasn't stopped the sender thread yet, we + * need to flush the sender work queue before generating the + * DESTROY events here. */ + if (get_t_state(&connection->worker) == RUNNING) + drbd_flush_workqueue(&connection->sender_work); + + mutex_lock(¬ification_mutex); + for_each_peer_device(peer_device, device) { + if (!has_net_conf(peer_device->connection)) + continue; + notify_peer_device_state(NULL, 0, peer_device, NULL, + NOTIFY_DESTROY | NOTIFY_CONTINUES); + } + notify_device_state(NULL, 0, device, NULL, NOTIFY_DESTROY); + mutex_unlock(¬ification_mutex); + drbd_delete_device(device); return NO_ERROR; } else @@ -3541,7 +4264,16 @@ static int adm_del_resource(struct drbd_resource *resource) if (!idr_is_empty(&resource->devices)) return ERR_RES_IN_USE; + /* The state engine has stopped the sender thread, so we don't + * need to flush the sender work queue before generating the + * DESTROY event here. */ + mutex_lock(¬ification_mutex); + notify_resource_state(NULL, 0, resource, NULL, NOTIFY_DESTROY); + mutex_unlock(¬ification_mutex); + + mutex_lock(&resources_mutex); list_del_rcu(&resource->resources); + mutex_unlock(&resources_mutex); /* Make sure all threads have actually stopped: state handling only * does drbd_thread_stop_nowait(). */ list_for_each_entry(connection, &resource->connections, connections) @@ -3637,7 +4369,6 @@ finish: void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib) { - static atomic_t drbd_genl_seq = ATOMIC_INIT(2); /* two. */ struct sk_buff *msg; struct drbd_genlmsghdr *d_out; unsigned seq; @@ -3658,7 +4389,7 @@ void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib) if (nla_put_status_info(msg, device, sib)) goto nla_put_failure; genlmsg_end(msg, d_out); - err = drbd_genl_multicast_events(msg, 0); + err = drbd_genl_multicast_events(msg, GFP_NOWAIT); /* msg has been consumed or freed in netlink_broadcast() */ if (err && err != -ESRCH) goto failed; @@ -3672,3 +4403,405 @@ failed: "Event seq:%u sib_reason:%u\n", err, seq, sib->sib_reason); } + +static int nla_put_notification_header(struct sk_buff *msg, + enum drbd_notification_type type) +{ + struct drbd_notification_header nh = { + .nh_type = type, + }; + + return drbd_notification_header_to_skb(msg, &nh, true); +} + +void notify_resource_state(struct sk_buff *skb, + unsigned int seq, + struct drbd_resource *resource, + struct resource_info *resource_info, + enum drbd_notification_type type) +{ + struct resource_statistics resource_statistics; + struct drbd_genlmsghdr *dh; + bool multicast = false; + int err; + + if (!skb) { + seq = atomic_inc_return(¬ify_genl_seq); + skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO); + err = -ENOMEM; + if (!skb) + goto failed; + multicast = true; + } + + err = -EMSGSIZE; + dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_RESOURCE_STATE); + if (!dh) + goto nla_put_failure; + dh->minor = -1U; + dh->ret_code = NO_ERROR; + if (nla_put_drbd_cfg_context(skb, resource, NULL, NULL) || + nla_put_notification_header(skb, type) || + ((type & ~NOTIFY_FLAGS) != NOTIFY_DESTROY && + resource_info_to_skb(skb, resource_info, true))) + goto nla_put_failure; + resource_statistics.res_stat_write_ordering = resource->write_ordering; + err = resource_statistics_to_skb(skb, &resource_statistics, !capable(CAP_SYS_ADMIN)); + if (err) + goto nla_put_failure; + genlmsg_end(skb, dh); + if (multicast) { + err = drbd_genl_multicast_events(skb, GFP_NOWAIT); + /* skb has been consumed or freed in netlink_broadcast() */ + if (err && err != -ESRCH) + goto failed; + } + return; + +nla_put_failure: + nlmsg_free(skb); +failed: + drbd_err(resource, "Error %d while broadcasting event. Event seq:%u\n", + err, seq); +} + +void notify_device_state(struct sk_buff *skb, + unsigned int seq, + struct drbd_device *device, + struct device_info *device_info, + enum drbd_notification_type type) +{ + struct device_statistics device_statistics; + struct drbd_genlmsghdr *dh; + bool multicast = false; + int err; + + if (!skb) { + seq = atomic_inc_return(¬ify_genl_seq); + skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO); + err = -ENOMEM; + if (!skb) + goto failed; + multicast = true; + } + + err = -EMSGSIZE; + dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_DEVICE_STATE); + if (!dh) + goto nla_put_failure; + dh->minor = device->minor; + dh->ret_code = NO_ERROR; + if (nla_put_drbd_cfg_context(skb, device->resource, NULL, device) || + nla_put_notification_header(skb, type) || + ((type & ~NOTIFY_FLAGS) != NOTIFY_DESTROY && + device_info_to_skb(skb, device_info, true))) + goto nla_put_failure; + device_to_statistics(&device_statistics, device); + device_statistics_to_skb(skb, &device_statistics, !capable(CAP_SYS_ADMIN)); + genlmsg_end(skb, dh); + if (multicast) { + err = drbd_genl_multicast_events(skb, GFP_NOWAIT); + /* skb has been consumed or freed in netlink_broadcast() */ + if (err && err != -ESRCH) + goto failed; + } + return; + +nla_put_failure: + nlmsg_free(skb); +failed: + drbd_err(device, "Error %d while broadcasting event. Event seq:%u\n", + err, seq); +} + +void notify_connection_state(struct sk_buff *skb, + unsigned int seq, + struct drbd_connection *connection, + struct connection_info *connection_info, + enum drbd_notification_type type) +{ + struct connection_statistics connection_statistics; + struct drbd_genlmsghdr *dh; + bool multicast = false; + int err; + + if (!skb) { + seq = atomic_inc_return(¬ify_genl_seq); + skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO); + err = -ENOMEM; + if (!skb) + goto failed; + multicast = true; + } + + err = -EMSGSIZE; + dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_CONNECTION_STATE); + if (!dh) + goto nla_put_failure; + dh->minor = -1U; + dh->ret_code = NO_ERROR; + if (nla_put_drbd_cfg_context(skb, connection->resource, connection, NULL) || + nla_put_notification_header(skb, type) || + ((type & ~NOTIFY_FLAGS) != NOTIFY_DESTROY && + connection_info_to_skb(skb, connection_info, true))) + goto nla_put_failure; + connection_statistics.conn_congested = test_bit(NET_CONGESTED, &connection->flags); + connection_statistics_to_skb(skb, &connection_statistics, !capable(CAP_SYS_ADMIN)); + genlmsg_end(skb, dh); + if (multicast) { + err = drbd_genl_multicast_events(skb, GFP_NOWAIT); + /* skb has been consumed or freed in netlink_broadcast() */ + if (err && err != -ESRCH) + goto failed; + } + return; + +nla_put_failure: + nlmsg_free(skb); +failed: + drbd_err(connection, "Error %d while broadcasting event. Event seq:%u\n", + err, seq); +} + +void notify_peer_device_state(struct sk_buff *skb, + unsigned int seq, + struct drbd_peer_device *peer_device, + struct peer_device_info *peer_device_info, + enum drbd_notification_type type) +{ + struct peer_device_statistics peer_device_statistics; + struct drbd_resource *resource = peer_device->device->resource; + struct drbd_genlmsghdr *dh; + bool multicast = false; + int err; + + if (!skb) { + seq = atomic_inc_return(¬ify_genl_seq); + skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO); + err = -ENOMEM; + if (!skb) + goto failed; + multicast = true; + } + + err = -EMSGSIZE; + dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_PEER_DEVICE_STATE); + if (!dh) + goto nla_put_failure; + dh->minor = -1U; + dh->ret_code = NO_ERROR; + if (nla_put_drbd_cfg_context(skb, resource, peer_device->connection, peer_device->device) || + nla_put_notification_header(skb, type) || + ((type & ~NOTIFY_FLAGS) != NOTIFY_DESTROY && + peer_device_info_to_skb(skb, peer_device_info, true))) + goto nla_put_failure; + peer_device_to_statistics(&peer_device_statistics, peer_device); + peer_device_statistics_to_skb(skb, &peer_device_statistics, !capable(CAP_SYS_ADMIN)); + genlmsg_end(skb, dh); + if (multicast) { + err = drbd_genl_multicast_events(skb, GFP_NOWAIT); + /* skb has been consumed or freed in netlink_broadcast() */ + if (err && err != -ESRCH) + goto failed; + } + return; + +nla_put_failure: + nlmsg_free(skb); +failed: + drbd_err(peer_device, "Error %d while broadcasting event. Event seq:%u\n", + err, seq); +} + +void notify_helper(enum drbd_notification_type type, + struct drbd_device *device, struct drbd_connection *connection, + const char *name, int status) +{ + struct drbd_resource *resource = device ? device->resource : connection->resource; + struct drbd_helper_info helper_info; + unsigned int seq = atomic_inc_return(¬ify_genl_seq); + struct sk_buff *skb = NULL; + struct drbd_genlmsghdr *dh; + int err; + + strlcpy(helper_info.helper_name, name, sizeof(helper_info.helper_name)); + helper_info.helper_name_len = min(strlen(name), sizeof(helper_info.helper_name)); + helper_info.helper_status = status; + + skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO); + err = -ENOMEM; + if (!skb) + goto fail; + + err = -EMSGSIZE; + dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_HELPER); + if (!dh) + goto fail; + dh->minor = device ? device->minor : -1; + dh->ret_code = NO_ERROR; + mutex_lock(¬ification_mutex); + if (nla_put_drbd_cfg_context(skb, resource, connection, device) || + nla_put_notification_header(skb, type) || + drbd_helper_info_to_skb(skb, &helper_info, true)) + goto unlock_fail; + genlmsg_end(skb, dh); + err = drbd_genl_multicast_events(skb, GFP_NOWAIT); + skb = NULL; + /* skb has been consumed or freed in netlink_broadcast() */ + if (err && err != -ESRCH) + goto unlock_fail; + mutex_unlock(¬ification_mutex); + return; + +unlock_fail: + mutex_unlock(¬ification_mutex); +fail: + nlmsg_free(skb); + drbd_err(resource, "Error %d while broadcasting event. Event seq:%u\n", + err, seq); +} + +static void notify_initial_state_done(struct sk_buff *skb, unsigned int seq) +{ + struct drbd_genlmsghdr *dh; + int err; + + err = -EMSGSIZE; + dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_INITIAL_STATE_DONE); + if (!dh) + goto nla_put_failure; + dh->minor = -1U; + dh->ret_code = NO_ERROR; + if (nla_put_notification_header(skb, NOTIFY_EXISTS)) + goto nla_put_failure; + genlmsg_end(skb, dh); + return; + +nla_put_failure: + nlmsg_free(skb); + pr_err("Error %d sending event. Event seq:%u\n", err, seq); +} + +static void free_state_changes(struct list_head *list) +{ + while (!list_empty(list)) { + struct drbd_state_change *state_change = + list_first_entry(list, struct drbd_state_change, list); + list_del(&state_change->list); + forget_state_change(state_change); + } +} + +static unsigned int notifications_for_state_change(struct drbd_state_change *state_change) +{ + return 1 + + state_change->n_connections + + state_change->n_devices + + state_change->n_devices * state_change->n_connections; +} + +static int get_initial_state(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct drbd_state_change *state_change = (struct drbd_state_change *)cb->args[0]; + unsigned int seq = cb->args[2]; + unsigned int n; + enum drbd_notification_type flags = 0; + + /* There is no need for taking notification_mutex here: it doesn't + matter if the initial state events mix with later state chage + events; we can always tell the events apart by the NOTIFY_EXISTS + flag. */ + + cb->args[5]--; + if (cb->args[5] == 1) { + notify_initial_state_done(skb, seq); + goto out; + } + n = cb->args[4]++; + if (cb->args[4] < cb->args[3]) + flags |= NOTIFY_CONTINUES; + if (n < 1) { + notify_resource_state_change(skb, seq, state_change->resource, + NOTIFY_EXISTS | flags); + goto next; + } + n--; + if (n < state_change->n_connections) { + notify_connection_state_change(skb, seq, &state_change->connections[n], + NOTIFY_EXISTS | flags); + goto next; + } + n -= state_change->n_connections; + if (n < state_change->n_devices) { + notify_device_state_change(skb, seq, &state_change->devices[n], + NOTIFY_EXISTS | flags); + goto next; + } + n -= state_change->n_devices; + if (n < state_change->n_devices * state_change->n_connections) { + notify_peer_device_state_change(skb, seq, &state_change->peer_devices[n], + NOTIFY_EXISTS | flags); + goto next; + } + +next: + if (cb->args[4] == cb->args[3]) { + struct drbd_state_change *next_state_change = + list_entry(state_change->list.next, + struct drbd_state_change, list); + cb->args[0] = (long)next_state_change; + cb->args[3] = notifications_for_state_change(next_state_change); + cb->args[4] = 0; + } +out: + return skb->len; +} + +int drbd_adm_get_initial_state(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct drbd_resource *resource; + LIST_HEAD(head); + + if (cb->args[5] >= 1) { + if (cb->args[5] > 1) + return get_initial_state(skb, cb); + if (cb->args[0]) { + struct drbd_state_change *state_change = + (struct drbd_state_change *)cb->args[0]; + + /* connect list to head */ + list_add(&head, &state_change->list); + free_state_changes(&head); + } + return 0; + } + + cb->args[5] = 2; /* number of iterations */ + mutex_lock(&resources_mutex); + for_each_resource(resource, &drbd_resources) { + struct drbd_state_change *state_change; + + state_change = remember_old_state(resource, GFP_KERNEL); + if (!state_change) { + if (!list_empty(&head)) + free_state_changes(&head); + mutex_unlock(&resources_mutex); + return -ENOMEM; + } + copy_old_to_new_state_change(state_change); + list_add_tail(&state_change->list, &head); + cb->args[5] += notifications_for_state_change(state_change); + } + mutex_unlock(&resources_mutex); + + if (!list_empty(&head)) { + struct drbd_state_change *state_change = + list_entry(head.next, struct drbd_state_change, list); + cb->args[0] = (long)state_change; + cb->args[3] = notifications_for_state_change(state_change); + list_del(&head); /* detach list from head */ + } + + cb->args[2] = cb->nlh->nlmsg_seq; + return get_initial_state(skb, cb); +} diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c index 3b10fa6..6537b25 100644 --- a/drivers/block/drbd/drbd_proc.c +++ b/drivers/block/drbd/drbd_proc.c @@ -245,9 +245,9 @@ static int drbd_seq_show(struct seq_file *seq, void *v) char wp; static char write_ordering_chars[] = { - [WO_none] = 'n', - [WO_drain_io] = 'd', - [WO_bdev_flush] = 'f', + [WO_NONE] = 'n', + [WO_DRAIN_IO] = 'd', + [WO_BDEV_FLUSH] = 'f', }; seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n", diff --git a/drivers/block/drbd/drbd_protocol.h b/drivers/block/drbd/drbd_protocol.h index 2da9104a..ef92453 100644 --- a/drivers/block/drbd/drbd_protocol.h +++ b/drivers/block/drbd/drbd_protocol.h @@ -23,7 +23,7 @@ enum drbd_packet { P_AUTH_RESPONSE = 0x11, P_STATE_CHG_REQ = 0x12, - /* asender (meta socket */ + /* (meta socket) */ P_PING = 0x13, P_PING_ACK = 0x14, P_RECV_ACK = 0x15, /* Used in protocol B */ diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index b4b5680..1957fe8 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -215,7 +215,7 @@ static void reclaim_finished_net_peer_reqs(struct drbd_device *device, } } -static void drbd_kick_lo_and_reclaim_net(struct drbd_device *device) +static void drbd_reclaim_net_peer_reqs(struct drbd_device *device) { LIST_HEAD(reclaimed); struct drbd_peer_request *peer_req, *t; @@ -223,11 +223,30 @@ static void drbd_kick_lo_and_reclaim_net(struct drbd_device *device) spin_lock_irq(&device->resource->req_lock); reclaim_finished_net_peer_reqs(device, &reclaimed); spin_unlock_irq(&device->resource->req_lock); - list_for_each_entry_safe(peer_req, t, &reclaimed, w.list) drbd_free_net_peer_req(device, peer_req); } +static void conn_reclaim_net_peer_reqs(struct drbd_connection *connection) +{ + struct drbd_peer_device *peer_device; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + if (!atomic_read(&device->pp_in_use_by_net)) + continue; + + kref_get(&device->kref); + rcu_read_unlock(); + drbd_reclaim_net_peer_reqs(device); + kref_put(&device->kref, drbd_destroy_device); + rcu_read_lock(); + } + rcu_read_unlock(); +} + /** * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled) * @device: DRBD device. @@ -265,10 +284,15 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int if (atomic_read(&device->pp_in_use) < mxb) page = __drbd_alloc_pages(device, number); + /* Try to keep the fast path fast, but occasionally we need + * to reclaim the pages we lended to the network stack. */ + if (page && atomic_read(&device->pp_in_use_by_net) > 512) + drbd_reclaim_net_peer_reqs(device); + while (page == NULL) { prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE); - drbd_kick_lo_and_reclaim_net(device); + drbd_reclaim_net_peer_reqs(device); if (atomic_read(&device->pp_in_use) < mxb) { page = __drbd_alloc_pages(device, number); @@ -1099,7 +1123,15 @@ randomize: return 0; } - drbd_thread_start(&connection->asender); + drbd_thread_start(&connection->ack_receiver); + /* opencoded create_singlethread_workqueue(), + * to be able to use format string arguments */ + connection->ack_sender = + alloc_ordered_workqueue("drbd_as_%s", WQ_MEM_RECLAIM, connection->resource->name); + if (!connection->ack_sender) { + drbd_err(connection, "Failed to create workqueue ack_sender\n"); + return 0; + } mutex_lock(&connection->resource->conf_update); /* The discard_my_data flag is a single-shot modifier to the next @@ -1178,7 +1210,7 @@ static void drbd_flush(struct drbd_connection *connection) struct drbd_peer_device *peer_device; int vnr; - if (connection->resource->write_ordering >= WO_bdev_flush) { + if (connection->resource->write_ordering >= WO_BDEV_FLUSH) { rcu_read_lock(); idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { struct drbd_device *device = peer_device->device; @@ -1203,7 +1235,7 @@ static void drbd_flush(struct drbd_connection *connection) /* would rather check on EOPNOTSUPP, but that is not reliable. * don't try again for ANY return value != 0 * if (rv == -EOPNOTSUPP) */ - drbd_bump_write_ordering(connection->resource, NULL, WO_drain_io); + drbd_bump_write_ordering(connection->resource, NULL, WO_DRAIN_IO); } put_ldev(device); kref_put(&device->kref, drbd_destroy_device); @@ -1299,10 +1331,10 @@ max_allowed_wo(struct drbd_backing_dev *bdev, enum write_ordering_e wo) dc = rcu_dereference(bdev->disk_conf); - if (wo == WO_bdev_flush && !dc->disk_flushes) - wo = WO_drain_io; - if (wo == WO_drain_io && !dc->disk_drain) - wo = WO_none; + if (wo == WO_BDEV_FLUSH && !dc->disk_flushes) + wo = WO_DRAIN_IO; + if (wo == WO_DRAIN_IO && !dc->disk_drain) + wo = WO_NONE; return wo; } @@ -1319,13 +1351,13 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin enum write_ordering_e pwo; int vnr; static char *write_ordering_str[] = { - [WO_none] = "none", - [WO_drain_io] = "drain", - [WO_bdev_flush] = "flush", + [WO_NONE] = "none", + [WO_DRAIN_IO] = "drain", + [WO_BDEV_FLUSH] = "flush", }; pwo = resource->write_ordering; - if (wo != WO_bdev_flush) + if (wo != WO_BDEV_FLUSH) wo = min(pwo, wo); rcu_read_lock(); idr_for_each_entry(&resource->devices, device, vnr) { @@ -1343,7 +1375,7 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin rcu_read_unlock(); resource->write_ordering = wo; - if (pwo != resource->write_ordering || wo == WO_bdev_flush) + if (pwo != resource->write_ordering || wo == WO_BDEV_FLUSH) drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]); } @@ -1380,7 +1412,7 @@ int drbd_submit_peer_request(struct drbd_device *device, if (peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) { /* wait for all pending IO completions, before we start * zeroing things out. */ - conn_wait_active_ee_empty(first_peer_device(device)->connection); + conn_wait_active_ee_empty(peer_req->peer_device->connection); /* add it to the active list now, * so we can find it to present it in debugfs */ peer_req->submit_jif = jiffies; @@ -1508,12 +1540,6 @@ static void conn_wait_active_ee_empty(struct drbd_connection *connection) rcu_read_unlock(); } -static struct drbd_peer_device * -conn_peer_device(struct drbd_connection *connection, int volume_number) -{ - return idr_find(&connection->peer_devices, volume_number); -} - static int receive_Barrier(struct drbd_connection *connection, struct packet_info *pi) { int rv; @@ -1533,7 +1559,7 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf * Therefore we must send the barrier_ack after the barrier request was * completed. */ switch (connection->resource->write_ordering) { - case WO_none: + case WO_NONE: if (rv == FE_RECYCLED) return 0; @@ -1546,8 +1572,8 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf drbd_warn(connection, "Allocation of an epoch failed, slowing down\n"); /* Fall through */ - case WO_bdev_flush: - case WO_drain_io: + case WO_BDEV_FLUSH: + case WO_DRAIN_IO: conn_wait_active_ee_empty(connection); drbd_flush(connection); @@ -1752,7 +1778,7 @@ static int recv_dless_read(struct drbd_peer_device *peer_device, struct drbd_req } /* - * e_end_resync_block() is called in asender context via + * e_end_resync_block() is called in ack_sender context via * drbd_finish_peer_reqs(). */ static int e_end_resync_block(struct drbd_work *w, int unused) @@ -1926,7 +1952,7 @@ static void restart_conflicting_writes(struct drbd_device *device, } /* - * e_end_block() is called in asender context via drbd_finish_peer_reqs(). + * e_end_block() is called in ack_sender context via drbd_finish_peer_reqs(). */ static int e_end_block(struct drbd_work *w, int cancel) { @@ -1966,7 +1992,7 @@ static int e_end_block(struct drbd_work *w, int cancel) } else D_ASSERT(device, drbd_interval_empty(&peer_req->i)); - drbd_may_finish_epoch(first_peer_device(device)->connection, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0)); + drbd_may_finish_epoch(peer_device->connection, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0)); return err; } @@ -2098,7 +2124,7 @@ static int wait_for_and_update_peer_seq(struct drbd_peer_device *peer_device, co } rcu_read_lock(); - tp = rcu_dereference(first_peer_device(device)->connection->net_conf)->two_primaries; + tp = rcu_dereference(peer_device->connection->net_conf)->two_primaries; rcu_read_unlock(); if (!tp) @@ -2217,7 +2243,7 @@ static int handle_write_conflicts(struct drbd_device *device, peer_req->w.cb = superseded ? e_send_superseded : e_send_retry_write; list_add_tail(&peer_req->w.list, &device->done_ee); - wake_asender(connection); + queue_work(connection->ack_sender, &peer_req->peer_device->send_acks_work); err = -ENOENT; goto out; @@ -2364,7 +2390,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * if (dp_flags & DP_SEND_RECEIVE_ACK) { /* I really don't like it that the receiver thread * sends on the msock, but anyways */ - drbd_send_ack(first_peer_device(device), P_RECV_ACK, peer_req); + drbd_send_ack(peer_device, P_RECV_ACK, peer_req); } if (tp) { @@ -4056,7 +4082,7 @@ static int receive_state(struct drbd_connection *connection, struct packet_info os = ns = drbd_read_state(device); spin_unlock_irq(&device->resource->req_lock); - /* If some other part of the code (asender thread, timeout) + /* If some other part of the code (ack_receiver thread, timeout) * already decided to close the connection again, * we must not "re-establish" it here. */ if (os.conn <= C_TEAR_DOWN) @@ -4661,8 +4687,12 @@ static void conn_disconnect(struct drbd_connection *connection) */ conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD); - /* asender does not clean up anything. it must not interfere, either */ - drbd_thread_stop(&connection->asender); + /* ack_receiver does not clean up anything. it must not interfere, either */ + drbd_thread_stop(&connection->ack_receiver); + if (connection->ack_sender) { + destroy_workqueue(connection->ack_sender); + connection->ack_sender = NULL; + } drbd_free_sock(connection); rcu_read_lock(); @@ -5431,49 +5461,39 @@ static int got_skip(struct drbd_connection *connection, struct packet_info *pi) return 0; } -static int connection_finish_peer_reqs(struct drbd_connection *connection) +struct meta_sock_cmd { + size_t pkt_size; + int (*fn)(struct drbd_connection *connection, struct packet_info *); +}; + +static void set_rcvtimeo(struct drbd_connection *connection, bool ping_timeout) { - struct drbd_peer_device *peer_device; - int vnr, not_empty = 0; + long t; + struct net_conf *nc; - do { - clear_bit(SIGNAL_ASENDER, &connection->flags); - flush_signals(current); + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + t = ping_timeout ? nc->ping_timeo : nc->ping_int; + rcu_read_unlock(); - rcu_read_lock(); - idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { - struct drbd_device *device = peer_device->device; - kref_get(&device->kref); - rcu_read_unlock(); - if (drbd_finish_peer_reqs(device)) { - kref_put(&device->kref, drbd_destroy_device); - return 1; - } - kref_put(&device->kref, drbd_destroy_device); - rcu_read_lock(); - } - set_bit(SIGNAL_ASENDER, &connection->flags); + t *= HZ; + if (ping_timeout) + t /= 10; - spin_lock_irq(&connection->resource->req_lock); - idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { - struct drbd_device *device = peer_device->device; - not_empty = !list_empty(&device->done_ee); - if (not_empty) - break; - } - spin_unlock_irq(&connection->resource->req_lock); - rcu_read_unlock(); - } while (not_empty); + connection->meta.socket->sk->sk_rcvtimeo = t; +} - return 0; +static void set_ping_timeout(struct drbd_connection *connection) +{ + set_rcvtimeo(connection, 1); } -struct asender_cmd { - size_t pkt_size; - int (*fn)(struct drbd_connection *connection, struct packet_info *); -}; +static void set_idle_timeout(struct drbd_connection *connection) +{ + set_rcvtimeo(connection, 0); +} -static struct asender_cmd asender_tbl[] = { +static struct meta_sock_cmd ack_receiver_tbl[] = { [P_PING] = { 0, got_Ping }, [P_PING_ACK] = { 0, got_PingAck }, [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, @@ -5493,64 +5513,40 @@ static struct asender_cmd asender_tbl[] = { [P_RETRY_WRITE] = { sizeof(struct p_block_ack), got_BlockAck }, }; -int drbd_asender(struct drbd_thread *thi) +int drbd_ack_receiver(struct drbd_thread *thi) { struct drbd_connection *connection = thi->connection; - struct asender_cmd *cmd = NULL; + struct meta_sock_cmd *cmd = NULL; struct packet_info pi; + unsigned long pre_recv_jif; int rv; void *buf = connection->meta.rbuf; int received = 0; unsigned int header_size = drbd_header_size(connection); int expect = header_size; bool ping_timeout_active = false; - struct net_conf *nc; - int ping_timeo, tcp_cork, ping_int; struct sched_param param = { .sched_priority = 2 }; rv = sched_setscheduler(current, SCHED_RR, ¶m); if (rv < 0) - drbd_err(connection, "drbd_asender: ERROR set priority, ret=%d\n", rv); + drbd_err(connection, "drbd_ack_receiver: ERROR set priority, ret=%d\n", rv); while (get_t_state(thi) == RUNNING) { drbd_thread_current_set_cpu(thi); - rcu_read_lock(); - nc = rcu_dereference(connection->net_conf); - ping_timeo = nc->ping_timeo; - tcp_cork = nc->tcp_cork; - ping_int = nc->ping_int; - rcu_read_unlock(); + conn_reclaim_net_peer_reqs(connection); if (test_and_clear_bit(SEND_PING, &connection->flags)) { if (drbd_send_ping(connection)) { drbd_err(connection, "drbd_send_ping has failed\n"); goto reconnect; } - connection->meta.socket->sk->sk_rcvtimeo = ping_timeo * HZ / 10; + set_ping_timeout(connection); ping_timeout_active = true; } - /* TODO: conditionally cork; it may hurt latency if we cork without - much to send */ - if (tcp_cork) - drbd_tcp_cork(connection->meta.socket); - if (connection_finish_peer_reqs(connection)) { - drbd_err(connection, "connection_finish_peer_reqs() failed\n"); - goto reconnect; - } - /* but unconditionally uncork unless disabled */ - if (tcp_cork) - drbd_tcp_uncork(connection->meta.socket); - - /* short circuit, recv_msg would return EINTR anyways. */ - if (signal_pending(current)) - continue; - + pre_recv_jif = jiffies; rv = drbd_recv_short(connection->meta.socket, buf, expect-received, 0); - clear_bit(SIGNAL_ASENDER, &connection->flags); - - flush_signals(current); /* Note: * -EINTR (on meta) we got a signal @@ -5562,7 +5558,6 @@ int drbd_asender(struct drbd_thread *thi) * rv < expected: "woken" by signal during receive * rv == 0 : "connection shut down by peer" */ -received_more: if (likely(rv > 0)) { received += rv; buf += rv; @@ -5584,8 +5579,7 @@ received_more: } else if (rv == -EAGAIN) { /* If the data socket received something meanwhile, * that is good enough: peer is still alive. */ - if (time_after(connection->last_received, - jiffies - connection->meta.socket->sk->sk_rcvtimeo)) + if (time_after(connection->last_received, pre_recv_jif)) continue; if (ping_timeout_active) { drbd_err(connection, "PingAck did not arrive in time.\n"); @@ -5594,6 +5588,10 @@ received_more: set_bit(SEND_PING, &connection->flags); continue; } else if (rv == -EINTR) { + /* maybe drbd_thread_stop(): the while condition will notice. + * maybe woken for send_ping: we'll send a ping above, + * and change the rcvtimeo */ + flush_signals(current); continue; } else { drbd_err(connection, "sock_recvmsg returned %d\n", rv); @@ -5603,8 +5601,8 @@ received_more: if (received == expect && cmd == NULL) { if (decode_header(connection, connection->meta.rbuf, &pi)) goto reconnect; - cmd = &asender_tbl[pi.cmd]; - if (pi.cmd >= ARRAY_SIZE(asender_tbl) || !cmd->fn) { + cmd = &ack_receiver_tbl[pi.cmd]; + if (pi.cmd >= ARRAY_SIZE(ack_receiver_tbl) || !cmd->fn) { drbd_err(connection, "Unexpected meta packet %s (0x%04x)\n", cmdname(pi.cmd), pi.cmd); goto disconnect; @@ -5627,9 +5625,8 @@ received_more: connection->last_received = jiffies; - if (cmd == &asender_tbl[P_PING_ACK]) { - /* restore idle timeout */ - connection->meta.socket->sk->sk_rcvtimeo = ping_int * HZ; + if (cmd == &ack_receiver_tbl[P_PING_ACK]) { + set_idle_timeout(connection); ping_timeout_active = false; } @@ -5638,11 +5635,6 @@ received_more: expect = header_size; cmd = NULL; } - if (test_bit(SEND_PING, &connection->flags)) - continue; - rv = drbd_recv_short(connection->meta.socket, buf, expect-received, MSG_DONTWAIT); - if (rv > 0) - goto received_more; } if (0) { @@ -5654,9 +5646,41 @@ reconnect: disconnect: conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD); } - clear_bit(SIGNAL_ASENDER, &connection->flags); - drbd_info(connection, "asender terminated\n"); + drbd_info(connection, "ack_receiver terminated\n"); return 0; } + +void drbd_send_acks_wf(struct work_struct *ws) +{ + struct drbd_peer_device *peer_device = + container_of(ws, struct drbd_peer_device, send_acks_work); + struct drbd_connection *connection = peer_device->connection; + struct drbd_device *device = peer_device->device; + struct net_conf *nc; + int tcp_cork, err; + + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + tcp_cork = nc->tcp_cork; + rcu_read_unlock(); + + if (tcp_cork) + drbd_tcp_cork(connection->meta.socket); + + err = drbd_finish_peer_reqs(device); + kref_put(&device->kref, drbd_destroy_device); + /* get is in drbd_endio_write_sec_final(). That is necessary to keep the + struct work_struct send_acks_work alive, which is in the peer_device object */ + + if (err) { + conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD); + return; + } + + if (tcp_cork) + drbd_tcp_uncork(connection->meta.socket); + + return; +} diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 3ae2c00..2255dcf 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -453,12 +453,12 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m, kref_get(&req->kref); /* wait for the DONE */ if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT)) { - /* potentially already completed in the asender thread */ + /* potentially already completed in the ack_receiver thread */ if (!(s & RQ_NET_DONE)) { atomic_add(req->i.size >> 9, &device->ap_in_flight); set_if_null_req_not_net_done(peer_device, req); } - if (s & RQ_NET_PENDING) + if (req->rq_state & RQ_NET_PENDING) set_if_null_req_ack_pending(peer_device, req); } @@ -1095,6 +1095,24 @@ static bool do_remote_read(struct drbd_request *req) return false; } +bool drbd_should_do_remote(union drbd_dev_state s) +{ + return s.pdsk == D_UP_TO_DATE || + (s.pdsk >= D_INCONSISTENT && + s.conn >= C_WF_BITMAP_T && + s.conn < C_AHEAD); + /* Before proto 96 that was >= CONNECTED instead of >= C_WF_BITMAP_T. + That is equivalent since before 96 IO was frozen in the C_WF_BITMAP* + states. */ +} + +static bool drbd_should_send_out_of_sync(union drbd_dev_state s) +{ + return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S; + /* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary + since we enter state C_AHEAD only if proto >= 96 */ +} + /* returns number of connections (== 1, for drbd 8.4) * expected to actually write this data, * which does NOT include those that we are L_AHEAD for. */ @@ -1149,7 +1167,6 @@ drbd_submit_req_private_bio(struct drbd_request *req) * stable storage, and this is a WRITE, we may not even submit * this bio. */ if (get_ldev(device)) { - req->pre_submit_jif = jiffies; if (drbd_insert_fault(device, rw == WRITE ? DRBD_FAULT_DT_WR : rw == READ ? DRBD_FAULT_DT_RD @@ -1293,6 +1310,7 @@ static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request &device->pending_master_completion[rw == WRITE]); if (req->private_bio) { /* needs to be marked within the same spinlock */ + req->pre_submit_jif = jiffies; list_add_tail(&req->req_pending_local, &device->pending_completion[rw == WRITE]); _req_mod(req, TO_BE_SUBMITTED); @@ -1513,6 +1531,78 @@ blk_qc_t drbd_make_request(struct request_queue *q, struct bio *bio) return BLK_QC_T_NONE; } +static bool net_timeout_reached(struct drbd_request *net_req, + struct drbd_connection *connection, + unsigned long now, unsigned long ent, + unsigned int ko_count, unsigned int timeout) +{ + struct drbd_device *device = net_req->device; + + if (!time_after(now, net_req->pre_send_jif + ent)) + return false; + + if (time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) + return false; + + if (net_req->rq_state & RQ_NET_PENDING) { + drbd_warn(device, "Remote failed to finish a request within %ums > ko-count (%u) * timeout (%u * 0.1s)\n", + jiffies_to_msecs(now - net_req->pre_send_jif), ko_count, timeout); + return true; + } + + /* We received an ACK already (or are using protocol A), + * but are waiting for the epoch closing barrier ack. + * Check if we sent the barrier already. We should not blame the peer + * for being unresponsive, if we did not even ask it yet. */ + if (net_req->epoch == connection->send.current_epoch_nr) { + drbd_warn(device, + "We did not send a P_BARRIER for %ums > ko-count (%u) * timeout (%u * 0.1s); drbd kernel thread blocked?\n", + jiffies_to_msecs(now - net_req->pre_send_jif), ko_count, timeout); + return false; + } + + /* Worst case: we may have been blocked for whatever reason, then + * suddenly are able to send a lot of requests (and epoch separating + * barriers) in quick succession. + * The timestamp of the net_req may be much too old and not correspond + * to the sending time of the relevant unack'ed barrier packet, so + * would trigger a spurious timeout. The latest barrier packet may + * have a too recent timestamp to trigger the timeout, potentially miss + * a timeout. Right now we don't have a place to conveniently store + * these timestamps. + * But in this particular situation, the application requests are still + * completed to upper layers, DRBD should still "feel" responsive. + * No need yet to kill this connection, it may still recover. + * If not, eventually we will have queued enough into the network for + * us to block. From that point of view, the timestamp of the last sent + * barrier packet is relevant enough. + */ + if (time_after(now, connection->send.last_sent_barrier_jif + ent)) { + drbd_warn(device, "Remote failed to answer a P_BARRIER (sent at %lu jif; now=%lu jif) within %ums > ko-count (%u) * timeout (%u * 0.1s)\n", + connection->send.last_sent_barrier_jif, now, + jiffies_to_msecs(now - connection->send.last_sent_barrier_jif), ko_count, timeout); + return true; + } + return false; +} + +/* A request is considered timed out, if + * - we have some effective timeout from the configuration, + * with some state restrictions applied, + * - the oldest request is waiting for a response from the network + * resp. the local disk, + * - the oldest request is in fact older than the effective timeout, + * - the connection was established (resp. disk was attached) + * for longer than the timeout already. + * Note that for 32bit jiffies and very stable connections/disks, + * we may have a wrap around, which is catched by + * !time_in_range(now, last_..._jif, last_..._jif + timeout). + * + * Side effect: once per 32bit wrap-around interval, which means every + * ~198 days with 250 HZ, we have a window where the timeout would need + * to expire twice (worst case) to become effective. Good enough. + */ + void request_timer_fn(unsigned long data) { struct drbd_device *device = (struct drbd_device *) data; @@ -1522,11 +1612,14 @@ void request_timer_fn(unsigned long data) unsigned long oldest_submit_jif; unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */ unsigned long now; + unsigned int ko_count = 0, timeout = 0; rcu_read_lock(); nc = rcu_dereference(connection->net_conf); - if (nc && device->state.conn >= C_WF_REPORT_PARAMS) - ent = nc->timeout * HZ/10 * nc->ko_count; + if (nc && device->state.conn >= C_WF_REPORT_PARAMS) { + ko_count = nc->ko_count; + timeout = nc->timeout; + } if (get_ldev(device)) { /* implicit state.disk >= D_INCONSISTENT */ dt = rcu_dereference(device->ldev->disk_conf)->disk_timeout * HZ / 10; @@ -1534,6 +1627,8 @@ void request_timer_fn(unsigned long data) } rcu_read_unlock(); + + ent = timeout * HZ/10 * ko_count; et = min_not_zero(dt, ent); if (!et) @@ -1545,11 +1640,22 @@ void request_timer_fn(unsigned long data) spin_lock_irq(&device->resource->req_lock); req_read = list_first_entry_or_null(&device->pending_completion[0], struct drbd_request, req_pending_local); req_write = list_first_entry_or_null(&device->pending_completion[1], struct drbd_request, req_pending_local); - req_peer = connection->req_not_net_done; + /* maybe the oldest request waiting for the peer is in fact still - * blocking in tcp sendmsg */ - if (!req_peer && connection->req_next && connection->req_next->pre_send_jif) - req_peer = connection->req_next; + * blocking in tcp sendmsg. That's ok, though, that's handled via the + * socket send timeout, requesting a ping, and bumping ko-count in + * we_should_drop_the_connection(). + */ + + /* check the oldest request we did successfully sent, + * but which is still waiting for an ACK. */ + req_peer = connection->req_ack_pending; + + /* if we don't have such request (e.g. protocoll A) + * check the oldest requests which is still waiting on its epoch + * closing barrier ack. */ + if (!req_peer) + req_peer = connection->req_not_net_done; /* evaluate the oldest peer request only in one timer! */ if (req_peer && req_peer->device != device) @@ -1566,28 +1672,9 @@ void request_timer_fn(unsigned long data) : req_write ? req_write->pre_submit_jif : req_read ? req_read->pre_submit_jif : now; - /* The request is considered timed out, if - * - we have some effective timeout from the configuration, - * with above state restrictions applied, - * - the oldest request is waiting for a response from the network - * resp. the local disk, - * - the oldest request is in fact older than the effective timeout, - * - the connection was established (resp. disk was attached) - * for longer than the timeout already. - * Note that for 32bit jiffies and very stable connections/disks, - * we may have a wrap around, which is catched by - * !time_in_range(now, last_..._jif, last_..._jif + timeout). - * - * Side effect: once per 32bit wrap-around interval, which means every - * ~198 days with 250 HZ, we have a window where the timeout would need - * to expire twice (worst case) to become effective. Good enough. - */ - if (ent && req_peer && - time_after(now, req_peer->pre_send_jif + ent) && - !time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) { - drbd_warn(device, "Remote failed to finish a request within ko-count * timeout\n"); + if (ent && req_peer && net_timeout_reached(req_peer, connection, now, ent, ko_count, timeout)) _conn_request_state(connection, NS(conn, C_TIMEOUT), CS_VERBOSE | CS_HARD); - } + if (dt && oldest_submit_jif != now && time_after(now, oldest_submit_jif + dt) && !time_in_range(now, device->last_reattach_jif, device->last_reattach_jif + dt)) { diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index 9f6a040..bb2ef78 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -331,21 +331,6 @@ static inline int req_mod(struct drbd_request *req, return rv; } -static inline bool drbd_should_do_remote(union drbd_dev_state s) -{ - return s.pdsk == D_UP_TO_DATE || - (s.pdsk >= D_INCONSISTENT && - s.conn >= C_WF_BITMAP_T && - s.conn < C_AHEAD); - /* Before proto 96 that was >= CONNECTED instead of >= C_WF_BITMAP_T. - That is equivalent since before 96 IO was frozen in the C_WF_BITMAP* - states. */ -} -static inline bool drbd_should_send_out_of_sync(union drbd_dev_state s) -{ - return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S; - /* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary - since we enter state C_AHEAD only if proto >= 96 */ -} +extern bool drbd_should_do_remote(union drbd_dev_state); #endif diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c index 2d7dd26..5a7ef78 100644 --- a/drivers/block/drbd/drbd_state.c +++ b/drivers/block/drbd/drbd_state.c @@ -29,6 +29,7 @@ #include "drbd_int.h" #include "drbd_protocol.h" #include "drbd_req.h" +#include "drbd_state_change.h" struct after_state_chg_work { struct drbd_work w; @@ -37,6 +38,7 @@ struct after_state_chg_work { union drbd_state ns; enum chg_state_flags flags; struct completion *done; + struct drbd_state_change *state_change; }; enum sanitize_state_warnings { @@ -48,9 +50,248 @@ enum sanitize_state_warnings { IMPLICITLY_UPGRADED_PDSK, }; +static void count_objects(struct drbd_resource *resource, + unsigned int *n_devices, + unsigned int *n_connections) +{ + struct drbd_device *device; + struct drbd_connection *connection; + int vnr; + + *n_devices = 0; + *n_connections = 0; + + idr_for_each_entry(&resource->devices, device, vnr) + (*n_devices)++; + for_each_connection(connection, resource) + (*n_connections)++; +} + +static struct drbd_state_change *alloc_state_change(unsigned int n_devices, unsigned int n_connections, gfp_t gfp) +{ + struct drbd_state_change *state_change; + unsigned int size, n; + + size = sizeof(struct drbd_state_change) + + n_devices * sizeof(struct drbd_device_state_change) + + n_connections * sizeof(struct drbd_connection_state_change) + + n_devices * n_connections * sizeof(struct drbd_peer_device_state_change); + state_change = kmalloc(size, gfp); + if (!state_change) + return NULL; + state_change->n_devices = n_devices; + state_change->n_connections = n_connections; + state_change->devices = (void *)(state_change + 1); + state_change->connections = (void *)&state_change->devices[n_devices]; + state_change->peer_devices = (void *)&state_change->connections[n_connections]; + state_change->resource->resource = NULL; + for (n = 0; n < n_devices; n++) + state_change->devices[n].device = NULL; + for (n = 0; n < n_connections; n++) + state_change->connections[n].connection = NULL; + return state_change; +} + +struct drbd_state_change *remember_old_state(struct drbd_resource *resource, gfp_t gfp) +{ + struct drbd_state_change *state_change; + struct drbd_device *device; + unsigned int n_devices; + struct drbd_connection *connection; + unsigned int n_connections; + int vnr; + + struct drbd_device_state_change *device_state_change; + struct drbd_peer_device_state_change *peer_device_state_change; + struct drbd_connection_state_change *connection_state_change; + + /* Caller holds req_lock spinlock. + * No state, no device IDR, no connections lists can change. */ + count_objects(resource, &n_devices, &n_connections); + state_change = alloc_state_change(n_devices, n_connections, gfp); + if (!state_change) + return NULL; + + kref_get(&resource->kref); + state_change->resource->resource = resource; + state_change->resource->role[OLD] = + conn_highest_role(first_connection(resource)); + state_change->resource->susp[OLD] = resource->susp; + state_change->resource->susp_nod[OLD] = resource->susp_nod; + state_change->resource->susp_fen[OLD] = resource->susp_fen; + + connection_state_change = state_change->connections; + for_each_connection(connection, resource) { + kref_get(&connection->kref); + connection_state_change->connection = connection; + connection_state_change->cstate[OLD] = + connection->cstate; + connection_state_change->peer_role[OLD] = + conn_highest_peer(connection); + connection_state_change++; + } + + device_state_change = state_change->devices; + peer_device_state_change = state_change->peer_devices; + idr_for_each_entry(&resource->devices, device, vnr) { + kref_get(&device->kref); + device_state_change->device = device; + device_state_change->disk_state[OLD] = device->state.disk; + + /* The peer_devices for each device have to be enumerated in + the order of the connections. We may not use for_each_peer_device() here. */ + for_each_connection(connection, resource) { + struct drbd_peer_device *peer_device; + + peer_device = conn_peer_device(connection, device->vnr); + peer_device_state_change->peer_device = peer_device; + peer_device_state_change->disk_state[OLD] = + device->state.pdsk; + peer_device_state_change->repl_state[OLD] = + max_t(enum drbd_conns, + C_WF_REPORT_PARAMS, device->state.conn); + peer_device_state_change->resync_susp_user[OLD] = + device->state.user_isp; + peer_device_state_change->resync_susp_peer[OLD] = + device->state.peer_isp; + peer_device_state_change->resync_susp_dependency[OLD] = + device->state.aftr_isp; + peer_device_state_change++; + } + device_state_change++; + } + + return state_change; +} + +static void remember_new_state(struct drbd_state_change *state_change) +{ + struct drbd_resource_state_change *resource_state_change; + struct drbd_resource *resource; + unsigned int n; + + if (!state_change) + return; + + resource_state_change = &state_change->resource[0]; + resource = resource_state_change->resource; + + resource_state_change->role[NEW] = + conn_highest_role(first_connection(resource)); + resource_state_change->susp[NEW] = resource->susp; + resource_state_change->susp_nod[NEW] = resource->susp_nod; + resource_state_change->susp_fen[NEW] = resource->susp_fen; + + for (n = 0; n < state_change->n_devices; n++) { + struct drbd_device_state_change *device_state_change = + &state_change->devices[n]; + struct drbd_device *device = device_state_change->device; + + device_state_change->disk_state[NEW] = device->state.disk; + } + + for (n = 0; n < state_change->n_connections; n++) { + struct drbd_connection_state_change *connection_state_change = + &state_change->connections[n]; + struct drbd_connection *connection = + connection_state_change->connection; + + connection_state_change->cstate[NEW] = connection->cstate; + connection_state_change->peer_role[NEW] = + conn_highest_peer(connection); + } + + for (n = 0; n < state_change->n_devices * state_change->n_connections; n++) { + struct drbd_peer_device_state_change *peer_device_state_change = + &state_change->peer_devices[n]; + struct drbd_device *device = + peer_device_state_change->peer_device->device; + union drbd_dev_state state = device->state; + + peer_device_state_change->disk_state[NEW] = state.pdsk; + peer_device_state_change->repl_state[NEW] = + max_t(enum drbd_conns, C_WF_REPORT_PARAMS, state.conn); + peer_device_state_change->resync_susp_user[NEW] = + state.user_isp; + peer_device_state_change->resync_susp_peer[NEW] = + state.peer_isp; + peer_device_state_change->resync_susp_dependency[NEW] = + state.aftr_isp; + } +} + +void copy_old_to_new_state_change(struct drbd_state_change *state_change) +{ + struct drbd_resource_state_change *resource_state_change = &state_change->resource[0]; + unsigned int n_device, n_connection, n_peer_device, n_peer_devices; + +#define OLD_TO_NEW(x) \ + (x[NEW] = x[OLD]) + + OLD_TO_NEW(resource_state_change->role); + OLD_TO_NEW(resource_state_change->susp); + OLD_TO_NEW(resource_state_change->susp_nod); + OLD_TO_NEW(resource_state_change->susp_fen); + + for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) { + struct drbd_connection_state_change *connection_state_change = + &state_change->connections[n_connection]; + + OLD_TO_NEW(connection_state_change->peer_role); + OLD_TO_NEW(connection_state_change->cstate); + } + + for (n_device = 0; n_device < state_change->n_devices; n_device++) { + struct drbd_device_state_change *device_state_change = + &state_change->devices[n_device]; + + OLD_TO_NEW(device_state_change->disk_state); + } + + n_peer_devices = state_change->n_devices * state_change->n_connections; + for (n_peer_device = 0; n_peer_device < n_peer_devices; n_peer_device++) { + struct drbd_peer_device_state_change *p = + &state_change->peer_devices[n_peer_device]; + + OLD_TO_NEW(p->disk_state); + OLD_TO_NEW(p->repl_state); + OLD_TO_NEW(p->resync_susp_user); + OLD_TO_NEW(p->resync_susp_peer); + OLD_TO_NEW(p->resync_susp_dependency); + } + +#undef OLD_TO_NEW +} + +void forget_state_change(struct drbd_state_change *state_change) +{ + unsigned int n; + + if (!state_change) + return; + + if (state_change->resource->resource) + kref_put(&state_change->resource->resource->kref, drbd_destroy_resource); + for (n = 0; n < state_change->n_devices; n++) { + struct drbd_device *device = state_change->devices[n].device; + + if (device) + kref_put(&device->kref, drbd_destroy_device); + } + for (n = 0; n < state_change->n_connections; n++) { + struct drbd_connection *connection = + state_change->connections[n].connection; + + if (connection) + kref_put(&connection->kref, drbd_destroy_connection); + } + kfree(state_change); +} + static int w_after_state_ch(struct drbd_work *w, int unused); static void after_state_ch(struct drbd_device *device, union drbd_state os, - union drbd_state ns, enum chg_state_flags flags); + union drbd_state ns, enum chg_state_flags flags, + struct drbd_state_change *); static enum drbd_state_rv is_valid_state(struct drbd_device *, union drbd_state); static enum drbd_state_rv is_valid_soft_transition(union drbd_state, union drbd_state, struct drbd_connection *); static enum drbd_state_rv is_valid_transition(union drbd_state os, union drbd_state ns); @@ -93,6 +334,7 @@ static enum drbd_role max_role(enum drbd_role role1, enum drbd_role role2) return R_SECONDARY; return R_UNKNOWN; } + static enum drbd_role min_role(enum drbd_role role1, enum drbd_role role2) { if (role1 == R_UNKNOWN || role2 == R_UNKNOWN) @@ -937,7 +1179,7 @@ void drbd_resume_al(struct drbd_device *device) drbd_info(device, "Resumed AL updates\n"); } -/* helper for __drbd_set_state */ +/* helper for _drbd_set_state */ static void set_ov_position(struct drbd_device *device, enum drbd_conns cs) { if (first_peer_device(device)->connection->agreed_pro_version < 90) @@ -965,17 +1207,17 @@ static void set_ov_position(struct drbd_device *device, enum drbd_conns cs) } /** - * __drbd_set_state() - Set a new DRBD state + * _drbd_set_state() - Set a new DRBD state * @device: DRBD device. * @ns: new state. * @flags: Flags * @done: Optional completion, that will get completed after the after_state_ch() finished * - * Caller needs to hold req_lock, and global_state_lock. Do not call directly. + * Caller needs to hold req_lock. Do not call directly. */ enum drbd_state_rv -__drbd_set_state(struct drbd_device *device, union drbd_state ns, - enum chg_state_flags flags, struct completion *done) +_drbd_set_state(struct drbd_device *device, union drbd_state ns, + enum chg_state_flags flags, struct completion *done) { struct drbd_peer_device *peer_device = first_peer_device(device); struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; @@ -983,6 +1225,7 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns, enum drbd_state_rv rv = SS_SUCCESS; enum sanitize_state_warnings ssw; struct after_state_chg_work *ascw; + struct drbd_state_change *state_change; os = drbd_read_state(device); @@ -1037,6 +1280,9 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns, if (!is_sync_state(os.conn) && is_sync_state(ns.conn)) clear_bit(RS_DONE, &device->flags); + /* FIXME: Have any flags been set earlier in this function already? */ + state_change = remember_old_state(device->resource, GFP_ATOMIC); + /* changes to local_cnt and device flags should be visible before * changes to state, which again should be visible before anything else * depending on that change happens. */ @@ -1047,6 +1293,8 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns, device->resource->susp_fen = ns.susp_fen; smp_wmb(); + remember_new_state(state_change); + /* put replicated vs not-replicated requests in seperate epochs */ if (drbd_should_do_remote((union drbd_dev_state)os.i) != drbd_should_do_remote((union drbd_dev_state)ns.i)) @@ -1184,6 +1432,7 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns, ascw->w.cb = w_after_state_ch; ascw->device = device; ascw->done = done; + ascw->state_change = state_change; drbd_queue_work(&connection->sender_work, &ascw->w); } else { @@ -1199,7 +1448,8 @@ static int w_after_state_ch(struct drbd_work *w, int unused) container_of(w, struct after_state_chg_work, w); struct drbd_device *device = ascw->device; - after_state_ch(device, ascw->os, ascw->ns, ascw->flags); + after_state_ch(device, ascw->os, ascw->ns, ascw->flags, ascw->state_change); + forget_state_change(ascw->state_change); if (ascw->flags & CS_WAIT_COMPLETE) complete(ascw->done); kfree(ascw); @@ -1234,7 +1484,7 @@ int drbd_bitmap_io_from_worker(struct drbd_device *device, D_ASSERT(device, current == first_peer_device(device)->connection->worker.task); /* open coded non-blocking drbd_suspend_io(device); */ - set_bit(SUSPEND_IO, &device->flags); + atomic_inc(&device->suspend_cnt); drbd_bm_lock(device, why, flags); rv = io_fn(device); @@ -1245,6 +1495,139 @@ int drbd_bitmap_io_from_worker(struct drbd_device *device, return rv; } +void notify_resource_state_change(struct sk_buff *skb, + unsigned int seq, + struct drbd_resource_state_change *resource_state_change, + enum drbd_notification_type type) +{ + struct drbd_resource *resource = resource_state_change->resource; + struct resource_info resource_info = { + .res_role = resource_state_change->role[NEW], + .res_susp = resource_state_change->susp[NEW], + .res_susp_nod = resource_state_change->susp_nod[NEW], + .res_susp_fen = resource_state_change->susp_fen[NEW], + }; + + notify_resource_state(skb, seq, resource, &resource_info, type); +} + +void notify_connection_state_change(struct sk_buff *skb, + unsigned int seq, + struct drbd_connection_state_change *connection_state_change, + enum drbd_notification_type type) +{ + struct drbd_connection *connection = connection_state_change->connection; + struct connection_info connection_info = { + .conn_connection_state = connection_state_change->cstate[NEW], + .conn_role = connection_state_change->peer_role[NEW], + }; + + notify_connection_state(skb, seq, connection, &connection_info, type); +} + +void notify_device_state_change(struct sk_buff *skb, + unsigned int seq, + struct drbd_device_state_change *device_state_change, + enum drbd_notification_type type) +{ + struct drbd_device *device = device_state_change->device; + struct device_info device_info = { + .dev_disk_state = device_state_change->disk_state[NEW], + }; + + notify_device_state(skb, seq, device, &device_info, type); +} + +void notify_peer_device_state_change(struct sk_buff *skb, + unsigned int seq, + struct drbd_peer_device_state_change *p, + enum drbd_notification_type type) +{ + struct drbd_peer_device *peer_device = p->peer_device; + struct peer_device_info peer_device_info = { + .peer_repl_state = p->repl_state[NEW], + .peer_disk_state = p->disk_state[NEW], + .peer_resync_susp_user = p->resync_susp_user[NEW], + .peer_resync_susp_peer = p->resync_susp_peer[NEW], + .peer_resync_susp_dependency = p->resync_susp_dependency[NEW], + }; + + notify_peer_device_state(skb, seq, peer_device, &peer_device_info, type); +} + +static void broadcast_state_change(struct drbd_state_change *state_change) +{ + struct drbd_resource_state_change *resource_state_change = &state_change->resource[0]; + bool resource_state_has_changed; + unsigned int n_device, n_connection, n_peer_device, n_peer_devices; + void (*last_func)(struct sk_buff *, unsigned int, void *, + enum drbd_notification_type) = NULL; + void *uninitialized_var(last_arg); + +#define HAS_CHANGED(state) ((state)[OLD] != (state)[NEW]) +#define FINAL_STATE_CHANGE(type) \ + ({ if (last_func) \ + last_func(NULL, 0, last_arg, type); \ + }) +#define REMEMBER_STATE_CHANGE(func, arg, type) \ + ({ FINAL_STATE_CHANGE(type | NOTIFY_CONTINUES); \ + last_func = (typeof(last_func))func; \ + last_arg = arg; \ + }) + + mutex_lock(¬ification_mutex); + + resource_state_has_changed = + HAS_CHANGED(resource_state_change->role) || + HAS_CHANGED(resource_state_change->susp) || + HAS_CHANGED(resource_state_change->susp_nod) || + HAS_CHANGED(resource_state_change->susp_fen); + + if (resource_state_has_changed) + REMEMBER_STATE_CHANGE(notify_resource_state_change, + resource_state_change, NOTIFY_CHANGE); + + for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) { + struct drbd_connection_state_change *connection_state_change = + &state_change->connections[n_connection]; + + if (HAS_CHANGED(connection_state_change->peer_role) || + HAS_CHANGED(connection_state_change->cstate)) + REMEMBER_STATE_CHANGE(notify_connection_state_change, + connection_state_change, NOTIFY_CHANGE); + } + + for (n_device = 0; n_device < state_change->n_devices; n_device++) { + struct drbd_device_state_change *device_state_change = + &state_change->devices[n_device]; + + if (HAS_CHANGED(device_state_change->disk_state)) + REMEMBER_STATE_CHANGE(notify_device_state_change, + device_state_change, NOTIFY_CHANGE); + } + + n_peer_devices = state_change->n_devices * state_change->n_connections; + for (n_peer_device = 0; n_peer_device < n_peer_devices; n_peer_device++) { + struct drbd_peer_device_state_change *p = + &state_change->peer_devices[n_peer_device]; + + if (HAS_CHANGED(p->disk_state) || + HAS_CHANGED(p->repl_state) || + HAS_CHANGED(p->resync_susp_user) || + HAS_CHANGED(p->resync_susp_peer) || + HAS_CHANGED(p->resync_susp_dependency)) + REMEMBER_STATE_CHANGE(notify_peer_device_state_change, + p, NOTIFY_CHANGE); + } + + FINAL_STATE_CHANGE(NOTIFY_CHANGE); + mutex_unlock(¬ification_mutex); + +#undef HAS_CHANGED +#undef FINAL_STATE_CHANGE +#undef REMEMBER_STATE_CHANGE +} + /** * after_state_ch() - Perform after state change actions that may sleep * @device: DRBD device. @@ -1253,13 +1636,16 @@ int drbd_bitmap_io_from_worker(struct drbd_device *device, * @flags: Flags */ static void after_state_ch(struct drbd_device *device, union drbd_state os, - union drbd_state ns, enum chg_state_flags flags) + union drbd_state ns, enum chg_state_flags flags, + struct drbd_state_change *state_change) { struct drbd_resource *resource = device->resource; struct drbd_peer_device *peer_device = first_peer_device(device); struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; struct sib_info sib; + broadcast_state_change(state_change); + sib.sib_reason = SIB_STATE_CHANGE; sib.os = os; sib.ns = ns; @@ -1377,7 +1763,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, } if (ns.pdsk < D_INCONSISTENT && get_ldev(device)) { - if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY && + if (os.peer != R_PRIMARY && ns.peer == R_PRIMARY && device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { drbd_uuid_new_current(device); drbd_send_uuids(peer_device); @@ -1444,7 +1830,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, if (os.disk != D_FAILED && ns.disk == D_FAILED) { enum drbd_io_error_p eh = EP_PASS_ON; int was_io_error = 0; - /* corresponding get_ldev was in __drbd_set_state, to serialize + /* corresponding get_ldev was in _drbd_set_state, to serialize * our cleanup here with the transition to D_DISKLESS. * But is is still not save to dreference ldev here, since * we might come from an failed Attach before ldev was set. */ @@ -1455,6 +1841,10 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, was_io_error = test_and_clear_bit(WAS_IO_ERROR, &device->flags); + /* Intentionally call this handler first, before drbd_send_state(). + * See: 2932204 drbd: call local-io-error handler early + * People may chose to hard-reset the box from this handler. + * It is useful if this looks like a "regular node crash". */ if (was_io_error && eh == EP_CALL_HELPER) drbd_khelper(device, "local-io-error"); @@ -1572,6 +1962,7 @@ struct after_conn_state_chg_work { union drbd_state ns_max; /* new, max state, over all devices */ enum chg_state_flags flags; struct drbd_connection *connection; + struct drbd_state_change *state_change; }; static int w_after_conn_state_ch(struct drbd_work *w, int unused) @@ -1584,6 +1975,8 @@ static int w_after_conn_state_ch(struct drbd_work *w, int unused) struct drbd_peer_device *peer_device; int vnr; + broadcast_state_change(acscw->state_change); + forget_state_change(acscw->state_change); kfree(acscw); /* Upon network configuration, we need to start the receiver */ @@ -1593,6 +1986,13 @@ static int w_after_conn_state_ch(struct drbd_work *w, int unused) if (oc == C_DISCONNECTING && ns_max.conn == C_STANDALONE) { struct net_conf *old_conf; + mutex_lock(¬ification_mutex); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) + notify_peer_device_state(NULL, 0, peer_device, NULL, + NOTIFY_DESTROY | NOTIFY_CONTINUES); + notify_connection_state(NULL, 0, connection, NULL, NOTIFY_DESTROY); + mutex_unlock(¬ification_mutex); + mutex_lock(&connection->resource->conf_update); old_conf = connection->net_conf; connection->my_addr_len = 0; @@ -1759,7 +2159,7 @@ conn_set_state(struct drbd_connection *connection, union drbd_state mask, union if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED) ns.disk = os.disk; - rv = __drbd_set_state(device, ns, flags, NULL); + rv = _drbd_set_state(device, ns, flags, NULL); if (rv < SS_SUCCESS) BUG(); @@ -1823,6 +2223,7 @@ _conn_request_state(struct drbd_connection *connection, union drbd_state mask, u enum drbd_conns oc = connection->cstate; union drbd_state ns_max, ns_min, os; bool have_mutex = false; + struct drbd_state_change *state_change; if (mask.conn) { rv = is_valid_conn_transition(oc, val.conn); @@ -1868,10 +2269,12 @@ _conn_request_state(struct drbd_connection *connection, union drbd_state mask, u goto abort; } + state_change = remember_old_state(connection->resource, GFP_ATOMIC); conn_old_common_state(connection, &os, &flags); flags |= CS_DC_SUSP; conn_set_state(connection, mask, val, &ns_min, &ns_max, flags); conn_pr_state_change(connection, os, ns_max, flags); + remember_new_state(state_change); acscw = kmalloc(sizeof(*acscw), GFP_ATOMIC); if (acscw) { @@ -1882,6 +2285,7 @@ _conn_request_state(struct drbd_connection *connection, union drbd_state mask, u acscw->w.cb = w_after_conn_state_ch; kref_get(&connection->kref); acscw->connection = connection; + acscw->state_change = state_change; drbd_queue_work(&connection->sender_work, &acscw->w); } else { drbd_err(connection, "Could not kmalloc an acscw\n"); diff --git a/drivers/block/drbd/drbd_state.h b/drivers/block/drbd/drbd_state.h index 7f53c40..bd98953 100644 --- a/drivers/block/drbd/drbd_state.h +++ b/drivers/block/drbd/drbd_state.h @@ -122,9 +122,9 @@ extern enum drbd_state_rv _drbd_request_state_holding_state_mutex(struct drbd_device *, union drbd_state, union drbd_state, enum chg_state_flags); -extern enum drbd_state_rv __drbd_set_state(struct drbd_device *, union drbd_state, - enum chg_state_flags, - struct completion *done); +extern enum drbd_state_rv _drbd_set_state(struct drbd_device *, union drbd_state, + enum chg_state_flags, + struct completion *done); extern void print_st_err(struct drbd_device *, union drbd_state, union drbd_state, int); diff --git a/drivers/block/drbd/drbd_state_change.h b/drivers/block/drbd/drbd_state_change.h new file mode 100644 index 0000000..9e503a1 --- /dev/null +++ b/drivers/block/drbd/drbd_state_change.h @@ -0,0 +1,63 @@ +#ifndef DRBD_STATE_CHANGE_H +#define DRBD_STATE_CHANGE_H + +struct drbd_resource_state_change { + struct drbd_resource *resource; + enum drbd_role role[2]; + bool susp[2]; + bool susp_nod[2]; + bool susp_fen[2]; +}; + +struct drbd_device_state_change { + struct drbd_device *device; + enum drbd_disk_state disk_state[2]; +}; + +struct drbd_connection_state_change { + struct drbd_connection *connection; + enum drbd_conns cstate[2]; /* drbd9: enum drbd_conn_state */ + enum drbd_role peer_role[2]; +}; + +struct drbd_peer_device_state_change { + struct drbd_peer_device *peer_device; + enum drbd_disk_state disk_state[2]; + enum drbd_conns repl_state[2]; /* drbd9: enum drbd_repl_state */ + bool resync_susp_user[2]; + bool resync_susp_peer[2]; + bool resync_susp_dependency[2]; +}; + +struct drbd_state_change { + struct list_head list; + unsigned int n_devices; + unsigned int n_connections; + struct drbd_resource_state_change resource[1]; + struct drbd_device_state_change *devices; + struct drbd_connection_state_change *connections; + struct drbd_peer_device_state_change *peer_devices; +}; + +extern struct drbd_state_change *remember_old_state(struct drbd_resource *, gfp_t); +extern void copy_old_to_new_state_change(struct drbd_state_change *); +extern void forget_state_change(struct drbd_state_change *); + +extern void notify_resource_state_change(struct sk_buff *, + unsigned int, + struct drbd_resource_state_change *, + enum drbd_notification_type type); +extern void notify_connection_state_change(struct sk_buff *, + unsigned int, + struct drbd_connection_state_change *, + enum drbd_notification_type type); +extern void notify_device_state_change(struct sk_buff *, + unsigned int, + struct drbd_device_state_change *, + enum drbd_notification_type type); +extern void notify_peer_device_state_change(struct sk_buff *, + unsigned int, + struct drbd_peer_device_state_change *, + enum drbd_notification_type type); + +#endif /* DRBD_STATE_CHANGE_H */ diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 5578c14..eff716c 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -55,13 +55,6 @@ static int make_resync_request(struct drbd_device *, int); * */ - -/* About the global_state_lock - Each state transition on an device holds a read lock. In case we have - to evaluate the resync after dependencies, we grab a write lock, because - we need stable states on all devices for that. */ -rwlock_t global_state_lock; - /* used for synchronous meta data and bitmap IO * submitted by drbd_md_sync_page_io() */ @@ -120,6 +113,7 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l unsigned long flags = 0; struct drbd_peer_device *peer_device = peer_req->peer_device; struct drbd_device *device = peer_device->device; + struct drbd_connection *connection = peer_device->connection; struct drbd_interval i; int do_wake; u64 block_id; @@ -152,6 +146,12 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l * ((peer_req->flags & (EE_WAS_ERROR|EE_IS_TRIM)) == EE_WAS_ERROR) */ if (peer_req->flags & EE_WAS_ERROR) __drbd_chk_io_error(device, DRBD_WRITE_ERROR); + + if (connection->cstate >= C_WF_REPORT_PARAMS) { + kref_get(&device->kref); /* put is in drbd_send_acks_wf() */ + if (!queue_work(connection->ack_sender, &peer_device->send_acks_work)) + kref_put(&device->kref, drbd_destroy_device); + } spin_unlock_irqrestore(&device->resource->req_lock, flags); if (block_id == ID_SYNCER) @@ -163,7 +163,6 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l if (do_al_complete_io) drbd_al_complete_io(device, &i); - wake_asender(peer_device->connection); put_ldev(device); } @@ -195,6 +194,12 @@ void drbd_peer_request_endio(struct bio *bio) } } +void drbd_panic_after_delayed_completion_of_aborted_request(struct drbd_device *device) +{ + panic("drbd%u %s/%u potential random memory corruption caused by delayed completion of aborted local request\n", + device->minor, device->resource->name, device->vnr); +} + /* read, readA or write requests on R_PRIMARY coming from drbd_make_request */ void drbd_request_endio(struct bio *bio) @@ -238,7 +243,7 @@ void drbd_request_endio(struct bio *bio) drbd_emerg(device, "delayed completion of aborted local request; disk-timeout may be too aggressive\n"); if (!bio->bi_error) - panic("possible random memory corruption caused by delayed completion of aborted local request\n"); + drbd_panic_after_delayed_completion_of_aborted_request(device); } /* to avoid recursion in __req_mod */ @@ -1291,6 +1296,7 @@ static int drbd_send_barrier(struct drbd_connection *connection) p->barrier = connection->send.current_epoch_nr; p->pad = 0; connection->send.current_epoch_writes = 0; + connection->send.last_sent_barrier_jif = jiffies; return conn_send_command(connection, sock, P_BARRIER, sizeof(*p), NULL, 0); } @@ -1315,6 +1321,7 @@ static void re_init_if_first_write(struct drbd_connection *connection, unsigned connection->send.seen_any_write_yet = true; connection->send.current_epoch_nr = epoch; connection->send.current_epoch_writes = 0; + connection->send.last_sent_barrier_jif = jiffies; } } @@ -1456,70 +1463,73 @@ static int _drbd_may_sync_now(struct drbd_device *device) } /** - * _drbd_pause_after() - Pause resync on all devices that may not resync now + * drbd_pause_after() - Pause resync on all devices that may not resync now * @device: DRBD device. * * Called from process context only (admin command and after_state_ch). */ -static int _drbd_pause_after(struct drbd_device *device) +static bool drbd_pause_after(struct drbd_device *device) { + bool changed = false; struct drbd_device *odev; - int i, rv = 0; + int i; rcu_read_lock(); idr_for_each_entry(&drbd_devices, odev, i) { if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS) continue; - if (!_drbd_may_sync_now(odev)) - rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL) - != SS_NOTHING_TO_DO); + if (!_drbd_may_sync_now(odev) && + _drbd_set_state(_NS(odev, aftr_isp, 1), + CS_HARD, NULL) != SS_NOTHING_TO_DO) + changed = true; } rcu_read_unlock(); - return rv; + return changed; } /** - * _drbd_resume_next() - Resume resync on all devices that may resync now + * drbd_resume_next() - Resume resync on all devices that may resync now * @device: DRBD device. * * Called from process context only (admin command and worker). */ -static int _drbd_resume_next(struct drbd_device *device) +static bool drbd_resume_next(struct drbd_device *device) { + bool changed = false; struct drbd_device *odev; - int i, rv = 0; + int i; rcu_read_lock(); idr_for_each_entry(&drbd_devices, odev, i) { if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS) continue; if (odev->state.aftr_isp) { - if (_drbd_may_sync_now(odev)) - rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0), - CS_HARD, NULL) - != SS_NOTHING_TO_DO) ; + if (_drbd_may_sync_now(odev) && + _drbd_set_state(_NS(odev, aftr_isp, 0), + CS_HARD, NULL) != SS_NOTHING_TO_DO) + changed = true; } } rcu_read_unlock(); - return rv; + return changed; } void resume_next_sg(struct drbd_device *device) { - write_lock_irq(&global_state_lock); - _drbd_resume_next(device); - write_unlock_irq(&global_state_lock); + lock_all_resources(); + drbd_resume_next(device); + unlock_all_resources(); } void suspend_other_sg(struct drbd_device *device) { - write_lock_irq(&global_state_lock); - _drbd_pause_after(device); - write_unlock_irq(&global_state_lock); + lock_all_resources(); + drbd_pause_after(device); + unlock_all_resources(); } -/* caller must hold global_state_lock */ +/* caller must lock_all_resources() */ enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor) { struct drbd_device *odev; @@ -1557,15 +1567,15 @@ enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_min } } -/* caller must hold global_state_lock */ +/* caller must lock_all_resources() */ void drbd_resync_after_changed(struct drbd_device *device) { - int changes; + int changed; do { - changes = _drbd_pause_after(device); - changes |= _drbd_resume_next(device); - } while (changes); + changed = drbd_pause_after(device); + changed |= drbd_resume_next(device); + } while (changed); } void drbd_rs_controller_reset(struct drbd_device *device) @@ -1685,19 +1695,14 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) } else { mutex_lock(device->state_mutex); } - clear_bit(B_RS_H_DONE, &device->flags); - /* req_lock: serialize with drbd_send_and_submit() and others - * global_state_lock: for stable sync-after dependencies */ - spin_lock_irq(&device->resource->req_lock); - write_lock(&global_state_lock); + lock_all_resources(); + clear_bit(B_RS_H_DONE, &device->flags); /* Did some connection breakage or IO error race with us? */ if (device->state.conn < C_CONNECTED || !get_ldev_if_state(device, D_NEGOTIATING)) { - write_unlock(&global_state_lock); - spin_unlock_irq(&device->resource->req_lock); - mutex_unlock(device->state_mutex); - return; + unlock_all_resources(); + goto out; } ns = drbd_read_state(device); @@ -1711,7 +1716,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) else /* side == C_SYNC_SOURCE */ ns.pdsk = D_INCONSISTENT; - r = __drbd_set_state(device, ns, CS_VERBOSE, NULL); + r = _drbd_set_state(device, ns, CS_VERBOSE, NULL); ns = drbd_read_state(device); if (ns.conn < C_CONNECTED) @@ -1732,7 +1737,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) device->rs_mark_left[i] = tw; device->rs_mark_time[i] = now; } - _drbd_pause_after(device); + drbd_pause_after(device); /* Forget potentially stale cached per resync extent bit-counts. * Open coded drbd_rs_cancel_all(device), we already have IRQs * disabled, and know the disk state is ok. */ @@ -1742,8 +1747,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) device->resync_wenr = LC_FREE; spin_unlock(&device->al_lock); } - write_unlock(&global_state_lock); - spin_unlock_irq(&device->resource->req_lock); + unlock_all_resources(); if (r == SS_SUCCESS) { wake_up(&device->al_wait); /* for lc_reset() above */ @@ -1807,6 +1811,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) drbd_md_sync(device); } put_ldev(device); +out: mutex_unlock(device->state_mutex); } @@ -1836,7 +1841,7 @@ static void drbd_ldev_destroy(struct drbd_device *device) device->act_log = NULL; __acquire(local); - drbd_free_ldev(device->ldev); + drbd_backing_dev_free(device, device->ldev); device->ldev = NULL; __release(local); diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 15bec40..9b180db 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -104,9 +104,9 @@ /* Device instance number, incremented each time a device is probed. */ static int instance; -struct list_head online_list; -struct list_head removing_list; -spinlock_t dev_lock; +static struct list_head online_list; +static struct list_head removing_list; +static spinlock_t dev_lock; /* * Global variable used to hold the major block device number diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 95dff91..6f95871 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -495,17 +495,17 @@ static int null_lnvm_id(struct nvm_dev *dev, struct nvm_id *id) id->ppaf.ch_offset = 56; id->ppaf.ch_len = 8; - do_div(size, bs); /* convert size to pages */ - do_div(size, 256); /* concert size to pgs pr blk */ + sector_div(size, bs); /* convert size to pages */ + size >>= 8; /* concert size to pgs pr blk */ grp = &id->groups[0]; grp->mtype = 0; grp->fmtype = 0; grp->num_ch = 1; grp->num_pg = 256; blksize = size; - do_div(size, (1 << 16)); + size >>= 16; grp->num_lun = size + 1; - do_div(blksize, grp->num_lun); + sector_div(blksize, grp->num_lun); grp->num_blk = blksize; grp->num_pln = 1; diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c index 59c91d4..ba4bfe9 100644 --- a/drivers/block/sx8.c +++ b/drivers/block/sx8.c @@ -23,7 +23,7 @@ #include <linux/workqueue.h> #include <linux/bitops.h> #include <linux/delay.h> -#include <linux/time.h> +#include <linux/ktime.h> #include <linux/hdreg.h> #include <linux/dma-mapping.h> #include <linux/completion.h> @@ -671,16 +671,15 @@ static int carm_send_special (struct carm_host *host, carm_sspc_t func) static unsigned int carm_fill_sync_time(struct carm_host *host, unsigned int idx, void *mem) { - struct timeval tv; struct carm_msg_sync_time *st = mem; - do_gettimeofday(&tv); + time64_t tv = ktime_get_real_seconds(); memset(st, 0, sizeof(*st)); st->type = CARM_MSG_MISC; st->subtype = MISC_SET_TIME; st->handle = cpu_to_le32(TAG_ENCODE(idx)); - st->timestamp = cpu_to_le32(tv.tv_sec); + st->timestamp = cpu_to_le32(tv); return sizeof(struct carm_msg_sync_time); } diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 41fb1a9..4809c15 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -84,6 +84,16 @@ MODULE_PARM_DESC(max_persistent_grants, "Maximum number of grants to map persistently"); /* + * Maximum number of rings/queues blkback supports, allow as many queues as there + * are CPUs if user has not specified a value. + */ +unsigned int xenblk_max_queues; +module_param_named(max_queues, xenblk_max_queues, uint, 0644); +MODULE_PARM_DESC(max_queues, + "Maximum number of hardware queues per virtual disk." \ + "By default it is the number of online CPUs."); + +/* * Maximum order of pages to be used for the shared ring between front and * backend, 4KB page granularity is used. */ @@ -113,71 +123,71 @@ module_param(log_stats, int, 0644); /* Number of free pages to remove on each call to gnttab_free_pages */ #define NUM_BATCH_FREE_PAGES 10 -static inline int get_free_page(struct xen_blkif *blkif, struct page **page) +static inline int get_free_page(struct xen_blkif_ring *ring, struct page **page) { unsigned long flags; - spin_lock_irqsave(&blkif->free_pages_lock, flags); - if (list_empty(&blkif->free_pages)) { - BUG_ON(blkif->free_pages_num != 0); - spin_unlock_irqrestore(&blkif->free_pages_lock, flags); + spin_lock_irqsave(&ring->free_pages_lock, flags); + if (list_empty(&ring->free_pages)) { + BUG_ON(ring->free_pages_num != 0); + spin_unlock_irqrestore(&ring->free_pages_lock, flags); return gnttab_alloc_pages(1, page); } - BUG_ON(blkif->free_pages_num == 0); - page[0] = list_first_entry(&blkif->free_pages, struct page, lru); + BUG_ON(ring->free_pages_num == 0); + page[0] = list_first_entry(&ring->free_pages, struct page, lru); list_del(&page[0]->lru); - blkif->free_pages_num--; - spin_unlock_irqrestore(&blkif->free_pages_lock, flags); + ring->free_pages_num--; + spin_unlock_irqrestore(&ring->free_pages_lock, flags); return 0; } -static inline void put_free_pages(struct xen_blkif *blkif, struct page **page, +static inline void put_free_pages(struct xen_blkif_ring *ring, struct page **page, int num) { unsigned long flags; int i; - spin_lock_irqsave(&blkif->free_pages_lock, flags); + spin_lock_irqsave(&ring->free_pages_lock, flags); for (i = 0; i < num; i++) - list_add(&page[i]->lru, &blkif->free_pages); - blkif->free_pages_num += num; - spin_unlock_irqrestore(&blkif->free_pages_lock, flags); + list_add(&page[i]->lru, &ring->free_pages); + ring->free_pages_num += num; + spin_unlock_irqrestore(&ring->free_pages_lock, flags); } -static inline void shrink_free_pagepool(struct xen_blkif *blkif, int num) +static inline void shrink_free_pagepool(struct xen_blkif_ring *ring, int num) { /* Remove requested pages in batches of NUM_BATCH_FREE_PAGES */ struct page *page[NUM_BATCH_FREE_PAGES]; unsigned int num_pages = 0; unsigned long flags; - spin_lock_irqsave(&blkif->free_pages_lock, flags); - while (blkif->free_pages_num > num) { - BUG_ON(list_empty(&blkif->free_pages)); - page[num_pages] = list_first_entry(&blkif->free_pages, + spin_lock_irqsave(&ring->free_pages_lock, flags); + while (ring->free_pages_num > num) { + BUG_ON(list_empty(&ring->free_pages)); + page[num_pages] = list_first_entry(&ring->free_pages, struct page, lru); list_del(&page[num_pages]->lru); - blkif->free_pages_num--; + ring->free_pages_num--; if (++num_pages == NUM_BATCH_FREE_PAGES) { - spin_unlock_irqrestore(&blkif->free_pages_lock, flags); + spin_unlock_irqrestore(&ring->free_pages_lock, flags); gnttab_free_pages(num_pages, page); - spin_lock_irqsave(&blkif->free_pages_lock, flags); + spin_lock_irqsave(&ring->free_pages_lock, flags); num_pages = 0; } } - spin_unlock_irqrestore(&blkif->free_pages_lock, flags); + spin_unlock_irqrestore(&ring->free_pages_lock, flags); if (num_pages != 0) gnttab_free_pages(num_pages, page); } #define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page))) -static int do_block_io_op(struct xen_blkif *blkif); -static int dispatch_rw_block_io(struct xen_blkif *blkif, +static int do_block_io_op(struct xen_blkif_ring *ring); +static int dispatch_rw_block_io(struct xen_blkif_ring *ring, struct blkif_request *req, struct pending_req *pending_req); -static void make_response(struct xen_blkif *blkif, u64 id, +static void make_response(struct xen_blkif_ring *ring, u64 id, unsigned short op, int st); #define foreach_grant_safe(pos, n, rbtree, node) \ @@ -190,7 +200,7 @@ static void make_response(struct xen_blkif *blkif, u64 id, /* * We don't need locking around the persistent grant helpers - * because blkback uses a single-thread for each backed, so we + * because blkback uses a single-thread for each backend, so we * can be sure that this functions will never be called recursively. * * The only exception to that is put_persistent_grant, that can be called @@ -198,19 +208,20 @@ static void make_response(struct xen_blkif *blkif, u64 id, * bit operations to modify the flags of a persistent grant and to count * the number of used grants. */ -static int add_persistent_gnt(struct xen_blkif *blkif, +static int add_persistent_gnt(struct xen_blkif_ring *ring, struct persistent_gnt *persistent_gnt) { struct rb_node **new = NULL, *parent = NULL; struct persistent_gnt *this; + struct xen_blkif *blkif = ring->blkif; - if (blkif->persistent_gnt_c >= xen_blkif_max_pgrants) { + if (ring->persistent_gnt_c >= xen_blkif_max_pgrants) { if (!blkif->vbd.overflow_max_grants) blkif->vbd.overflow_max_grants = 1; return -EBUSY; } /* Figure out where to put new node */ - new = &blkif->persistent_gnts.rb_node; + new = &ring->persistent_gnts.rb_node; while (*new) { this = container_of(*new, struct persistent_gnt, node); @@ -229,19 +240,19 @@ static int add_persistent_gnt(struct xen_blkif *blkif, set_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags); /* Add new node and rebalance tree. */ rb_link_node(&(persistent_gnt->node), parent, new); - rb_insert_color(&(persistent_gnt->node), &blkif->persistent_gnts); - blkif->persistent_gnt_c++; - atomic_inc(&blkif->persistent_gnt_in_use); + rb_insert_color(&(persistent_gnt->node), &ring->persistent_gnts); + ring->persistent_gnt_c++; + atomic_inc(&ring->persistent_gnt_in_use); return 0; } -static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif, +static struct persistent_gnt *get_persistent_gnt(struct xen_blkif_ring *ring, grant_ref_t gref) { struct persistent_gnt *data; struct rb_node *node = NULL; - node = blkif->persistent_gnts.rb_node; + node = ring->persistent_gnts.rb_node; while (node) { data = container_of(node, struct persistent_gnt, node); @@ -255,24 +266,24 @@ static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif, return NULL; } set_bit(PERSISTENT_GNT_ACTIVE, data->flags); - atomic_inc(&blkif->persistent_gnt_in_use); + atomic_inc(&ring->persistent_gnt_in_use); return data; } } return NULL; } -static void put_persistent_gnt(struct xen_blkif *blkif, +static void put_persistent_gnt(struct xen_blkif_ring *ring, struct persistent_gnt *persistent_gnt) { if(!test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags)) pr_alert_ratelimited("freeing a grant already unused\n"); set_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags); clear_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags); - atomic_dec(&blkif->persistent_gnt_in_use); + atomic_dec(&ring->persistent_gnt_in_use); } -static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root, +static void free_persistent_gnts(struct xen_blkif_ring *ring, struct rb_root *root, unsigned int num) { struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; @@ -303,7 +314,7 @@ static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root, unmap_data.count = segs_to_unmap; BUG_ON(gnttab_unmap_refs_sync(&unmap_data)); - put_free_pages(blkif, pages, segs_to_unmap); + put_free_pages(ring, pages, segs_to_unmap); segs_to_unmap = 0; } @@ -320,15 +331,15 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work) struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; struct persistent_gnt *persistent_gnt; int segs_to_unmap = 0; - struct xen_blkif *blkif = container_of(work, typeof(*blkif), persistent_purge_work); + struct xen_blkif_ring *ring = container_of(work, typeof(*ring), persistent_purge_work); struct gntab_unmap_queue_data unmap_data; unmap_data.pages = pages; unmap_data.unmap_ops = unmap; unmap_data.kunmap_ops = NULL; - while(!list_empty(&blkif->persistent_purge_list)) { - persistent_gnt = list_first_entry(&blkif->persistent_purge_list, + while(!list_empty(&ring->persistent_purge_list)) { + persistent_gnt = list_first_entry(&ring->persistent_purge_list, struct persistent_gnt, remove_node); list_del(&persistent_gnt->remove_node); @@ -343,7 +354,7 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work) if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) { unmap_data.count = segs_to_unmap; BUG_ON(gnttab_unmap_refs_sync(&unmap_data)); - put_free_pages(blkif, pages, segs_to_unmap); + put_free_pages(ring, pages, segs_to_unmap); segs_to_unmap = 0; } kfree(persistent_gnt); @@ -351,11 +362,11 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work) if (segs_to_unmap > 0) { unmap_data.count = segs_to_unmap; BUG_ON(gnttab_unmap_refs_sync(&unmap_data)); - put_free_pages(blkif, pages, segs_to_unmap); + put_free_pages(ring, pages, segs_to_unmap); } } -static void purge_persistent_gnt(struct xen_blkif *blkif) +static void purge_persistent_gnt(struct xen_blkif_ring *ring) { struct persistent_gnt *persistent_gnt; struct rb_node *n; @@ -363,23 +374,23 @@ static void purge_persistent_gnt(struct xen_blkif *blkif) bool scan_used = false, clean_used = false; struct rb_root *root; - if (blkif->persistent_gnt_c < xen_blkif_max_pgrants || - (blkif->persistent_gnt_c == xen_blkif_max_pgrants && - !blkif->vbd.overflow_max_grants)) { - return; + if (ring->persistent_gnt_c < xen_blkif_max_pgrants || + (ring->persistent_gnt_c == xen_blkif_max_pgrants && + !ring->blkif->vbd.overflow_max_grants)) { + goto out; } - if (work_busy(&blkif->persistent_purge_work)) { + if (work_busy(&ring->persistent_purge_work)) { pr_alert_ratelimited("Scheduled work from previous purge is still busy, cannot purge list\n"); - return; + goto out; } num_clean = (xen_blkif_max_pgrants / 100) * LRU_PERCENT_CLEAN; - num_clean = blkif->persistent_gnt_c - xen_blkif_max_pgrants + num_clean; - num_clean = min(blkif->persistent_gnt_c, num_clean); + num_clean = ring->persistent_gnt_c - xen_blkif_max_pgrants + num_clean; + num_clean = min(ring->persistent_gnt_c, num_clean); if ((num_clean == 0) || - (num_clean > (blkif->persistent_gnt_c - atomic_read(&blkif->persistent_gnt_in_use)))) - return; + (num_clean > (ring->persistent_gnt_c - atomic_read(&ring->persistent_gnt_in_use)))) + goto out; /* * At this point, we can assure that there will be no calls @@ -394,8 +405,8 @@ static void purge_persistent_gnt(struct xen_blkif *blkif) pr_debug("Going to purge %u persistent grants\n", num_clean); - BUG_ON(!list_empty(&blkif->persistent_purge_list)); - root = &blkif->persistent_gnts; + BUG_ON(!list_empty(&ring->persistent_purge_list)); + root = &ring->persistent_gnts; purge_list: foreach_grant_safe(persistent_gnt, n, root, node) { BUG_ON(persistent_gnt->handle == @@ -414,7 +425,7 @@ purge_list: rb_erase(&persistent_gnt->node, root); list_add(&persistent_gnt->remove_node, - &blkif->persistent_purge_list); + &ring->persistent_purge_list); if (--num_clean == 0) goto finished; } @@ -435,30 +446,32 @@ finished: goto purge_list; } - blkif->persistent_gnt_c -= (total - num_clean); - blkif->vbd.overflow_max_grants = 0; + ring->persistent_gnt_c -= (total - num_clean); + ring->blkif->vbd.overflow_max_grants = 0; /* We can defer this work */ - schedule_work(&blkif->persistent_purge_work); + schedule_work(&ring->persistent_purge_work); pr_debug("Purged %u/%u\n", (total - num_clean), total); + +out: return; } /* * Retrieve from the 'pending_reqs' a free pending_req structure to be used. */ -static struct pending_req *alloc_req(struct xen_blkif *blkif) +static struct pending_req *alloc_req(struct xen_blkif_ring *ring) { struct pending_req *req = NULL; unsigned long flags; - spin_lock_irqsave(&blkif->pending_free_lock, flags); - if (!list_empty(&blkif->pending_free)) { - req = list_entry(blkif->pending_free.next, struct pending_req, + spin_lock_irqsave(&ring->pending_free_lock, flags); + if (!list_empty(&ring->pending_free)) { + req = list_entry(ring->pending_free.next, struct pending_req, free_list); list_del(&req->free_list); } - spin_unlock_irqrestore(&blkif->pending_free_lock, flags); + spin_unlock_irqrestore(&ring->pending_free_lock, flags); return req; } @@ -466,17 +479,17 @@ static struct pending_req *alloc_req(struct xen_blkif *blkif) * Return the 'pending_req' structure back to the freepool. We also * wake up the thread if it was waiting for a free page. */ -static void free_req(struct xen_blkif *blkif, struct pending_req *req) +static void free_req(struct xen_blkif_ring *ring, struct pending_req *req) { unsigned long flags; int was_empty; - spin_lock_irqsave(&blkif->pending_free_lock, flags); - was_empty = list_empty(&blkif->pending_free); - list_add(&req->free_list, &blkif->pending_free); - spin_unlock_irqrestore(&blkif->pending_free_lock, flags); + spin_lock_irqsave(&ring->pending_free_lock, flags); + was_empty = list_empty(&ring->pending_free); + list_add(&req->free_list, &ring->pending_free); + spin_unlock_irqrestore(&ring->pending_free_lock, flags); if (was_empty) - wake_up(&blkif->pending_free_wq); + wake_up(&ring->pending_free_wq); } /* @@ -556,10 +569,10 @@ abort: /* * Notification from the guest OS. */ -static void blkif_notify_work(struct xen_blkif *blkif) +static void blkif_notify_work(struct xen_blkif_ring *ring) { - blkif->waiting_reqs = 1; - wake_up(&blkif->wq); + ring->waiting_reqs = 1; + wake_up(&ring->wq); } irqreturn_t xen_blkif_be_int(int irq, void *dev_id) @@ -572,31 +585,33 @@ irqreturn_t xen_blkif_be_int(int irq, void *dev_id) * SCHEDULER FUNCTIONS */ -static void print_stats(struct xen_blkif *blkif) +static void print_stats(struct xen_blkif_ring *ring) { pr_info("(%s): oo %3llu | rd %4llu | wr %4llu | f %4llu" " | ds %4llu | pg: %4u/%4d\n", - current->comm, blkif->st_oo_req, - blkif->st_rd_req, blkif->st_wr_req, - blkif->st_f_req, blkif->st_ds_req, - blkif->persistent_gnt_c, + current->comm, ring->st_oo_req, + ring->st_rd_req, ring->st_wr_req, + ring->st_f_req, ring->st_ds_req, + ring->persistent_gnt_c, xen_blkif_max_pgrants); - blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000); - blkif->st_rd_req = 0; - blkif->st_wr_req = 0; - blkif->st_oo_req = 0; - blkif->st_ds_req = 0; + ring->st_print = jiffies + msecs_to_jiffies(10 * 1000); + ring->st_rd_req = 0; + ring->st_wr_req = 0; + ring->st_oo_req = 0; + ring->st_ds_req = 0; } int xen_blkif_schedule(void *arg) { - struct xen_blkif *blkif = arg; + struct xen_blkif_ring *ring = arg; + struct xen_blkif *blkif = ring->blkif; struct xen_vbd *vbd = &blkif->vbd; unsigned long timeout; int ret; xen_blkif_get(blkif); + set_freezable(); while (!kthread_should_stop()) { if (try_to_freeze()) continue; @@ -606,50 +621,50 @@ int xen_blkif_schedule(void *arg) timeout = msecs_to_jiffies(LRU_INTERVAL); timeout = wait_event_interruptible_timeout( - blkif->wq, - blkif->waiting_reqs || kthread_should_stop(), + ring->wq, + ring->waiting_reqs || kthread_should_stop(), timeout); if (timeout == 0) goto purge_gnt_list; timeout = wait_event_interruptible_timeout( - blkif->pending_free_wq, - !list_empty(&blkif->pending_free) || + ring->pending_free_wq, + !list_empty(&ring->pending_free) || kthread_should_stop(), timeout); if (timeout == 0) goto purge_gnt_list; - blkif->waiting_reqs = 0; + ring->waiting_reqs = 0; smp_mb(); /* clear flag *before* checking for work */ - ret = do_block_io_op(blkif); + ret = do_block_io_op(ring); if (ret > 0) - blkif->waiting_reqs = 1; + ring->waiting_reqs = 1; if (ret == -EACCES) - wait_event_interruptible(blkif->shutdown_wq, + wait_event_interruptible(ring->shutdown_wq, kthread_should_stop()); purge_gnt_list: if (blkif->vbd.feature_gnt_persistent && - time_after(jiffies, blkif->next_lru)) { - purge_persistent_gnt(blkif); - blkif->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL); + time_after(jiffies, ring->next_lru)) { + purge_persistent_gnt(ring); + ring->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL); } /* Shrink if we have more than xen_blkif_max_buffer_pages */ - shrink_free_pagepool(blkif, xen_blkif_max_buffer_pages); + shrink_free_pagepool(ring, xen_blkif_max_buffer_pages); - if (log_stats && time_after(jiffies, blkif->st_print)) - print_stats(blkif); + if (log_stats && time_after(jiffies, ring->st_print)) + print_stats(ring); } /* Drain pending purge work */ - flush_work(&blkif->persistent_purge_work); + flush_work(&ring->persistent_purge_work); if (log_stats) - print_stats(blkif); + print_stats(ring); - blkif->xenblkd = NULL; + ring->xenblkd = NULL; xen_blkif_put(blkif); return 0; @@ -658,22 +673,22 @@ purge_gnt_list: /* * Remove persistent grants and empty the pool of free pages */ -void xen_blkbk_free_caches(struct xen_blkif *blkif) +void xen_blkbk_free_caches(struct xen_blkif_ring *ring) { /* Free all persistent grant pages */ - if (!RB_EMPTY_ROOT(&blkif->persistent_gnts)) - free_persistent_gnts(blkif, &blkif->persistent_gnts, - blkif->persistent_gnt_c); + if (!RB_EMPTY_ROOT(&ring->persistent_gnts)) + free_persistent_gnts(ring, &ring->persistent_gnts, + ring->persistent_gnt_c); - BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts)); - blkif->persistent_gnt_c = 0; + BUG_ON(!RB_EMPTY_ROOT(&ring->persistent_gnts)); + ring->persistent_gnt_c = 0; /* Since we are shutting down remove all pages from the buffer */ - shrink_free_pagepool(blkif, 0 /* All */); + shrink_free_pagepool(ring, 0 /* All */); } static unsigned int xen_blkbk_unmap_prepare( - struct xen_blkif *blkif, + struct xen_blkif_ring *ring, struct grant_page **pages, unsigned int num, struct gnttab_unmap_grant_ref *unmap_ops, @@ -683,7 +698,7 @@ static unsigned int xen_blkbk_unmap_prepare( for (i = 0; i < num; i++) { if (pages[i]->persistent_gnt != NULL) { - put_persistent_gnt(blkif, pages[i]->persistent_gnt); + put_persistent_gnt(ring, pages[i]->persistent_gnt); continue; } if (pages[i]->handle == BLKBACK_INVALID_HANDLE) @@ -700,17 +715,18 @@ static unsigned int xen_blkbk_unmap_prepare( static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_queue_data *data) { - struct pending_req* pending_req = (struct pending_req*) (data->data); - struct xen_blkif *blkif = pending_req->blkif; + struct pending_req *pending_req = (struct pending_req *)(data->data); + struct xen_blkif_ring *ring = pending_req->ring; + struct xen_blkif *blkif = ring->blkif; /* BUG_ON used to reproduce existing behaviour, but is this the best way to deal with this? */ BUG_ON(result); - put_free_pages(blkif, data->pages, data->count); - make_response(blkif, pending_req->id, + put_free_pages(ring, data->pages, data->count); + make_response(ring, pending_req->id, pending_req->operation, pending_req->status); - free_req(blkif, pending_req); + free_req(ring, pending_req); /* * Make sure the request is freed before releasing blkif, * or there could be a race between free_req and the @@ -723,7 +739,7 @@ static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_ * pending_free_wq if there's a drain going on, but it has * to be taken into account if the current model is changed. */ - if (atomic_dec_and_test(&blkif->inflight) && atomic_read(&blkif->drain)) { + if (atomic_dec_and_test(&ring->inflight) && atomic_read(&blkif->drain)) { complete(&blkif->drain_complete); } xen_blkif_put(blkif); @@ -732,11 +748,11 @@ static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_ static void xen_blkbk_unmap_and_respond(struct pending_req *req) { struct gntab_unmap_queue_data* work = &req->gnttab_unmap_data; - struct xen_blkif *blkif = req->blkif; + struct xen_blkif_ring *ring = req->ring; struct grant_page **pages = req->segments; unsigned int invcount; - invcount = xen_blkbk_unmap_prepare(blkif, pages, req->nr_segs, + invcount = xen_blkbk_unmap_prepare(ring, pages, req->nr_segs, req->unmap, req->unmap_pages); work->data = req; @@ -757,7 +773,7 @@ static void xen_blkbk_unmap_and_respond(struct pending_req *req) * of hypercalls, but since this is only used in error paths there's * no real need. */ -static void xen_blkbk_unmap(struct xen_blkif *blkif, +static void xen_blkbk_unmap(struct xen_blkif_ring *ring, struct grant_page *pages[], int num) { @@ -768,20 +784,20 @@ static void xen_blkbk_unmap(struct xen_blkif *blkif, while (num) { unsigned int batch = min(num, BLKIF_MAX_SEGMENTS_PER_REQUEST); - - invcount = xen_blkbk_unmap_prepare(blkif, pages, batch, + + invcount = xen_blkbk_unmap_prepare(ring, pages, batch, unmap, unmap_pages); if (invcount) { ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount); BUG_ON(ret); - put_free_pages(blkif, unmap_pages, invcount); + put_free_pages(ring, unmap_pages, invcount); } pages += batch; num -= batch; } } -static int xen_blkbk_map(struct xen_blkif *blkif, +static int xen_blkbk_map(struct xen_blkif_ring *ring, struct grant_page *pages[], int num, bool ro) { @@ -794,6 +810,7 @@ static int xen_blkbk_map(struct xen_blkif *blkif, int ret = 0; int last_map = 0, map_until = 0; int use_persistent_gnts; + struct xen_blkif *blkif = ring->blkif; use_persistent_gnts = (blkif->vbd.feature_gnt_persistent); @@ -806,10 +823,11 @@ again: for (i = map_until; i < num; i++) { uint32_t flags; - if (use_persistent_gnts) + if (use_persistent_gnts) { persistent_gnt = get_persistent_gnt( - blkif, + ring, pages[i]->gref); + } if (persistent_gnt) { /* @@ -819,7 +837,7 @@ again: pages[i]->page = persistent_gnt->page; pages[i]->persistent_gnt = persistent_gnt; } else { - if (get_free_page(blkif, &pages[i]->page)) + if (get_free_page(ring, &pages[i]->page)) goto out_of_memory; addr = vaddr(pages[i]->page); pages_to_gnt[segs_to_map] = pages[i]->page; @@ -852,7 +870,7 @@ again: BUG_ON(new_map_idx >= segs_to_map); if (unlikely(map[new_map_idx].status != 0)) { pr_debug("invalid buffer -- could not remap it\n"); - put_free_pages(blkif, &pages[seg_idx]->page, 1); + put_free_pages(ring, &pages[seg_idx]->page, 1); pages[seg_idx]->handle = BLKBACK_INVALID_HANDLE; ret |= 1; goto next; @@ -862,7 +880,7 @@ again: continue; } if (use_persistent_gnts && - blkif->persistent_gnt_c < xen_blkif_max_pgrants) { + ring->persistent_gnt_c < xen_blkif_max_pgrants) { /* * We are using persistent grants, the grant is * not mapped but we might have room for it. @@ -880,7 +898,7 @@ again: persistent_gnt->gnt = map[new_map_idx].ref; persistent_gnt->handle = map[new_map_idx].handle; persistent_gnt->page = pages[seg_idx]->page; - if (add_persistent_gnt(blkif, + if (add_persistent_gnt(ring, persistent_gnt)) { kfree(persistent_gnt); persistent_gnt = NULL; @@ -888,7 +906,7 @@ again: } pages[seg_idx]->persistent_gnt = persistent_gnt; pr_debug("grant %u added to the tree of persistent grants, using %u/%u\n", - persistent_gnt->gnt, blkif->persistent_gnt_c, + persistent_gnt->gnt, ring->persistent_gnt_c, xen_blkif_max_pgrants); goto next; } @@ -913,7 +931,7 @@ next: out_of_memory: pr_alert("%s: out of memory\n", __func__); - put_free_pages(blkif, pages_to_gnt, segs_to_map); + put_free_pages(ring, pages_to_gnt, segs_to_map); return -ENOMEM; } @@ -921,7 +939,7 @@ static int xen_blkbk_map_seg(struct pending_req *pending_req) { int rc; - rc = xen_blkbk_map(pending_req->blkif, pending_req->segments, + rc = xen_blkbk_map(pending_req->ring, pending_req->segments, pending_req->nr_segs, (pending_req->operation != BLKIF_OP_READ)); @@ -934,7 +952,7 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req, struct phys_req *preq) { struct grant_page **pages = pending_req->indirect_pages; - struct xen_blkif *blkif = pending_req->blkif; + struct xen_blkif_ring *ring = pending_req->ring; int indirect_grefs, rc, n, nseg, i; struct blkif_request_segment *segments = NULL; @@ -945,7 +963,7 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req, for (i = 0; i < indirect_grefs; i++) pages[i]->gref = req->u.indirect.indirect_grefs[i]; - rc = xen_blkbk_map(blkif, pages, indirect_grefs, true); + rc = xen_blkbk_map(ring, pages, indirect_grefs, true); if (rc) goto unmap; @@ -977,15 +995,16 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req, unmap: if (segments) kunmap_atomic(segments); - xen_blkbk_unmap(blkif, pages, indirect_grefs); + xen_blkbk_unmap(ring, pages, indirect_grefs); return rc; } -static int dispatch_discard_io(struct xen_blkif *blkif, +static int dispatch_discard_io(struct xen_blkif_ring *ring, struct blkif_request *req) { int err = 0; int status = BLKIF_RSP_OKAY; + struct xen_blkif *blkif = ring->blkif; struct block_device *bdev = blkif->vbd.bdev; unsigned long secure; struct phys_req preq; @@ -1002,7 +1021,7 @@ static int dispatch_discard_io(struct xen_blkif *blkif, preq.sector_number + preq.nr_sects, blkif->vbd.pdevice); goto fail_response; } - blkif->st_ds_req++; + ring->st_ds_req++; secure = (blkif->vbd.discard_secure && (req->u.discard.flag & BLKIF_DISCARD_SECURE)) ? @@ -1018,26 +1037,28 @@ fail_response: } else if (err) status = BLKIF_RSP_ERROR; - make_response(blkif, req->u.discard.id, req->operation, status); + make_response(ring, req->u.discard.id, req->operation, status); xen_blkif_put(blkif); return err; } -static int dispatch_other_io(struct xen_blkif *blkif, +static int dispatch_other_io(struct xen_blkif_ring *ring, struct blkif_request *req, struct pending_req *pending_req) { - free_req(blkif, pending_req); - make_response(blkif, req->u.other.id, req->operation, + free_req(ring, pending_req); + make_response(ring, req->u.other.id, req->operation, BLKIF_RSP_EOPNOTSUPP); return -EIO; } -static void xen_blk_drain_io(struct xen_blkif *blkif) +static void xen_blk_drain_io(struct xen_blkif_ring *ring) { + struct xen_blkif *blkif = ring->blkif; + atomic_set(&blkif->drain, 1); do { - if (atomic_read(&blkif->inflight) == 0) + if (atomic_read(&ring->inflight) == 0) break; wait_for_completion_interruptible_timeout( &blkif->drain_complete, HZ); @@ -1058,12 +1079,12 @@ static void __end_block_io_op(struct pending_req *pending_req, int error) if ((pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE) && (error == -EOPNOTSUPP)) { pr_debug("flush diskcache op failed, not supported\n"); - xen_blkbk_flush_diskcache(XBT_NIL, pending_req->blkif->be, 0); + xen_blkbk_flush_diskcache(XBT_NIL, pending_req->ring->blkif->be, 0); pending_req->status = BLKIF_RSP_EOPNOTSUPP; } else if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) && (error == -EOPNOTSUPP)) { pr_debug("write barrier op failed, not supported\n"); - xen_blkbk_barrier(XBT_NIL, pending_req->blkif->be, 0); + xen_blkbk_barrier(XBT_NIL, pending_req->ring->blkif->be, 0); pending_req->status = BLKIF_RSP_EOPNOTSUPP; } else if (error) { pr_debug("Buffer not up-to-date at end of operation," @@ -1097,9 +1118,9 @@ static void end_block_io_op(struct bio *bio) * and transmute it to the block API to hand it over to the proper block disk. */ static int -__do_block_io_op(struct xen_blkif *blkif) +__do_block_io_op(struct xen_blkif_ring *ring) { - union blkif_back_rings *blk_rings = &blkif->blk_rings; + union blkif_back_rings *blk_rings = &ring->blk_rings; struct blkif_request req; struct pending_req *pending_req; RING_IDX rc, rp; @@ -1112,7 +1133,7 @@ __do_block_io_op(struct xen_blkif *blkif) if (RING_REQUEST_PROD_OVERFLOW(&blk_rings->common, rp)) { rc = blk_rings->common.rsp_prod_pvt; pr_warn("Frontend provided bogus ring requests (%d - %d = %d). Halting ring processing on dev=%04x\n", - rp, rc, rp - rc, blkif->vbd.pdevice); + rp, rc, rp - rc, ring->blkif->vbd.pdevice); return -EACCES; } while (rc != rp) { @@ -1125,14 +1146,14 @@ __do_block_io_op(struct xen_blkif *blkif) break; } - pending_req = alloc_req(blkif); + pending_req = alloc_req(ring); if (NULL == pending_req) { - blkif->st_oo_req++; + ring->st_oo_req++; more_to_do = 1; break; } - switch (blkif->blk_protocol) { + switch (ring->blkif->blk_protocol) { case BLKIF_PROTOCOL_NATIVE: memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req)); break; @@ -1156,16 +1177,16 @@ __do_block_io_op(struct xen_blkif *blkif) case BLKIF_OP_WRITE_BARRIER: case BLKIF_OP_FLUSH_DISKCACHE: case BLKIF_OP_INDIRECT: - if (dispatch_rw_block_io(blkif, &req, pending_req)) + if (dispatch_rw_block_io(ring, &req, pending_req)) goto done; break; case BLKIF_OP_DISCARD: - free_req(blkif, pending_req); - if (dispatch_discard_io(blkif, &req)) + free_req(ring, pending_req); + if (dispatch_discard_io(ring, &req)) goto done; break; default: - if (dispatch_other_io(blkif, &req, pending_req)) + if (dispatch_other_io(ring, &req, pending_req)) goto done; break; } @@ -1178,13 +1199,13 @@ done: } static int -do_block_io_op(struct xen_blkif *blkif) +do_block_io_op(struct xen_blkif_ring *ring) { - union blkif_back_rings *blk_rings = &blkif->blk_rings; + union blkif_back_rings *blk_rings = &ring->blk_rings; int more_to_do; do { - more_to_do = __do_block_io_op(blkif); + more_to_do = __do_block_io_op(ring); if (more_to_do) break; @@ -1197,7 +1218,7 @@ do_block_io_op(struct xen_blkif *blkif) * Transmutation of the 'struct blkif_request' to a proper 'struct bio' * and call the 'submit_bio' to pass it to the underlying storage. */ -static int dispatch_rw_block_io(struct xen_blkif *blkif, +static int dispatch_rw_block_io(struct xen_blkif_ring *ring, struct blkif_request *req, struct pending_req *pending_req) { @@ -1225,17 +1246,17 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, switch (req_operation) { case BLKIF_OP_READ: - blkif->st_rd_req++; + ring->st_rd_req++; operation = READ; break; case BLKIF_OP_WRITE: - blkif->st_wr_req++; + ring->st_wr_req++; operation = WRITE_ODIRECT; break; case BLKIF_OP_WRITE_BARRIER: drain = true; case BLKIF_OP_FLUSH_DISKCACHE: - blkif->st_f_req++; + ring->st_f_req++; operation = WRITE_FLUSH; break; default: @@ -1260,7 +1281,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, preq.nr_sects = 0; - pending_req->blkif = blkif; + pending_req->ring = ring; pending_req->id = req->u.rw.id; pending_req->operation = req_operation; pending_req->status = BLKIF_RSP_OKAY; @@ -1287,12 +1308,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, goto fail_response; } - if (xen_vbd_translate(&preq, blkif, operation) != 0) { + if (xen_vbd_translate(&preq, ring->blkif, operation) != 0) { pr_debug("access denied: %s of [%llu,%llu] on dev=%04x\n", operation == READ ? "read" : "write", preq.sector_number, preq.sector_number + preq.nr_sects, - blkif->vbd.pdevice); + ring->blkif->vbd.pdevice); goto fail_response; } @@ -1304,7 +1325,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, if (((int)preq.sector_number|(int)seg[i].nsec) & ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) { pr_debug("Misaligned I/O request from domain %d\n", - blkif->domid); + ring->blkif->domid); goto fail_response; } } @@ -1313,7 +1334,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, * issue the WRITE_FLUSH. */ if (drain) - xen_blk_drain_io(pending_req->blkif); + xen_blk_drain_io(pending_req->ring); /* * If we have failed at this point, we need to undo the M2P override, @@ -1328,8 +1349,8 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, * This corresponding xen_blkif_put is done in __end_block_io_op, or * below (in "!bio") if we are handling a BLKIF_OP_DISCARD. */ - xen_blkif_get(blkif); - atomic_inc(&blkif->inflight); + xen_blkif_get(ring->blkif); + atomic_inc(&ring->inflight); for (i = 0; i < nseg; i++) { while ((bio == NULL) || @@ -1377,19 +1398,19 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, blk_finish_plug(&plug); if (operation == READ) - blkif->st_rd_sect += preq.nr_sects; + ring->st_rd_sect += preq.nr_sects; else if (operation & WRITE) - blkif->st_wr_sect += preq.nr_sects; + ring->st_wr_sect += preq.nr_sects; return 0; fail_flush: - xen_blkbk_unmap(blkif, pending_req->segments, + xen_blkbk_unmap(ring, pending_req->segments, pending_req->nr_segs); fail_response: /* Haven't submitted any bio's yet. */ - make_response(blkif, req->u.rw.id, req_operation, BLKIF_RSP_ERROR); - free_req(blkif, pending_req); + make_response(ring, req->u.rw.id, req_operation, BLKIF_RSP_ERROR); + free_req(ring, pending_req); msleep(1); /* back off a bit */ return -EIO; @@ -1407,21 +1428,22 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, /* * Put a response on the ring on how the operation fared. */ -static void make_response(struct xen_blkif *blkif, u64 id, +static void make_response(struct xen_blkif_ring *ring, u64 id, unsigned short op, int st) { struct blkif_response resp; unsigned long flags; - union blkif_back_rings *blk_rings = &blkif->blk_rings; + union blkif_back_rings *blk_rings; int notify; resp.id = id; resp.operation = op; resp.status = st; - spin_lock_irqsave(&blkif->blk_ring_lock, flags); + spin_lock_irqsave(&ring->blk_ring_lock, flags); + blk_rings = &ring->blk_rings; /* Place on the response ring for the relevant domain. */ - switch (blkif->blk_protocol) { + switch (ring->blkif->blk_protocol) { case BLKIF_PROTOCOL_NATIVE: memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt), &resp, sizeof(resp)); @@ -1439,9 +1461,9 @@ static void make_response(struct xen_blkif *blkif, u64 id, } blk_rings->common.rsp_prod_pvt++; RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify); - spin_unlock_irqrestore(&blkif->blk_ring_lock, flags); + spin_unlock_irqrestore(&ring->blk_ring_lock, flags); if (notify) - notify_remote_via_irq(blkif->irq); + notify_remote_via_irq(ring->irq); } static int __init xen_blkif_init(void) @@ -1457,6 +1479,9 @@ static int __init xen_blkif_init(void) xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER; } + if (xenblk_max_queues == 0) + xenblk_max_queues = num_online_cpus(); + rc = xen_blkif_interface_init(); if (rc) goto failed_init; diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index c929ae2..dea61f6 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -46,6 +46,7 @@ #include <xen/interface/io/protocols.h> extern unsigned int xen_blkif_max_ring_order; +extern unsigned int xenblk_max_queues; /* * This is the maximum number of segments that would be allowed in indirect * requests. This value will also be passed to the frontend. @@ -269,68 +270,79 @@ struct persistent_gnt { struct list_head remove_node; }; -struct xen_blkif { - /* Unique identifier for this interface. */ - domid_t domid; - unsigned int handle; +/* Per-ring information. */ +struct xen_blkif_ring { /* Physical parameters of the comms window. */ unsigned int irq; - /* Comms information. */ - enum blkif_protocol blk_protocol; union blkif_back_rings blk_rings; void *blk_ring; - /* The VBD attached to this interface. */ - struct xen_vbd vbd; - /* Back pointer to the backend_info. */ - struct backend_info *be; /* Private fields. */ spinlock_t blk_ring_lock; - atomic_t refcnt; wait_queue_head_t wq; - /* for barrier (drain) requests */ - struct completion drain_complete; - atomic_t drain; atomic_t inflight; - /* One thread per one blkif. */ + /* One thread per blkif ring. */ struct task_struct *xenblkd; unsigned int waiting_reqs; - /* tree to store persistent grants */ + /* List of all 'pending_req' available */ + struct list_head pending_free; + /* And its spinlock. */ + spinlock_t pending_free_lock; + wait_queue_head_t pending_free_wq; + + /* Tree to store persistent grants. */ + spinlock_t pers_gnts_lock; struct rb_root persistent_gnts; unsigned int persistent_gnt_c; atomic_t persistent_gnt_in_use; unsigned long next_lru; - /* used by the kworker that offload work from the persistent purge */ + /* Statistics. */ + unsigned long st_print; + unsigned long long st_rd_req; + unsigned long long st_wr_req; + unsigned long long st_oo_req; + unsigned long long st_f_req; + unsigned long long st_ds_req; + unsigned long long st_rd_sect; + unsigned long long st_wr_sect; + + /* Used by the kworker that offload work from the persistent purge. */ struct list_head persistent_purge_list; struct work_struct persistent_purge_work; - /* buffer of free pages to map grant refs */ + /* Buffer of free pages to map grant refs. */ spinlock_t free_pages_lock; int free_pages_num; struct list_head free_pages; - /* List of all 'pending_req' available */ - struct list_head pending_free; - /* And its spinlock. */ - spinlock_t pending_free_lock; - wait_queue_head_t pending_free_wq; - - /* statistics */ - unsigned long st_print; - unsigned long long st_rd_req; - unsigned long long st_wr_req; - unsigned long long st_oo_req; - unsigned long long st_f_req; - unsigned long long st_ds_req; - unsigned long long st_rd_sect; - unsigned long long st_wr_sect; - struct work_struct free_work; /* Thread shutdown wait queue. */ wait_queue_head_t shutdown_wq; - unsigned int nr_ring_pages; + struct xen_blkif *blkif; +}; + +struct xen_blkif { + /* Unique identifier for this interface. */ + domid_t domid; + unsigned int handle; + /* Comms information. */ + enum blkif_protocol blk_protocol; + /* The VBD attached to this interface. */ + struct xen_vbd vbd; + /* Back pointer to the backend_info. */ + struct backend_info *be; + atomic_t refcnt; + /* for barrier (drain) requests */ + struct completion drain_complete; + atomic_t drain; + + struct work_struct free_work; + unsigned int nr_ring_pages; + /* All rings for this device. */ + struct xen_blkif_ring *rings; + unsigned int nr_rings; }; struct seg_buf { @@ -352,7 +364,7 @@ struct grant_page { * response queued for it, with the saved 'id' passed back. */ struct pending_req { - struct xen_blkif *blkif; + struct xen_blkif_ring *ring; u64 id; int nr_segs; atomic_t pendcnt; @@ -394,7 +406,7 @@ int xen_blkif_xenbus_init(void); irqreturn_t xen_blkif_be_int(int irq, void *dev_id); int xen_blkif_schedule(void *arg); int xen_blkif_purge_persistent(void *arg); -void xen_blkbk_free_caches(struct xen_blkif *blkif); +void xen_blkbk_free_caches(struct xen_blkif_ring *ring); int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt, struct backend_info *be, int state); diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index f53cff4..876763f 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -86,9 +86,11 @@ static void xen_update_blkif_status(struct xen_blkif *blkif) { int err; char name[BLKBACK_NAME_LEN]; + struct xen_blkif_ring *ring; + int i; /* Not ready to connect? */ - if (!blkif->irq || !blkif->vbd.bdev) + if (!blkif->rings || !blkif->rings[0].irq || !blkif->vbd.bdev) return; /* Already connected? */ @@ -113,13 +115,55 @@ static void xen_update_blkif_status(struct xen_blkif *blkif) } invalidate_inode_pages2(blkif->vbd.bdev->bd_inode->i_mapping); - blkif->xenblkd = kthread_run(xen_blkif_schedule, blkif, "%s", name); - if (IS_ERR(blkif->xenblkd)) { - err = PTR_ERR(blkif->xenblkd); - blkif->xenblkd = NULL; - xenbus_dev_error(blkif->be->dev, err, "start xenblkd"); - return; + for (i = 0; i < blkif->nr_rings; i++) { + ring = &blkif->rings[i]; + ring->xenblkd = kthread_run(xen_blkif_schedule, ring, "%s-%d", name, i); + if (IS_ERR(ring->xenblkd)) { + err = PTR_ERR(ring->xenblkd); + ring->xenblkd = NULL; + xenbus_dev_fatal(blkif->be->dev, err, + "start %s-%d xenblkd", name, i); + goto out; + } + } + return; + +out: + while (--i >= 0) { + ring = &blkif->rings[i]; + kthread_stop(ring->xenblkd); + } + return; +} + +static int xen_blkif_alloc_rings(struct xen_blkif *blkif) +{ + unsigned int r; + + blkif->rings = kzalloc(blkif->nr_rings * sizeof(struct xen_blkif_ring), GFP_KERNEL); + if (!blkif->rings) + return -ENOMEM; + + for (r = 0; r < blkif->nr_rings; r++) { + struct xen_blkif_ring *ring = &blkif->rings[r]; + + spin_lock_init(&ring->blk_ring_lock); + init_waitqueue_head(&ring->wq); + INIT_LIST_HEAD(&ring->pending_free); + INIT_LIST_HEAD(&ring->persistent_purge_list); + INIT_WORK(&ring->persistent_purge_work, xen_blkbk_unmap_purged_grants); + spin_lock_init(&ring->free_pages_lock); + INIT_LIST_HEAD(&ring->free_pages); + + spin_lock_init(&ring->pending_free_lock); + init_waitqueue_head(&ring->pending_free_wq); + init_waitqueue_head(&ring->shutdown_wq); + ring->blkif = blkif; + ring->st_print = jiffies; + xen_blkif_get(blkif); } + + return 0; } static struct xen_blkif *xen_blkif_alloc(domid_t domid) @@ -133,41 +177,25 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid) return ERR_PTR(-ENOMEM); blkif->domid = domid; - spin_lock_init(&blkif->blk_ring_lock); atomic_set(&blkif->refcnt, 1); - init_waitqueue_head(&blkif->wq); init_completion(&blkif->drain_complete); - atomic_set(&blkif->drain, 0); - blkif->st_print = jiffies; - blkif->persistent_gnts.rb_node = NULL; - spin_lock_init(&blkif->free_pages_lock); - INIT_LIST_HEAD(&blkif->free_pages); - INIT_LIST_HEAD(&blkif->persistent_purge_list); - blkif->free_pages_num = 0; - atomic_set(&blkif->persistent_gnt_in_use, 0); - atomic_set(&blkif->inflight, 0); - INIT_WORK(&blkif->persistent_purge_work, xen_blkbk_unmap_purged_grants); - - INIT_LIST_HEAD(&blkif->pending_free); INIT_WORK(&blkif->free_work, xen_blkif_deferred_free); - spin_lock_init(&blkif->pending_free_lock); - init_waitqueue_head(&blkif->pending_free_wq); - init_waitqueue_head(&blkif->shutdown_wq); return blkif; } -static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref, +static int xen_blkif_map(struct xen_blkif_ring *ring, grant_ref_t *gref, unsigned int nr_grefs, unsigned int evtchn) { int err; + struct xen_blkif *blkif = ring->blkif; /* Already connected through? */ - if (blkif->irq) + if (ring->irq) return 0; err = xenbus_map_ring_valloc(blkif->be->dev, gref, nr_grefs, - &blkif->blk_ring); + &ring->blk_ring); if (err < 0) return err; @@ -175,24 +203,24 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref, case BLKIF_PROTOCOL_NATIVE: { struct blkif_sring *sring; - sring = (struct blkif_sring *)blkif->blk_ring; - BACK_RING_INIT(&blkif->blk_rings.native, sring, + sring = (struct blkif_sring *)ring->blk_ring; + BACK_RING_INIT(&ring->blk_rings.native, sring, XEN_PAGE_SIZE * nr_grefs); break; } case BLKIF_PROTOCOL_X86_32: { struct blkif_x86_32_sring *sring_x86_32; - sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring; - BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, + sring_x86_32 = (struct blkif_x86_32_sring *)ring->blk_ring; + BACK_RING_INIT(&ring->blk_rings.x86_32, sring_x86_32, XEN_PAGE_SIZE * nr_grefs); break; } case BLKIF_PROTOCOL_X86_64: { struct blkif_x86_64_sring *sring_x86_64; - sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring; - BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, + sring_x86_64 = (struct blkif_x86_64_sring *)ring->blk_ring; + BACK_RING_INIT(&ring->blk_rings.x86_64, sring_x86_64, XEN_PAGE_SIZE * nr_grefs); break; } @@ -202,13 +230,13 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref, err = bind_interdomain_evtchn_to_irqhandler(blkif->domid, evtchn, xen_blkif_be_int, 0, - "blkif-backend", blkif); + "blkif-backend", ring); if (err < 0) { - xenbus_unmap_ring_vfree(blkif->be->dev, blkif->blk_ring); - blkif->blk_rings.common.sring = NULL; + xenbus_unmap_ring_vfree(blkif->be->dev, ring->blk_ring); + ring->blk_rings.common.sring = NULL; return err; } - blkif->irq = err; + ring->irq = err; return 0; } @@ -216,50 +244,69 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref, static int xen_blkif_disconnect(struct xen_blkif *blkif) { struct pending_req *req, *n; - int i = 0, j; + unsigned int j, r; - if (blkif->xenblkd) { - kthread_stop(blkif->xenblkd); - wake_up(&blkif->shutdown_wq); - blkif->xenblkd = NULL; - } + for (r = 0; r < blkif->nr_rings; r++) { + struct xen_blkif_ring *ring = &blkif->rings[r]; + unsigned int i = 0; - /* The above kthread_stop() guarantees that at this point we - * don't have any discard_io or other_io requests. So, checking - * for inflight IO is enough. - */ - if (atomic_read(&blkif->inflight) > 0) - return -EBUSY; + if (ring->xenblkd) { + kthread_stop(ring->xenblkd); + wake_up(&ring->shutdown_wq); + ring->xenblkd = NULL; + } - if (blkif->irq) { - unbind_from_irqhandler(blkif->irq, blkif); - blkif->irq = 0; - } + /* The above kthread_stop() guarantees that at this point we + * don't have any discard_io or other_io requests. So, checking + * for inflight IO is enough. + */ + if (atomic_read(&ring->inflight) > 0) + return -EBUSY; - if (blkif->blk_rings.common.sring) { - xenbus_unmap_ring_vfree(blkif->be->dev, blkif->blk_ring); - blkif->blk_rings.common.sring = NULL; - } + if (ring->irq) { + unbind_from_irqhandler(ring->irq, ring); + ring->irq = 0; + } - /* Remove all persistent grants and the cache of ballooned pages. */ - xen_blkbk_free_caches(blkif); + if (ring->blk_rings.common.sring) { + xenbus_unmap_ring_vfree(blkif->be->dev, ring->blk_ring); + ring->blk_rings.common.sring = NULL; + } - /* Check that there is no request in use */ - list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) { - list_del(&req->free_list); + /* Remove all persistent grants and the cache of ballooned pages. */ + xen_blkbk_free_caches(ring); - for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) - kfree(req->segments[j]); + /* Check that there is no request in use */ + list_for_each_entry_safe(req, n, &ring->pending_free, free_list) { + list_del(&req->free_list); - for (j = 0; j < MAX_INDIRECT_PAGES; j++) - kfree(req->indirect_pages[j]); + for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) + kfree(req->segments[j]); - kfree(req); - i++; - } + for (j = 0; j < MAX_INDIRECT_PAGES; j++) + kfree(req->indirect_pages[j]); + + kfree(req); + i++; + } - WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages)); + BUG_ON(atomic_read(&ring->persistent_gnt_in_use) != 0); + BUG_ON(!list_empty(&ring->persistent_purge_list)); + BUG_ON(!RB_EMPTY_ROOT(&ring->persistent_gnts)); + BUG_ON(!list_empty(&ring->free_pages)); + BUG_ON(ring->free_pages_num != 0); + BUG_ON(ring->persistent_gnt_c != 0); + WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages)); + xen_blkif_put(blkif); + } blkif->nr_ring_pages = 0; + /* + * blkif->rings was allocated in connect_ring, so we should free it in + * here. + */ + kfree(blkif->rings); + blkif->rings = NULL; + blkif->nr_rings = 0; return 0; } @@ -271,13 +318,6 @@ static void xen_blkif_free(struct xen_blkif *blkif) xen_vbd_free(&blkif->vbd); /* Make sure everything is drained before shutting down */ - BUG_ON(blkif->persistent_gnt_c != 0); - BUG_ON(atomic_read(&blkif->persistent_gnt_in_use) != 0); - BUG_ON(blkif->free_pages_num != 0); - BUG_ON(!list_empty(&blkif->persistent_purge_list)); - BUG_ON(!list_empty(&blkif->free_pages)); - BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts)); - kmem_cache_free(xen_blkif_cachep, blkif); } @@ -296,25 +336,38 @@ int __init xen_blkif_interface_init(void) * sysfs interface for VBD I/O requests */ -#define VBD_SHOW(name, format, args...) \ +#define VBD_SHOW_ALLRING(name, format) \ static ssize_t show_##name(struct device *_dev, \ struct device_attribute *attr, \ char *buf) \ { \ struct xenbus_device *dev = to_xenbus_device(_dev); \ struct backend_info *be = dev_get_drvdata(&dev->dev); \ + struct xen_blkif *blkif = be->blkif; \ + unsigned int i; \ + unsigned long long result = 0; \ \ - return sprintf(buf, format, ##args); \ + if (!blkif->rings) \ + goto out; \ + \ + for (i = 0; i < blkif->nr_rings; i++) { \ + struct xen_blkif_ring *ring = &blkif->rings[i]; \ + \ + result += ring->st_##name; \ + } \ + \ +out: \ + return sprintf(buf, format, result); \ } \ static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) -VBD_SHOW(oo_req, "%llu\n", be->blkif->st_oo_req); -VBD_SHOW(rd_req, "%llu\n", be->blkif->st_rd_req); -VBD_SHOW(wr_req, "%llu\n", be->blkif->st_wr_req); -VBD_SHOW(f_req, "%llu\n", be->blkif->st_f_req); -VBD_SHOW(ds_req, "%llu\n", be->blkif->st_ds_req); -VBD_SHOW(rd_sect, "%llu\n", be->blkif->st_rd_sect); -VBD_SHOW(wr_sect, "%llu\n", be->blkif->st_wr_sect); +VBD_SHOW_ALLRING(oo_req, "%llu\n"); +VBD_SHOW_ALLRING(rd_req, "%llu\n"); +VBD_SHOW_ALLRING(wr_req, "%llu\n"); +VBD_SHOW_ALLRING(f_req, "%llu\n"); +VBD_SHOW_ALLRING(ds_req, "%llu\n"); +VBD_SHOW_ALLRING(rd_sect, "%llu\n"); +VBD_SHOW_ALLRING(wr_sect, "%llu\n"); static struct attribute *xen_vbdstat_attrs[] = { &dev_attr_oo_req.attr, @@ -332,6 +385,18 @@ static struct attribute_group xen_vbdstat_group = { .attrs = xen_vbdstat_attrs, }; +#define VBD_SHOW(name, format, args...) \ + static ssize_t show_##name(struct device *_dev, \ + struct device_attribute *attr, \ + char *buf) \ + { \ + struct xenbus_device *dev = to_xenbus_device(_dev); \ + struct backend_info *be = dev_get_drvdata(&dev->dev); \ + \ + return sprintf(buf, format, ##args); \ + } \ + static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) + VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor); VBD_SHOW(mode, "%s\n", be->mode); @@ -440,11 +505,11 @@ static int xen_blkbk_remove(struct xenbus_device *dev) dev_set_drvdata(&dev->dev, NULL); - if (be->blkif) { + if (be->blkif) xen_blkif_disconnect(be->blkif); - xen_blkif_put(be->blkif); - } + /* Put the reference we set in xen_blkif_alloc(). */ + xen_blkif_put(be->blkif); kfree(be->mode); kfree(be); return 0; @@ -553,6 +618,12 @@ static int xen_blkbk_probe(struct xenbus_device *dev, goto fail; } + /* Multi-queue: advertise how many queues are supported by us.*/ + err = xenbus_printf(XBT_NIL, dev->nodename, + "multi-queue-max-queues", "%u", xenblk_max_queues); + if (err) + pr_warn("Error writing multi-queue-max-queues\n"); + /* setup back pointer */ be->blkif->be = be; @@ -708,8 +779,14 @@ static void frontend_changed(struct xenbus_device *dev, } err = connect_ring(be); - if (err) + if (err) { + /* + * Clean up so that memory resources can be used by + * other devices. connect_ring reported already error. + */ + xen_blkif_disconnect(be->blkif); break; + } xen_update_blkif_status(be->blkif); break; @@ -825,50 +902,43 @@ again: xenbus_transaction_end(xbt, 1); } - -static int connect_ring(struct backend_info *be) +/* + * Each ring may have multi pages, depends on "ring-page-order". + */ +static int read_per_ring_refs(struct xen_blkif_ring *ring, const char *dir) { - struct xenbus_device *dev = be->dev; unsigned int ring_ref[XENBUS_MAX_RING_GRANTS]; - unsigned int evtchn, nr_grefs, ring_page_order; - unsigned int pers_grants; - char protocol[64] = ""; struct pending_req *req, *n; int err, i, j; + struct xen_blkif *blkif = ring->blkif; + struct xenbus_device *dev = blkif->be->dev; + unsigned int ring_page_order, nr_grefs, evtchn; - pr_debug("%s %s\n", __func__, dev->otherend); - - err = xenbus_scanf(XBT_NIL, dev->otherend, "event-channel", "%u", + err = xenbus_scanf(XBT_NIL, dir, "event-channel", "%u", &evtchn); if (err != 1) { err = -EINVAL; - xenbus_dev_fatal(dev, err, "reading %s/event-channel", - dev->otherend); + xenbus_dev_fatal(dev, err, "reading %s/event-channel", dir); return err; } - pr_info("event-channel %u\n", evtchn); err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-page-order", "%u", &ring_page_order); if (err != 1) { - err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-ref", - "%u", &ring_ref[0]); + err = xenbus_scanf(XBT_NIL, dir, "ring-ref", "%u", &ring_ref[0]); if (err != 1) { err = -EINVAL; - xenbus_dev_fatal(dev, err, "reading %s/ring-ref", - dev->otherend); + xenbus_dev_fatal(dev, err, "reading %s/ring-ref", dir); return err; } nr_grefs = 1; - pr_info("%s:using single page: ring-ref %d\n", dev->otherend, - ring_ref[0]); } else { unsigned int i; if (ring_page_order > xen_blkif_max_ring_order) { err = -EINVAL; xenbus_dev_fatal(dev, err, "%s/request %d ring page order exceed max:%d", - dev->otherend, ring_page_order, + dir, ring_page_order, xen_blkif_max_ring_order); return err; } @@ -878,52 +948,23 @@ static int connect_ring(struct backend_info *be) char ring_ref_name[RINGREF_NAME_LEN]; snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i); - err = xenbus_scanf(XBT_NIL, dev->otherend, ring_ref_name, + err = xenbus_scanf(XBT_NIL, dir, ring_ref_name, "%u", &ring_ref[i]); if (err != 1) { err = -EINVAL; xenbus_dev_fatal(dev, err, "reading %s/%s", - dev->otherend, ring_ref_name); + dir, ring_ref_name); return err; } - pr_info("ring-ref%u: %u\n", i, ring_ref[i]); } } - - be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT; - err = xenbus_gather(XBT_NIL, dev->otherend, "protocol", - "%63s", protocol, NULL); - if (err) - strcpy(protocol, "unspecified, assuming default"); - else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE)) - be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE; - else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32)) - be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32; - else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64)) - be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64; - else { - xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol); - return -1; - } - err = xenbus_gather(XBT_NIL, dev->otherend, - "feature-persistent", "%u", - &pers_grants, NULL); - if (err) - pers_grants = 0; - - be->blkif->vbd.feature_gnt_persistent = pers_grants; - be->blkif->vbd.overflow_max_grants = 0; - be->blkif->nr_ring_pages = nr_grefs; - - pr_info("ring-pages:%d, event-channel %d, protocol %d (%s) %s\n", - nr_grefs, evtchn, be->blkif->blk_protocol, protocol, - pers_grants ? "persistent grants" : ""); + blkif->nr_ring_pages = nr_grefs; for (i = 0; i < nr_grefs * XEN_BLKIF_REQS_PER_PAGE; i++) { req = kzalloc(sizeof(*req), GFP_KERNEL); if (!req) goto fail; - list_add_tail(&req->free_list, &be->blkif->pending_free); + list_add_tail(&req->free_list, &ring->pending_free); for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) { req->segments[j] = kzalloc(sizeof(*req->segments[0]), GFP_KERNEL); if (!req->segments[j]) @@ -938,7 +979,7 @@ static int connect_ring(struct backend_info *be) } /* Map the shared frame, irq etc. */ - err = xen_blkif_map(be->blkif, ring_ref, nr_grefs, evtchn); + err = xen_blkif_map(ring, ring_ref, nr_grefs, evtchn); if (err) { xenbus_dev_fatal(dev, err, "mapping ring-ref port %u", evtchn); return err; @@ -947,7 +988,7 @@ static int connect_ring(struct backend_info *be) return 0; fail: - list_for_each_entry_safe(req, n, &be->blkif->pending_free, free_list) { + list_for_each_entry_safe(req, n, &ring->pending_free, free_list) { list_del(&req->free_list); for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) { if (!req->segments[j]) @@ -962,6 +1003,93 @@ fail: kfree(req); } return -ENOMEM; + +} + +static int connect_ring(struct backend_info *be) +{ + struct xenbus_device *dev = be->dev; + unsigned int pers_grants; + char protocol[64] = ""; + int err, i; + char *xspath; + size_t xspathsize; + const size_t xenstore_path_ext_size = 11; /* sufficient for "/queue-NNN" */ + unsigned int requested_num_queues = 0; + + pr_debug("%s %s\n", __func__, dev->otherend); + + be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT; + err = xenbus_gather(XBT_NIL, dev->otherend, "protocol", + "%63s", protocol, NULL); + if (err) + strcpy(protocol, "unspecified, assuming default"); + else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE)) + be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE; + else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32)) + be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32; + else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64)) + be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64; + else { + xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol); + return -ENOSYS; + } + err = xenbus_gather(XBT_NIL, dev->otherend, + "feature-persistent", "%u", + &pers_grants, NULL); + if (err) + pers_grants = 0; + + be->blkif->vbd.feature_gnt_persistent = pers_grants; + be->blkif->vbd.overflow_max_grants = 0; + + /* + * Read the number of hardware queues from frontend. + */ + err = xenbus_scanf(XBT_NIL, dev->otherend, "multi-queue-num-queues", + "%u", &requested_num_queues); + if (err < 0) { + requested_num_queues = 1; + } else { + if (requested_num_queues > xenblk_max_queues + || requested_num_queues == 0) { + /* Buggy or malicious guest. */ + xenbus_dev_fatal(dev, err, + "guest requested %u queues, exceeding the maximum of %u.", + requested_num_queues, xenblk_max_queues); + return -ENOSYS; + } + } + be->blkif->nr_rings = requested_num_queues; + if (xen_blkif_alloc_rings(be->blkif)) + return -ENOMEM; + + pr_info("%s: using %d queues, protocol %d (%s) %s\n", dev->nodename, + be->blkif->nr_rings, be->blkif->blk_protocol, protocol, + pers_grants ? "persistent grants" : ""); + + if (be->blkif->nr_rings == 1) + return read_per_ring_refs(&be->blkif->rings[0], dev->otherend); + else { + xspathsize = strlen(dev->otherend) + xenstore_path_ext_size; + xspath = kmalloc(xspathsize, GFP_KERNEL); + if (!xspath) { + xenbus_dev_fatal(dev, -ENOMEM, "reading ring references"); + return -ENOMEM; + } + + for (i = 0; i < be->blkif->nr_rings; i++) { + memset(xspath, 0, xspathsize); + snprintf(xspath, xspathsize, "%s/queue-%u", dev->otherend, i); + err = read_per_ring_refs(&be->blkif->rings[i], xspath); + if (err) { + kfree(xspath); + return err; + } + } + kfree(xspath); + } + return 0; } static const struct xenbus_device_id xen_blkbk_ids[] = { diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 2fee2ee..8a8dc91 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -60,6 +60,20 @@ #include <asm/xen/hypervisor.h> +/* + * The minimal size of segment supported by the block framework is PAGE_SIZE. + * When Linux is using a different page size than Xen, it may not be possible + * to put all the data in a single segment. + * This can happen when the backend doesn't support indirect descriptor and + * therefore the maximum amount of data that a request can carry is + * BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE = 44KB + * + * Note that we only support one extra request. So the Linux page size + * should be <= ( 2 * BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE) = + * 88KB. + */ +#define HAS_EXTRA_REQ (BLKIF_MAX_SEGMENTS_PER_REQUEST < XEN_PFN_PER_PAGE) + enum blkif_state { BLKIF_STATE_DISCONNECTED, BLKIF_STATE_CONNECTED, @@ -72,6 +86,13 @@ struct grant { struct list_head node; }; +enum blk_req_status { + REQ_WAITING, + REQ_DONE, + REQ_ERROR, + REQ_EOPNOTSUPP, +}; + struct blk_shadow { struct blkif_request req; struct request *request; @@ -79,6 +100,14 @@ struct blk_shadow { struct grant **indirect_grants; struct scatterlist *sg; unsigned int num_sg; + enum blk_req_status status; + + #define NO_ASSOCIATED_ID ~0UL + /* + * Id of the sibling if we ever need 2 requests when handling a + * block I/O request + */ + unsigned long associated_id; }; struct split_bio { @@ -99,6 +128,10 @@ static unsigned int xen_blkif_max_segments = 32; module_param_named(max, xen_blkif_max_segments, int, S_IRUGO); MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)"); +static unsigned int xen_blkif_max_queues = 4; +module_param_named(max_queues, xen_blkif_max_queues, uint, S_IRUGO); +MODULE_PARM_DESC(max_queues, "Maximum number of hardware queues/rings used per virtual disk"); + /* * Maximum order of pages to be used for the shared ring between front and * backend, 4KB page granularity is used. @@ -114,10 +147,35 @@ MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the __CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * XENBUS_MAX_RING_GRANTS) /* - * ring-ref%i i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19 - * characters are enough. Define to 20 to keep consist with backend. + * ring-ref%u i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19 + * characters are enough. Define to 20 to keep consistent with backend. */ #define RINGREF_NAME_LEN (20) +/* + * queue-%u would take 7 + 10(UINT_MAX) = 17 characters. + */ +#define QUEUE_NAME_LEN (17) + +/* + * Per-ring info. + * Every blkfront device can associate with one or more blkfront_ring_info, + * depending on how many hardware queues/rings to be used. + */ +struct blkfront_ring_info { + /* Lock to protect data in every ring buffer. */ + spinlock_t ring_lock; + struct blkif_front_ring ring; + unsigned int ring_ref[XENBUS_MAX_RING_GRANTS]; + unsigned int evtchn, irq; + struct work_struct work; + struct gnttab_free_callback callback; + struct blk_shadow shadow[BLK_MAX_RING_SIZE]; + struct list_head indirect_pages; + struct list_head grants; + unsigned int persistent_gnts_c; + unsigned long shadow_free; + struct blkfront_info *dev_info; +}; /* * We have one of these per vbd, whether ide, scsi or 'other'. They @@ -126,25 +184,15 @@ MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the */ struct blkfront_info { - spinlock_t io_lock; struct mutex mutex; struct xenbus_device *xbdev; struct gendisk *gd; int vdevice; blkif_vdev_t handle; enum blkif_state connected; - int ring_ref[XENBUS_MAX_RING_GRANTS]; + /* Number of pages per ring buffer. */ unsigned int nr_ring_pages; - struct blkif_front_ring ring; - unsigned int evtchn, irq; struct request_queue *rq; - struct work_struct work; - struct gnttab_free_callback callback; - struct blk_shadow shadow[BLK_MAX_RING_SIZE]; - struct list_head grants; - struct list_head indirect_pages; - unsigned int persistent_gnts_c; - unsigned long shadow_free; unsigned int feature_flush; unsigned int feature_discard:1; unsigned int feature_secdiscard:1; @@ -155,6 +203,8 @@ struct blkfront_info unsigned int max_indirect_segments; int is_ready; struct blk_mq_tag_set tag_set; + struct blkfront_ring_info *rinfo; + unsigned int nr_rings; }; static unsigned int nr_minors; @@ -198,38 +248,40 @@ static DEFINE_SPINLOCK(minor_lock); #define GREFS(_psegs) ((_psegs) * GRANTS_PER_PSEG) -static int blkfront_setup_indirect(struct blkfront_info *info); -static int blkfront_gather_backend_features(struct blkfront_info *info); +static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo); +static void blkfront_gather_backend_features(struct blkfront_info *info); -static int get_id_from_freelist(struct blkfront_info *info) +static int get_id_from_freelist(struct blkfront_ring_info *rinfo) { - unsigned long free = info->shadow_free; - BUG_ON(free >= BLK_RING_SIZE(info)); - info->shadow_free = info->shadow[free].req.u.rw.id; - info->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */ + unsigned long free = rinfo->shadow_free; + + BUG_ON(free >= BLK_RING_SIZE(rinfo->dev_info)); + rinfo->shadow_free = rinfo->shadow[free].req.u.rw.id; + rinfo->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */ return free; } -static int add_id_to_freelist(struct blkfront_info *info, - unsigned long id) +static int add_id_to_freelist(struct blkfront_ring_info *rinfo, + unsigned long id) { - if (info->shadow[id].req.u.rw.id != id) + if (rinfo->shadow[id].req.u.rw.id != id) return -EINVAL; - if (info->shadow[id].request == NULL) + if (rinfo->shadow[id].request == NULL) return -EINVAL; - info->shadow[id].req.u.rw.id = info->shadow_free; - info->shadow[id].request = NULL; - info->shadow_free = id; + rinfo->shadow[id].req.u.rw.id = rinfo->shadow_free; + rinfo->shadow[id].request = NULL; + rinfo->shadow_free = id; return 0; } -static int fill_grant_buffer(struct blkfront_info *info, int num) +static int fill_grant_buffer(struct blkfront_ring_info *rinfo, int num) { + struct blkfront_info *info = rinfo->dev_info; struct page *granted_page; struct grant *gnt_list_entry, *n; int i = 0; - while(i < num) { + while (i < num) { gnt_list_entry = kzalloc(sizeof(struct grant), GFP_NOIO); if (!gnt_list_entry) goto out_of_memory; @@ -244,7 +296,7 @@ static int fill_grant_buffer(struct blkfront_info *info, int num) } gnt_list_entry->gref = GRANT_INVALID_REF; - list_add(&gnt_list_entry->node, &info->grants); + list_add(&gnt_list_entry->node, &rinfo->grants); i++; } @@ -252,7 +304,7 @@ static int fill_grant_buffer(struct blkfront_info *info, int num) out_of_memory: list_for_each_entry_safe(gnt_list_entry, n, - &info->grants, node) { + &rinfo->grants, node) { list_del(&gnt_list_entry->node); if (info->feature_persistent) __free_page(gnt_list_entry->page); @@ -263,17 +315,17 @@ out_of_memory: return -ENOMEM; } -static struct grant *get_free_grant(struct blkfront_info *info) +static struct grant *get_free_grant(struct blkfront_ring_info *rinfo) { struct grant *gnt_list_entry; - BUG_ON(list_empty(&info->grants)); - gnt_list_entry = list_first_entry(&info->grants, struct grant, + BUG_ON(list_empty(&rinfo->grants)); + gnt_list_entry = list_first_entry(&rinfo->grants, struct grant, node); list_del(&gnt_list_entry->node); if (gnt_list_entry->gref != GRANT_INVALID_REF) - info->persistent_gnts_c--; + rinfo->persistent_gnts_c--; return gnt_list_entry; } @@ -289,9 +341,10 @@ static inline void grant_foreign_access(const struct grant *gnt_list_entry, static struct grant *get_grant(grant_ref_t *gref_head, unsigned long gfn, - struct blkfront_info *info) + struct blkfront_ring_info *rinfo) { - struct grant *gnt_list_entry = get_free_grant(info); + struct grant *gnt_list_entry = get_free_grant(rinfo); + struct blkfront_info *info = rinfo->dev_info; if (gnt_list_entry->gref != GRANT_INVALID_REF) return gnt_list_entry; @@ -312,9 +365,10 @@ static struct grant *get_grant(grant_ref_t *gref_head, } static struct grant *get_indirect_grant(grant_ref_t *gref_head, - struct blkfront_info *info) + struct blkfront_ring_info *rinfo) { - struct grant *gnt_list_entry = get_free_grant(info); + struct grant *gnt_list_entry = get_free_grant(rinfo); + struct blkfront_info *info = rinfo->dev_info; if (gnt_list_entry->gref != GRANT_INVALID_REF) return gnt_list_entry; @@ -326,8 +380,8 @@ static struct grant *get_indirect_grant(grant_ref_t *gref_head, struct page *indirect_page; /* Fetch a pre-allocated page to use for indirect grefs */ - BUG_ON(list_empty(&info->indirect_pages)); - indirect_page = list_first_entry(&info->indirect_pages, + BUG_ON(list_empty(&rinfo->indirect_pages)); + indirect_page = list_first_entry(&rinfo->indirect_pages, struct page, lru); list_del(&indirect_page->lru); gnt_list_entry->page = indirect_page; @@ -403,8 +457,8 @@ static void xlbd_release_minors(unsigned int minor, unsigned int nr) static void blkif_restart_queue_callback(void *arg) { - struct blkfront_info *info = (struct blkfront_info *)arg; - schedule_work(&info->work); + struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)arg; + schedule_work(&rinfo->work); } static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg) @@ -456,16 +510,33 @@ static int blkif_ioctl(struct block_device *bdev, fmode_t mode, return 0; } -static int blkif_queue_discard_req(struct request *req) +static unsigned long blkif_ring_get_request(struct blkfront_ring_info *rinfo, + struct request *req, + struct blkif_request **ring_req) { - struct blkfront_info *info = req->rq_disk->private_data; + unsigned long id; + + *ring_req = RING_GET_REQUEST(&rinfo->ring, rinfo->ring.req_prod_pvt); + rinfo->ring.req_prod_pvt++; + + id = get_id_from_freelist(rinfo); + rinfo->shadow[id].request = req; + rinfo->shadow[id].status = REQ_WAITING; + rinfo->shadow[id].associated_id = NO_ASSOCIATED_ID; + + (*ring_req)->u.rw.id = id; + + return id; +} + +static int blkif_queue_discard_req(struct request *req, struct blkfront_ring_info *rinfo) +{ + struct blkfront_info *info = rinfo->dev_info; struct blkif_request *ring_req; unsigned long id; /* Fill out a communications ring structure. */ - ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); - id = get_id_from_freelist(info); - info->shadow[id].request = req; + id = blkif_ring_get_request(rinfo, req, &ring_req); ring_req->operation = BLKIF_OP_DISCARD; ring_req->u.discard.nr_sectors = blk_rq_sectors(req); @@ -476,10 +547,8 @@ static int blkif_queue_discard_req(struct request *req) else ring_req->u.discard.flag = 0; - info->ring.req_prod_pvt++; - /* Keep a private copy so we can reissue requests when recovering. */ - info->shadow[id].req = *ring_req; + rinfo->shadow[id].req = *ring_req; return 0; } @@ -487,7 +556,7 @@ static int blkif_queue_discard_req(struct request *req) struct setup_rw_req { unsigned int grant_idx; struct blkif_request_segment *segments; - struct blkfront_info *info; + struct blkfront_ring_info *rinfo; struct blkif_request *ring_req; grant_ref_t gref_head; unsigned int id; @@ -495,6 +564,9 @@ struct setup_rw_req { bool need_copy; unsigned int bvec_off; char *bvec_data; + + bool require_extra_req; + struct blkif_request *extra_ring_req; }; static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset, @@ -507,8 +579,24 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset, /* Convenient aliases */ unsigned int grant_idx = setup->grant_idx; struct blkif_request *ring_req = setup->ring_req; - struct blkfront_info *info = setup->info; - struct blk_shadow *shadow = &info->shadow[setup->id]; + struct blkfront_ring_info *rinfo = setup->rinfo; + /* + * We always use the shadow of the first request to store the list + * of grant associated to the block I/O request. This made the + * completion more easy to handle even if the block I/O request is + * split. + */ + struct blk_shadow *shadow = &rinfo->shadow[setup->id]; + + if (unlikely(setup->require_extra_req && + grant_idx >= BLKIF_MAX_SEGMENTS_PER_REQUEST)) { + /* + * We are using the second request, setup grant_idx + * to be the index of the segment array. + */ + grant_idx -= BLKIF_MAX_SEGMENTS_PER_REQUEST; + ring_req = setup->extra_ring_req; + } if ((ring_req->operation == BLKIF_OP_INDIRECT) && (grant_idx % GRANTS_PER_INDIRECT_FRAME == 0)) { @@ -516,15 +604,19 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset, kunmap_atomic(setup->segments); n = grant_idx / GRANTS_PER_INDIRECT_FRAME; - gnt_list_entry = get_indirect_grant(&setup->gref_head, info); + gnt_list_entry = get_indirect_grant(&setup->gref_head, rinfo); shadow->indirect_grants[n] = gnt_list_entry; setup->segments = kmap_atomic(gnt_list_entry->page); ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref; } - gnt_list_entry = get_grant(&setup->gref_head, gfn, info); + gnt_list_entry = get_grant(&setup->gref_head, gfn, rinfo); ref = gnt_list_entry->gref; - shadow->grants_used[grant_idx] = gnt_list_entry; + /* + * All the grants are stored in the shadow of the first + * request. Therefore we have to use the global index. + */ + shadow->grants_used[setup->grant_idx] = gnt_list_entry; if (setup->need_copy) { void *shared_data; @@ -566,16 +658,36 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset, (setup->grant_idx)++; } -static int blkif_queue_rw_req(struct request *req) +static void blkif_setup_extra_req(struct blkif_request *first, + struct blkif_request *second) { - struct blkfront_info *info = req->rq_disk->private_data; - struct blkif_request *ring_req; - unsigned long id; + uint16_t nr_segments = first->u.rw.nr_segments; + + /* + * The second request is only present when the first request uses + * all its segments. It's always the continuity of the first one. + */ + first->u.rw.nr_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST; + + second->u.rw.nr_segments = nr_segments - BLKIF_MAX_SEGMENTS_PER_REQUEST; + second->u.rw.sector_number = first->u.rw.sector_number + + (BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE) / 512; + + second->u.rw.handle = first->u.rw.handle; + second->operation = first->operation; +} + +static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *rinfo) +{ + struct blkfront_info *info = rinfo->dev_info; + struct blkif_request *ring_req, *extra_ring_req = NULL; + unsigned long id, extra_id = NO_ASSOCIATED_ID; + bool require_extra_req = false; int i; struct setup_rw_req setup = { .grant_idx = 0, .segments = NULL, - .info = info, + .rinfo = rinfo, .need_copy = rq_data_dir(req) && info->feature_persistent, }; @@ -584,7 +696,6 @@ static int blkif_queue_rw_req(struct request *req) * existing persistent grants, or if we have to get new grants, * as there are not sufficiently many free. */ - bool new_persistent_gnts; struct scatterlist *sg; int num_sg, max_grefs, num_grant; @@ -596,41 +707,36 @@ static int blkif_queue_rw_req(struct request *req) */ max_grefs += INDIRECT_GREFS(max_grefs); - /* Check if we have enough grants to allocate a requests */ - if (info->persistent_gnts_c < max_grefs) { - new_persistent_gnts = 1; - if (gnttab_alloc_grant_references( - max_grefs - info->persistent_gnts_c, - &setup.gref_head) < 0) { + /* + * We have to reserve 'max_grefs' grants because persistent + * grants are shared by all rings. + */ + if (max_grefs > 0) + if (gnttab_alloc_grant_references(max_grefs, &setup.gref_head) < 0) { gnttab_request_free_callback( - &info->callback, + &rinfo->callback, blkif_restart_queue_callback, - info, + rinfo, max_grefs); return 1; } - } else - new_persistent_gnts = 0; /* Fill out a communications ring structure. */ - ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); - id = get_id_from_freelist(info); - info->shadow[id].request = req; - - BUG_ON(info->max_indirect_segments == 0 && - GREFS(req->nr_phys_segments) > BLKIF_MAX_SEGMENTS_PER_REQUEST); - BUG_ON(info->max_indirect_segments && - GREFS(req->nr_phys_segments) > info->max_indirect_segments); + id = blkif_ring_get_request(rinfo, req, &ring_req); - num_sg = blk_rq_map_sg(req->q, req, info->shadow[id].sg); + num_sg = blk_rq_map_sg(req->q, req, rinfo->shadow[id].sg); num_grant = 0; /* Calculate the number of grant used */ - for_each_sg(info->shadow[id].sg, sg, num_sg, i) + for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i) num_grant += gnttab_count_grant(sg->offset, sg->length); - ring_req->u.rw.id = id; - info->shadow[id].num_sg = num_sg; - if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST) { + require_extra_req = info->max_indirect_segments == 0 && + num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST; + BUG_ON(!HAS_EXTRA_REQ && require_extra_req); + + rinfo->shadow[id].num_sg = num_sg; + if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST && + likely(!require_extra_req)) { /* * The indirect operation can only be a BLKIF_OP_READ or * BLKIF_OP_WRITE @@ -670,11 +776,31 @@ static int blkif_queue_rw_req(struct request *req) } } ring_req->u.rw.nr_segments = num_grant; + if (unlikely(require_extra_req)) { + extra_id = blkif_ring_get_request(rinfo, req, + &extra_ring_req); + /* + * Only the first request contains the scatter-gather + * list. + */ + rinfo->shadow[extra_id].num_sg = 0; + + blkif_setup_extra_req(ring_req, extra_ring_req); + + /* Link the 2 requests together */ + rinfo->shadow[extra_id].associated_id = id; + rinfo->shadow[id].associated_id = extra_id; + } } setup.ring_req = ring_req; setup.id = id; - for_each_sg(info->shadow[id].sg, sg, num_sg, i) { + + setup.require_extra_req = require_extra_req; + if (unlikely(require_extra_req)) + setup.extra_ring_req = extra_ring_req; + + for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i) { BUG_ON(sg->offset + sg->length > PAGE_SIZE); if (setup.need_copy) { @@ -694,12 +820,12 @@ static int blkif_queue_rw_req(struct request *req) if (setup.segments) kunmap_atomic(setup.segments); - info->ring.req_prod_pvt++; - /* Keep a private copy so we can reissue requests when recovering. */ - info->shadow[id].req = *ring_req; + rinfo->shadow[id].req = *ring_req; + if (unlikely(require_extra_req)) + rinfo->shadow[extra_id].req = *extra_ring_req; - if (new_persistent_gnts) + if (max_grefs > 0) gnttab_free_grant_references(setup.gref_head); return 0; @@ -711,27 +837,25 @@ static int blkif_queue_rw_req(struct request *req) * * @req: a request struct */ -static int blkif_queue_request(struct request *req) +static int blkif_queue_request(struct request *req, struct blkfront_ring_info *rinfo) { - struct blkfront_info *info = req->rq_disk->private_data; - - if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) + if (unlikely(rinfo->dev_info->connected != BLKIF_STATE_CONNECTED)) return 1; if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) - return blkif_queue_discard_req(req); + return blkif_queue_discard_req(req, rinfo); else - return blkif_queue_rw_req(req); + return blkif_queue_rw_req(req, rinfo); } -static inline void flush_requests(struct blkfront_info *info) +static inline void flush_requests(struct blkfront_ring_info *rinfo) { int notify; - RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify); + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&rinfo->ring, notify); if (notify) - notify_remote_via_irq(info->irq); + notify_remote_via_irq(rinfo->irq); } static inline bool blkif_request_flush_invalid(struct request *req, @@ -745,38 +869,50 @@ static inline bool blkif_request_flush_invalid(struct request *req, } static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx, - const struct blk_mq_queue_data *qd) + const struct blk_mq_queue_data *qd) { - struct blkfront_info *info = qd->rq->rq_disk->private_data; + unsigned long flags; + struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)hctx->driver_data; blk_mq_start_request(qd->rq); - spin_lock_irq(&info->io_lock); - if (RING_FULL(&info->ring)) + spin_lock_irqsave(&rinfo->ring_lock, flags); + if (RING_FULL(&rinfo->ring)) goto out_busy; - if (blkif_request_flush_invalid(qd->rq, info)) + if (blkif_request_flush_invalid(qd->rq, rinfo->dev_info)) goto out_err; - if (blkif_queue_request(qd->rq)) + if (blkif_queue_request(qd->rq, rinfo)) goto out_busy; - flush_requests(info); - spin_unlock_irq(&info->io_lock); + flush_requests(rinfo); + spin_unlock_irqrestore(&rinfo->ring_lock, flags); return BLK_MQ_RQ_QUEUE_OK; out_err: - spin_unlock_irq(&info->io_lock); + spin_unlock_irqrestore(&rinfo->ring_lock, flags); return BLK_MQ_RQ_QUEUE_ERROR; out_busy: - spin_unlock_irq(&info->io_lock); + spin_unlock_irqrestore(&rinfo->ring_lock, flags); blk_mq_stop_hw_queue(hctx); return BLK_MQ_RQ_QUEUE_BUSY; } +static int blk_mq_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int index) +{ + struct blkfront_info *info = (struct blkfront_info *)data; + + BUG_ON(info->nr_rings <= index); + hctx->driver_data = &info->rinfo[index]; + return 0; +} + static struct blk_mq_ops blkfront_mq_ops = { .queue_rq = blkif_queue_rq, .map_queue = blk_mq_map_queue, + .init_hctx = blk_mq_init_hctx, }; static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, @@ -788,19 +924,28 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, memset(&info->tag_set, 0, sizeof(info->tag_set)); info->tag_set.ops = &blkfront_mq_ops; - info->tag_set.nr_hw_queues = 1; - info->tag_set.queue_depth = BLK_RING_SIZE(info); + info->tag_set.nr_hw_queues = info->nr_rings; + if (HAS_EXTRA_REQ && info->max_indirect_segments == 0) { + /* + * When indirect descriptior is not supported, the I/O request + * will be split between multiple request in the ring. + * To avoid problems when sending the request, divide by + * 2 the depth of the queue. + */ + info->tag_set.queue_depth = BLK_RING_SIZE(info) / 2; + } else + info->tag_set.queue_depth = BLK_RING_SIZE(info); info->tag_set.numa_node = NUMA_NO_NODE; info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; info->tag_set.cmd_size = 0; info->tag_set.driver_data = info; if (blk_mq_alloc_tag_set(&info->tag_set)) - return -1; + return -EINVAL; rq = blk_mq_init_queue(&info->tag_set); if (IS_ERR(rq)) { blk_mq_free_tag_set(&info->tag_set); - return -1; + return PTR_ERR(rq); } queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq); @@ -1028,7 +1173,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, static void xlvbd_release_gendisk(struct blkfront_info *info) { - unsigned int minor, nr_minors; + unsigned int minor, nr_minors, i; if (info->rq == NULL) return; @@ -1036,11 +1181,15 @@ static void xlvbd_release_gendisk(struct blkfront_info *info) /* No more blkif_request(). */ blk_mq_stop_hw_queues(info->rq); - /* No more gnttab callback work. */ - gnttab_cancel_free_callback(&info->callback); + for (i = 0; i < info->nr_rings; i++) { + struct blkfront_ring_info *rinfo = &info->rinfo[i]; - /* Flush gnttab callback work. Must be done with no locks held. */ - flush_work(&info->work); + /* No more gnttab callback work. */ + gnttab_cancel_free_callback(&rinfo->callback); + + /* Flush gnttab callback work. Must be done with no locks held. */ + flush_work(&rinfo->work); + } del_gendisk(info->gd); @@ -1056,88 +1205,87 @@ static void xlvbd_release_gendisk(struct blkfront_info *info) info->gd = NULL; } -/* Must be called with io_lock holded */ -static void kick_pending_request_queues(struct blkfront_info *info) +/* Already hold rinfo->ring_lock. */ +static inline void kick_pending_request_queues_locked(struct blkfront_ring_info *rinfo) { - if (!RING_FULL(&info->ring)) - blk_mq_start_stopped_hw_queues(info->rq, true); + if (!RING_FULL(&rinfo->ring)) + blk_mq_start_stopped_hw_queues(rinfo->dev_info->rq, true); } -static void blkif_restart_queue(struct work_struct *work) +static void kick_pending_request_queues(struct blkfront_ring_info *rinfo) { - struct blkfront_info *info = container_of(work, struct blkfront_info, work); + unsigned long flags; - spin_lock_irq(&info->io_lock); - if (info->connected == BLKIF_STATE_CONNECTED) - kick_pending_request_queues(info); - spin_unlock_irq(&info->io_lock); + spin_lock_irqsave(&rinfo->ring_lock, flags); + kick_pending_request_queues_locked(rinfo); + spin_unlock_irqrestore(&rinfo->ring_lock, flags); } -static void blkif_free(struct blkfront_info *info, int suspend) +static void blkif_restart_queue(struct work_struct *work) { - struct grant *persistent_gnt; - struct grant *n; - int i, j, segs; + struct blkfront_ring_info *rinfo = container_of(work, struct blkfront_ring_info, work); - /* Prevent new requests being issued until we fix things up. */ - spin_lock_irq(&info->io_lock); - info->connected = suspend ? - BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED; - /* No more blkif_request(). */ - if (info->rq) - blk_mq_stop_hw_queues(info->rq); + if (rinfo->dev_info->connected == BLKIF_STATE_CONNECTED) + kick_pending_request_queues(rinfo); +} - /* Remove all persistent grants */ - if (!list_empty(&info->grants)) { - list_for_each_entry_safe(persistent_gnt, n, - &info->grants, node) { - list_del(&persistent_gnt->node); - if (persistent_gnt->gref != GRANT_INVALID_REF) { - gnttab_end_foreign_access(persistent_gnt->gref, - 0, 0UL); - info->persistent_gnts_c--; - } - if (info->feature_persistent) - __free_page(persistent_gnt->page); - kfree(persistent_gnt); - } - } - BUG_ON(info->persistent_gnts_c != 0); +static void blkif_free_ring(struct blkfront_ring_info *rinfo) +{ + struct grant *persistent_gnt, *n; + struct blkfront_info *info = rinfo->dev_info; + int i, j, segs; /* * Remove indirect pages, this only happens when using indirect * descriptors but not persistent grants */ - if (!list_empty(&info->indirect_pages)) { + if (!list_empty(&rinfo->indirect_pages)) { struct page *indirect_page, *n; BUG_ON(info->feature_persistent); - list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) { + list_for_each_entry_safe(indirect_page, n, &rinfo->indirect_pages, lru) { list_del(&indirect_page->lru); __free_page(indirect_page); } } + /* Remove all persistent grants. */ + if (!list_empty(&rinfo->grants)) { + list_for_each_entry_safe(persistent_gnt, n, + &rinfo->grants, node) { + list_del(&persistent_gnt->node); + if (persistent_gnt->gref != GRANT_INVALID_REF) { + gnttab_end_foreign_access(persistent_gnt->gref, + 0, 0UL); + rinfo->persistent_gnts_c--; + } + if (info->feature_persistent) + __free_page(persistent_gnt->page); + kfree(persistent_gnt); + } + } + BUG_ON(rinfo->persistent_gnts_c != 0); + for (i = 0; i < BLK_RING_SIZE(info); i++) { /* * Clear persistent grants present in requests already * on the shared ring */ - if (!info->shadow[i].request) + if (!rinfo->shadow[i].request) goto free_shadow; - segs = info->shadow[i].req.operation == BLKIF_OP_INDIRECT ? - info->shadow[i].req.u.indirect.nr_segments : - info->shadow[i].req.u.rw.nr_segments; + segs = rinfo->shadow[i].req.operation == BLKIF_OP_INDIRECT ? + rinfo->shadow[i].req.u.indirect.nr_segments : + rinfo->shadow[i].req.u.rw.nr_segments; for (j = 0; j < segs; j++) { - persistent_gnt = info->shadow[i].grants_used[j]; + persistent_gnt = rinfo->shadow[i].grants_used[j]; gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); if (info->feature_persistent) __free_page(persistent_gnt->page); kfree(persistent_gnt); } - if (info->shadow[i].req.operation != BLKIF_OP_INDIRECT) + if (rinfo->shadow[i].req.operation != BLKIF_OP_INDIRECT) /* * If this is not an indirect operation don't try to * free indirect segments @@ -1145,42 +1293,59 @@ static void blkif_free(struct blkfront_info *info, int suspend) goto free_shadow; for (j = 0; j < INDIRECT_GREFS(segs); j++) { - persistent_gnt = info->shadow[i].indirect_grants[j]; + persistent_gnt = rinfo->shadow[i].indirect_grants[j]; gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); __free_page(persistent_gnt->page); kfree(persistent_gnt); } free_shadow: - kfree(info->shadow[i].grants_used); - info->shadow[i].grants_used = NULL; - kfree(info->shadow[i].indirect_grants); - info->shadow[i].indirect_grants = NULL; - kfree(info->shadow[i].sg); - info->shadow[i].sg = NULL; + kfree(rinfo->shadow[i].grants_used); + rinfo->shadow[i].grants_used = NULL; + kfree(rinfo->shadow[i].indirect_grants); + rinfo->shadow[i].indirect_grants = NULL; + kfree(rinfo->shadow[i].sg); + rinfo->shadow[i].sg = NULL; } /* No more gnttab callback work. */ - gnttab_cancel_free_callback(&info->callback); - spin_unlock_irq(&info->io_lock); + gnttab_cancel_free_callback(&rinfo->callback); /* Flush gnttab callback work. Must be done with no locks held. */ - flush_work(&info->work); + flush_work(&rinfo->work); /* Free resources associated with old device channel. */ for (i = 0; i < info->nr_ring_pages; i++) { - if (info->ring_ref[i] != GRANT_INVALID_REF) { - gnttab_end_foreign_access(info->ring_ref[i], 0, 0); - info->ring_ref[i] = GRANT_INVALID_REF; + if (rinfo->ring_ref[i] != GRANT_INVALID_REF) { + gnttab_end_foreign_access(rinfo->ring_ref[i], 0, 0); + rinfo->ring_ref[i] = GRANT_INVALID_REF; } } - free_pages((unsigned long)info->ring.sring, get_order(info->nr_ring_pages * PAGE_SIZE)); - info->ring.sring = NULL; + free_pages((unsigned long)rinfo->ring.sring, get_order(info->nr_ring_pages * PAGE_SIZE)); + rinfo->ring.sring = NULL; - if (info->irq) - unbind_from_irqhandler(info->irq, info); - info->evtchn = info->irq = 0; + if (rinfo->irq) + unbind_from_irqhandler(rinfo->irq, rinfo); + rinfo->evtchn = rinfo->irq = 0; +} +static void blkif_free(struct blkfront_info *info, int suspend) +{ + unsigned int i; + + /* Prevent new requests being issued until we fix things up. */ + info->connected = suspend ? + BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED; + /* No more blkif_request(). */ + if (info->rq) + blk_mq_stop_hw_queues(info->rq); + + for (i = 0; i < info->nr_rings; i++) + blkif_free_ring(&info->rinfo[i]); + + kfree(info->rinfo); + info->rinfo = NULL; + info->nr_rings = 0; } struct copy_from_grant { @@ -1209,19 +1374,93 @@ static void blkif_copy_from_grant(unsigned long gfn, unsigned int offset, kunmap_atomic(shared_data); } -static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, +static enum blk_req_status blkif_rsp_to_req_status(int rsp) +{ + switch (rsp) + { + case BLKIF_RSP_OKAY: + return REQ_DONE; + case BLKIF_RSP_EOPNOTSUPP: + return REQ_EOPNOTSUPP; + case BLKIF_RSP_ERROR: + /* Fallthrough. */ + default: + return REQ_ERROR; + } +} + +/* + * Get the final status of the block request based on two ring response + */ +static int blkif_get_final_status(enum blk_req_status s1, + enum blk_req_status s2) +{ + BUG_ON(s1 == REQ_WAITING); + BUG_ON(s2 == REQ_WAITING); + + if (s1 == REQ_ERROR || s2 == REQ_ERROR) + return BLKIF_RSP_ERROR; + else if (s1 == REQ_EOPNOTSUPP || s2 == REQ_EOPNOTSUPP) + return BLKIF_RSP_EOPNOTSUPP; + return BLKIF_RSP_OKAY; +} + +static bool blkif_completion(unsigned long *id, + struct blkfront_ring_info *rinfo, struct blkif_response *bret) { int i = 0; struct scatterlist *sg; int num_sg, num_grant; + struct blkfront_info *info = rinfo->dev_info; + struct blk_shadow *s = &rinfo->shadow[*id]; struct copy_from_grant data = { - .s = s, .grant_idx = 0, }; num_grant = s->req.operation == BLKIF_OP_INDIRECT ? s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments; + + /* The I/O request may be split in two. */ + if (unlikely(s->associated_id != NO_ASSOCIATED_ID)) { + struct blk_shadow *s2 = &rinfo->shadow[s->associated_id]; + + /* Keep the status of the current response in shadow. */ + s->status = blkif_rsp_to_req_status(bret->status); + + /* Wait the second response if not yet here. */ + if (s2->status == REQ_WAITING) + return 0; + + bret->status = blkif_get_final_status(s->status, + s2->status); + + /* + * All the grants is stored in the first shadow in order + * to make the completion code simpler. + */ + num_grant += s2->req.u.rw.nr_segments; + + /* + * The two responses may not come in order. Only the + * first request will store the scatter-gather list. + */ + if (s2->num_sg != 0) { + /* Update "id" with the ID of the first response. */ + *id = s->associated_id; + s = s2; + } + + /* + * We don't need anymore the second request, so recycling + * it now. + */ + if (add_id_to_freelist(rinfo, s->associated_id)) + WARN(1, "%s: can't recycle the second part (id = %ld) of the request\n", + info->gd->disk_name, s->associated_id); + } + + data.s = s; num_sg = s->num_sg; if (bret->operation == BLKIF_OP_READ && info->feature_persistent) { @@ -1252,8 +1491,8 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, if (!info->feature_persistent) pr_alert_ratelimited("backed has not unmapped grant: %u\n", s->grants_used[i]->gref); - list_add(&s->grants_used[i]->node, &info->grants); - info->persistent_gnts_c++; + list_add(&s->grants_used[i]->node, &rinfo->grants); + rinfo->persistent_gnts_c++; } else { /* * If the grant is not mapped by the backend we end the @@ -1263,7 +1502,7 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, */ gnttab_end_foreign_access(s->grants_used[i]->gref, 0, 0UL); s->grants_used[i]->gref = GRANT_INVALID_REF; - list_add_tail(&s->grants_used[i]->node, &info->grants); + list_add_tail(&s->grants_used[i]->node, &rinfo->grants); } } if (s->req.operation == BLKIF_OP_INDIRECT) { @@ -1272,8 +1511,8 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, if (!info->feature_persistent) pr_alert_ratelimited("backed has not unmapped grant: %u\n", s->indirect_grants[i]->gref); - list_add(&s->indirect_grants[i]->node, &info->grants); - info->persistent_gnts_c++; + list_add(&s->indirect_grants[i]->node, &rinfo->grants); + rinfo->persistent_gnts_c++; } else { struct page *indirect_page; @@ -1284,13 +1523,15 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, */ if (!info->feature_persistent) { indirect_page = s->indirect_grants[i]->page; - list_add(&indirect_page->lru, &info->indirect_pages); + list_add(&indirect_page->lru, &rinfo->indirect_pages); } s->indirect_grants[i]->gref = GRANT_INVALID_REF; - list_add_tail(&s->indirect_grants[i]->node, &info->grants); + list_add_tail(&s->indirect_grants[i]->node, &rinfo->grants); } } } + + return 1; } static irqreturn_t blkif_interrupt(int irq, void *dev_id) @@ -1299,24 +1540,22 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) struct blkif_response *bret; RING_IDX i, rp; unsigned long flags; - struct blkfront_info *info = (struct blkfront_info *)dev_id; + struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)dev_id; + struct blkfront_info *info = rinfo->dev_info; int error; - spin_lock_irqsave(&info->io_lock, flags); - - if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) { - spin_unlock_irqrestore(&info->io_lock, flags); + if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) return IRQ_HANDLED; - } + spin_lock_irqsave(&rinfo->ring_lock, flags); again: - rp = info->ring.sring->rsp_prod; + rp = rinfo->ring.sring->rsp_prod; rmb(); /* Ensure we see queued responses up to 'rp'. */ - for (i = info->ring.rsp_cons; i != rp; i++) { + for (i = rinfo->ring.rsp_cons; i != rp; i++) { unsigned long id; - bret = RING_GET_RESPONSE(&info->ring, i); + bret = RING_GET_RESPONSE(&rinfo->ring, i); id = bret->id; /* * The backend has messed up and given us an id that we would @@ -1330,12 +1569,18 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) * the id is busted. */ continue; } - req = info->shadow[id].request; + req = rinfo->shadow[id].request; - if (bret->operation != BLKIF_OP_DISCARD) - blkif_completion(&info->shadow[id], info, bret); + if (bret->operation != BLKIF_OP_DISCARD) { + /* + * We may need to wait for an extra response if the + * I/O request is split in 2 + */ + if (!blkif_completion(&id, rinfo, bret)) + continue; + } - if (add_id_to_freelist(info, id)) { + if (add_id_to_freelist(rinfo, id)) { WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n", info->gd->disk_name, op_name(bret->operation), id); continue; @@ -1364,7 +1609,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) error = -EOPNOTSUPP; } if (unlikely(bret->status == BLKIF_RSP_ERROR && - info->shadow[id].req.u.rw.nr_segments == 0)) { + rinfo->shadow[id].req.u.rw.nr_segments == 0)) { printk(KERN_WARNING "blkfront: %s: empty %s op failed\n", info->gd->disk_name, op_name(bret->operation)); error = -EOPNOTSUPP; @@ -1389,34 +1634,35 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) } } - info->ring.rsp_cons = i; + rinfo->ring.rsp_cons = i; - if (i != info->ring.req_prod_pvt) { + if (i != rinfo->ring.req_prod_pvt) { int more_to_do; - RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do); + RING_FINAL_CHECK_FOR_RESPONSES(&rinfo->ring, more_to_do); if (more_to_do) goto again; } else - info->ring.sring->rsp_event = i + 1; + rinfo->ring.sring->rsp_event = i + 1; - kick_pending_request_queues(info); + kick_pending_request_queues_locked(rinfo); - spin_unlock_irqrestore(&info->io_lock, flags); + spin_unlock_irqrestore(&rinfo->ring_lock, flags); return IRQ_HANDLED; } static int setup_blkring(struct xenbus_device *dev, - struct blkfront_info *info) + struct blkfront_ring_info *rinfo) { struct blkif_sring *sring; int err, i; + struct blkfront_info *info = rinfo->dev_info; unsigned long ring_size = info->nr_ring_pages * XEN_PAGE_SIZE; grant_ref_t gref[XENBUS_MAX_RING_GRANTS]; for (i = 0; i < info->nr_ring_pages; i++) - info->ring_ref[i] = GRANT_INVALID_REF; + rinfo->ring_ref[i] = GRANT_INVALID_REF; sring = (struct blkif_sring *)__get_free_pages(GFP_NOIO | __GFP_HIGH, get_order(ring_size)); @@ -1425,29 +1671,29 @@ static int setup_blkring(struct xenbus_device *dev, return -ENOMEM; } SHARED_RING_INIT(sring); - FRONT_RING_INIT(&info->ring, sring, ring_size); + FRONT_RING_INIT(&rinfo->ring, sring, ring_size); - err = xenbus_grant_ring(dev, info->ring.sring, info->nr_ring_pages, gref); + err = xenbus_grant_ring(dev, rinfo->ring.sring, info->nr_ring_pages, gref); if (err < 0) { free_pages((unsigned long)sring, get_order(ring_size)); - info->ring.sring = NULL; + rinfo->ring.sring = NULL; goto fail; } for (i = 0; i < info->nr_ring_pages; i++) - info->ring_ref[i] = gref[i]; + rinfo->ring_ref[i] = gref[i]; - err = xenbus_alloc_evtchn(dev, &info->evtchn); + err = xenbus_alloc_evtchn(dev, &rinfo->evtchn); if (err) goto fail; - err = bind_evtchn_to_irqhandler(info->evtchn, blkif_interrupt, 0, - "blkif", info); + err = bind_evtchn_to_irqhandler(rinfo->evtchn, blkif_interrupt, 0, + "blkif", rinfo); if (err <= 0) { xenbus_dev_fatal(dev, err, "bind_evtchn_to_irqhandler failed"); goto fail; } - info->irq = err; + rinfo->irq = err; return 0; fail: @@ -1455,6 +1701,53 @@ fail: return err; } +/* + * Write out per-ring/queue nodes including ring-ref and event-channel, and each + * ring buffer may have multi pages depending on ->nr_ring_pages. + */ +static int write_per_ring_nodes(struct xenbus_transaction xbt, + struct blkfront_ring_info *rinfo, const char *dir) +{ + int err; + unsigned int i; + const char *message = NULL; + struct blkfront_info *info = rinfo->dev_info; + + if (info->nr_ring_pages == 1) { + err = xenbus_printf(xbt, dir, "ring-ref", "%u", rinfo->ring_ref[0]); + if (err) { + message = "writing ring-ref"; + goto abort_transaction; + } + } else { + for (i = 0; i < info->nr_ring_pages; i++) { + char ring_ref_name[RINGREF_NAME_LEN]; + + snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i); + err = xenbus_printf(xbt, dir, ring_ref_name, + "%u", rinfo->ring_ref[i]); + if (err) { + message = "writing ring-ref"; + goto abort_transaction; + } + } + } + + err = xenbus_printf(xbt, dir, "event-channel", "%u", rinfo->evtchn); + if (err) { + message = "writing event-channel"; + goto abort_transaction; + } + + return 0; + +abort_transaction: + xenbus_transaction_end(xbt, 1); + if (message) + xenbus_dev_fatal(info->xbdev, err, "%s", message); + + return err; +} /* Common code used when first setting up, and when resuming. */ static int talk_to_blkback(struct xenbus_device *dev, @@ -1462,8 +1755,8 @@ static int talk_to_blkback(struct xenbus_device *dev, { const char *message = NULL; struct xenbus_transaction xbt; - int err, i; - unsigned int max_page_order = 0; + int err; + unsigned int i, max_page_order = 0; unsigned int ring_page_order = 0; err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, @@ -1475,10 +1768,14 @@ static int talk_to_blkback(struct xenbus_device *dev, info->nr_ring_pages = 1 << ring_page_order; } - /* Create shared ring, alloc event channel. */ - err = setup_blkring(dev, info); - if (err) - goto out; + for (i = 0; i < info->nr_rings; i++) { + struct blkfront_ring_info *rinfo = &info->rinfo[i]; + + /* Create shared ring, alloc event channel. */ + err = setup_blkring(dev, rinfo); + if (err) + goto destroy_blkring; + } again: err = xenbus_transaction_start(&xbt); @@ -1487,38 +1784,49 @@ again: goto destroy_blkring; } - if (info->nr_ring_pages == 1) { - err = xenbus_printf(xbt, dev->nodename, - "ring-ref", "%u", info->ring_ref[0]); + if (info->nr_ring_pages > 1) { + err = xenbus_printf(xbt, dev->nodename, "ring-page-order", "%u", + ring_page_order); if (err) { - message = "writing ring-ref"; + message = "writing ring-page-order"; goto abort_transaction; } + } + + /* We already got the number of queues/rings in _probe */ + if (info->nr_rings == 1) { + err = write_per_ring_nodes(xbt, &info->rinfo[0], dev->nodename); + if (err) + goto destroy_blkring; } else { - err = xenbus_printf(xbt, dev->nodename, - "ring-page-order", "%u", ring_page_order); + char *path; + size_t pathsize; + + err = xenbus_printf(xbt, dev->nodename, "multi-queue-num-queues", "%u", + info->nr_rings); if (err) { - message = "writing ring-page-order"; + message = "writing multi-queue-num-queues"; goto abort_transaction; } - for (i = 0; i < info->nr_ring_pages; i++) { - char ring_ref_name[RINGREF_NAME_LEN]; + pathsize = strlen(dev->nodename) + QUEUE_NAME_LEN; + path = kmalloc(pathsize, GFP_KERNEL); + if (!path) { + err = -ENOMEM; + message = "ENOMEM while writing ring references"; + goto abort_transaction; + } - snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i); - err = xenbus_printf(xbt, dev->nodename, ring_ref_name, - "%u", info->ring_ref[i]); + for (i = 0; i < info->nr_rings; i++) { + memset(path, 0, pathsize); + snprintf(path, pathsize, "%s/queue-%u", dev->nodename, i); + err = write_per_ring_nodes(xbt, &info->rinfo[i], path); if (err) { - message = "writing ring-ref"; - goto abort_transaction; + kfree(path); + goto destroy_blkring; } } - } - err = xenbus_printf(xbt, dev->nodename, - "event-channel", "%u", info->evtchn); - if (err) { - message = "writing event-channel"; - goto abort_transaction; + kfree(path); } err = xenbus_printf(xbt, dev->nodename, "protocol", "%s", XEN_IO_PROTO_ABI_NATIVE); @@ -1540,9 +1848,14 @@ again: goto destroy_blkring; } - for (i = 0; i < BLK_RING_SIZE(info); i++) - info->shadow[i].req.u.rw.id = i+1; - info->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff; + for (i = 0; i < info->nr_rings; i++) { + unsigned int j; + struct blkfront_ring_info *rinfo = &info->rinfo[i]; + + for (j = 0; j < BLK_RING_SIZE(info); j++) + rinfo->shadow[j].req.u.rw.id = j + 1; + rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff; + } xenbus_switch_state(dev, XenbusStateInitialised); return 0; @@ -1553,7 +1866,10 @@ again: xenbus_dev_fatal(dev, err, "%s", message); destroy_blkring: blkif_free(info, 0); - out: + + kfree(info); + dev_set_drvdata(&dev->dev, NULL); + return err; } @@ -1567,7 +1883,9 @@ static int blkfront_probe(struct xenbus_device *dev, const struct xenbus_device_id *id) { int err, vdevice; + unsigned int r_index; struct blkfront_info *info; + unsigned int backend_max_queues = 0; /* FIXME: Use dynamic device id if this is not set. */ err = xenbus_scanf(XBT_NIL, dev->nodename, @@ -1617,15 +1935,39 @@ static int blkfront_probe(struct xenbus_device *dev, return -ENOMEM; } - mutex_init(&info->mutex); - spin_lock_init(&info->io_lock); info->xbdev = dev; + /* Check if backend supports multiple queues. */ + err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, + "multi-queue-max-queues", "%u", &backend_max_queues); + if (err < 0) + backend_max_queues = 1; + + info->nr_rings = min(backend_max_queues, xen_blkif_max_queues); + /* We need at least one ring. */ + if (!info->nr_rings) + info->nr_rings = 1; + + info->rinfo = kzalloc(sizeof(struct blkfront_ring_info) * info->nr_rings, GFP_KERNEL); + if (!info->rinfo) { + xenbus_dev_fatal(dev, -ENOMEM, "allocating ring_info structure"); + kfree(info); + return -ENOMEM; + } + + for (r_index = 0; r_index < info->nr_rings; r_index++) { + struct blkfront_ring_info *rinfo; + + rinfo = &info->rinfo[r_index]; + INIT_LIST_HEAD(&rinfo->indirect_pages); + INIT_LIST_HEAD(&rinfo->grants); + rinfo->dev_info = info; + INIT_WORK(&rinfo->work, blkif_restart_queue); + spin_lock_init(&rinfo->ring_lock); + } + + mutex_init(&info->mutex); info->vdevice = vdevice; - INIT_LIST_HEAD(&info->grants); - INIT_LIST_HEAD(&info->indirect_pages); - info->persistent_gnts_c = 0; info->connected = BLKIF_STATE_DISCONNECTED; - INIT_WORK(&info->work, blkif_restart_queue); /* Front end dir is a number, which is used as the id. */ info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0); @@ -1649,7 +1991,7 @@ static void split_bio_end(struct bio *bio) static int blkif_recover(struct blkfront_info *info) { - int i; + unsigned int i, r_index; struct request *req, *n; struct blk_shadow *copy; int rc; @@ -1660,64 +2002,73 @@ static int blkif_recover(struct blkfront_info *info) struct split_bio *split_bio; struct list_head requests; - /* Stage 1: Make a safe copy of the shadow state. */ - copy = kmemdup(info->shadow, sizeof(info->shadow), - GFP_NOIO | __GFP_REPEAT | __GFP_HIGH); - if (!copy) - return -ENOMEM; - - /* Stage 2: Set up free list. */ - memset(&info->shadow, 0, sizeof(info->shadow)); - for (i = 0; i < BLK_RING_SIZE(info); i++) - info->shadow[i].req.u.rw.id = i+1; - info->shadow_free = info->ring.req_prod_pvt; - info->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff; - - rc = blkfront_gather_backend_features(info); - if (rc) { - kfree(copy); - return rc; - } - + blkfront_gather_backend_features(info); segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST; blk_queue_max_segments(info->rq, segs); bio_list_init(&bio_list); INIT_LIST_HEAD(&requests); - for (i = 0; i < BLK_RING_SIZE(info); i++) { - /* Not in use? */ - if (!copy[i].request) - continue; - /* - * Get the bios in the request so we can re-queue them. - */ - if (copy[i].request->cmd_flags & - (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) { + for (r_index = 0; r_index < info->nr_rings; r_index++) { + struct blkfront_ring_info *rinfo; + + rinfo = &info->rinfo[r_index]; + /* Stage 1: Make a safe copy of the shadow state. */ + copy = kmemdup(rinfo->shadow, sizeof(rinfo->shadow), + GFP_NOIO | __GFP_REPEAT | __GFP_HIGH); + if (!copy) + return -ENOMEM; + + /* Stage 2: Set up free list. */ + memset(&rinfo->shadow, 0, sizeof(rinfo->shadow)); + for (i = 0; i < BLK_RING_SIZE(info); i++) + rinfo->shadow[i].req.u.rw.id = i+1; + rinfo->shadow_free = rinfo->ring.req_prod_pvt; + rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff; + + rc = blkfront_setup_indirect(rinfo); + if (rc) { + kfree(copy); + return rc; + } + + for (i = 0; i < BLK_RING_SIZE(info); i++) { + /* Not in use? */ + if (!copy[i].request) + continue; + /* - * Flush operations don't contain bios, so - * we need to requeue the whole request + * Get the bios in the request so we can re-queue them. */ - list_add(©[i].request->queuelist, &requests); - continue; + if (copy[i].request->cmd_flags & + (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) { + /* + * Flush operations don't contain bios, so + * we need to requeue the whole request + */ + list_add(©[i].request->queuelist, &requests); + continue; + } + merge_bio.head = copy[i].request->bio; + merge_bio.tail = copy[i].request->biotail; + bio_list_merge(&bio_list, &merge_bio); + copy[i].request->bio = NULL; + blk_end_request_all(copy[i].request, 0); } - merge_bio.head = copy[i].request->bio; - merge_bio.tail = copy[i].request->biotail; - bio_list_merge(&bio_list, &merge_bio); - copy[i].request->bio = NULL; - blk_end_request_all(copy[i].request, 0); - } - - kfree(copy); + kfree(copy); + } xenbus_switch_state(info->xbdev, XenbusStateConnected); - spin_lock_irq(&info->io_lock); - /* Now safe for us to use the shared ring */ info->connected = BLKIF_STATE_CONNECTED; - /* Kick any other new requests queued since we resumed */ - kick_pending_request_queues(info); + for (r_index = 0; r_index < info->nr_rings; r_index++) { + struct blkfront_ring_info *rinfo; + + rinfo = &info->rinfo[r_index]; + /* Kick any other new requests queued since we resumed */ + kick_pending_request_queues(rinfo); + } list_for_each_entry_safe(req, n, &requests, queuelist) { /* Requeue pending requests (flush or discard) */ @@ -1725,7 +2076,6 @@ static int blkif_recover(struct blkfront_info *info) BUG_ON(req->nr_phys_segments > segs); blk_mq_requeue_request(req); } - spin_unlock_irq(&info->io_lock); blk_mq_kick_requeue_list(info->rq); while ((bio = bio_list_pop(&bio_list)) != NULL) { @@ -1790,8 +2140,7 @@ static int blkfront_resume(struct xenbus_device *dev) return err; } -static void -blkfront_closing(struct blkfront_info *info) +static void blkfront_closing(struct blkfront_info *info) { struct xenbus_device *xbdev = info->xbdev; struct block_device *bdev = NULL; @@ -1851,18 +2200,29 @@ static void blkfront_setup_discard(struct blkfront_info *info) info->feature_secdiscard = !!discard_secure; } -static int blkfront_setup_indirect(struct blkfront_info *info) +static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo) { unsigned int psegs, grants; int err, i; + struct blkfront_info *info = rinfo->dev_info; - if (info->max_indirect_segments == 0) - grants = BLKIF_MAX_SEGMENTS_PER_REQUEST; + if (info->max_indirect_segments == 0) { + if (!HAS_EXTRA_REQ) + grants = BLKIF_MAX_SEGMENTS_PER_REQUEST; + else { + /* + * When an extra req is required, the maximum + * grants supported is related to the size of the + * Linux block segment. + */ + grants = GRANTS_PER_PSEG; + } + } else grants = info->max_indirect_segments; psegs = grants / GRANTS_PER_PSEG; - err = fill_grant_buffer(info, + err = fill_grant_buffer(rinfo, (grants + INDIRECT_GREFS(grants)) * BLK_RING_SIZE(info)); if (err) goto out_of_memory; @@ -1875,31 +2235,31 @@ static int blkfront_setup_indirect(struct blkfront_info *info) */ int num = INDIRECT_GREFS(grants) * BLK_RING_SIZE(info); - BUG_ON(!list_empty(&info->indirect_pages)); + BUG_ON(!list_empty(&rinfo->indirect_pages)); for (i = 0; i < num; i++) { struct page *indirect_page = alloc_page(GFP_NOIO); if (!indirect_page) goto out_of_memory; - list_add(&indirect_page->lru, &info->indirect_pages); + list_add(&indirect_page->lru, &rinfo->indirect_pages); } } for (i = 0; i < BLK_RING_SIZE(info); i++) { - info->shadow[i].grants_used = kzalloc( - sizeof(info->shadow[i].grants_used[0]) * grants, + rinfo->shadow[i].grants_used = kzalloc( + sizeof(rinfo->shadow[i].grants_used[0]) * grants, GFP_NOIO); - info->shadow[i].sg = kzalloc(sizeof(info->shadow[i].sg[0]) * psegs, GFP_NOIO); + rinfo->shadow[i].sg = kzalloc(sizeof(rinfo->shadow[i].sg[0]) * psegs, GFP_NOIO); if (info->max_indirect_segments) - info->shadow[i].indirect_grants = kzalloc( - sizeof(info->shadow[i].indirect_grants[0]) * + rinfo->shadow[i].indirect_grants = kzalloc( + sizeof(rinfo->shadow[i].indirect_grants[0]) * INDIRECT_GREFS(grants), GFP_NOIO); - if ((info->shadow[i].grants_used == NULL) || - (info->shadow[i].sg == NULL) || + if ((rinfo->shadow[i].grants_used == NULL) || + (rinfo->shadow[i].sg == NULL) || (info->max_indirect_segments && - (info->shadow[i].indirect_grants == NULL))) + (rinfo->shadow[i].indirect_grants == NULL))) goto out_of_memory; - sg_init_table(info->shadow[i].sg, psegs); + sg_init_table(rinfo->shadow[i].sg, psegs); } @@ -1907,16 +2267,16 @@ static int blkfront_setup_indirect(struct blkfront_info *info) out_of_memory: for (i = 0; i < BLK_RING_SIZE(info); i++) { - kfree(info->shadow[i].grants_used); - info->shadow[i].grants_used = NULL; - kfree(info->shadow[i].sg); - info->shadow[i].sg = NULL; - kfree(info->shadow[i].indirect_grants); - info->shadow[i].indirect_grants = NULL; - } - if (!list_empty(&info->indirect_pages)) { + kfree(rinfo->shadow[i].grants_used); + rinfo->shadow[i].grants_used = NULL; + kfree(rinfo->shadow[i].sg); + rinfo->shadow[i].sg = NULL; + kfree(rinfo->shadow[i].indirect_grants); + rinfo->shadow[i].indirect_grants = NULL; + } + if (!list_empty(&rinfo->indirect_pages)) { struct page *indirect_page, *n; - list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) { + list_for_each_entry_safe(indirect_page, n, &rinfo->indirect_pages, lru) { list_del(&indirect_page->lru); __free_page(indirect_page); } @@ -1927,7 +2287,7 @@ out_of_memory: /* * Gather all backend feature-* */ -static int blkfront_gather_backend_features(struct blkfront_info *info) +static void blkfront_gather_backend_features(struct blkfront_info *info) { int err; int barrier, flush, discard, persistent; @@ -1982,8 +2342,6 @@ static int blkfront_gather_backend_features(struct blkfront_info *info) else info->max_indirect_segments = min(indirect_segments, xen_blkif_max_segments); - - return blkfront_setup_indirect(info); } /* @@ -1996,7 +2354,7 @@ static void blkfront_connect(struct blkfront_info *info) unsigned long sector_size; unsigned int physical_sector_size; unsigned int binfo; - int err; + int err, i; switch (info->connected) { case BLKIF_STATE_CONNECTED: @@ -2053,11 +2411,15 @@ static void blkfront_connect(struct blkfront_info *info) if (err != 1) physical_sector_size = sector_size; - err = blkfront_gather_backend_features(info); - if (err) { - xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s", - info->xbdev->otherend); - return; + blkfront_gather_backend_features(info); + for (i = 0; i < info->nr_rings; i++) { + err = blkfront_setup_indirect(&info->rinfo[i]); + if (err) { + xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s", + info->xbdev->otherend); + blkif_free(info, 0); + break; + } } err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size, @@ -2071,10 +2433,9 @@ static void blkfront_connect(struct blkfront_info *info) xenbus_switch_state(info->xbdev, XenbusStateConnected); /* Kick pending requests. */ - spin_lock_irq(&info->io_lock); info->connected = BLKIF_STATE_CONNECTED; - kick_pending_request_queues(info); - spin_unlock_irq(&info->io_lock); + for (i = 0; i < info->nr_rings; i++) + kick_pending_request_queues(&info->rinfo[i]); add_disk(info->gd); @@ -2095,11 +2456,8 @@ static void blkback_changed(struct xenbus_device *dev, case XenbusStateInitWait: if (dev->state != XenbusStateInitialising) break; - if (talk_to_blkback(dev, info)) { - kfree(info); - dev_set_drvdata(&dev->dev, NULL); + if (talk_to_blkback(dev, info)) break; - } case XenbusStateInitialising: case XenbusStateInitialised: case XenbusStateReconfiguring: @@ -2108,6 +2466,10 @@ static void blkback_changed(struct xenbus_device *dev, break; case XenbusStateConnected: + if (dev->state != XenbusStateInitialised) { + if (talk_to_blkback(dev, info)) + break; + } blkfront_connect(info); break; @@ -2281,6 +2643,7 @@ static struct xenbus_driver blkfront_driver = { static int __init xlblk_init(void) { int ret; + int nr_cpus = num_online_cpus(); if (!xen_domain()) return -ENODEV; @@ -2288,7 +2651,13 @@ static int __init xlblk_init(void) if (xen_blkif_max_ring_order > XENBUS_MAX_RING_GRANT_ORDER) { pr_info("Invalid max_ring_order (%d), will use default max: %d.\n", xen_blkif_max_ring_order, XENBUS_MAX_RING_GRANT_ORDER); - xen_blkif_max_ring_order = 0; + xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER; + } + + if (xen_blkif_max_queues > nr_cpus) { + pr_info("Invalid max_queues (%d), will use default max: %d.\n", + xen_blkif_max_queues, nr_cpus); + xen_blkif_max_queues = nr_cpus; } if (!xen_has_pv_disk_devices()) |