From 288dab8a35a0bde426a09870943c8d3ee3a50dab Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 9 Jun 2016 16:00:36 +0200 Subject: block: add a separate operation type for secure erase Instead of overloading the discard support with the REQ_SECURE flag. Use the opportunity to rename the queue flag as well, and remove the dead checks for this flag in the RAID 1 and RAID 10 drivers that don't claim support for secure erase. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 5 ++--- include/linux/blkdev.h | 23 ++++------------------- 2 files changed, 6 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 562ab83..efba1f2 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -163,7 +163,6 @@ enum rq_flag_bits { __REQ_SYNC, /* request is sync (sync write or read) */ __REQ_META, /* metadata io request */ __REQ_PRIO, /* boost priority in cfq */ - __REQ_SECURE, /* secure discard (used with REQ_OP_DISCARD) */ __REQ_NOIDLE, /* don't anticipate more IO after this one */ __REQ_INTEGRITY, /* I/O includes block integrity payload */ @@ -212,7 +211,7 @@ enum rq_flag_bits { (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) #define REQ_COMMON_MASK \ (REQ_FAILFAST_MASK | REQ_SYNC | REQ_META | REQ_PRIO | REQ_NOIDLE | \ - REQ_PREFLUSH | REQ_FUA | REQ_SECURE | REQ_INTEGRITY | REQ_NOMERGE) + REQ_PREFLUSH | REQ_FUA | REQ_INTEGRITY | REQ_NOMERGE) #define REQ_CLONE_MASK REQ_COMMON_MASK /* This mask is used for both bio and request merge checking */ @@ -239,7 +238,6 @@ enum rq_flag_bits { #define REQ_FLUSH_SEQ (1ULL << __REQ_FLUSH_SEQ) #define REQ_IO_STAT (1ULL << __REQ_IO_STAT) #define REQ_MIXED_MERGE (1ULL << __REQ_MIXED_MERGE) -#define REQ_SECURE (1ULL << __REQ_SECURE) #define REQ_PM (1ULL << __REQ_PM) #define REQ_HASHED (1ULL << __REQ_HASHED) #define REQ_MQ_INFLIGHT (1ULL << __REQ_MQ_INFLIGHT) @@ -248,6 +246,7 @@ enum req_op { REQ_OP_READ, REQ_OP_WRITE, REQ_OP_DISCARD, /* request to discard sectors */ + REQ_OP_SECURE_ERASE, /* request to securely erase sectors */ REQ_OP_WRITE_SAME, /* write same block many times */ REQ_OP_FLUSH, /* request for cache flush */ }; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0c9f879..53fee61 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -497,7 +497,7 @@ struct request_queue { #define QUEUE_FLAG_DISCARD 14 /* supports DISCARD */ #define QUEUE_FLAG_NOXMERGES 15 /* No extended merges */ #define QUEUE_FLAG_ADD_RANDOM 16 /* Contributes to random pool */ -#define QUEUE_FLAG_SECDISCARD 17 /* supports SECDISCARD */ +#define QUEUE_FLAG_SECERASE 17 /* supports secure erase */ #define QUEUE_FLAG_SAME_FORCE 18 /* force complete on same CPU */ #define QUEUE_FLAG_DEAD 19 /* queue tear-down finished */ #define QUEUE_FLAG_INIT_DONE 20 /* queue is initialized */ @@ -593,8 +593,8 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) #define blk_queue_stackable(q) \ test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags) #define blk_queue_discard(q) test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags) -#define blk_queue_secdiscard(q) (blk_queue_discard(q) && \ - test_bit(QUEUE_FLAG_SECDISCARD, &(q)->queue_flags)) +#define blk_queue_secure_erase(q) \ + (test_bit(QUEUE_FLAG_SECERASE, &(q)->queue_flags)) #define blk_noretry_request(rq) \ ((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \ @@ -675,21 +675,6 @@ static inline bool rq_mergeable(struct request *rq) return true; } -static inline bool blk_check_merge_flags(unsigned int flags1, unsigned int op1, - unsigned int flags2, unsigned int op2) -{ - if ((op1 == REQ_OP_DISCARD) != (op2 == REQ_OP_DISCARD)) - return false; - - if ((flags1 & REQ_SECURE) != (flags2 & REQ_SECURE)) - return false; - - if ((op1 == REQ_OP_WRITE_SAME) != (op2 == REQ_OP_WRITE_SAME)) - return false; - - return true; -} - static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b) { if (bio_data(a) == bio_data(b)) @@ -1158,7 +1143,7 @@ extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *); extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, unsigned long flags); extern int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, int op_flags, + sector_t nr_sects, gfp_t gfp_mask, int flags, struct bio **biop); extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct page *page); -- cgit v1.1 From 14e974a84e831bf9a44495c7256a6846e7f77630 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 6 Jun 2016 23:20:43 +0200 Subject: nvme.h: add RTD3R, RTD3E and OAES fields These have been added in NVMe 1.2 and we'll need at least oaes for the NVMe target driver. Reviewed-by: Sagi Grimberg Reviewed-by: Jay Freyensee Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- include/linux/nvme.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 7d51b29..ff5ebc3 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -107,7 +107,10 @@ struct nvme_id_ctrl { __u8 mdts; __le16 cntlid; __le32 ver; - __u8 rsvd84[172]; + __le32 rtd3r; + __le32 rtd3e; + __le32 oaes; + __u8 rsvd96[160]; __le16 oacs; __u8 acl; __u8 aerl; -- cgit v1.1 From 725b358836ed038d7d8eafef86330b3a0b3f9c2f Mon Sep 17 00:00:00 2001 From: Armen Baloyan Date: Mon, 6 Jun 2016 23:20:44 +0200 Subject: nvme.h: Add get_log_page command strucure Add get_log_page command structure and a corresponding entry in nvme_command union Signed-off-by: Armen Baloyan Reviewed-by: Jay Freyensee Reviewed--by: Sagi Grimberg Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- include/linux/nvme.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index ff5ebc3..9925b85 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -519,6 +519,24 @@ struct nvme_format_cmd { __u32 rsvd11[5]; }; +struct nvme_get_log_page_command { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2[2]; + __le64 prp1; + __le64 prp2; + __u8 lid; + __u8 rsvd10; + __le16 numdl; + __le16 numdu; + __u16 rsvd11; + __le32 lpol; + __le32 lpou; + __u32 rsvd14[2]; +}; + struct nvme_command { union { struct nvme_common_command common; @@ -532,6 +550,7 @@ struct nvme_command { struct nvme_format_cmd format; struct nvme_dsm_cmd dsm; struct nvme_abort_cmd abort; + struct nvme_get_log_page_command get_log_page; }; }; -- cgit v1.1 From 69cd27e2511616f9b402d1bad4c49f91aa7411a3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 6 Jun 2016 23:20:45 +0200 Subject: nvme.h: add NVM command set SQE/CQE size defines Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- include/linux/nvme.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 9925b85..9807d98 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -50,6 +50,13 @@ enum { #define NVME_CMB_CQS(cmbsz) ((cmbsz) & 0x2) #define NVME_CMB_SQS(cmbsz) ((cmbsz) & 0x1) +/* + * Submission and Completion Queue Entry Sizes for the NVM command set. + * (In bytes and specified as a power of two (2^n)). + */ +#define NVME_NVM_IOSQES 6 +#define NVME_NVM_IOCQES 4 + enum { NVME_CC_ENABLE = 1 << 0, NVME_CC_CSS_NVM = 0 << 4, @@ -61,8 +68,8 @@ enum { NVME_CC_SHN_NORMAL = 1 << 14, NVME_CC_SHN_ABRUPT = 2 << 14, NVME_CC_SHN_MASK = 3 << 14, - NVME_CC_IOSQES = 6 << 16, - NVME_CC_IOCQES = 4 << 20, + NVME_CC_IOSQES = NVME_NVM_IOSQES << 16, + NVME_CC_IOCQES = NVME_NVM_IOCQES << 20, NVME_CSTS_RDY = 1 << 0, NVME_CSTS_CFS = 1 << 1, NVME_CSTS_NSSRO = 1 << 4, -- cgit v1.1 From 79f370eac63796a8933b210bca02f006ba32d22e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 6 Jun 2016 23:20:46 +0200 Subject: nvme.h: add AER constants Reviewed-by: Jay Freyensee Reviewed-by: Sagi Grimberg Reviewed-by: Ming Lin Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- include/linux/nvme.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 9807d98..a9b8c7b 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -284,6 +284,12 @@ struct nvme_reservation_status { } regctl_ds[]; }; +enum nvme_async_event_type { + NVME_AER_TYPE_ERROR = 0, + NVME_AER_TYPE_SMART = 1, + NVME_AER_TYPE_NOTICE = 2, +}; + /* I/O commands */ enum nvme_opcode { -- cgit v1.1 From 3972be23bd2d2bcfaa44595a260a371cd9218872 Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 6 Jun 2016 23:20:47 +0200 Subject: nvme.h: add constants for PSDT and FUSE values Signed-off-by: James Smart Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- include/linux/nvme.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index a9b8c7b..2b82f05 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -306,6 +306,29 @@ enum nvme_opcode { nvme_cmd_resv_release = 0x15, }; +/* + * Lowest two bits of our flags field (FUSE field in the spec): + * + * @NVME_CMD_FUSE_FIRST: Fused Operation, first command + * @NVME_CMD_FUSE_SECOND: Fused Operation, second command + * + * Highest two bits in our flags field (PSDT field in the spec): + * + * @NVME_CMD_PSDT_SGL_METABUF: Use SGLS for this transfer, + * If used, MPTR contains addr of single physical buffer (byte aligned). + * @NVME_CMD_PSDT_SGL_METASEG: Use SGLS for this transfer, + * If used, MPTR contains an address of an SGL segment containing + * exactly 1 SGL descriptor (qword aligned). + */ +enum { + NVME_CMD_FUSE_FIRST = (1 << 0), + NVME_CMD_FUSE_SECOND = (1 << 1), + + NVME_CMD_SGL_METABUF = (1 << 6), + NVME_CMD_SGL_METASEG = (1 << 7), + NVME_CMD_SGL_ALL = NVME_CMD_SGL_METABUF | NVME_CMD_SGL_METASEG, +}; + struct nvme_common_command { __u8 opcode; __u8 flags; -- cgit v1.1 From 7a5abb4b48570c3552e33ff4c72ae1e8dac3ba15 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 6 Jun 2016 23:20:49 +0200 Subject: nvme: factor out a add nvme_is_write helper Centralize the check if a given NVMe command reads or writes data. Reviewed-by: Sagi Grimberg Reviewed-by: Jay Freyensee Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- include/linux/nvme.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 2b82f05..dc815cc 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -590,6 +590,11 @@ struct nvme_command { }; }; +static inline bool nvme_is_write(struct nvme_command *cmd) +{ + return cmd->common.opcode & 1; +} + enum { NVME_SC_SUCCESS = 0x0, NVME_SC_INVALID_OPCODE = 0x1, -- cgit v1.1 From a5ca66c419410b4a26ab47b120d5424bd1d33700 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Tue, 14 Jun 2016 00:26:14 +0200 Subject: drbd: Introduce new disk config option rs-discard-granularity As long as the value is 0 the feature is disabled. With setting it to a positive value, DRBD limits and aligns its resync requests to the rs-discard-granularity setting. If the sync source detects all zeros in such a block, the resync target discards the range on disk. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- include/linux/drbd_genl.h | 6 +++--- include/linux/drbd_limits.h | 6 ++++++ 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/drbd_genl.h b/include/linux/drbd_genl.h index 2d0e5ad..ab649d8 100644 --- a/include/linux/drbd_genl.h +++ b/include/linux/drbd_genl.h @@ -123,14 +123,14 @@ GENL_struct(DRBD_NLA_DISK_CONF, 3, disk_conf, __u32_field_def(13, DRBD_GENLA_F_MANDATORY, c_fill_target, DRBD_C_FILL_TARGET_DEF) __u32_field_def(14, DRBD_GENLA_F_MANDATORY, c_max_rate, DRBD_C_MAX_RATE_DEF) __u32_field_def(15, DRBD_GENLA_F_MANDATORY, c_min_rate, DRBD_C_MIN_RATE_DEF) + __u32_field_def(20, DRBD_GENLA_F_MANDATORY, disk_timeout, DRBD_DISK_TIMEOUT_DEF) + __u32_field_def(21, 0 /* OPTIONAL */, read_balancing, DRBD_READ_BALANCING_DEF) + __u32_field_def(25, 0 /* OPTIONAL */, rs_discard_granularity, DRBD_RS_DISCARD_GRANULARITY_DEF) __flg_field_def(16, DRBD_GENLA_F_MANDATORY, disk_barrier, DRBD_DISK_BARRIER_DEF) __flg_field_def(17, DRBD_GENLA_F_MANDATORY, disk_flushes, DRBD_DISK_FLUSHES_DEF) __flg_field_def(18, DRBD_GENLA_F_MANDATORY, disk_drain, DRBD_DISK_DRAIN_DEF) __flg_field_def(19, DRBD_GENLA_F_MANDATORY, md_flushes, DRBD_MD_FLUSHES_DEF) - __u32_field_def(20, DRBD_GENLA_F_MANDATORY, disk_timeout, DRBD_DISK_TIMEOUT_DEF) - __u32_field_def(21, 0 /* OPTIONAL */, read_balancing, DRBD_READ_BALANCING_DEF) - /* 9: __u32_field_def(22, DRBD_GENLA_F_MANDATORY, unplug_watermark, DRBD_UNPLUG_WATERMARK_DEF) */ __flg_field_def(23, 0 /* OPTIONAL */, al_updates, DRBD_AL_UPDATES_DEF) ) diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h index 8ac8c5d..2e4aad8 100644 --- a/include/linux/drbd_limits.h +++ b/include/linux/drbd_limits.h @@ -230,4 +230,10 @@ #define DRBD_SOCKET_CHECK_TIMEO_MAX DRBD_PING_TIMEO_MAX #define DRBD_SOCKET_CHECK_TIMEO_DEF 0 #define DRBD_SOCKET_CHECK_TIMEO_SCALE '1' + +#define DRBD_RS_DISCARD_GRANULARITY_MIN 0 +#define DRBD_RS_DISCARD_GRANULARITY_MAX (1<<20) /* 1MiByte */ +#define DRBD_RS_DISCARD_GRANULARITY_DEF 0 /* disabled by default */ +#define DRBD_RS_DISCARD_GRANULARITY_SCALE '1' /* bytes */ + #endif -- cgit v1.1 From dd4f699da674010c58d7a2534215b4ca1ff13b13 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 14 Jun 2016 00:26:20 +0200 Subject: drbd: when receiving P_TRIM, zero-out partial unaligned chunks We can avoid spurious data divergence caused by partially-ignored discards on certain backends with discard_zeroes_data=0, if we translate partial unaligned discard requests into explicit zero-out. The relevant use case is LVM/DM thin. If on different nodes, DRBD is backed by devices with differing discard characteristics, discards may lead to data divergence (old data or garbage left over on one backend, zeroes due to unmapped areas on the other backend). Online verify would now potentially report tons of spurious differences. While probably harmless for most use cases (fstrim on a file system), DRBD cannot have that, it would violate our promise to upper layers that our data instances on the nodes are identical. To be correct and play safe (make sure data is identical on both copies), we would have to disable discard support, if our local backend (on a Primary) does not support "discard_zeroes_data=true". We'd also have to translate discards to explicit zero-out on the receiving (typically: Secondary) side, unless the receiving side supports "discard_zeroes_data=true". Which both would allocate those blocks, instead of unmapping them, in contrast with expectations. LVM/DM thin does set discard_zeroes_data=0, because it silently ignores discards to partial chunks. We can work around this by checking the alignment first. For unaligned (wrt. alignment and granularity) or too small discards, we zero-out the initial (and/or) trailing unaligned partial chunks, but discard all the aligned full chunks. At least for LVM/DM thin, the result is effectively "discard_zeroes_data=1". Arguably it should behave this way internally, by default, and we'll try to make that happen. But our workaround is still valid for already deployed setups, and for other devices that may behave this way. Setting discard-zeroes-if-aligned=yes will allow DRBD to use discards, and to announce discard_zeroes_data=true, even on backends that announce discard_zeroes_data=false. Setting discard-zeroes-if-aligned=no will cause DRBD to always fall-back to zero-out on the receiving side, and to not even announce discard capabilities on the Primary, if the respective backend announces discard_zeroes_data=false. We used to ignore the discard_zeroes_data setting completely. To not break established and expected behaviour, and suddenly cause fstrim on thin-provisioned LVs to run out-of-space, instead of freeing up space, the default value is "yes". Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- include/linux/drbd_genl.h | 1 + include/linux/drbd_limits.h | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/drbd_genl.h b/include/linux/drbd_genl.h index ab649d8..c934d3a 100644 --- a/include/linux/drbd_genl.h +++ b/include/linux/drbd_genl.h @@ -132,6 +132,7 @@ GENL_struct(DRBD_NLA_DISK_CONF, 3, disk_conf, __flg_field_def(18, DRBD_GENLA_F_MANDATORY, disk_drain, DRBD_DISK_DRAIN_DEF) __flg_field_def(19, DRBD_GENLA_F_MANDATORY, md_flushes, DRBD_MD_FLUSHES_DEF) __flg_field_def(23, 0 /* OPTIONAL */, al_updates, DRBD_AL_UPDATES_DEF) + __flg_field_def(24, 0 /* OPTIONAL */, discard_zeroes_if_aligned, DRBD_DISCARD_ZEROES_IF_ALIGNED) ) GENL_struct(DRBD_NLA_RESOURCE_OPTS, 4, res_opts, diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h index 2e4aad8..a351c40 100644 --- a/include/linux/drbd_limits.h +++ b/include/linux/drbd_limits.h @@ -210,6 +210,12 @@ #define DRBD_MD_FLUSHES_DEF 1 #define DRBD_TCP_CORK_DEF 1 #define DRBD_AL_UPDATES_DEF 1 +/* We used to ignore the discard_zeroes_data setting. + * To not change established (and expected) behaviour, + * by default assume that, for discard_zeroes_data=0, + * we can make that an effective discard_zeroes_data=1, + * if we only explicitly zero-out unaligned partial chunks. */ +#define DRBD_DISCARD_ZEROES_IF_ALIGNED 1 #define DRBD_ALLOW_TWO_PRIMARIES_DEF 0 #define DRBD_ALWAYS_ASBP_DEF 0 -- cgit v1.1 From 505675f96cf0f169647a18c3dda1f373eca957b1 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 14 Jun 2016 00:26:23 +0200 Subject: drbd: allow larger max_discard_sectors Make sure we have at least 67 (> AL_UPDATES_PER_TRANSACTION) al-extents available, and allow up to half of that to be discarded in one bio. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- include/linux/drbd_limits.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h index a351c40..ddac684 100644 --- a/include/linux/drbd_limits.h +++ b/include/linux/drbd_limits.h @@ -126,8 +126,7 @@ #define DRBD_RESYNC_RATE_DEF 250 #define DRBD_RESYNC_RATE_SCALE 'k' /* kilobytes */ - /* less than 7 would hit performance unnecessarily. */ -#define DRBD_AL_EXTENTS_MIN 7 +#define DRBD_AL_EXTENTS_MIN 67 /* we use u16 as "slot number", (u16)~0 is "FREE". * If you use >= 292 kB on-disk ring buffer, * this is the maximum you can use: */ -- cgit v1.1 From 7e5fec31685a5c69b81e9005eaed44318880d881 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Tue, 14 Jun 2016 00:26:35 +0200 Subject: drbd: code cleanups without semantic changes This contains various cosmetic fixes ranging from simple typos to const-ifying, and using booleans properly. Original commit messages from Fabian's patch set: drbd: debugfs: constify drbd_version_fops drbd: use seq_put instead of seq_print where possible drbd: include linux/uaccess.h instead of asm/uaccess.h drbd: use const char * const for drbd strings drbd: kerneldoc warning fix in w_e_end_data_req() drbd: use unsigned for one bit fields drbd: use bool for peer is_ states drbd: fix typo drbd: use | for bitmask combination drbd: use true/false for bool drbd: fix drbd_bm_init() comments drbd: introduce peer state union drbd: fix maybe_pull_ahead() locking comments drbd: use bool for growing drbd: remove redundant declarations drbd: replace if/BUG by BUG_ON Signed-off-by: Fabian Frederick Signed-off-by: Roland Kammerer Signed-off-by: Jens Axboe --- include/linux/drbd.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/drbd.h b/include/linux/drbd.h index d6b3c99..2b26156 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -370,6 +370,14 @@ enum drbd_notification_type { NOTIFY_FLAGS = NOTIFY_CONTINUES, }; +enum drbd_peer_state { + P_INCONSISTENT = 3, + P_OUTDATED = 4, + P_DOWN = 5, + P_PRIMARY = 6, + P_FENCING = 7, +}; + #define UUID_JUST_CREATED ((__u64)4) enum write_ordering_e { -- cgit v1.1 From 1b57e66384e2d21150301e68078526fac5680a16 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 14 Jun 2016 00:26:39 +0200 Subject: drbd: correctly handle failed crypto_alloc_hash crypto_alloc_hash returns an ERR_PTR(), not NULL. Also reset peer_integrity_tfm to NULL, to not call crypto_free_hash() on an errno in the cleanup path. Reported-by: Insu Yun Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- include/linux/drbd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 2b26156..002611c 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -51,7 +51,7 @@ #endif extern const char *drbd_buildtag(void); -#define REL_VERSION "8.4.6" +#define REL_VERSION "8.4.7" #define API_VERSION 1 #define PRO_VERSION_MIN 86 #define PRO_VERSION_MAX 101 -- cgit v1.1 From e63a46bef01ff3064f44dba145833284fb6adeec Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 15 Jun 2016 18:17:27 -0700 Subject: block: introduce device_add_disk() In preparation for removing the ->driverfs_dev member of a gendisk, add an api that takes the parent device as a parameter to add_disk(). For now this maintains the status quo of WARN()ing on failure, but not return a error code. Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Bart Van Assche Signed-off-by: Dan Williams --- include/linux/genhd.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 359a8e4..df1dabb 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -414,7 +414,13 @@ static inline void free_part_info(struct hd_struct *part) extern void part_round_stats(int cpu, struct hd_struct *part); /* block/genhd.c */ -extern void add_disk(struct gendisk *disk); +extern void device_add_disk(struct device *parent, struct gendisk *disk); +static inline void add_disk(struct gendisk *disk) +{ + /* temporary while we convert callers to device_add_disk */ + device_add_disk(disk->driverfs_dev, disk); +} + extern void del_gendisk(struct gendisk *gp); extern struct gendisk *get_gendisk(dev_t dev, int *partno); extern struct block_device *bdget_disk(struct gendisk *disk, int partno); -- cgit v1.1 From 52c44d93c26f5a76068c0a8cc83bb8f56f38043d Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 15 Jun 2016 19:43:07 -0700 Subject: block: remove ->driverfs_dev Now that all drivers that specify a ->driverfs_dev have been converted to device_add_disk(), the pointer can be removed from struct gendisk. Cc: Jens Axboe Cc: Ross Zwisler Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Dan Williams --- include/linux/genhd.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index df1dabb..1dbf52f 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -205,7 +205,6 @@ struct gendisk { void *private_data; int flags; - struct device *driverfs_dev; // FIXME: remove struct kobject *slave_dir; struct timer_rand_state *random; @@ -417,8 +416,7 @@ extern void part_round_stats(int cpu, struct hd_struct *part); extern void device_add_disk(struct device *parent, struct gendisk *disk); static inline void add_disk(struct gendisk *disk) { - /* temporary while we convert callers to device_add_disk */ - device_add_disk(disk->driverfs_dev, disk); + device_add_disk(NULL, disk); } extern void del_gendisk(struct gendisk *gp); -- cgit v1.1 From 1f5bd336b9150560458b03460cbcfcfbcf8995b1 Mon Sep 17 00:00:00 2001 From: Ming Lin Date: Mon, 13 Jun 2016 16:45:21 +0200 Subject: blk-mq: add blk_mq_alloc_request_hctx For some protocols like NVMe over Fabrics we need to be able to send initialization commands to a specific queue. Based on an earlier patch from Christoph Hellwig . Signed-off-by: Ming Lin [hch: disallow sleeping allocation, req_op fixes] Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 2498fdf..cbfd8ca 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -196,6 +196,8 @@ enum { struct request *blk_mq_alloc_request(struct request_queue *q, int rw, unsigned int flags); +struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int op, + unsigned int flags, unsigned int hctx_idx); struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag); struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags); -- cgit v1.1 From eb793e2c9286cca415423edff4942e4ba28e3cd4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 13 Jun 2016 16:45:25 +0200 Subject: nvme.h: add NVMe over Fabrics definitions The NVMe over Fabrics specification defines a protocol interface and related extensions to NVMe that enable operation over network protocols. The NVMe over Fabrics specification has an NVMe Transport binding for each NVMe Transport. This patch adds the fabrics related definitions: - fabric specific command set and error codes - transport addressing and binding definitions - fabrics sgl extensions - controller identification fabrics enhancements - discovery log page definition Signed-off-by: Armen Baloyan Signed-off-by: James Smart Signed-off-by: Jay Freyensee Signed-off-by: Ming Lin Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- include/linux/nvme.h | 337 ++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 318 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index dc815cc..7525030 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -16,6 +16,78 @@ #define _LINUX_NVME_H #include +#include + +/* NQN names in commands fields specified one size */ +#define NVMF_NQN_FIELD_LEN 256 + +/* However the max length of a qualified name is another size */ +#define NVMF_NQN_SIZE 223 + +#define NVMF_TRSVCID_SIZE 32 +#define NVMF_TRADDR_SIZE 256 +#define NVMF_TSAS_SIZE 256 + +#define NVME_DISC_SUBSYS_NAME "nqn.2014-08.org.nvmexpress.discovery" + +#define NVME_RDMA_IP_PORT 4420 + +enum nvme_subsys_type { + NVME_NQN_DISC = 1, /* Discovery type target subsystem */ + NVME_NQN_NVME = 2, /* NVME type target subsystem */ +}; + +/* Address Family codes for Discovery Log Page entry ADRFAM field */ +enum { + NVMF_ADDR_FAMILY_PCI = 0, /* PCIe */ + NVMF_ADDR_FAMILY_IP4 = 1, /* IP4 */ + NVMF_ADDR_FAMILY_IP6 = 2, /* IP6 */ + NVMF_ADDR_FAMILY_IB = 3, /* InfiniBand */ + NVMF_ADDR_FAMILY_FC = 4, /* Fibre Channel */ +}; + +/* Transport Type codes for Discovery Log Page entry TRTYPE field */ +enum { + NVMF_TRTYPE_RDMA = 1, /* RDMA */ + NVMF_TRTYPE_FC = 2, /* Fibre Channel */ + NVMF_TRTYPE_LOOP = 254, /* Reserved for host usage */ + NVMF_TRTYPE_MAX, +}; + +/* Transport Requirements codes for Discovery Log Page entry TREQ field */ +enum { + NVMF_TREQ_NOT_SPECIFIED = 0, /* Not specified */ + NVMF_TREQ_REQUIRED = 1, /* Required */ + NVMF_TREQ_NOT_REQUIRED = 2, /* Not Required */ +}; + +/* RDMA QP Service Type codes for Discovery Log Page entry TSAS + * RDMA_QPTYPE field + */ +enum { + NVMF_RDMA_QPTYPE_CONNECTED = 0, /* Reliable Connected */ + NVMF_RDMA_QPTYPE_DATAGRAM = 1, /* Reliable Datagram */ +}; + +/* RDMA QP Service Type codes for Discovery Log Page entry TSAS + * RDMA_QPTYPE field + */ +enum { + NVMF_RDMA_PRTYPE_NOT_SPECIFIED = 0, /* No Provider Specified */ + NVMF_RDMA_PRTYPE_IB = 1, /* InfiniBand */ + NVMF_RDMA_PRTYPE_ROCE = 2, /* InfiniBand RoCE */ + NVMF_RDMA_PRTYPE_ROCEV2 = 3, /* InfiniBand RoCEV2 */ + NVMF_RDMA_PRTYPE_IWARP = 4, /* IWARP */ +}; + +/* RDMA Connection Management Service Type codes for Discovery Log Page + * entry TSAS RDMA_CMS field + */ +enum { + NVMF_RDMA_CMS_RDMA_CM = 0, /* Sockets based enpoint addressing */ +}; + +#define NVMF_AQ_DEPTH 32 enum { NVME_REG_CAP = 0x0000, /* Controller Capabilities */ @@ -117,7 +189,8 @@ struct nvme_id_ctrl { __le32 rtd3r; __le32 rtd3e; __le32 oaes; - __u8 rsvd96[160]; + __le32 ctratt; + __u8 rsvd100[156]; __le16 oacs; __u8 acl; __u8 aerl; @@ -132,7 +205,7 @@ struct nvme_id_ctrl { __u8 rsvd270[242]; __u8 sqes; __u8 cqes; - __u8 rsvd514[2]; + __le16 maxcmd; __le32 nn; __le16 oncs; __le16 fuses; @@ -145,7 +218,15 @@ struct nvme_id_ctrl { __le16 acwu; __u8 rsvd534[2]; __le32 sgls; - __u8 rsvd540[1508]; + __u8 rsvd540[228]; + char subnqn[256]; + __u8 rsvd1024[768]; + __le32 ioccsz; + __le32 iorcsz; + __le16 icdoff; + __u8 ctrattr; + __u8 msdbd; + __u8 rsvd1804[244]; struct nvme_id_power_state psd[32]; __u8 vs[1024]; }; @@ -307,6 +388,61 @@ enum nvme_opcode { }; /* + * Descriptor subtype - lower 4 bits of nvme_(keyed_)sgl_desc identifier + * + * @NVME_SGL_FMT_ADDRESS: absolute address of the data block + * @NVME_SGL_FMT_OFFSET: relative offset of the in-capsule data block + * @NVME_SGL_FMT_INVALIDATE: RDMA transport specific remote invalidation + * request subtype + */ +enum { + NVME_SGL_FMT_ADDRESS = 0x00, + NVME_SGL_FMT_OFFSET = 0x01, + NVME_SGL_FMT_INVALIDATE = 0x0f, +}; + +/* + * Descriptor type - upper 4 bits of nvme_(keyed_)sgl_desc identifier + * + * For struct nvme_sgl_desc: + * @NVME_SGL_FMT_DATA_DESC: data block descriptor + * @NVME_SGL_FMT_SEG_DESC: sgl segment descriptor + * @NVME_SGL_FMT_LAST_SEG_DESC: last sgl segment descriptor + * + * For struct nvme_keyed_sgl_desc: + * @NVME_KEY_SGL_FMT_DATA_DESC: keyed data block descriptor + */ +enum { + NVME_SGL_FMT_DATA_DESC = 0x00, + NVME_SGL_FMT_SEG_DESC = 0x02, + NVME_SGL_FMT_LAST_SEG_DESC = 0x03, + NVME_KEY_SGL_FMT_DATA_DESC = 0x04, +}; + +struct nvme_sgl_desc { + __le64 addr; + __le32 length; + __u8 rsvd[3]; + __u8 type; +}; + +struct nvme_keyed_sgl_desc { + __le64 addr; + __u8 length[3]; + __u8 key[4]; + __u8 type; +}; + +union nvme_data_ptr { + struct { + __le64 prp1; + __le64 prp2; + }; + struct nvme_sgl_desc sgl; + struct nvme_keyed_sgl_desc ksgl; +}; + +/* * Lowest two bits of our flags field (FUSE field in the spec): * * @NVME_CMD_FUSE_FIRST: Fused Operation, first command @@ -336,8 +472,7 @@ struct nvme_common_command { __le32 nsid; __le32 cdw2[2]; __le64 metadata; - __le64 prp1; - __le64 prp2; + union nvme_data_ptr dptr; __le32 cdw10[6]; }; @@ -348,8 +483,7 @@ struct nvme_rw_command { __le32 nsid; __u64 rsvd2; __le64 metadata; - __le64 prp1; - __le64 prp2; + union nvme_data_ptr dptr; __le64 slba; __le16 length; __le16 control; @@ -389,8 +523,7 @@ struct nvme_dsm_cmd { __u16 command_id; __le32 nsid; __u64 rsvd2[2]; - __le64 prp1; - __le64 prp2; + union nvme_data_ptr dptr; __le32 nr; __le32 attributes; __u32 rsvd12[4]; @@ -454,6 +587,7 @@ enum { NVME_LOG_ERROR = 0x01, NVME_LOG_SMART = 0x02, NVME_LOG_FW_SLOT = 0x03, + NVME_LOG_DISC = 0x70, NVME_LOG_RESERVATION = 0x80, NVME_FWACT_REPL = (0 << 3), NVME_FWACT_REPL_ACTV = (1 << 3), @@ -466,8 +600,7 @@ struct nvme_identify { __u16 command_id; __le32 nsid; __u64 rsvd2[2]; - __le64 prp1; - __le64 prp2; + union nvme_data_ptr dptr; __le32 cns; __u32 rsvd11[5]; }; @@ -478,8 +611,7 @@ struct nvme_features { __u16 command_id; __le32 nsid; __u64 rsvd2[2]; - __le64 prp1; - __le64 prp2; + union nvme_data_ptr dptr; __le32 fid; __le32 dword11; __u32 rsvd12[4]; @@ -538,8 +670,7 @@ struct nvme_download_firmware { __u8 flags; __u16 command_id; __u32 rsvd1[5]; - __le64 prp1; - __le64 prp2; + union nvme_data_ptr dptr; __le32 numd; __le32 offset; __u32 rsvd12[4]; @@ -561,8 +692,7 @@ struct nvme_get_log_page_command { __u16 command_id; __le32 nsid; __u64 rsvd2[2]; - __le64 prp1; - __le64 prp2; + union nvme_data_ptr dptr; __u8 lid; __u8 rsvd10; __le16 numdl; @@ -573,6 +703,126 @@ struct nvme_get_log_page_command { __u32 rsvd14[2]; }; +/* + * Fabrics subcommands. + */ +enum nvmf_fabrics_opcode { + nvme_fabrics_command = 0x7f, +}; + +enum nvmf_capsule_command { + nvme_fabrics_type_property_set = 0x00, + nvme_fabrics_type_connect = 0x01, + nvme_fabrics_type_property_get = 0x04, +}; + +struct nvmf_common_command { + __u8 opcode; + __u8 resv1; + __u16 command_id; + __u8 fctype; + __u8 resv2[35]; + __u8 ts[24]; +}; + +/* + * The legal cntlid range a NVMe Target will provide. + * Note that cntlid of value 0 is considered illegal in the fabrics world. + * Devices based on earlier specs did not have the subsystem concept; + * therefore, those devices had their cntlid value set to 0 as a result. + */ +#define NVME_CNTLID_MIN 1 +#define NVME_CNTLID_MAX 0xffef +#define NVME_CNTLID_DYNAMIC 0xffff + +#define MAX_DISC_LOGS 255 + +/* Discovery log page entry */ +struct nvmf_disc_rsp_page_entry { + __u8 trtype; + __u8 adrfam; + __u8 nqntype; + __u8 treq; + __le16 portid; + __le16 cntlid; + __le16 asqsz; + __u8 resv8[22]; + char trsvcid[NVMF_TRSVCID_SIZE]; + __u8 resv64[192]; + char subnqn[NVMF_NQN_FIELD_LEN]; + char traddr[NVMF_TRADDR_SIZE]; + union tsas { + char common[NVMF_TSAS_SIZE]; + struct rdma { + __u8 qptype; + __u8 prtype; + __u8 cms; + __u8 resv3[5]; + __u16 pkey; + __u8 resv10[246]; + } rdma; + } tsas; +}; + +/* Discovery log page header */ +struct nvmf_disc_rsp_page_hdr { + __le64 genctr; + __le64 numrec; + __le16 recfmt; + __u8 resv14[1006]; + struct nvmf_disc_rsp_page_entry entries[0]; +}; + +struct nvmf_connect_command { + __u8 opcode; + __u8 resv1; + __u16 command_id; + __u8 fctype; + __u8 resv2[19]; + union nvme_data_ptr dptr; + __le16 recfmt; + __le16 qid; + __le16 sqsize; + __u8 cattr; + __u8 resv3; + __le32 kato; + __u8 resv4[12]; +}; + +struct nvmf_connect_data { + uuid_le hostid; + __le16 cntlid; + char resv4[238]; + char subsysnqn[NVMF_NQN_FIELD_LEN]; + char hostnqn[NVMF_NQN_FIELD_LEN]; + char resv5[256]; +}; + +struct nvmf_property_set_command { + __u8 opcode; + __u8 resv1; + __u16 command_id; + __u8 fctype; + __u8 resv2[35]; + __u8 attrib; + __u8 resv3[3]; + __le32 offset; + __le64 value; + __u8 resv4[8]; +}; + +struct nvmf_property_get_command { + __u8 opcode; + __u8 resv1; + __u16 command_id; + __u8 fctype; + __u8 resv2[35]; + __u8 attrib; + __u8 resv3[3]; + __le32 offset; + __u8 resv4[16]; +}; + struct nvme_command { union { struct nvme_common_command common; @@ -587,15 +837,29 @@ struct nvme_command { struct nvme_dsm_cmd dsm; struct nvme_abort_cmd abort; struct nvme_get_log_page_command get_log_page; + struct nvmf_common_command fabrics; + struct nvmf_connect_command connect; + struct nvmf_property_set_command prop_set; + struct nvmf_property_get_command prop_get; }; }; static inline bool nvme_is_write(struct nvme_command *cmd) { + /* + * What a mess... + * + * Why can't we simply have a Fabrics In and Fabrics out command? + */ + if (unlikely(cmd->common.opcode == nvme_fabrics_command)) + return cmd->fabrics.opcode & 1; return cmd->common.opcode & 1; } enum { + /* + * Generic Command Status: + */ NVME_SC_SUCCESS = 0x0, NVME_SC_INVALID_OPCODE = 0x1, NVME_SC_INVALID_FIELD = 0x2, @@ -614,10 +878,18 @@ enum { NVME_SC_SGL_INVALID_DATA = 0xf, NVME_SC_SGL_INVALID_METADATA = 0x10, NVME_SC_SGL_INVALID_TYPE = 0x11, + + NVME_SC_SGL_INVALID_OFFSET = 0x16, + NVME_SC_SGL_INVALID_SUBTYPE = 0x17, + NVME_SC_LBA_RANGE = 0x80, NVME_SC_CAP_EXCEEDED = 0x81, NVME_SC_NS_NOT_READY = 0x82, NVME_SC_RESERVATION_CONFLICT = 0x83, + + /* + * Command Specific Status: + */ NVME_SC_CQ_INVALID = 0x100, NVME_SC_QID_INVALID = 0x101, NVME_SC_QUEUE_SIZE = 0x102, @@ -635,9 +907,29 @@ enum { NVME_SC_FEATURE_NOT_CHANGEABLE = 0x10e, NVME_SC_FEATURE_NOT_PER_NS = 0x10f, NVME_SC_FW_NEEDS_RESET_SUBSYS = 0x110, + + /* + * I/O Command Set Specific - NVM commands: + */ NVME_SC_BAD_ATTRIBUTES = 0x180, NVME_SC_INVALID_PI = 0x181, NVME_SC_READ_ONLY = 0x182, + + /* + * I/O Command Set Specific - Fabrics commands: + */ + NVME_SC_CONNECT_FORMAT = 0x180, + NVME_SC_CONNECT_CTRL_BUSY = 0x181, + NVME_SC_CONNECT_INVALID_PARAM = 0x182, + NVME_SC_CONNECT_RESTART_DISC = 0x183, + NVME_SC_CONNECT_INVALID_HOST = 0x184, + + NVME_SC_DISCOVERY_RESTART = 0x190, + NVME_SC_AUTH_REQUIRED = 0x191, + + /* + * Media and Data Integrity Errors: + */ NVME_SC_WRITE_FAULT = 0x280, NVME_SC_READ_ERROR = 0x281, NVME_SC_GUARD_CHECK = 0x282, @@ -645,12 +937,19 @@ enum { NVME_SC_REFTAG_CHECK = 0x284, NVME_SC_COMPARE_FAILED = 0x285, NVME_SC_ACCESS_DENIED = 0x286, + NVME_SC_DNR = 0x4000, }; struct nvme_completion { - __le32 result; /* Used by admin commands to return data */ - __u32 rsvd; + /* + * Used by Admin and Fabrics commands to return data: + */ + union { + __le16 result16; + __le32 result; + __le64 result64; + }; __le16 sq_head; /* how much of this queue may be reclaimed */ __le16 sq_id; /* submission queue that generated this entry */ __u16 command_id; /* of the command which completed */ -- cgit v1.1 From 7b89eae29eec4dd9259acd82ed57bec8d9e430ca Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 13 Jun 2016 16:45:27 +0200 Subject: nvme.h: Add keep-alive opcode and identify controller attribute KAS: keep-alive support and granularity of kato in units of 100 ms nvme_admin_keep_alive opcode: 0x18 Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- include/linux/nvme.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 7525030..d8b37ba 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -202,7 +202,9 @@ struct nvme_id_ctrl { __u8 apsta; __le16 wctemp; __le16 cctemp; - __u8 rsvd270[242]; + __u8 rsvd270[50]; + __le16 kas; + __u8 rsvd322[190]; __u8 sqes; __u8 cqes; __le16 maxcmd; @@ -556,6 +558,7 @@ enum nvme_admin_opcode { nvme_admin_async_event = 0x0c, nvme_admin_activate_fw = 0x10, nvme_admin_download_fw = 0x11, + nvme_admin_keep_alive = 0x18, nvme_admin_format_nvm = 0x80, nvme_admin_security_send = 0x81, nvme_admin_security_recv = 0x82, @@ -580,6 +583,7 @@ enum { NVME_FEAT_WRITE_ATOMIC = 0x0a, NVME_FEAT_ASYNC_EVENT = 0x0b, NVME_FEAT_AUTO_PST = 0x0c, + NVME_FEAT_KATO = 0x0f, NVME_FEAT_SW_PROGRESS = 0x80, NVME_FEAT_HOST_ID = 0x81, NVME_FEAT_RESV_MASK = 0x82, -- cgit v1.1 From 529435e8188674fa1071235f50e6fbbbe020c93f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Thu, 7 Jul 2016 09:54:08 +0200 Subject: lightnvm: add media manager mark_blk helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Expose media manager mark_blk() to targets, as done for the rest of the media manager callback functions. Signed-off-by: Javier González Updated description Signed-off-by: Matias Bjørling Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index ef2c7d2..9c56148 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -532,6 +532,8 @@ extern int nvm_register(struct request_queue *, char *, struct nvm_dev_ops *); extern void nvm_unregister(char *); +void nvm_mark_blk(struct nvm_dev *dev, struct ppa_addr ppa, int type); + extern int nvm_submit_io(struct nvm_dev *, struct nvm_rq *); extern void nvm_generic_to_addr_mode(struct nvm_dev *, struct nvm_rq *); extern void nvm_addr_to_generic_mode(struct nvm_dev *, struct nvm_rq *); -- cgit v1.1 From 5389a1dfb39786df08d4f6a482bd2734b1b50e33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Thu, 7 Jul 2016 09:54:09 +0200 Subject: lightnvm: initialize ppa_addr in dev_to_generic_addr() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ->reserved bit is not initialized when allocated on stack. This may lead targets to misinterpret the PPA as cached. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 9c56148..cee4c8d 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -385,6 +385,7 @@ static inline struct ppa_addr dev_to_generic_addr(struct nvm_dev *dev, { struct ppa_addr l; + l.ppa = 0; /* * (r.ppa << X offset) & X len bitmask. X eq. blk, pg, etc. */ -- cgit v1.1 From 077d2389994197277f4f7662b13d11b2f67646a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Thu, 7 Jul 2016 09:54:14 +0200 Subject: lightnvm: remove open/close statistics for gennvm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The responsibility of the media manager is not to keep track of open/closed blocks. This is better maintained within a target, that already manages this information on writes. Remove the statistics and merge the states NVM_BLK_ST_OPEN and NVM_BLK_ST_CLOSED. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index cee4c8d..8b51d57 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -269,24 +269,15 @@ struct nvm_lun { int lun_id; int chnl_id; - /* It is up to the target to mark blocks as closed. If the target does - * not do it, all blocks are marked as open, and nr_open_blocks - * represents the number of blocks in use - */ - unsigned int nr_open_blocks; /* Number of used, writable blocks */ - unsigned int nr_closed_blocks; /* Number of used, read-only blocks */ - unsigned int nr_free_blocks; /* Number of unused blocks */ - unsigned int nr_bad_blocks; /* Number of bad blocks */ - spinlock_t lock; + unsigned int nr_free_blocks; /* Number of unused blocks */ struct nvm_block *blocks; }; enum { NVM_BLK_ST_FREE = 0x1, /* Free block */ - NVM_BLK_ST_OPEN = 0x2, /* Open block - read-write */ - NVM_BLK_ST_CLOSED = 0x4, /* Closed block - read-only */ + NVM_BLK_ST_TGT = 0x2, /* Block in use by target */ NVM_BLK_ST_BAD = 0x8, /* Bad block */ }; -- cgit v1.1 From b76eb20bb0c6f4f9c2e1ad493f73c52817b8aaff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Thu, 7 Jul 2016 09:54:16 +0200 Subject: lightnvm: move target mgmt into media mgr MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To enable persistent block management to easily control creation and removal of targets, we move target management into the media manager. The LightNVM core continues to maintain which target types are registered, while the media manager now keeps track of its initialized targets. Two new callbacks for the media manager are introduced. create_tgt and remove_tgt. Note that remove_tgt returns 0 on successfully removing a target, and returns 1 if the target was not found. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 8b51d57..d619f6d 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -1,7 +1,9 @@ #ifndef NVM_H #define NVM_H +#include #include +#include enum { NVM_IO_OK = 0, @@ -447,6 +449,8 @@ struct nvm_tgt_type { struct list_head list; }; +extern struct nvm_tgt_type *nvm_find_target_type(const char *, int); + extern int nvm_register_tgt_type(struct nvm_tgt_type *); extern void nvm_unregister_tgt_type(struct nvm_tgt_type *); @@ -455,6 +459,9 @@ extern void nvm_dev_dma_free(struct nvm_dev *, void *, dma_addr_t); typedef int (nvmm_register_fn)(struct nvm_dev *); typedef void (nvmm_unregister_fn)(struct nvm_dev *); + +typedef int (nvmm_create_tgt_fn)(struct nvm_dev *, struct nvm_ioctl_create *); +typedef int (nvmm_remove_tgt_fn)(struct nvm_dev *, struct nvm_ioctl_remove *); typedef struct nvm_block *(nvmm_get_blk_fn)(struct nvm_dev *, struct nvm_lun *, unsigned long); typedef void (nvmm_put_blk_fn)(struct nvm_dev *, struct nvm_block *); @@ -480,6 +487,9 @@ struct nvmm_type { nvmm_register_fn *register_mgr; nvmm_unregister_fn *unregister_mgr; + nvmm_create_tgt_fn *create_tgt; + nvmm_remove_tgt_fn *remove_tgt; + /* Block administration callbacks */ nvmm_get_blk_fn *get_blk_unlocked; nvmm_put_blk_fn *put_blk_unlocked; -- cgit v1.1 From 41285fad511a2c3746bee7ccb3b2f21b70305c14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Thu, 7 Jul 2016 09:54:19 +0200 Subject: lightnvm: remove _unlocked variant of [get/put]_blk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The [get/put]_blk API enables targets to get ownership of blocks at runtime. This information is currently not recorded on disk, and the information is therefore lost on power failure. To restore the metadata, the [get/put]_blk must persist its metadata. In that case, we need to control the outer lock, so that we can disable them while updating the on-disk metadata. Fortunately, the _unlocked versions can be removed, which allows us to move the lock into the [get/put]_blk functions. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index d619f6d..e9836cf 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -491,8 +491,6 @@ struct nvmm_type { nvmm_remove_tgt_fn *remove_tgt; /* Block administration callbacks */ - nvmm_get_blk_fn *get_blk_unlocked; - nvmm_put_blk_fn *put_blk_unlocked; nvmm_get_blk_fn *get_blk; nvmm_put_blk_fn *put_blk; nvmm_open_blk_fn *open_blk; @@ -522,10 +520,6 @@ struct nvmm_type { extern int nvm_register_mgr(struct nvmm_type *); extern void nvm_unregister_mgr(struct nvmm_type *); -extern struct nvm_block *nvm_get_blk_unlocked(struct nvm_dev *, - struct nvm_lun *, unsigned long); -extern void nvm_put_blk_unlocked(struct nvm_dev *, struct nvm_block *); - extern struct nvm_block *nvm_get_blk(struct nvm_dev *, struct nvm_lun *, unsigned long); extern void nvm_put_blk(struct nvm_dev *, struct nvm_block *); -- cgit v1.1 From 8680f165eaa56cc9812d7d9b4903f449df0f6a45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Thu, 7 Jul 2016 09:54:22 +0200 Subject: lightnvm: make ppa_list const in nvm_set_rqd_list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The passed by reference ppa list in nvm_set_rqd_list() is updated when multiple planes are available. In that case, each PPA plane is incremented when the device side PPA list is created. This prevents the caller to rely on the PPA list to be unmodified after a call. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index e9836cf..ba78b83 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -534,7 +534,7 @@ extern int nvm_submit_io(struct nvm_dev *, struct nvm_rq *); extern void nvm_generic_to_addr_mode(struct nvm_dev *, struct nvm_rq *); extern void nvm_addr_to_generic_mode(struct nvm_dev *, struct nvm_rq *); extern int nvm_set_rqd_ppalist(struct nvm_dev *, struct nvm_rq *, - struct ppa_addr *, int, int); + const struct ppa_addr *, int, int); extern void nvm_free_rqd_ppalist(struct nvm_dev *, struct nvm_rq *); extern int nvm_erase_ppa(struct nvm_dev *, struct ppa_addr *, int); extern int nvm_erase_blk(struct nvm_dev *, struct nvm_block *); -- cgit v1.1 From 486cf9899e311838b6ab95d19ff87c4da44d6508 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 6 Jul 2016 21:55:48 +0900 Subject: blk-mq: Introduce blk_mq_reinit_tagset The new nvme-rdma driver will need to reinitialize all the tags as part of the error recovery procedure (realloc the tag memory region). Add a helper in blk-mq for it that can iterate over all requests in a tagset to make this easier. Signed-off-by: Sagi Grimberg Tested-by: Ming Lin Reviewed-by: Stephen Bates Signed-off-by: Christoph Hellwig Reviewed-by: Steve Wise Tested-by: Steve Wise Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index cbfd8ca..e43bbff 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -96,6 +96,7 @@ typedef int (init_request_fn)(void *, struct request *, unsigned int, unsigned int, unsigned int); typedef void (exit_request_fn)(void *, struct request *, unsigned int, unsigned int); +typedef int (reinit_request_fn)(void *, struct request *); typedef void (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *, bool); @@ -145,6 +146,7 @@ struct blk_mq_ops { */ init_request_fn *init_request; exit_request_fn *exit_request; + reinit_request_fn *reinit_request; }; enum { @@ -245,6 +247,7 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_unfreeze_queue(struct request_queue *q); void blk_mq_freeze_queue_start(struct request_queue *q); +int blk_mq_reinit_tagset(struct blk_mq_tag_set *set); void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); -- cgit v1.1 From 931a6de4d734de6142142c82555e3f017adcb9f0 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 6 Jul 2016 21:55:50 +0900 Subject: nvme-rdma.h: Add includes for nvme rdma_cm negotiation NVMe over Fabrics RDMA transport defines a connection establishment protocol over the RDMA connection manager. This header will be used by both the host and target drivers to negotiate the connection establishment parameters. Signed-off-by: Jay Freyensee Signed-off-by: Ming Lin Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Reviewed-by: Steve Wise Tested-by: Steve Wise Signed-off-by: Jens Axboe --- include/linux/nvme-rdma.h | 71 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 include/linux/nvme-rdma.h (limited to 'include') diff --git a/include/linux/nvme-rdma.h b/include/linux/nvme-rdma.h new file mode 100644 index 0000000..bf240a3 --- /dev/null +++ b/include/linux/nvme-rdma.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2015 Mellanox Technologies. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef _LINUX_NVME_RDMA_H +#define _LINUX_NVME_RDMA_H + +enum nvme_rdma_cm_fmt { + NVME_RDMA_CM_FMT_1_0 = 0x0, +}; + +enum nvme_rdma_cm_status { + NVME_RDMA_CM_INVALID_LEN = 0x01, + NVME_RDMA_CM_INVALID_RECFMT = 0x02, + NVME_RDMA_CM_INVALID_QID = 0x03, + NVME_RDMA_CM_INVALID_HSQSIZE = 0x04, + NVME_RDMA_CM_INVALID_HRQSIZE = 0x05, + NVME_RDMA_CM_NO_RSC = 0x06, + NVME_RDMA_CM_INVALID_IRD = 0x07, + NVME_RDMA_CM_INVALID_ORD = 0x08, +}; + +/** + * struct nvme_rdma_cm_req - rdma connect request + * + * @recfmt: format of the RDMA Private Data + * @qid: queue Identifier for the Admin or I/O Queue + * @hrqsize: host receive queue size to be created + * @hsqsize: host send queue size to be created + */ +struct nvme_rdma_cm_req { + __le16 recfmt; + __le16 qid; + __le16 hrqsize; + __le16 hsqsize; + u8 rsvd[24]; +}; + +/** + * struct nvme_rdma_cm_rep - rdma connect reply + * + * @recfmt: format of the RDMA Private Data + * @crqsize: controller receive queue size + */ +struct nvme_rdma_cm_rep { + __le16 recfmt; + __le16 crqsize; + u8 rsvd[28]; +}; + +/** + * struct nvme_rdma_cm_rej - rdma connect reject + * + * @recfmt: format of the RDMA Private Data + * @fsts: error status for the associated connect request + */ +struct nvme_rdma_cm_rej { + __le16 recfmt; + __le16 sts; +}; + +#endif /* _LINUX_NVME_RDMA_H */ -- cgit v1.1 From e950fdf71c9b4a6b63b58fed78956a96cc907402 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Jul 2016 11:23:33 +0200 Subject: block: introduce BLKDEV_DISCARD_ZERO to fix zeroout Currently blkdev_issue_zeroout cascades down from discards (if the driver guarantees that discards zero data), to WRITE SAME and then to a loop writing zeroes. Unfortunately we ignore run-time EOPNOTSUPP errors in the block layer blkdev_issue_discard helper to work around DM volumes that may have mixed discard support underneath. This patch intoroduces a new BLKDEV_DISCARD_ZERO flag to blkdev_issue_discard that indicates we are called for zeroing operation. This allows both to ignore the EOPNOTSUPP hack and actually consolidating the discard_zeroes_data check into the function. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 53fee61..156455c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1137,7 +1137,9 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt, return bqt->tag_index[tag]; } -#define BLKDEV_DISCARD_SECURE 0x01 /* secure discard */ + +#define BLKDEV_DISCARD_SECURE (1 << 0) /* issue a secure erase */ +#define BLKDEV_DISCARD_ZERO (1 << 1) /* must reliably zero data */ extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *); extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector, -- cgit v1.1 From 70246286e94c335b5bea0cbc68a17a96dd620281 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Jul 2016 11:28:41 +0200 Subject: block: get rid of bio_rw and READA These two are confusing leftover of the old world order, combining values of the REQ_OP_ and REQ_ namespaces. For callers that don't special case we mostly just replace bi_rw with bio_data_dir or op_is_write, except for the few cases where a switch over the REQ_OP_ values makes more sense. Any check for READA is replaced with an explicit check for REQ_RAHEAD. Also remove the READA alias for REQ_RAHEAD. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Mike Christie Signed-off-by: Jens Axboe --- include/linux/fs.h | 18 +----------------- include/trace/events/f2fs.h | 4 ++-- 2 files changed, 3 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 1830245..dc48866 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -178,9 +178,6 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, * READ_SYNC A synchronous read. Device is not plugged, caller can * immediately wait on this read without caring about * unplugging. - * READA Used for read-ahead operations. Lower priority, and the - * block layer could (in theory) choose to ignore this - * request if it runs into resource problems. * WRITE A normal async write. Device will be plugged. * WRITE_SYNC Synchronous write. Identical to WRITE, but passes down * the hint that someone will be waiting on this IO @@ -195,11 +192,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, * */ #define RW_MASK REQ_OP_WRITE -#define RWA_MASK REQ_RAHEAD #define READ REQ_OP_READ -#define WRITE RW_MASK -#define READA RWA_MASK +#define WRITE REQ_OP_WRITE #define READ_SYNC REQ_SYNC #define WRITE_SYNC (REQ_SYNC | REQ_NOIDLE) @@ -2471,17 +2466,6 @@ static inline bool op_is_write(unsigned int op) } /* - * return READ, READA, or WRITE - */ -static inline int bio_rw(struct bio *bio) -{ - if (op_is_write(bio_op(bio))) - return WRITE; - - return bio->bi_rw & RWA_MASK; -} - -/* * return data direction, READ or WRITE */ static inline int bio_data_dir(struct bio *bio) diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 878963a..ff95fd0 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -55,7 +55,7 @@ TRACE_DEFINE_ENUM(CP_DISCARD); { IPU, "IN-PLACE" }, \ { OPU, "OUT-OF-PLACE" }) -#define F2FS_BIO_FLAG_MASK(t) (t & (READA | WRITE_FLUSH_FUA)) +#define F2FS_BIO_FLAG_MASK(t) (t & (REQ_RAHEAD | WRITE_FLUSH_FUA)) #define F2FS_BIO_EXTRA_MASK(t) (t & (REQ_META | REQ_PRIO)) #define show_bio_type(op, op_flags) show_bio_op(op), \ @@ -68,7 +68,7 @@ TRACE_DEFINE_ENUM(CP_DISCARD); #define show_bio_op_flags(flags) \ __print_symbolic(F2FS_BIO_FLAG_MASK(flags), \ - { READA, "READAHEAD" }, \ + { REQ_RAHEAD, "READAHEAD" }, \ { READ_SYNC, "READ_SYNC" }, \ { WRITE_SYNC, "WRITE_SYNC" }, \ { WRITE_FLUSH, "WRITE_FLUSH" }, \ -- cgit v1.1 From ed996a52c868b62c4e5bf529cb4ccb44bcfa2f8e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Jul 2016 11:28:42 +0200 Subject: block: simplify and cleanup bvec pool handling Instead of a flag and an index just make sure an index of 0 means no need to free the bvec array. Also move the constants related to the bvec pools together and use a consistent naming scheme for them. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Mike Christie Signed-off-by: Jens Axboe --- include/linux/bio.h | 2 -- include/linux/blk_types.h | 22 ++++++++++++++-------- 2 files changed, 14 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index 0bbb2e3..141cfa9 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -715,8 +715,6 @@ static inline void bio_inc_remaining(struct bio *bio) * and the bvec_slabs[]. */ #define BIO_POOL_SIZE 2 -#define BIOVEC_NR_POOLS 6 -#define BIOVEC_MAX_IDX (BIOVEC_NR_POOLS - 1) struct bio_set { struct kmem_cache *bio_slab; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index efba1f2..b76463e 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -134,19 +134,25 @@ struct bio { /* * Flags starting here get preserved by bio_reset() - this includes - * BIO_POOL_IDX() + * BVEC_POOL_IDX() */ #define BIO_RESET_BITS 13 -#define BIO_OWNS_VEC 13 /* bio_free() should free bvec */ /* - * top 4 bits of bio flags indicate the pool this bio came from + * We support 6 different bvec pools, the last one is magic in that it + * is backed by a mempool. */ -#define BIO_POOL_BITS (4) -#define BIO_POOL_NONE ((1UL << BIO_POOL_BITS) - 1) -#define BIO_POOL_OFFSET (32 - BIO_POOL_BITS) -#define BIO_POOL_MASK (1UL << BIO_POOL_OFFSET) -#define BIO_POOL_IDX(bio) ((bio)->bi_flags >> BIO_POOL_OFFSET) +#define BVEC_POOL_NR 6 +#define BVEC_POOL_MAX (BVEC_POOL_NR - 1) + +/* + * Top 4 bits of bio flags indicate the pool the bvecs came from. We add + * 1 to the actual index so that 0 indicates that there are no bvecs to be + * freed. + */ +#define BVEC_POOL_BITS (4) +#define BVEC_POOL_OFFSET (32 - BVEC_POOL_BITS) +#define BVEC_POOL_IDX(bio) ((bio)->bi_flags >> BVEC_POOL_OFFSET) #endif /* CONFIG_BLOCK */ -- cgit v1.1 From c0acf12a50c2b6cbaf72bdfe2b1b514c369a83c5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Jul 2016 11:28:43 +0200 Subject: block: shrink bio size again The recent ops split grew the bio by adding the new ioprio field. Shrink it again by using a 16-bit field for the bi_flags value and filling the holes near the beginning of the structure. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Mike Christie Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index b76463e..c3525c5 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -46,11 +46,11 @@ struct bvec_iter { struct bio { struct bio *bi_next; /* request queue link */ struct block_device *bi_bdev; - unsigned int bi_flags; /* status, command, etc */ int bi_error; unsigned int bi_rw; /* bottom bits req flags, * top bits REQ_OP */ + unsigned short bi_flags; /* status, command, etc */ unsigned short bi_ioprio; struct bvec_iter bi_iter; @@ -136,7 +136,7 @@ struct bio { * Flags starting here get preserved by bio_reset() - this includes * BVEC_POOL_IDX() */ -#define BIO_RESET_BITS 13 +#define BIO_RESET_BITS 10 /* * We support 6 different bvec pools, the last one is magic in that it @@ -151,7 +151,7 @@ struct bio { * freed. */ #define BVEC_POOL_BITS (4) -#define BVEC_POOL_OFFSET (32 - BVEC_POOL_BITS) +#define BVEC_POOL_OFFSET (16 - BVEC_POOL_BITS) #define BVEC_POOL_IDX(bio) ((bio)->bi_flags >> BVEC_POOL_OFFSET) #endif /* CONFIG_BLOCK */ -- cgit v1.1 From 98d61d5b1a65a9df7cb3d9605f5d37d3dbbb4b5e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Jul 2016 11:31:51 +0200 Subject: block: simplify and export blk_rq_append_bio The target SCSI passthrough backend is much better served with the low-level blk_rq_append_bio construct then the helpers built on top of it, so export it. Also use the opportunity to remove the pointless request_queue argument and make the code flow a little more readable. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 156455c..706c8bf 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -802,6 +802,7 @@ extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src, extern void blk_rq_unprep_clone(struct request *rq); extern int blk_insert_cloned_request(struct request_queue *q, struct request *rq); +extern int blk_rq_append_bio(struct request *rq, struct bio *bio); extern void blk_delay_queue(struct request_queue *, unsigned long); extern void blk_queue_split(struct request_queue *, struct bio **, struct bio_set *); -- cgit v1.1 From 4613c5f1df92f3cb5a8f89c7dfefc37402c16bd8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Jul 2016 11:31:53 +0200 Subject: scsi/osd: open code blk_make_request I wish the OSD code could simply use blk_rq_map_* helpers like everyone else, but the complex nature of deciding if we have DATA IN and/or DATA OUT buffers might make this impossible (at least for a mere human like me). But using blk_rq_append_bio at least allows sharing the setup code between request with or without dat a buffers, and given that this is the last user of blk_make_request it allows getting rid of that somewhat awkward interface. Signed-off-by: Christoph Hellwig Acked-by: Boaz Harrosh Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 706c8bf..0cdea75 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -788,8 +788,6 @@ extern void blk_rq_init(struct request_queue *q, struct request *rq); extern void blk_put_request(struct request *); extern void __blk_put_request(struct request_queue *, struct request *); extern struct request *blk_get_request(struct request_queue *, int, gfp_t); -extern struct request *blk_make_request(struct request_queue *, struct bio *, - gfp_t); extern void blk_rq_set_block_pc(struct request *); extern void blk_requeue_request(struct request_queue *, struct request *); extern void blk_add_request_payload(struct request *rq, struct page *page, -- cgit v1.1