diff options
47 files changed, 1504 insertions, 460 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index e23b07d..1db3152 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7680,9 +7680,7 @@ M: Ross Zwisler <ross.zwisler@linux.intel.com> L: linux-nvdimm@lists.01.org Q: https://patchwork.kernel.org/project/linux-nvdimm/list/ S: Supported -F: drivers/nvdimm/pmem.c -F: include/linux/pmem.h -F: arch/*/include/asm/pmem.h +F: drivers/nvdimm/pmem* LIGHTNVM PLATFORM SUPPORT M: Matias Bjorling <mb@lightnvm.io> diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c index a7fe5fe..2799706 100644 --- a/arch/powerpc/sysdev/axonram.c +++ b/arch/powerpc/sysdev/axonram.c @@ -45,6 +45,7 @@ #include <linux/of_device.h> #include <linux/of_platform.h> #include <linux/pfn_t.h> +#include <linux/uio.h> #include <asm/page.h> #include <asm/prom.h> @@ -163,8 +164,15 @@ axon_ram_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pa return __axon_ram_direct_access(bank, pgoff, nr_pages, kaddr, pfn); } +static size_t axon_ram_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i) +{ + return copy_from_iter(addr, bytes, i); +} + static const struct dax_operations axon_ram_dax_ops = { .direct_access = axon_ram_dax_direct_access, + .copy_from_iter = axon_ram_copy_from_iter, }; /** diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fe53a3a..7d7e0e8 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -54,6 +54,7 @@ config X86 select ARCH_HAS_KCOV if X86_64 select ARCH_HAS_MMIO_FLUSH select ARCH_HAS_PMEM_API if X86_64 + select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 select ARCH_HAS_SET_MEMORY select ARCH_HAS_SG_CHAIN select ARCH_HAS_STRICT_KERNEL_RWX diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h deleted file mode 100644 index 0ff8fe7..0000000 --- a/arch/x86/include/asm/pmem.h +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright(c) 2015 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - */ -#ifndef __ASM_X86_PMEM_H__ -#define __ASM_X86_PMEM_H__ - -#include <linux/uaccess.h> -#include <asm/cacheflush.h> -#include <asm/cpufeature.h> -#include <asm/special_insns.h> - -#ifdef CONFIG_ARCH_HAS_PMEM_API -/** - * arch_memcpy_to_pmem - copy data to persistent memory - * @dst: destination buffer for the copy - * @src: source buffer for the copy - * @n: length of the copy in bytes - * - * Copy data to persistent memory media via non-temporal stores so that - * a subsequent pmem driver flush operation will drain posted write queues. - */ -static inline void arch_memcpy_to_pmem(void *dst, const void *src, size_t n) -{ - int rem; - - /* - * We are copying between two kernel buffers, if - * __copy_from_user_inatomic_nocache() returns an error (page - * fault) we would have already reported a general protection fault - * before the WARN+BUG. - */ - rem = __copy_from_user_inatomic_nocache(dst, (void __user *) src, n); - if (WARN(rem, "%s: fault copying %p <- %p unwritten: %d\n", - __func__, dst, src, rem)) - BUG(); -} - -/** - * arch_wb_cache_pmem - write back a cache range with CLWB - * @vaddr: virtual start address - * @size: number of bytes to write back - * - * Write back a cache range using the CLWB (cache line write back) - * instruction. Note that @size is internally rounded up to be cache - * line size aligned. - */ -static inline void arch_wb_cache_pmem(void *addr, size_t size) -{ - u16 x86_clflush_size = boot_cpu_data.x86_clflush_size; - unsigned long clflush_mask = x86_clflush_size - 1; - void *vend = addr + size; - void *p; - - for (p = (void *)((unsigned long)addr & ~clflush_mask); - p < vend; p += x86_clflush_size) - clwb(p); -} - -/** - * arch_copy_from_iter_pmem - copy data from an iterator to PMEM - * @addr: PMEM destination address - * @bytes: number of bytes to copy - * @i: iterator with source data - * - * Copy data from the iterator 'i' to the PMEM buffer starting at 'addr'. - */ -static inline size_t arch_copy_from_iter_pmem(void *addr, size_t bytes, - struct iov_iter *i) -{ - size_t len; - - /* TODO: skip the write-back by always using non-temporal stores */ - len = copy_from_iter_nocache(addr, bytes, i); - - /* - * In the iovec case on x86_64 copy_from_iter_nocache() uses - * non-temporal stores for the bulk of the transfer, but we need - * to manually flush if the transfer is unaligned. A cached - * memory copy is used when destination or size is not naturally - * aligned. That is: - * - Require 8-byte alignment when size is 8 bytes or larger. - * - Require 4-byte alignment when size is 4 bytes. - * - * In the non-iovec case the entire destination needs to be - * flushed. - */ - if (iter_is_iovec(i)) { - unsigned long flushed, dest = (unsigned long) addr; - - if (bytes < 8) { - if (!IS_ALIGNED(dest, 4) || (bytes != 4)) - arch_wb_cache_pmem(addr, bytes); - } else { - if (!IS_ALIGNED(dest, 8)) { - dest = ALIGN(dest, boot_cpu_data.x86_clflush_size); - arch_wb_cache_pmem(addr, 1); - } - - flushed = dest - (unsigned long) addr; - if (bytes > flushed && !IS_ALIGNED(bytes - flushed, 8)) - arch_wb_cache_pmem(addr + bytes - 1, 1); - } - } else - arch_wb_cache_pmem(addr, bytes); - - return len; -} - -/** - * arch_clear_pmem - zero a PMEM memory range - * @addr: virtual start address - * @size: number of bytes to zero - * - * Write zeros into the memory range starting at 'addr' for 'size' bytes. - */ -static inline void arch_clear_pmem(void *addr, size_t size) -{ - memset(addr, 0, size); - arch_wb_cache_pmem(addr, size); -} - -static inline void arch_invalidate_pmem(void *addr, size_t size) -{ - clflush_cache_range(addr, size); -} -#endif /* CONFIG_ARCH_HAS_PMEM_API */ -#endif /* __ASM_X86_PMEM_H__ */ diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index 733bae0..1f22bc2 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -109,6 +109,11 @@ memcpy_mcsafe(void *dst, const void *src, size_t cnt) return 0; } +#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE +#define __HAVE_ARCH_MEMCPY_FLUSHCACHE 1 +void memcpy_flushcache(void *dst, const void *src, size_t cnt); +#endif + #endif /* __KERNEL__ */ #endif /* _ASM_X86_STRING_64_H */ diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index c5504b9..b16f6a1 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -171,6 +171,10 @@ unsigned long raw_copy_in_user(void __user *dst, const void __user *src, unsigne extern long __copy_user_nocache(void *dst, const void __user *src, unsigned size, int zerorest); +extern long __copy_user_flushcache(void *dst, const void __user *src, unsigned size); +extern void memcpy_page_flushcache(char *to, struct page *page, size_t offset, + size_t len); + static inline int __copy_from_user_inatomic_nocache(void *dst, const void __user *src, unsigned size) @@ -179,6 +183,13 @@ __copy_from_user_inatomic_nocache(void *dst, const void __user *src, return __copy_user_nocache(dst, src, size, 0); } +static inline int +__copy_from_user_flushcache(void *dst, const void __user *src, unsigned size) +{ + kasan_check_write(dst, size); + return __copy_user_flushcache(dst, src, size); +} + unsigned long copy_user_handle_tail(char *to, char *from, unsigned len); diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 3b7c40a..75d3776 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -7,6 +7,7 @@ */ #include <linux/export.h> #include <linux/uaccess.h> +#include <linux/highmem.h> /* * Zero Userspace @@ -73,3 +74,136 @@ copy_user_handle_tail(char *to, char *from, unsigned len) clac(); return len; } + +#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE +/** + * clean_cache_range - write back a cache range with CLWB + * @vaddr: virtual start address + * @size: number of bytes to write back + * + * Write back a cache range using the CLWB (cache line write back) + * instruction. Note that @size is internally rounded up to be cache + * line size aligned. + */ +static void clean_cache_range(void *addr, size_t size) +{ + u16 x86_clflush_size = boot_cpu_data.x86_clflush_size; + unsigned long clflush_mask = x86_clflush_size - 1; + void *vend = addr + size; + void *p; + + for (p = (void *)((unsigned long)addr & ~clflush_mask); + p < vend; p += x86_clflush_size) + clwb(p); +} + +void arch_wb_cache_pmem(void *addr, size_t size) +{ + clean_cache_range(addr, size); +} +EXPORT_SYMBOL_GPL(arch_wb_cache_pmem); + +long __copy_user_flushcache(void *dst, const void __user *src, unsigned size) +{ + unsigned long flushed, dest = (unsigned long) dst; + long rc = __copy_user_nocache(dst, src, size, 0); + + /* + * __copy_user_nocache() uses non-temporal stores for the bulk + * of the transfer, but we need to manually flush if the + * transfer is unaligned. A cached memory copy is used when + * destination or size is not naturally aligned. That is: + * - Require 8-byte alignment when size is 8 bytes or larger. + * - Require 4-byte alignment when size is 4 bytes. + */ + if (size < 8) { + if (!IS_ALIGNED(dest, 4) || size != 4) + clean_cache_range(dst, 1); + } else { + if (!IS_ALIGNED(dest, 8)) { + dest = ALIGN(dest, boot_cpu_data.x86_clflush_size); + clean_cache_range(dst, 1); + } + + flushed = dest - (unsigned long) dst; + if (size > flushed && !IS_ALIGNED(size - flushed, 8)) + clean_cache_range(dst + size - 1, 1); + } + + return rc; +} + +void memcpy_flushcache(void *_dst, const void *_src, size_t size) +{ + unsigned long dest = (unsigned long) _dst; + unsigned long source = (unsigned long) _src; + + /* cache copy and flush to align dest */ + if (!IS_ALIGNED(dest, 8)) { + unsigned len = min_t(unsigned, size, ALIGN(dest, 8) - dest); + + memcpy((void *) dest, (void *) source, len); + clean_cache_range((void *) dest, len); + dest += len; + source += len; + size -= len; + if (!size) + return; + } + + /* 4x8 movnti loop */ + while (size >= 32) { + asm("movq (%0), %%r8\n" + "movq 8(%0), %%r9\n" + "movq 16(%0), %%r10\n" + "movq 24(%0), %%r11\n" + "movnti %%r8, (%1)\n" + "movnti %%r9, 8(%1)\n" + "movnti %%r10, 16(%1)\n" + "movnti %%r11, 24(%1)\n" + :: "r" (source), "r" (dest) + : "memory", "r8", "r9", "r10", "r11"); + dest += 32; + source += 32; + size -= 32; + } + + /* 1x8 movnti loop */ + while (size >= 8) { + asm("movq (%0), %%r8\n" + "movnti %%r8, (%1)\n" + :: "r" (source), "r" (dest) + : "memory", "r8"); + dest += 8; + source += 8; + size -= 8; + } + + /* 1x4 movnti loop */ + while (size >= 4) { + asm("movl (%0), %%r8d\n" + "movnti %%r8d, (%1)\n" + :: "r" (source), "r" (dest) + : "memory", "r8"); + dest += 4; + source += 4; + size -= 4; + } + + /* cache copy for remaining bytes */ + if (size) { + memcpy((void *) dest, (void *) source, size); + clean_cache_range((void *) dest, size); + } +} +EXPORT_SYMBOL_GPL(memcpy_flushcache); + +void memcpy_page_flushcache(char *to, struct page *page, size_t offset, + size_t len) +{ + char *from = kmap_atomic(page); + + memcpy_flushcache(to, from + offset, len); + kunmap_atomic(from); +} +#endif diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index c8520b2..757b0bc 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -150,6 +150,12 @@ void clflush_cache_range(void *vaddr, unsigned int size) } EXPORT_SYMBOL_GPL(clflush_cache_range); +void arch_invalidate_pmem(void *addr, size_t size) +{ + clflush_cache_range(addr, size); +} +EXPORT_SYMBOL_GPL(arch_invalidate_pmem); + static void __cpa_flush_all(void *arg) { unsigned long cache = (unsigned long)arg; diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index 097eff0..b75b734 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -20,7 +20,6 @@ #include <linux/list.h> #include <linux/acpi.h> #include <linux/sort.h> -#include <linux/pmem.h> #include <linux/io.h> #include <linux/nd.h> #include <asm/cacheflush.h> @@ -253,6 +252,8 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, cmd_name = nvdimm_bus_cmd_name(cmd); cmd_mask = nd_desc->cmd_mask; dsm_mask = cmd_mask; + if (cmd == ND_CMD_CALL) + dsm_mask = nd_desc->bus_dsm_mask; desc = nd_cmd_bus_desc(cmd); guid = to_nfit_uuid(NFIT_DEV_BUS); handle = adev->handle; @@ -927,6 +928,17 @@ static int nfit_mem_init(struct acpi_nfit_desc *acpi_desc) return 0; } +static ssize_t bus_dsm_mask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev); + struct nvdimm_bus_descriptor *nd_desc = to_nd_desc(nvdimm_bus); + + return sprintf(buf, "%#lx\n", nd_desc->bus_dsm_mask); +} +static struct device_attribute dev_attr_bus_dsm_mask = + __ATTR(dsm_mask, 0444, bus_dsm_mask_show, NULL); + static ssize_t revision_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -1031,7 +1043,7 @@ static ssize_t scrub_store(struct device *dev, if (nd_desc) { struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc); - rc = acpi_nfit_ars_rescan(acpi_desc); + rc = acpi_nfit_ars_rescan(acpi_desc, 0); } device_unlock(dev); if (rc) @@ -1063,10 +1075,11 @@ static struct attribute *acpi_nfit_attributes[] = { &dev_attr_revision.attr, &dev_attr_scrub.attr, &dev_attr_hw_error_scrub.attr, + &dev_attr_bus_dsm_mask.attr, NULL, }; -static struct attribute_group acpi_nfit_attribute_group = { +static const struct attribute_group acpi_nfit_attribute_group = { .name = "nfit", .attrs = acpi_nfit_attributes, .is_visible = nfit_visible, @@ -1346,7 +1359,7 @@ static umode_t acpi_nfit_dimm_attr_visible(struct kobject *kobj, return a->mode; } -static struct attribute_group acpi_nfit_dimm_attribute_group = { +static const struct attribute_group acpi_nfit_dimm_attribute_group = { .name = "nfit", .attrs = acpi_nfit_dimm_attributes, .is_visible = acpi_nfit_dimm_attr_visible, @@ -1608,11 +1621,23 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc) acpi_desc); } +/* + * These constants are private because there are no kernel consumers of + * these commands. + */ +enum nfit_aux_cmds { + NFIT_CMD_TRANSLATE_SPA = 5, + NFIT_CMD_ARS_INJECT_SET = 7, + NFIT_CMD_ARS_INJECT_CLEAR = 8, + NFIT_CMD_ARS_INJECT_GET = 9, +}; + static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc) { struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc; const guid_t *guid = to_nfit_uuid(NFIT_DEV_BUS); struct acpi_device *adev; + unsigned long dsm_mask; int i; nd_desc->cmd_mask = acpi_desc->bus_cmd_force_en; @@ -1623,6 +1648,20 @@ static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc) for (i = ND_CMD_ARS_CAP; i <= ND_CMD_CLEAR_ERROR; i++) if (acpi_check_dsm(adev->handle, guid, 1, 1ULL << i)) set_bit(i, &nd_desc->cmd_mask); + set_bit(ND_CMD_CALL, &nd_desc->cmd_mask); + + dsm_mask = + (1 << ND_CMD_ARS_CAP) | + (1 << ND_CMD_ARS_START) | + (1 << ND_CMD_ARS_STATUS) | + (1 << ND_CMD_CLEAR_ERROR) | + (1 << NFIT_CMD_TRANSLATE_SPA) | + (1 << NFIT_CMD_ARS_INJECT_SET) | + (1 << NFIT_CMD_ARS_INJECT_CLEAR) | + (1 << NFIT_CMD_ARS_INJECT_GET); + for_each_set_bit(i, &dsm_mask, BITS_PER_LONG) + if (acpi_check_dsm(adev->handle, guid, 1, 1ULL << i)) + set_bit(i, &nd_desc->bus_dsm_mask); } static ssize_t range_index_show(struct device *dev, @@ -1640,7 +1679,7 @@ static struct attribute *acpi_nfit_region_attributes[] = { NULL, }; -static struct attribute_group acpi_nfit_region_attribute_group = { +static const struct attribute_group acpi_nfit_region_attribute_group = { .name = "nfit", .attrs = acpi_nfit_region_attributes, }; @@ -1663,12 +1702,29 @@ struct nfit_set_info { } mapping[0]; }; +struct nfit_set_info2 { + struct nfit_set_info_map2 { + u64 region_offset; + u32 serial_number; + u16 vendor_id; + u16 manufacturing_date; + u8 manufacturing_location; + u8 reserved[31]; + } mapping[0]; +}; + static size_t sizeof_nfit_set_info(int num_mappings) { return sizeof(struct nfit_set_info) + num_mappings * sizeof(struct nfit_set_info_map); } +static size_t sizeof_nfit_set_info2(int num_mappings) +{ + return sizeof(struct nfit_set_info2) + + num_mappings * sizeof(struct nfit_set_info_map2); +} + static int cmp_map_compat(const void *m0, const void *m1) { const struct nfit_set_info_map *map0 = m0; @@ -1690,6 +1746,18 @@ static int cmp_map(const void *m0, const void *m1) return 0; } +static int cmp_map2(const void *m0, const void *m1) +{ + const struct nfit_set_info_map2 *map0 = m0; + const struct nfit_set_info_map2 *map1 = m1; + + if (map0->region_offset < map1->region_offset) + return -1; + else if (map0->region_offset > map1->region_offset) + return 1; + return 0; +} + /* Retrieve the nth entry referencing this spa */ static struct acpi_nfit_memory_map *memdev_from_spa( struct acpi_nfit_desc *acpi_desc, u16 range_index, int n) @@ -1707,27 +1775,31 @@ static int acpi_nfit_init_interleave_set(struct acpi_nfit_desc *acpi_desc, struct nd_region_desc *ndr_desc, struct acpi_nfit_system_address *spa) { - int i, spa_type = nfit_spa_type(spa); struct device *dev = acpi_desc->dev; struct nd_interleave_set *nd_set; u16 nr = ndr_desc->num_mappings; + struct nfit_set_info2 *info2; struct nfit_set_info *info; - - if (spa_type == NFIT_SPA_PM || spa_type == NFIT_SPA_VOLATILE) - /* pass */; - else - return 0; + int i; nd_set = devm_kzalloc(dev, sizeof(*nd_set), GFP_KERNEL); if (!nd_set) return -ENOMEM; + ndr_desc->nd_set = nd_set; + guid_copy(&nd_set->type_guid, (guid_t *) spa->range_guid); info = devm_kzalloc(dev, sizeof_nfit_set_info(nr), GFP_KERNEL); if (!info) return -ENOMEM; + + info2 = devm_kzalloc(dev, sizeof_nfit_set_info2(nr), GFP_KERNEL); + if (!info2) + return -ENOMEM; + for (i = 0; i < nr; i++) { struct nd_mapping_desc *mapping = &ndr_desc->mapping[i]; struct nfit_set_info_map *map = &info->mapping[i]; + struct nfit_set_info_map2 *map2 = &info2->mapping[i]; struct nvdimm *nvdimm = mapping->nvdimm; struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); struct acpi_nfit_memory_map *memdev = memdev_from_spa(acpi_desc, @@ -1740,19 +1812,32 @@ static int acpi_nfit_init_interleave_set(struct acpi_nfit_desc *acpi_desc, map->region_offset = memdev->region_offset; map->serial_number = nfit_mem->dcr->serial_number; + + map2->region_offset = memdev->region_offset; + map2->serial_number = nfit_mem->dcr->serial_number; + map2->vendor_id = nfit_mem->dcr->vendor_id; + map2->manufacturing_date = nfit_mem->dcr->manufacturing_date; + map2->manufacturing_location = nfit_mem->dcr->manufacturing_location; } + /* v1.1 namespaces */ sort(&info->mapping[0], nr, sizeof(struct nfit_set_info_map), cmp_map, NULL); - nd_set->cookie = nd_fletcher64(info, sizeof_nfit_set_info(nr), 0); + nd_set->cookie1 = nd_fletcher64(info, sizeof_nfit_set_info(nr), 0); + + /* v1.2 namespaces */ + sort(&info2->mapping[0], nr, sizeof(struct nfit_set_info_map2), + cmp_map2, NULL); + nd_set->cookie2 = nd_fletcher64(info2, sizeof_nfit_set_info2(nr), 0); - /* support namespaces created with the wrong sort order */ + /* support v1.1 namespaces created with the wrong sort order */ sort(&info->mapping[0], nr, sizeof(struct nfit_set_info_map), cmp_map_compat, NULL); nd_set->altcookie = nd_fletcher64(info, sizeof_nfit_set_info(nr), 0); ndr_desc->nd_set = nd_set; devm_kfree(dev, info); + devm_kfree(dev, info2); return 0; } @@ -1842,8 +1927,7 @@ static int acpi_nfit_blk_single_io(struct nfit_blk *nfit_blk, } if (rw) - memcpy_to_pmem(mmio->addr.aperture + offset, - iobuf + copied, c); + memcpy_flushcache(mmio->addr.aperture + offset, iobuf + copied, c); else { if (nfit_blk->dimm_flags & NFIT_BLK_READ_FLUSH) mmio_flush_range((void __force *) @@ -1957,7 +2041,7 @@ static int acpi_nfit_blk_region_enable(struct nvdimm_bus *nvdimm_bus, nfit_blk->bdw_offset = nfit_mem->bdw->offset; mmio = &nfit_blk->mmio[BDW]; mmio->addr.base = devm_nvdimm_memremap(dev, nfit_mem->spa_bdw->address, - nfit_mem->spa_bdw->length, ARCH_MEMREMAP_PMEM); + nfit_mem->spa_bdw->length, nd_blk_memremap_flags(ndbr)); if (!mmio->addr.base) { dev_dbg(dev, "%s: %s failed to map bdw\n", __func__, nvdimm_name(nvdimm)); @@ -2051,6 +2135,7 @@ static int ars_start(struct acpi_nfit_desc *acpi_desc, struct nfit_spa *nfit_spa memset(&ars_start, 0, sizeof(ars_start)); ars_start.address = spa->address; ars_start.length = spa->length; + ars_start.flags = acpi_desc->ars_start_flags; if (nfit_spa_type(spa) == NFIT_SPA_PM) ars_start.type = ND_ARS_PERSISTENT; else if (nfit_spa_type(spa) == NFIT_SPA_VOLATILE) @@ -2077,6 +2162,7 @@ static int ars_continue(struct acpi_nfit_desc *acpi_desc) ars_start.address = ars_status->restart_address; ars_start.length = ars_status->restart_length; ars_start.type = ars_status->type; + ars_start.flags = acpi_desc->ars_start_flags; rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_START, &ars_start, sizeof(ars_start), &cmd_rc); if (rc < 0) @@ -2179,7 +2265,7 @@ static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc, struct acpi_nfit_system_address *spa = nfit_spa->spa; struct nd_blk_region_desc *ndbr_desc; struct nfit_mem *nfit_mem; - int blk_valid = 0; + int blk_valid = 0, rc; if (!nvdimm) { dev_err(acpi_desc->dev, "spa%d dimm: %#x not found\n", @@ -2211,6 +2297,9 @@ static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc, ndbr_desc = to_blk_region_desc(ndr_desc); ndbr_desc->enable = acpi_nfit_blk_region_enable; ndbr_desc->do_io = acpi_desc->blk_do_io; + rc = acpi_nfit_init_interleave_set(acpi_desc, ndr_desc, spa); + if (rc) + return rc; nfit_spa->nd_region = nvdimm_blk_region_create(acpi_desc->nvdimm_bus, ndr_desc); if (!nfit_spa->nd_region) @@ -2229,6 +2318,13 @@ static bool nfit_spa_is_virtual(struct acpi_nfit_system_address *spa) nfit_spa_type(spa) == NFIT_SPA_PCD); } +static bool nfit_spa_is_volatile(struct acpi_nfit_system_address *spa) +{ + return (nfit_spa_type(spa) == NFIT_SPA_VDISK || + nfit_spa_type(spa) == NFIT_SPA_VCD || + nfit_spa_type(spa) == NFIT_SPA_VOLATILE); +} + static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc, struct nfit_spa *nfit_spa) { @@ -2303,7 +2399,7 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc, ndr_desc); if (!nfit_spa->nd_region) rc = -ENOMEM; - } else if (nfit_spa_type(spa) == NFIT_SPA_VOLATILE) { + } else if (nfit_spa_is_volatile(spa)) { nfit_spa->nd_region = nvdimm_volatile_region_create(nvdimm_bus, ndr_desc); if (!nfit_spa->nd_region) @@ -2595,6 +2691,7 @@ static void acpi_nfit_scrub(struct work_struct *work) list_for_each_entry(nfit_spa, &acpi_desc->spas, list) acpi_nfit_async_scrub(acpi_desc, nfit_spa); acpi_desc->scrub_count++; + acpi_desc->ars_start_flags = 0; if (acpi_desc->scrub_count_state) sysfs_notify_dirent(acpi_desc->scrub_count_state); mutex_unlock(&acpi_desc->init_mutex); @@ -2613,6 +2710,7 @@ static int acpi_nfit_register_regions(struct acpi_nfit_desc *acpi_desc) return rc; } + acpi_desc->ars_start_flags = 0; if (!acpi_desc->cancel) queue_work(nfit_wq, &acpi_desc->work); return 0; @@ -2817,7 +2915,7 @@ static int acpi_nfit_clear_to_send(struct nvdimm_bus_descriptor *nd_desc, return 0; } -int acpi_nfit_ars_rescan(struct acpi_nfit_desc *acpi_desc) +int acpi_nfit_ars_rescan(struct acpi_nfit_desc *acpi_desc, u8 flags) { struct device *dev = acpi_desc->dev; struct nfit_spa *nfit_spa; @@ -2839,6 +2937,7 @@ int acpi_nfit_ars_rescan(struct acpi_nfit_desc *acpi_desc) nfit_spa->ars_required = 1; } + acpi_desc->ars_start_flags = flags; queue_work(nfit_wq, &acpi_desc->work); dev_dbg(dev, "%s: ars_scan triggered\n", __func__); mutex_unlock(&acpi_desc->init_mutex); @@ -2967,7 +3066,7 @@ static int acpi_nfit_remove(struct acpi_device *adev) return 0; } -void __acpi_nfit_notify(struct device *dev, acpi_handle handle, u32 event) +static void acpi_nfit_update_notify(struct device *dev, acpi_handle handle) { struct acpi_nfit_desc *acpi_desc = dev_get_drvdata(dev); struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL }; @@ -2975,11 +3074,6 @@ void __acpi_nfit_notify(struct device *dev, acpi_handle handle, u32 event) acpi_status status; int ret; - dev_dbg(dev, "%s: event: %d\n", __func__, event); - - if (event != NFIT_NOTIFY_UPDATE) - return; - if (!dev->driver) { /* dev->driver may be null if we're being removed */ dev_dbg(dev, "%s: no driver found for dev\n", __func__); @@ -3016,6 +3110,29 @@ void __acpi_nfit_notify(struct device *dev, acpi_handle handle, u32 event) dev_err(dev, "Invalid _FIT\n"); kfree(buf.pointer); } + +static void acpi_nfit_uc_error_notify(struct device *dev, acpi_handle handle) +{ + struct acpi_nfit_desc *acpi_desc = dev_get_drvdata(dev); + u8 flags = (acpi_desc->scrub_mode == HW_ERROR_SCRUB_ON) ? + 0 : ND_ARS_RETURN_PREV_DATA; + + acpi_nfit_ars_rescan(acpi_desc, flags); +} + +void __acpi_nfit_notify(struct device *dev, acpi_handle handle, u32 event) +{ + dev_dbg(dev, "%s: event: 0x%x\n", __func__, event); + + switch (event) { + case NFIT_NOTIFY_UPDATE: + return acpi_nfit_update_notify(dev, handle); + case NFIT_NOTIFY_UC_MEMORY_ERROR: + return acpi_nfit_uc_error_notify(dev, handle); + default: + return; + } +} EXPORT_SYMBOL_GPL(__acpi_nfit_notify); static void acpi_nfit_notify(struct acpi_device *adev, u32 event) diff --git a/drivers/acpi/nfit/mce.c b/drivers/acpi/nfit/mce.c index fd86bec..feeb95d 100644 --- a/drivers/acpi/nfit/mce.c +++ b/drivers/acpi/nfit/mce.c @@ -79,7 +79,7 @@ static int nfit_handle_mce(struct notifier_block *nb, unsigned long val, * already in progress, just let that be the last * authoritative one */ - acpi_nfit_ars_rescan(acpi_desc); + acpi_nfit_ars_rescan(acpi_desc, 0); } break; } diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h index 29bdd95..54292db 100644 --- a/drivers/acpi/nfit/nfit.h +++ b/drivers/acpi/nfit/nfit.h @@ -79,6 +79,7 @@ enum { enum nfit_root_notifiers { NFIT_NOTIFY_UPDATE = 0x80, + NFIT_NOTIFY_UC_MEMORY_ERROR = 0x81, }; enum nfit_dimm_notifiers { @@ -154,6 +155,7 @@ struct acpi_nfit_desc { struct list_head idts; struct nvdimm_bus *nvdimm_bus; struct device *dev; + u8 ars_start_flags; struct nd_cmd_ars_status *ars_status; size_t ars_status_size; struct work_struct work; @@ -206,7 +208,7 @@ struct nfit_blk { extern struct list_head acpi_descs; extern struct mutex acpi_desc_lock; -int acpi_nfit_ars_rescan(struct acpi_nfit_desc *acpi_desc); +int acpi_nfit_ars_rescan(struct acpi_nfit_desc *acpi_desc, u8 flags); #ifdef CONFIG_X86_MCE void nfit_mce_register(void); diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 6112e99..17723fd5 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -22,6 +22,7 @@ #ifdef CONFIG_BLK_DEV_RAM_DAX #include <linux/pfn_t.h> #include <linux/dax.h> +#include <linux/uio.h> #endif #include <linux/uaccess.h> @@ -354,8 +355,15 @@ static long brd_dax_direct_access(struct dax_device *dax_dev, return __brd_direct_access(brd, pgoff, nr_pages, kaddr, pfn); } +static size_t brd_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i) +{ + return copy_from_iter(addr, bytes, i); +} + static const struct dax_operations brd_dax_ops = { .direct_access = brd_dax_direct_access, + .copy_from_iter = brd_dax_copy_from_iter, }; #endif diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 922d082..ce9e563 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -18,6 +18,7 @@ #include <linux/cdev.h> #include <linux/hash.h> #include <linux/slab.h> +#include <linux/uio.h> #include <linux/dax.h> #include <linux/fs.h> @@ -115,13 +116,20 @@ int __bdev_dax_supported(struct super_block *sb, int blocksize) EXPORT_SYMBOL_GPL(__bdev_dax_supported); #endif +enum dax_device_flags { + /* !alive + rcu grace period == no new operations / mappings */ + DAXDEV_ALIVE, + /* gate whether dax_flush() calls the low level flush routine */ + DAXDEV_WRITE_CACHE, +}; + /** * struct dax_device - anchor object for dax services * @inode: core vfs * @cdev: optional character interface for "device dax" * @host: optional name for lookups where the device path is not available * @private: dax driver private data - * @alive: !alive + rcu grace period == no new operations / mappings + * @flags: state and boolean properties */ struct dax_device { struct hlist_node list; @@ -129,10 +137,75 @@ struct dax_device { struct cdev cdev; const char *host; void *private; - bool alive; + unsigned long flags; const struct dax_operations *ops; }; +static ssize_t write_cache_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dax_device *dax_dev = dax_get_by_host(dev_name(dev)); + ssize_t rc; + + WARN_ON_ONCE(!dax_dev); + if (!dax_dev) + return -ENXIO; + + rc = sprintf(buf, "%d\n", !!test_bit(DAXDEV_WRITE_CACHE, + &dax_dev->flags)); + put_dax(dax_dev); + return rc; +} + +static ssize_t write_cache_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + bool write_cache; + int rc = strtobool(buf, &write_cache); + struct dax_device *dax_dev = dax_get_by_host(dev_name(dev)); + + WARN_ON_ONCE(!dax_dev); + if (!dax_dev) + return -ENXIO; + + if (rc) + len = rc; + else if (write_cache) + set_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags); + else + clear_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags); + + put_dax(dax_dev); + return len; +} +static DEVICE_ATTR_RW(write_cache); + +static umode_t dax_visible(struct kobject *kobj, struct attribute *a, int n) +{ + struct device *dev = container_of(kobj, typeof(*dev), kobj); + struct dax_device *dax_dev = dax_get_by_host(dev_name(dev)); + + WARN_ON_ONCE(!dax_dev); + if (!dax_dev) + return 0; + + if (a == &dev_attr_write_cache.attr && !dax_dev->ops->flush) + return 0; + return a->mode; +} + +static struct attribute *dax_attributes[] = { + &dev_attr_write_cache.attr, + NULL, +}; + +struct attribute_group dax_attribute_group = { + .name = "dax", + .attrs = dax_attributes, + .is_visible = dax_visible, +}; +EXPORT_SYMBOL_GPL(dax_attribute_group); + /** * dax_direct_access() - translate a device pgoff to an absolute pfn * @dax_dev: a dax_device instance representing the logical memory range @@ -172,10 +245,43 @@ long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, } EXPORT_SYMBOL_GPL(dax_direct_access); +size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, + size_t bytes, struct iov_iter *i) +{ + if (!dax_alive(dax_dev)) + return 0; + + return dax_dev->ops->copy_from_iter(dax_dev, pgoff, addr, bytes, i); +} +EXPORT_SYMBOL_GPL(dax_copy_from_iter); + +void dax_flush(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, + size_t size) +{ + if (!dax_alive(dax_dev)) + return; + + if (!test_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags)) + return; + + if (dax_dev->ops->flush) + dax_dev->ops->flush(dax_dev, pgoff, addr, size); +} +EXPORT_SYMBOL_GPL(dax_flush); + +void dax_write_cache(struct dax_device *dax_dev, bool wc) +{ + if (wc) + set_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags); + else + clear_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags); +} +EXPORT_SYMBOL_GPL(dax_write_cache); + bool dax_alive(struct dax_device *dax_dev) { lockdep_assert_held(&dax_srcu); - return dax_dev->alive; + return test_bit(DAXDEV_ALIVE, &dax_dev->flags); } EXPORT_SYMBOL_GPL(dax_alive); @@ -195,7 +301,7 @@ void kill_dax(struct dax_device *dax_dev) if (!dax_dev) return; - dax_dev->alive = false; + clear_bit(DAXDEV_ALIVE, &dax_dev->flags); synchronize_srcu(&dax_srcu); @@ -239,7 +345,7 @@ static void dax_destroy_inode(struct inode *inode) { struct dax_device *dax_dev = to_dax_dev(inode); - WARN_ONCE(dax_dev->alive, + WARN_ONCE(test_bit(DAXDEV_ALIVE, &dax_dev->flags), "kill_dax() must be called before final iput()\n"); call_rcu(&inode->i_rcu, dax_i_callback); } @@ -291,7 +397,7 @@ static struct dax_device *dax_dev_get(dev_t devt) dax_dev = to_dax_dev(inode); if (inode->i_state & I_NEW) { - dax_dev->alive = true; + set_bit(DAXDEV_ALIVE, &dax_dev->flags); inode->i_cdev = &dax_dev->cdev; inode->i_mode = S_IFCHR; inode->i_flags = S_DAX; diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index c03c203..41971a0 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -170,6 +170,34 @@ static long linear_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, return dax_direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn); } +static size_t linear_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i) +{ + struct linear_c *lc = ti->private; + struct block_device *bdev = lc->dev->bdev; + struct dax_device *dax_dev = lc->dev->dax_dev; + sector_t dev_sector, sector = pgoff * PAGE_SECTORS; + + dev_sector = linear_map_sector(ti, sector); + if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(bytes, PAGE_SIZE), &pgoff)) + return 0; + return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i); +} + +static void linear_dax_flush(struct dm_target *ti, pgoff_t pgoff, void *addr, + size_t size) +{ + struct linear_c *lc = ti->private; + struct block_device *bdev = lc->dev->bdev; + struct dax_device *dax_dev = lc->dev->dax_dev; + sector_t dev_sector, sector = pgoff * PAGE_SECTORS; + + dev_sector = linear_map_sector(ti, sector); + if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(size, PAGE_SIZE), &pgoff)) + return; + dax_flush(dax_dev, pgoff, addr, size); +} + static struct target_type linear_target = { .name = "linear", .version = {1, 4, 0}, @@ -183,6 +211,8 @@ static struct target_type linear_target = { .prepare_ioctl = linear_prepare_ioctl, .iterate_devices = linear_iterate_devices, .direct_access = linear_dax_direct_access, + .dax_copy_from_iter = linear_dax_copy_from_iter, + .dax_flush = linear_dax_flush, }; int __init dm_linear_init(void) diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 11621a0..a037553 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -332,6 +332,44 @@ static long stripe_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, return dax_direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn); } +static size_t stripe_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i) +{ + sector_t dev_sector, sector = pgoff * PAGE_SECTORS; + struct stripe_c *sc = ti->private; + struct dax_device *dax_dev; + struct block_device *bdev; + uint32_t stripe; + + stripe_map_sector(sc, sector, &stripe, &dev_sector); + dev_sector += sc->stripe[stripe].physical_start; + dax_dev = sc->stripe[stripe].dev->dax_dev; + bdev = sc->stripe[stripe].dev->bdev; + + if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(bytes, PAGE_SIZE), &pgoff)) + return 0; + return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i); +} + +static void stripe_dax_flush(struct dm_target *ti, pgoff_t pgoff, void *addr, + size_t size) +{ + sector_t dev_sector, sector = pgoff * PAGE_SECTORS; + struct stripe_c *sc = ti->private; + struct dax_device *dax_dev; + struct block_device *bdev; + uint32_t stripe; + + stripe_map_sector(sc, sector, &stripe, &dev_sector); + dev_sector += sc->stripe[stripe].physical_start; + dax_dev = sc->stripe[stripe].dev->dax_dev; + bdev = sc->stripe[stripe].dev->bdev; + + if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(size, PAGE_SIZE), &pgoff)) + return; + dax_flush(dax_dev, pgoff, addr, size); +} + /* * Stripe status: * @@ -452,6 +490,8 @@ static struct target_type stripe_target = { .iterate_devices = stripe_iterate_devices, .io_hints = stripe_io_hints, .direct_access = stripe_dax_direct_access, + .dax_copy_from_iter = stripe_dax_copy_from_iter, + .dax_flush = stripe_dax_flush, }; int __init dm_stripe_init(void) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index c2afe7a..10cabe9 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -19,6 +19,7 @@ #include <linux/dax.h> #include <linux/slab.h> #include <linux/idr.h> +#include <linux/uio.h> #include <linux/hdreg.h> #include <linux/delay.h> #include <linux/wait.h> @@ -972,6 +973,48 @@ static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, return ret; } +static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i) +{ + struct mapped_device *md = dax_get_private(dax_dev); + sector_t sector = pgoff * PAGE_SECTORS; + struct dm_target *ti; + long ret = 0; + int srcu_idx; + + ti = dm_dax_get_live_target(md, sector, &srcu_idx); + + if (!ti) + goto out; + if (!ti->type->dax_copy_from_iter) { + ret = copy_from_iter(addr, bytes, i); + goto out; + } + ret = ti->type->dax_copy_from_iter(ti, pgoff, addr, bytes, i); + out: + dm_put_live_table(md, srcu_idx); + + return ret; +} + +static void dm_dax_flush(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, + size_t size) +{ + struct mapped_device *md = dax_get_private(dax_dev); + sector_t sector = pgoff * PAGE_SECTORS; + struct dm_target *ti; + int srcu_idx; + + ti = dm_dax_get_live_target(md, sector, &srcu_idx); + + if (!ti) + goto out; + if (ti->type->dax_flush) + ti->type->dax_flush(ti, pgoff, addr, size); + out: + dm_put_live_table(md, srcu_idx); +} + /* * A target may call dm_accept_partial_bio only from the map routine. It is * allowed for all bio types except REQ_PREFLUSH. @@ -2958,6 +3001,8 @@ static const struct block_device_operations dm_blk_dops = { static const struct dax_operations dm_dax_ops = { .direct_access = dm_dax_direct_access, + .copy_from_iter = dm_dax_copy_from_iter, + .flush = dm_dax_flush, }; /* diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index b6ba061..64216de 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -37,8 +37,8 @@ static int arena_read_bytes(struct arena_info *arena, resource_size_t offset, struct nd_btt *nd_btt = arena->nd_btt; struct nd_namespace_common *ndns = nd_btt->ndns; - /* arena offsets are 4K from the base of the device */ - offset += SZ_4K; + /* arena offsets may be shifted from the base of the device */ + offset += arena->nd_btt->initial_offset; return nvdimm_read_bytes(ndns, offset, buf, n, flags); } @@ -48,8 +48,8 @@ static int arena_write_bytes(struct arena_info *arena, resource_size_t offset, struct nd_btt *nd_btt = arena->nd_btt; struct nd_namespace_common *ndns = nd_btt->ndns; - /* arena offsets are 4K from the base of the device */ - offset += SZ_4K; + /* arena offsets may be shifted from the base of the device */ + offset += arena->nd_btt->initial_offset; return nvdimm_write_bytes(ndns, offset, buf, n, flags); } @@ -323,7 +323,7 @@ static int btt_log_read(struct arena_info *arena, u32 lane, old_ent = btt_log_get_old(log); if (old_ent < 0 || old_ent > 1) { - dev_info(to_dev(arena), + dev_err(to_dev(arena), "log corruption (%d): lane %d seq [%d, %d]\n", old_ent, lane, log[0].seq, log[1].seq); /* TODO set error state? */ @@ -576,8 +576,8 @@ static struct arena_info *alloc_arena(struct btt *btt, size_t size, arena->internal_lbasize = roundup(arena->external_lbasize, INT_LBASIZE_ALIGNMENT); arena->nfree = BTT_DEFAULT_NFREE; - arena->version_major = 1; - arena->version_minor = 1; + arena->version_major = btt->nd_btt->version_major; + arena->version_minor = btt->nd_btt->version_minor; if (available % BTT_PG_SIZE) available -= (available % BTT_PG_SIZE); @@ -684,7 +684,7 @@ static int discover_arenas(struct btt *btt) dev_info(to_dev(arena), "No existing arenas\n"); goto out; } else { - dev_info(to_dev(arena), + dev_err(to_dev(arena), "Found corrupted metadata!\n"); ret = -ENODEV; goto out; @@ -1227,7 +1227,7 @@ static blk_qc_t btt_make_request(struct request_queue *q, struct bio *bio) err = btt_do_bvec(btt, bip, bvec.bv_page, len, bvec.bv_offset, op_is_write(bio_op(bio)), iter.bi_sector); if (err) { - dev_info(&btt->nd_btt->dev, + dev_err(&btt->nd_btt->dev, "io error in %s sector %lld, len %d,\n", (op_is_write(bio_op(bio))) ? "WRITE" : "READ", @@ -1248,10 +1248,13 @@ static int btt_rw_page(struct block_device *bdev, sector_t sector, struct page *page, bool is_write) { struct btt *btt = bdev->bd_disk->private_data; + int rc; - btt_do_bvec(btt, NULL, page, PAGE_SIZE, 0, is_write, sector); - page_endio(page, is_write, 0); - return 0; + rc = btt_do_bvec(btt, NULL, page, PAGE_SIZE, 0, is_write, sector); + if (rc == 0) + page_endio(page, is_write, 0); + + return rc; } @@ -1369,7 +1372,7 @@ static struct btt *btt_init(struct nd_btt *nd_btt, unsigned long long rawsize, } if (btt->init_state != INIT_READY && nd_region->ro) { - dev_info(dev, "%s is read-only, unable to init btt metadata\n", + dev_warn(dev, "%s is read-only, unable to init btt metadata\n", dev_name(&nd_region->dev)); return NULL; } else if (btt->init_state != INIT_READY) { @@ -1424,6 +1427,7 @@ int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns) { struct nd_btt *nd_btt = to_nd_btt(ndns->claim); struct nd_region *nd_region; + struct btt_sb *btt_sb; struct btt *btt; size_t rawsize; @@ -1432,10 +1436,21 @@ int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns) return -ENODEV; } - rawsize = nvdimm_namespace_capacity(ndns) - SZ_4K; + btt_sb = devm_kzalloc(&nd_btt->dev, sizeof(*btt_sb), GFP_KERNEL); + + /* + * If this returns < 0, that is ok as it just means there wasn't + * an existing BTT, and we're creating a new one. We still need to + * call this as we need the version dependent fields in nd_btt to be + * set correctly based on the holder class + */ + nd_btt_version(nd_btt, ndns, btt_sb); + + rawsize = nvdimm_namespace_capacity(ndns) - nd_btt->initial_offset; if (rawsize < ARENA_MIN_SIZE) { dev_dbg(&nd_btt->dev, "%s must be at least %ld bytes\n", - dev_name(&ndns->dev), ARENA_MIN_SIZE + SZ_4K); + dev_name(&ndns->dev), + ARENA_MIN_SIZE + nd_btt->initial_offset); return -ENXIO; } nd_region = to_nd_region(nd_btt->dev.parent); diff --git a/drivers/nvdimm/btt.h b/drivers/nvdimm/btt.h index b2f8651..888e862 100644 --- a/drivers/nvdimm/btt.h +++ b/drivers/nvdimm/btt.h @@ -184,5 +184,7 @@ struct btt { }; bool nd_btt_arena_is_valid(struct nd_btt *nd_btt, struct btt_sb *super); +int nd_btt_version(struct nd_btt *nd_btt, struct nd_namespace_common *ndns, + struct btt_sb *btt_sb); #endif diff --git a/drivers/nvdimm/btt_devs.c b/drivers/nvdimm/btt_devs.c index 4c989bb..3e359d2 100644 --- a/drivers/nvdimm/btt_devs.c +++ b/drivers/nvdimm/btt_devs.c @@ -260,20 +260,55 @@ bool nd_btt_arena_is_valid(struct nd_btt *nd_btt, struct btt_sb *super) } EXPORT_SYMBOL(nd_btt_arena_is_valid); +int nd_btt_version(struct nd_btt *nd_btt, struct nd_namespace_common *ndns, + struct btt_sb *btt_sb) +{ + if (ndns->claim_class == NVDIMM_CCLASS_BTT2) { + /* Probe/setup for BTT v2.0 */ + nd_btt->initial_offset = 0; + nd_btt->version_major = 2; + nd_btt->version_minor = 0; + if (nvdimm_read_bytes(ndns, 0, btt_sb, sizeof(*btt_sb), 0)) + return -ENXIO; + if (!nd_btt_arena_is_valid(nd_btt, btt_sb)) + return -ENODEV; + if ((le16_to_cpu(btt_sb->version_major) != 2) || + (le16_to_cpu(btt_sb->version_minor) != 0)) + return -ENODEV; + } else { + /* + * Probe/setup for BTT v1.1 (NVDIMM_CCLASS_NONE or + * NVDIMM_CCLASS_BTT) + */ + nd_btt->initial_offset = SZ_4K; + nd_btt->version_major = 1; + nd_btt->version_minor = 1; + if (nvdimm_read_bytes(ndns, SZ_4K, btt_sb, sizeof(*btt_sb), 0)) + return -ENXIO; + if (!nd_btt_arena_is_valid(nd_btt, btt_sb)) + return -ENODEV; + if ((le16_to_cpu(btt_sb->version_major) != 1) || + (le16_to_cpu(btt_sb->version_minor) != 1)) + return -ENODEV; + } + return 0; +} +EXPORT_SYMBOL(nd_btt_version); + static int __nd_btt_probe(struct nd_btt *nd_btt, struct nd_namespace_common *ndns, struct btt_sb *btt_sb) { + int rc; + if (!btt_sb || !ndns || !nd_btt) return -ENODEV; - if (nvdimm_read_bytes(ndns, SZ_4K, btt_sb, sizeof(*btt_sb), 0)) - return -ENXIO; - if (nvdimm_namespace_capacity(ndns) < SZ_16M) return -ENXIO; - if (!nd_btt_arena_is_valid(nd_btt, btt_sb)) - return -ENODEV; + rc = nd_btt_version(nd_btt, ndns, btt_sb); + if (rc < 0) + return rc; nd_btt->lbasize = le32_to_cpu(btt_sb->external_lbasize); nd_btt->uuid = kmemdup(btt_sb->uuid, 16, GFP_KERNEL); @@ -295,6 +330,15 @@ int nd_btt_probe(struct device *dev, struct nd_namespace_common *ndns) if (ndns->force_raw) return -ENODEV; + switch (ndns->claim_class) { + case NVDIMM_CCLASS_NONE: + case NVDIMM_CCLASS_BTT: + case NVDIMM_CCLASS_BTT2: + break; + default: + return -ENODEV; + } + nvdimm_bus_lock(&ndns->dev); btt_dev = __nd_btt_create(nd_region, 0, NULL, ndns); nvdimm_bus_unlock(&ndns->dev); diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c index e9361bf..937fafa 100644 --- a/drivers/nvdimm/bus.c +++ b/drivers/nvdimm/bus.c @@ -38,13 +38,13 @@ static int to_nd_device_type(struct device *dev) { if (is_nvdimm(dev)) return ND_DEVICE_DIMM; - else if (is_nd_pmem(dev)) + else if (is_memory(dev)) return ND_DEVICE_REGION_PMEM; else if (is_nd_blk(dev)) return ND_DEVICE_REGION_BLK; else if (is_nd_dax(dev)) return ND_DEVICE_DAX_PMEM; - else if (is_nd_pmem(dev->parent) || is_nd_blk(dev->parent)) + else if (is_nd_region(dev->parent)) return nd_region_to_nstype(to_nd_region(dev->parent)); return 0; @@ -56,7 +56,7 @@ static int nvdimm_bus_uevent(struct device *dev, struct kobj_uevent_env *env) * Ensure that region devices always have their numa node set as * early as possible. */ - if (is_nd_pmem(dev) || is_nd_blk(dev)) + if (is_nd_region(dev)) set_dev_node(dev, to_nd_region(dev)->numa_node); return add_uevent_var(env, "MODALIAS=" ND_DEVICE_MODALIAS_FMT, to_nd_device_type(dev)); @@ -65,7 +65,7 @@ static int nvdimm_bus_uevent(struct device *dev, struct kobj_uevent_env *env) static struct module *to_bus_provider(struct device *dev) { /* pin bus providers while regions are enabled */ - if (is_nd_pmem(dev) || is_nd_blk(dev)) { + if (is_nd_region(dev)) { struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); return nvdimm_bus->nd_desc->module; @@ -198,6 +198,9 @@ static int nvdimm_clear_badblocks_region(struct device *dev, void *data) sector = (ctx->phys - nd_region->ndr_start) / 512; badblocks_clear(&nd_region->bb, sector, ctx->cleared / 512); + if (nd_region->bb_state) + sysfs_notify_dirent(nd_region->bb_state); + return 0; } @@ -907,6 +910,7 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm, static char in_env[ND_CMD_MAX_ENVELOPE]; const struct nd_cmd_desc *desc = NULL; unsigned int cmd = _IOC_NR(ioctl_cmd); + unsigned int func = cmd; void __user *p = (void __user *) arg; struct device *dev = &nvdimm_bus->dev; struct nd_cmd_pkg pkg; @@ -972,6 +976,7 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm, } if (cmd == ND_CMD_CALL) { + func = pkg.nd_command; dev_dbg(dev, "%s:%s, idx: %llu, in: %zu, out: %zu, len %zu\n", __func__, dimm_name, pkg.nd_command, in_len, out_len, buf_len); @@ -1020,7 +1025,7 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm, } nvdimm_bus_lock(&nvdimm_bus->dev); - rc = nd_cmd_clear_to_send(nvdimm_bus, nvdimm, cmd, buf); + rc = nd_cmd_clear_to_send(nvdimm_bus, nvdimm, func, buf); if (rc) goto out_unlock; diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c index 7ceb5fa..4777046 100644 --- a/drivers/nvdimm/claim.c +++ b/drivers/nvdimm/claim.c @@ -12,8 +12,8 @@ */ #include <linux/device.h> #include <linux/sizes.h> -#include <linux/pmem.h> #include "nd-core.h" +#include "pmem.h" #include "pfn.h" #include "btt.h" #include "nd.h" @@ -184,6 +184,35 @@ ssize_t nd_namespace_store(struct device *dev, } ndns = to_ndns(found); + + switch (ndns->claim_class) { + case NVDIMM_CCLASS_NONE: + break; + case NVDIMM_CCLASS_BTT: + case NVDIMM_CCLASS_BTT2: + if (!is_nd_btt(dev)) { + len = -EBUSY; + goto out_attach; + } + break; + case NVDIMM_CCLASS_PFN: + if (!is_nd_pfn(dev)) { + len = -EBUSY; + goto out_attach; + } + break; + case NVDIMM_CCLASS_DAX: + if (!is_nd_dax(dev)) { + len = -EBUSY; + goto out_attach; + } + break; + default: + len = -EBUSY; + goto out_attach; + break; + } + if (__nvdimm_namespace_capacity(ndns) < SZ_16M) { dev_dbg(dev, "%s too small to host\n", name); len = -ENXIO; @@ -260,8 +289,7 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns, * work around this collision. */ if (IS_ALIGNED(offset, 512) && IS_ALIGNED(size, 512) - && !(flags & NVDIMM_IO_ATOMIC) - && !ndns->claim) { + && !(flags & NVDIMM_IO_ATOMIC)) { long cleared; cleared = nvdimm_clear_poison(&ndns->dev, @@ -272,12 +300,12 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns, cleared /= 512; badblocks_clear(&nsio->bb, sector, cleared); } - invalidate_pmem(nsio->addr + offset, size); + arch_invalidate_pmem(nsio->addr + offset, size); } else rc = -EIO; } - memcpy_to_pmem(nsio->addr + offset, buf, size); + memcpy_flushcache(nsio->addr + offset, buf, size); nvdimm_flush(to_nd_region(ndns->dev.parent)); return rc; diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c index 2dee908..7cd99b1 100644 --- a/drivers/nvdimm/core.c +++ b/drivers/nvdimm/core.c @@ -504,7 +504,7 @@ void nvdimm_badblocks_populate(struct nd_region *nd_region, struct nvdimm_bus *nvdimm_bus; struct list_head *poison_list; - if (!is_nd_pmem(&nd_region->dev)) { + if (!is_memory(&nd_region->dev)) { dev_WARN_ONCE(&nd_region->dev, 1, "%s only valid for pmem regions\n", __func__); return; @@ -699,6 +699,9 @@ static __init int libnvdimm_init(void) rc = nd_region_init(); if (rc) goto err_region; + + nd_label_init(); + return 0; err_region: nvdimm_exit(); diff --git a/drivers/nvdimm/dax_devs.c b/drivers/nvdimm/dax_devs.c index c1b6556..1bf2bd3 100644 --- a/drivers/nvdimm/dax_devs.c +++ b/drivers/nvdimm/dax_devs.c @@ -89,7 +89,7 @@ struct device *nd_dax_create(struct nd_region *nd_region) struct device *dev = NULL; struct nd_dax *nd_dax; - if (!is_nd_pmem(&nd_region->dev)) + if (!is_memory(&nd_region->dev)) return NULL; nd_dax = nd_dax_alloc(nd_region); @@ -111,6 +111,14 @@ int nd_dax_probe(struct device *dev, struct nd_namespace_common *ndns) if (ndns->force_raw) return -ENODEV; + switch (ndns->claim_class) { + case NVDIMM_CCLASS_NONE: + case NVDIMM_CCLASS_DAX: + break; + default: + return -ENODEV; + } + nvdimm_bus_lock(&ndns->dev); nd_dax = nd_dax_alloc(nd_region); nd_pfn = &nd_dax->nd_pfn; diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c index 9852a33..f0d1b7e 100644 --- a/drivers/nvdimm/dimm_devs.c +++ b/drivers/nvdimm/dimm_devs.c @@ -20,6 +20,7 @@ #include <linux/mm.h> #include "nd-core.h" #include "label.h" +#include "pmem.h" #include "nd.h" static DEFINE_IDA(dimm_ida); @@ -235,6 +236,13 @@ struct nvdimm *nd_blk_region_to_dimm(struct nd_blk_region *ndbr) } EXPORT_SYMBOL_GPL(nd_blk_region_to_dimm); +unsigned long nd_blk_memremap_flags(struct nd_blk_region *ndbr) +{ + /* pmem mapping properties are private to libnvdimm */ + return ARCH_MEMREMAP_PMEM; +} +EXPORT_SYMBOL_GPL(nd_blk_memremap_flags); + struct nvdimm_drvdata *to_ndd(struct nd_mapping *nd_mapping) { struct nvdimm *nvdimm = nd_mapping->nvdimm; @@ -411,7 +419,7 @@ int alias_dpa_busy(struct device *dev, void *data) struct resource *res; int i; - if (!is_nd_pmem(dev)) + if (!is_memory(dev)) return 0; nd_region = to_nd_region(dev); diff --git a/drivers/nvdimm/label.c b/drivers/nvdimm/label.c index dd61534..87796f8 100644 --- a/drivers/nvdimm/label.c +++ b/drivers/nvdimm/label.c @@ -12,6 +12,7 @@ */ #include <linux/device.h> #include <linux/ndctl.h> +#include <linux/uuid.h> #include <linux/slab.h> #include <linux/io.h> #include <linux/nd.h> @@ -19,6 +20,11 @@ #include "label.h" #include "nd.h" +static guid_t nvdimm_btt_guid; +static guid_t nvdimm_btt2_guid; +static guid_t nvdimm_pfn_guid; +static guid_t nvdimm_dax_guid; + static u32 best_seq(u32 a, u32 b) { a &= NSINDEX_SEQ_MASK; @@ -34,6 +40,11 @@ static u32 best_seq(u32 a, u32 b) return a; } +unsigned sizeof_namespace_label(struct nvdimm_drvdata *ndd) +{ + return ndd->nslabel_size; +} + size_t sizeof_namespace_index(struct nvdimm_drvdata *ndd) { u32 index_span; @@ -49,7 +60,7 @@ size_t sizeof_namespace_index(struct nvdimm_drvdata *ndd) * starts to waste space at larger config_sizes, but it's * unlikely we'll ever see anything but 128K. */ - index_span = ndd->nsarea.config_size / 129; + index_span = ndd->nsarea.config_size / (sizeof_namespace_label(ndd) + 1); index_span /= NSINDEX_ALIGN * 2; ndd->nsindex_size = index_span * NSINDEX_ALIGN; @@ -58,10 +69,10 @@ size_t sizeof_namespace_index(struct nvdimm_drvdata *ndd) int nvdimm_num_label_slots(struct nvdimm_drvdata *ndd) { - return ndd->nsarea.config_size / 129; + return ndd->nsarea.config_size / (sizeof_namespace_label(ndd) + 1); } -int nd_label_validate(struct nvdimm_drvdata *ndd) +static int __nd_label_validate(struct nvdimm_drvdata *ndd) { /* * On media label format consists of two index blocks followed @@ -104,6 +115,7 @@ int nd_label_validate(struct nvdimm_drvdata *ndd) u32 nslot; u8 sig[NSINDEX_SIG_LEN]; u64 sum_save, sum, size; + unsigned int version, labelsize; memcpy(sig, nsindex[i]->sig, NSINDEX_SIG_LEN); if (memcmp(sig, NSINDEX_SIGNATURE, NSINDEX_SIG_LEN) != 0) { @@ -111,6 +123,21 @@ int nd_label_validate(struct nvdimm_drvdata *ndd) __func__, i); continue; } + + /* label sizes larger than 128 arrived with v1.2 */ + version = __le16_to_cpu(nsindex[i]->major) * 100 + + __le16_to_cpu(nsindex[i]->minor); + if (version >= 102) + labelsize = 1 << (7 + nsindex[i]->labelsize); + else + labelsize = 128; + + if (labelsize != sizeof_namespace_label(ndd)) { + dev_dbg(dev, "%s: nsindex%d labelsize %d invalid\n", + __func__, i, nsindex[i]->labelsize); + continue; + } + sum_save = __le64_to_cpu(nsindex[i]->checksum); nsindex[i]->checksum = __cpu_to_le64(0); sum = nd_fletcher64(nsindex[i], sizeof_namespace_index(ndd), 1); @@ -153,7 +180,7 @@ int nd_label_validate(struct nvdimm_drvdata *ndd) } nslot = __le32_to_cpu(nsindex[i]->nslot); - if (nslot * sizeof(struct nd_namespace_label) + if (nslot * sizeof_namespace_label(ndd) + 2 * sizeof_namespace_index(ndd) > ndd->nsarea.config_size) { dev_dbg(dev, "%s: nsindex%d nslot: %u invalid, config_size: %#x\n", @@ -189,6 +216,29 @@ int nd_label_validate(struct nvdimm_drvdata *ndd) return -1; } +int nd_label_validate(struct nvdimm_drvdata *ndd) +{ + /* + * In order to probe for and validate namespace index blocks we + * need to know the size of the labels, and we can't trust the + * size of the labels until we validate the index blocks. + * Resolve this dependency loop by probing for known label + * sizes, but default to v1.2 256-byte namespace labels if + * discovery fails. + */ + int label_size[] = { 128, 256 }; + int i, rc; + + for (i = 0; i < ARRAY_SIZE(label_size); i++) { + ndd->nslabel_size = label_size[i]; + rc = __nd_label_validate(ndd); + if (rc >= 0) + return rc; + } + + return -1; +} + void nd_label_copy(struct nvdimm_drvdata *ndd, struct nd_namespace_index *dst, struct nd_namespace_index *src) { @@ -210,7 +260,22 @@ static struct nd_namespace_label *nd_label_base(struct nvdimm_drvdata *ndd) static int to_slot(struct nvdimm_drvdata *ndd, struct nd_namespace_label *nd_label) { - return nd_label - nd_label_base(ndd); + unsigned long label, base; + + label = (unsigned long) nd_label; + base = (unsigned long) nd_label_base(ndd); + + return (label - base) / sizeof_namespace_label(ndd); +} + +static struct nd_namespace_label *to_label(struct nvdimm_drvdata *ndd, int slot) +{ + unsigned long label, base; + + base = (unsigned long) nd_label_base(ndd); + label = base + sizeof_namespace_label(ndd) * slot; + + return (struct nd_namespace_label *) label; } #define for_each_clear_bit_le(bit, addr, size) \ @@ -268,7 +333,8 @@ static bool preamble_next(struct nvdimm_drvdata *ndd, free, nslot); } -static bool slot_valid(struct nd_namespace_label *nd_label, u32 slot) +static bool slot_valid(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label, u32 slot) { /* check that we are written where we expect to be written */ if (slot != __le32_to_cpu(nd_label->slot)) @@ -279,6 +345,21 @@ static bool slot_valid(struct nd_namespace_label *nd_label, u32 slot) | __le64_to_cpu(nd_label->rawsize)) % SZ_4K) return false; + /* check checksum */ + if (namespace_label_has(ndd, checksum)) { + u64 sum, sum_save; + + sum_save = __le64_to_cpu(nd_label->checksum); + nd_label->checksum = __cpu_to_le64(0); + sum = nd_fletcher64(nd_label, sizeof_namespace_label(ndd), 1); + nd_label->checksum = __cpu_to_le64(sum_save); + if (sum != sum_save) { + dev_dbg(ndd->dev, "%s fail checksum. slot: %d expect: %#llx\n", + __func__, slot, sum); + return false; + } + } + return true; } @@ -299,9 +380,9 @@ int nd_label_reserve_dpa(struct nvdimm_drvdata *ndd) struct resource *res; u32 flags; - nd_label = nd_label_base(ndd) + slot; + nd_label = to_label(ndd, slot); - if (!slot_valid(nd_label, slot)) + if (!slot_valid(ndd, nd_label, slot)) continue; memcpy(label_uuid, nd_label->uuid, NSLABEL_UUID_LEN); @@ -331,9 +412,9 @@ int nd_label_active_count(struct nvdimm_drvdata *ndd) for_each_clear_bit_le(slot, free, nslot) { struct nd_namespace_label *nd_label; - nd_label = nd_label_base(ndd) + slot; + nd_label = to_label(ndd, slot); - if (!slot_valid(nd_label, slot)) { + if (!slot_valid(ndd, nd_label, slot)) { u32 label_slot = __le32_to_cpu(nd_label->slot); u64 size = __le64_to_cpu(nd_label->rawsize); u64 dpa = __le64_to_cpu(nd_label->dpa); @@ -360,12 +441,12 @@ struct nd_namespace_label *nd_label_active(struct nvdimm_drvdata *ndd, int n) for_each_clear_bit_le(slot, free, nslot) { struct nd_namespace_label *nd_label; - nd_label = nd_label_base(ndd) + slot; - if (!slot_valid(nd_label, slot)) + nd_label = to_label(ndd, slot); + if (!slot_valid(ndd, nd_label, slot)) continue; if (n-- == 0) - return nd_label_base(ndd) + slot; + return to_label(ndd, slot); } return NULL; @@ -437,7 +518,8 @@ static int nd_label_write_index(struct nvdimm_drvdata *ndd, int index, u32 seq, nslot = __le32_to_cpu(nsindex->nslot); memcpy(nsindex->sig, NSINDEX_SIGNATURE, NSINDEX_SIG_LEN); - nsindex->flags = __cpu_to_le32(0); + memset(&nsindex->flags, 0, 3); + nsindex->labelsize = sizeof_namespace_label(ndd) >> 8; nsindex->seq = __cpu_to_le32(seq); offset = (unsigned long) nsindex - (unsigned long) to_namespace_index(ndd, 0); @@ -452,7 +534,10 @@ static int nd_label_write_index(struct nvdimm_drvdata *ndd, int index, u32 seq, nsindex->labeloff = __cpu_to_le64(offset); nsindex->nslot = __cpu_to_le32(nslot); nsindex->major = __cpu_to_le16(1); - nsindex->minor = __cpu_to_le16(1); + if (sizeof_namespace_label(ndd) < 256) + nsindex->minor = __cpu_to_le16(1); + else + nsindex->minor = __cpu_to_le16(2); nsindex->checksum = __cpu_to_le64(0); if (flags & ND_NSINDEX_INIT) { unsigned long *free = (unsigned long *) nsindex->free; @@ -490,11 +575,49 @@ static unsigned long nd_label_offset(struct nvdimm_drvdata *ndd, - (unsigned long) to_namespace_index(ndd, 0); } +enum nvdimm_claim_class to_nvdimm_cclass(guid_t *guid) +{ + if (guid_equal(guid, &nvdimm_btt_guid)) + return NVDIMM_CCLASS_BTT; + else if (guid_equal(guid, &nvdimm_btt2_guid)) + return NVDIMM_CCLASS_BTT2; + else if (guid_equal(guid, &nvdimm_pfn_guid)) + return NVDIMM_CCLASS_PFN; + else if (guid_equal(guid, &nvdimm_dax_guid)) + return NVDIMM_CCLASS_DAX; + else if (guid_equal(guid, &guid_null)) + return NVDIMM_CCLASS_NONE; + + return NVDIMM_CCLASS_UNKNOWN; +} + +static const guid_t *to_abstraction_guid(enum nvdimm_claim_class claim_class, + guid_t *target) +{ + if (claim_class == NVDIMM_CCLASS_BTT) + return &nvdimm_btt_guid; + else if (claim_class == NVDIMM_CCLASS_BTT2) + return &nvdimm_btt2_guid; + else if (claim_class == NVDIMM_CCLASS_PFN) + return &nvdimm_pfn_guid; + else if (claim_class == NVDIMM_CCLASS_DAX) + return &nvdimm_dax_guid; + else if (claim_class == NVDIMM_CCLASS_UNKNOWN) { + /* + * If we're modifying a namespace for which we don't + * know the claim_class, don't touch the existing guid. + */ + return target; + } else + return &guid_null; +} + static int __pmem_label_update(struct nd_region *nd_region, struct nd_mapping *nd_mapping, struct nd_namespace_pmem *nspm, int pos) { - u64 cookie = nd_region_interleave_set_cookie(nd_region); + struct nd_namespace_common *ndns = &nspm->nsio.common; + struct nd_interleave_set *nd_set = nd_region->nd_set; struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); struct nd_label_ent *label_ent, *victim = NULL; struct nd_namespace_label *nd_label; @@ -504,11 +627,13 @@ static int __pmem_label_update(struct nd_region *nd_region, unsigned long *free; u32 nslot, slot; size_t offset; + u64 cookie; int rc; if (!preamble_next(ndd, &nsindex, &free, &nslot)) return -ENXIO; + cookie = nd_region_interleave_set_cookie(nd_region, nsindex); nd_label_gen_id(&label_id, nspm->uuid, 0); for_each_dpa_resource(ndd, res) if (strcmp(res->name, label_id.id) == 0) @@ -525,8 +650,8 @@ static int __pmem_label_update(struct nd_region *nd_region, return -ENXIO; dev_dbg(ndd->dev, "%s: allocated: %d\n", __func__, slot); - nd_label = nd_label_base(ndd) + slot; - memset(nd_label, 0, sizeof(struct nd_namespace_label)); + nd_label = to_label(ndd, slot); + memset(nd_label, 0, sizeof_namespace_label(ndd)); memcpy(nd_label->uuid, nspm->uuid, NSLABEL_UUID_LEN); if (nspm->alt_name) memcpy(nd_label->name, nspm->alt_name, NSLABEL_NAME_LEN); @@ -535,14 +660,28 @@ static int __pmem_label_update(struct nd_region *nd_region, nd_label->position = __cpu_to_le16(pos); nd_label->isetcookie = __cpu_to_le64(cookie); nd_label->rawsize = __cpu_to_le64(resource_size(res)); + nd_label->lbasize = __cpu_to_le64(nspm->lbasize); nd_label->dpa = __cpu_to_le64(res->start); nd_label->slot = __cpu_to_le32(slot); + if (namespace_label_has(ndd, type_guid)) + guid_copy(&nd_label->type_guid, &nd_set->type_guid); + if (namespace_label_has(ndd, abstraction_guid)) + guid_copy(&nd_label->abstraction_guid, + to_abstraction_guid(ndns->claim_class, + &nd_label->abstraction_guid)); + if (namespace_label_has(ndd, checksum)) { + u64 sum; + + nd_label->checksum = __cpu_to_le64(0); + sum = nd_fletcher64(nd_label, sizeof_namespace_label(ndd), 1); + nd_label->checksum = __cpu_to_le64(sum); + } nd_dbg_dpa(nd_region, ndd, res, "%s\n", __func__); /* update label */ offset = nd_label_offset(ndd, nd_label); rc = nvdimm_set_config_data(ndd, offset, nd_label, - sizeof(struct nd_namespace_label)); + sizeof_namespace_label(ndd)); if (rc < 0) return rc; @@ -624,6 +763,8 @@ static int __blk_label_update(struct nd_region *nd_region, int num_labels) { int i, alloc, victims, nfree, old_num_resources, nlabel, rc = -ENXIO; + struct nd_interleave_set *nd_set = nd_region->nd_set; + struct nd_namespace_common *ndns = &nsblk->common; struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); struct nd_namespace_label *nd_label; struct nd_label_ent *label_ent, *e; @@ -632,6 +773,7 @@ static int __blk_label_update(struct nd_region *nd_region, struct resource *res, **old_res_list; struct nd_label_id label_id; u8 uuid[NSLABEL_UUID_LEN]; + int min_dpa_idx = 0; LIST_HEAD(list); u32 nslot, slot; @@ -668,7 +810,7 @@ static int __blk_label_update(struct nd_region *nd_region, /* mark unused labels for garbage collection */ for_each_clear_bit_le(slot, free, nslot) { - nd_label = nd_label_base(ndd) + slot; + nd_label = to_label(ndd, slot); memcpy(uuid, nd_label->uuid, NSLABEL_UUID_LEN); if (memcmp(uuid, nsblk->uuid, NSLABEL_UUID_LEN) != 0) continue; @@ -703,6 +845,18 @@ static int __blk_label_update(struct nd_region *nd_region, } } + /* + * Find the resource associated with the first label in the set + * per the v1.2 namespace specification. + */ + for (i = 0; i < nsblk->num_resources; i++) { + struct resource *min = nsblk->res[min_dpa_idx]; + + res = nsblk->res[i]; + if (res->start < min->start) + min_dpa_idx = i; + } + for (i = 0; i < nsblk->num_resources; i++) { size_t offset; @@ -714,25 +868,58 @@ static int __blk_label_update(struct nd_region *nd_region, goto abort; dev_dbg(ndd->dev, "%s: allocated: %d\n", __func__, slot); - nd_label = nd_label_base(ndd) + slot; - memset(nd_label, 0, sizeof(struct nd_namespace_label)); + nd_label = to_label(ndd, slot); + memset(nd_label, 0, sizeof_namespace_label(ndd)); memcpy(nd_label->uuid, nsblk->uuid, NSLABEL_UUID_LEN); if (nsblk->alt_name) memcpy(nd_label->name, nsblk->alt_name, NSLABEL_NAME_LEN); nd_label->flags = __cpu_to_le32(NSLABEL_FLAG_LOCAL); - nd_label->nlabel = __cpu_to_le16(0); /* N/A */ - nd_label->position = __cpu_to_le16(0); /* N/A */ - nd_label->isetcookie = __cpu_to_le64(0); /* N/A */ + + /* + * Use the presence of the type_guid as a flag to + * determine isetcookie usage and nlabel + position + * policy for blk-aperture namespaces. + */ + if (namespace_label_has(ndd, type_guid)) { + if (i == min_dpa_idx) { + nd_label->nlabel = __cpu_to_le16(nsblk->num_resources); + nd_label->position = __cpu_to_le16(0); + } else { + nd_label->nlabel = __cpu_to_le16(0xffff); + nd_label->position = __cpu_to_le16(0xffff); + } + nd_label->isetcookie = __cpu_to_le64(nd_set->cookie2); + } else { + nd_label->nlabel = __cpu_to_le16(0); /* N/A */ + nd_label->position = __cpu_to_le16(0); /* N/A */ + nd_label->isetcookie = __cpu_to_le64(0); /* N/A */ + } + nd_label->dpa = __cpu_to_le64(res->start); nd_label->rawsize = __cpu_to_le64(resource_size(res)); nd_label->lbasize = __cpu_to_le64(nsblk->lbasize); nd_label->slot = __cpu_to_le32(slot); + if (namespace_label_has(ndd, type_guid)) + guid_copy(&nd_label->type_guid, &nd_set->type_guid); + if (namespace_label_has(ndd, abstraction_guid)) + guid_copy(&nd_label->abstraction_guid, + to_abstraction_guid(ndns->claim_class, + &nd_label->abstraction_guid)); + + if (namespace_label_has(ndd, checksum)) { + u64 sum; + + nd_label->checksum = __cpu_to_le64(0); + sum = nd_fletcher64(nd_label, + sizeof_namespace_label(ndd), 1); + nd_label->checksum = __cpu_to_le64(sum); + } /* update label */ offset = nd_label_offset(ndd, nd_label); rc = nvdimm_set_config_data(ndd, offset, nd_label, - sizeof(struct nd_namespace_label)); + sizeof_namespace_label(ndd)); if (rc < 0) goto abort; } @@ -790,7 +977,7 @@ static int __blk_label_update(struct nd_region *nd_region, goto out; } for_each_clear_bit_le(slot, free, nslot) { - nd_label = nd_label_base(ndd) + slot; + nd_label = to_label(ndd, slot); memcpy(uuid, nd_label->uuid, NSLABEL_UUID_LEN); if (memcmp(uuid, nsblk->uuid, NSLABEL_UUID_LEN) != 0) continue; @@ -973,3 +1160,13 @@ int nd_blk_namespace_label_update(struct nd_region *nd_region, return __blk_label_update(nd_region, nd_mapping, nsblk, count); } + +int __init nd_label_init(void) +{ + WARN_ON(guid_parse(NVDIMM_BTT_GUID, &nvdimm_btt_guid)); + WARN_ON(guid_parse(NVDIMM_BTT2_GUID, &nvdimm_btt2_guid)); + WARN_ON(guid_parse(NVDIMM_PFN_GUID, &nvdimm_pfn_guid)); + WARN_ON(guid_parse(NVDIMM_DAX_GUID, &nvdimm_dax_guid)); + + return 0; +} diff --git a/drivers/nvdimm/label.h b/drivers/nvdimm/label.h index a59ef6e..1ebf4d3 100644 --- a/drivers/nvdimm/label.h +++ b/drivers/nvdimm/label.h @@ -15,6 +15,7 @@ #include <linux/ndctl.h> #include <linux/sizes.h> +#include <linux/uuid.h> #include <linux/io.h> enum { @@ -60,7 +61,8 @@ static const char NSINDEX_SIGNATURE[] = "NAMESPACE_INDEX\0"; */ struct nd_namespace_index { u8 sig[NSINDEX_SIG_LEN]; - __le32 flags; + u8 flags[3]; + u8 labelsize; __le32 seq; __le64 myoff; __le64 mysize; @@ -98,9 +100,23 @@ struct nd_namespace_label { __le64 dpa; __le64 rawsize; __le32 slot; - __le32 unused; + /* + * Accessing fields past this point should be gated by a + * namespace_label_has() check. + */ + u8 align; + u8 reserved[3]; + guid_t type_guid; + guid_t abstraction_guid; + u8 reserved2[88]; + __le64 checksum; }; +#define NVDIMM_BTT_GUID "8aed63a2-29a2-4c66-8b12-f05d15d3922a" +#define NVDIMM_BTT2_GUID "18633bfc-1735-4217-8ac9-17239282d3f8" +#define NVDIMM_PFN_GUID "266400ba-fb9f-4677-bcb0-968f11d0d225" +#define NVDIMM_DAX_GUID "97a86d9c-3cdd-4eda-986f-5068b4f80088" + /** * struct nd_label_id - identifier string for dpa allocation * @id: "{blk|pmem}-<namespace uuid>" @@ -131,6 +147,7 @@ struct nd_namespace_label *nd_label_active(struct nvdimm_drvdata *ndd, int n); u32 nd_label_alloc_slot(struct nvdimm_drvdata *ndd); bool nd_label_free_slot(struct nvdimm_drvdata *ndd, u32 slot); u32 nd_label_nfree(struct nvdimm_drvdata *ndd); +enum nvdimm_claim_class to_nvdimm_cclass(guid_t *guid); struct nd_region; struct nd_namespace_pmem; struct nd_namespace_blk; diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index 2f9dfbd..5f1c675 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -14,10 +14,10 @@ #include <linux/device.h> #include <linux/sort.h> #include <linux/slab.h> -#include <linux/pmem.h> #include <linux/list.h> #include <linux/nd.h> #include "nd-core.h" +#include "pmem.h" #include "nd.h" static void namespace_io_release(struct device *dev) @@ -112,7 +112,7 @@ static int is_uuid_busy(struct device *dev, void *data) static int is_namespace_uuid_busy(struct device *dev, void *data) { - if (is_nd_pmem(dev) || is_nd_blk(dev)) + if (is_nd_region(dev)) return device_for_each_child(dev, data, is_uuid_busy); return 0; } @@ -155,14 +155,33 @@ bool pmem_should_map_pages(struct device *dev) IORES_DESC_NONE) == REGION_MIXED) return false; -#ifdef ARCH_MEMREMAP_PMEM return ARCH_MEMREMAP_PMEM == MEMREMAP_WB; -#else - return false; -#endif } EXPORT_SYMBOL(pmem_should_map_pages); +unsigned int pmem_sector_size(struct nd_namespace_common *ndns) +{ + if (is_namespace_pmem(&ndns->dev)) { + struct nd_namespace_pmem *nspm; + + nspm = to_nd_namespace_pmem(&ndns->dev); + if (nspm->lbasize == 0 || nspm->lbasize == 512) + /* default */; + else if (nspm->lbasize == 4096) + return 4096; + else + dev_WARN(&ndns->dev, "unsupported sector size: %ld\n", + nspm->lbasize); + } + + /* + * There is no namespace label (is_namespace_io()), or the label + * indicates the default sector size. + */ + return 512; +} +EXPORT_SYMBOL(pmem_sector_size); + const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns, char *name) { @@ -787,7 +806,7 @@ static int __reserve_free_pmem(struct device *dev, void *data) struct nd_label_id label_id; int i; - if (!is_nd_pmem(dev)) + if (!is_memory(dev)) return 0; nd_region = to_nd_region(dev); @@ -1283,28 +1302,49 @@ static ssize_t resource_show(struct device *dev, } static DEVICE_ATTR_RO(resource); -static const unsigned long ns_lbasize_supported[] = { 512, 520, 528, +static const unsigned long blk_lbasize_supported[] = { 512, 520, 528, 4096, 4104, 4160, 4224, 0 }; +static const unsigned long pmem_lbasize_supported[] = { 512, 4096, 0 }; + static ssize_t sector_size_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); + if (is_namespace_blk(dev)) { + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); - if (!is_namespace_blk(dev)) - return -ENXIO; + return nd_sector_size_show(nsblk->lbasize, + blk_lbasize_supported, buf); + } + + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); - return nd_sector_size_show(nsblk->lbasize, ns_lbasize_supported, buf); + return nd_sector_size_show(nspm->lbasize, + pmem_lbasize_supported, buf); + } + return -ENXIO; } static ssize_t sector_size_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { - struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); struct nd_region *nd_region = to_nd_region(dev->parent); + const unsigned long *supported; + unsigned long *lbasize; ssize_t rc = 0; - if (!is_namespace_blk(dev)) + if (is_namespace_blk(dev)) { + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); + + lbasize = &nsblk->lbasize; + supported = blk_lbasize_supported; + } else if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + lbasize = &nspm->lbasize; + supported = pmem_lbasize_supported; + } else return -ENXIO; device_lock(dev); @@ -1312,8 +1352,7 @@ static ssize_t sector_size_store(struct device *dev, if (to_ndns(dev)->claim) rc = -EBUSY; if (rc >= 0) - rc = nd_sector_size_store(dev, buf, &nsblk->lbasize, - ns_lbasize_supported); + rc = nd_sector_size_store(dev, buf, lbasize, supported); if (rc >= 0) rc = nd_namespace_label_update(nd_region, dev); dev_dbg(dev, "%s: result: %zd %s: %s%s", __func__, @@ -1368,6 +1407,58 @@ static ssize_t dpa_extents_show(struct device *dev, } static DEVICE_ATTR_RO(dpa_extents); +static int btt_claim_class(struct device *dev) +{ + struct nd_region *nd_region = to_nd_region(dev->parent); + int i, loop_bitmask = 0; + + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct nd_namespace_index *nsindex; + + nsindex = to_namespace_index(ndd, ndd->ns_current); + if (nsindex == NULL) + loop_bitmask |= 1; + else { + /* check whether existing labels are v1.1 or v1.2 */ + if (__le16_to_cpu(nsindex->major) == 1 + && __le16_to_cpu(nsindex->minor) == 1) + loop_bitmask |= 2; + else + loop_bitmask |= 4; + } + } + /* + * If nsindex is null loop_bitmask's bit 0 will be set, and if an index + * block is found, a v1.1 label for any mapping will set bit 1, and a + * v1.2 label will set bit 2. + * + * At the end of the loop, at most one of the three bits must be set. + * If multiple bits were set, it means the different mappings disagree + * about their labels, and this must be cleaned up first. + * + * If all the label index blocks are found to agree, nsindex of NULL + * implies labels haven't been initialized yet, and when they will, + * they will be of the 1.2 format, so we can assume BTT2.0 + * + * If 1.1 labels are found, we enforce BTT1.1, and if 1.2 labels are + * found, we enforce BTT2.0 + * + * If the loop was never entered, default to BTT1.1 (legacy namespaces) + */ + switch (loop_bitmask) { + case 0: + case 2: + return NVDIMM_CCLASS_BTT; + case 1: + case 4: + return NVDIMM_CCLASS_BTT2; + default: + return -ENXIO; + } +} + static ssize_t holder_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -1382,6 +1473,74 @@ static ssize_t holder_show(struct device *dev, } static DEVICE_ATTR_RO(holder); +static ssize_t __holder_class_store(struct device *dev, const char *buf) +{ + struct nd_namespace_common *ndns = to_ndns(dev); + + if (dev->driver || ndns->claim) + return -EBUSY; + + if (strcmp(buf, "btt") == 0 || strcmp(buf, "btt\n") == 0) + ndns->claim_class = btt_claim_class(dev); + else if (strcmp(buf, "pfn") == 0 || strcmp(buf, "pfn\n") == 0) + ndns->claim_class = NVDIMM_CCLASS_PFN; + else if (strcmp(buf, "dax") == 0 || strcmp(buf, "dax\n") == 0) + ndns->claim_class = NVDIMM_CCLASS_DAX; + else if (strcmp(buf, "") == 0 || strcmp(buf, "\n") == 0) + ndns->claim_class = NVDIMM_CCLASS_NONE; + else + return -EINVAL; + + /* btt_claim_class() could've returned an error */ + if (ndns->claim_class < 0) + return ndns->claim_class; + + return 0; +} + +static ssize_t holder_class_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct nd_region *nd_region = to_nd_region(dev->parent); + ssize_t rc; + + device_lock(dev); + nvdimm_bus_lock(dev); + wait_nvdimm_bus_probe_idle(dev); + rc = __holder_class_store(dev, buf); + if (rc >= 0) + rc = nd_namespace_label_update(nd_region, dev); + dev_dbg(dev, "%s: %s(%zd)\n", __func__, rc < 0 ? "fail " : "", rc); + nvdimm_bus_unlock(dev); + device_unlock(dev); + + return rc < 0 ? rc : len; +} + +static ssize_t holder_class_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_namespace_common *ndns = to_ndns(dev); + ssize_t rc; + + device_lock(dev); + if (ndns->claim_class == NVDIMM_CCLASS_NONE) + rc = sprintf(buf, "\n"); + else if ((ndns->claim_class == NVDIMM_CCLASS_BTT) || + (ndns->claim_class == NVDIMM_CCLASS_BTT2)) + rc = sprintf(buf, "btt\n"); + else if (ndns->claim_class == NVDIMM_CCLASS_PFN) + rc = sprintf(buf, "pfn\n"); + else if (ndns->claim_class == NVDIMM_CCLASS_DAX) + rc = sprintf(buf, "dax\n"); + else + rc = sprintf(buf, "<unknown>\n"); + device_unlock(dev); + + return rc; +} +static DEVICE_ATTR_RW(holder_class); + static ssize_t mode_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -1440,6 +1599,7 @@ static struct attribute *nd_namespace_attributes[] = { &dev_attr_force_raw.attr, &dev_attr_sector_size.attr, &dev_attr_dpa_extents.attr, + &dev_attr_holder_class.attr, NULL, }; @@ -1458,14 +1618,12 @@ static umode_t namespace_visible(struct kobject *kobj, if (a == &dev_attr_size.attr) return 0644; - if (is_namespace_pmem(dev) && a == &dev_attr_sector_size.attr) - return 0; - return a->mode; } if (a == &dev_attr_nstype.attr || a == &dev_attr_size.attr || a == &dev_attr_holder.attr + || a == &dev_attr_holder_class.attr || a == &dev_attr_force_raw.attr || a == &dev_attr_mode.attr) return a->mode; @@ -1599,6 +1757,8 @@ static bool has_uuid_at_pos(struct nd_region *nd_region, u8 *uuid, for (i = 0; i < nd_region->ndr_mappings; i++) { struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nd_interleave_set *nd_set = nd_region->nd_set; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); struct nd_label_ent *label_ent; bool found_uuid = false; @@ -1619,8 +1779,17 @@ static bool has_uuid_at_pos(struct nd_region *nd_region, u8 *uuid, if (memcmp(nd_label->uuid, uuid, NSLABEL_UUID_LEN) != 0) continue; + if (namespace_label_has(ndd, type_guid) + && !guid_equal(&nd_set->type_guid, + &nd_label->type_guid)) { + dev_dbg(ndd->dev, "expect type_guid %pUb got %pUb\n", + nd_set->type_guid.b, + nd_label->type_guid.b); + continue; + } + if (found_uuid) { - dev_dbg(to_ndd(nd_mapping)->dev, + dev_dbg(ndd->dev, "%s duplicate entry for uuid\n", __func__); return false; @@ -1698,10 +1867,11 @@ static int select_pmem_id(struct nd_region *nd_region, u8 *pmem_id) * @nd_label: target pmem namespace label to evaluate */ struct device *create_namespace_pmem(struct nd_region *nd_region, + struct nd_namespace_index *nsindex, struct nd_namespace_label *nd_label) { + u64 cookie = nd_region_interleave_set_cookie(nd_region, nsindex); u64 altcookie = nd_region_interleave_set_altcookie(nd_region); - u64 cookie = nd_region_interleave_set_cookie(nd_region); struct nd_label_ent *label_ent; struct nd_namespace_pmem *nspm; struct nd_mapping *nd_mapping; @@ -1775,6 +1945,7 @@ struct device *create_namespace_pmem(struct nd_region *nd_region, /* Calculate total size and populate namespace properties from label0 */ for (i = 0; i < nd_region->ndr_mappings; i++) { struct nd_namespace_label *label0; + struct nvdimm_drvdata *ndd; nd_mapping = &nd_region->mapping[i]; label_ent = list_first_entry_or_null(&nd_mapping->labels, @@ -1794,6 +1965,12 @@ struct device *create_namespace_pmem(struct nd_region *nd_region, NSLABEL_NAME_LEN, GFP_KERNEL); nspm->uuid = kmemdup((void __force *) label0->uuid, NSLABEL_UUID_LEN, GFP_KERNEL); + nspm->lbasize = __le64_to_cpu(label0->lbasize); + ndd = to_ndd(nd_mapping); + if (namespace_label_has(ndd, abstraction_guid)) + nspm->nsio.common.claim_class + = to_nvdimm_cclass(&label0->abstraction_guid); + } if (!nspm->alt_name || !nspm->uuid) { @@ -1876,7 +2053,7 @@ static struct device *nd_namespace_pmem_create(struct nd_region *nd_region) struct resource *res; struct device *dev; - if (!is_nd_pmem(&nd_region->dev)) + if (!is_memory(&nd_region->dev)) return NULL; nspm = kzalloc(sizeof(*nspm), GFP_KERNEL); @@ -2005,12 +2182,29 @@ struct device *create_namespace_blk(struct nd_region *nd_region, { struct nd_mapping *nd_mapping = &nd_region->mapping[0]; + struct nd_interleave_set *nd_set = nd_region->nd_set; struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); struct nd_namespace_blk *nsblk; char name[NSLABEL_NAME_LEN]; struct device *dev = NULL; struct resource *res; + if (namespace_label_has(ndd, type_guid)) { + if (!guid_equal(&nd_set->type_guid, &nd_label->type_guid)) { + dev_dbg(ndd->dev, "expect type_guid %pUb got %pUb\n", + nd_set->type_guid.b, + nd_label->type_guid.b); + return ERR_PTR(-EAGAIN); + } + + if (nd_label->isetcookie != __cpu_to_le64(nd_set->cookie2)) { + dev_dbg(ndd->dev, "expect cookie %#llx got %#llx\n", + nd_set->cookie2, + __le64_to_cpu(nd_label->isetcookie)); + return ERR_PTR(-EAGAIN); + } + } + nsblk = kzalloc(sizeof(*nsblk), GFP_KERNEL); if (!nsblk) return ERR_PTR(-ENOMEM); @@ -2021,6 +2215,9 @@ struct device *create_namespace_blk(struct nd_region *nd_region, nsblk->lbasize = __le64_to_cpu(nd_label->lbasize); nsblk->uuid = kmemdup(nd_label->uuid, NSLABEL_UUID_LEN, GFP_KERNEL); + if (namespace_label_has(ndd, abstraction_guid)) + nsblk->common.claim_class + = to_nvdimm_cclass(&nd_label->abstraction_guid); if (!nsblk->uuid) goto blk_err; memcpy(name, nd_label->name, NSLABEL_NAME_LEN); @@ -2102,27 +2299,30 @@ static struct device **scan_labels(struct nd_region *nd_region) kfree(devs); devs = __devs; - if (is_nd_blk(&nd_region->dev)) { + if (is_nd_blk(&nd_region->dev)) dev = create_namespace_blk(nd_region, nd_label, count); - if (IS_ERR(dev)) + else { + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct nd_namespace_index *nsindex; + + nsindex = to_namespace_index(ndd, ndd->ns_current); + dev = create_namespace_pmem(nd_region, nsindex, nd_label); + } + + if (IS_ERR(dev)) { + switch (PTR_ERR(dev)) { + case -EAGAIN: + /* skip invalid labels */ + continue; + case -ENODEV: + /* fallthrough to seed creation */ + break; + default: goto err; + } + } else devs[count++] = dev; - } else { - dev = create_namespace_pmem(nd_region, nd_label); - if (IS_ERR(dev)) { - switch (PTR_ERR(dev)) { - case -EAGAIN: - /* skip invalid labels */ - continue; - case -ENODEV: - /* fallthrough to seed creation */ - break; - default: - goto err; - } - } else - devs[count++] = dev; - } + } dev_dbg(&nd_region->dev, "%s: discovered %d %s namespace%s\n", @@ -2156,7 +2356,7 @@ static struct device **scan_labels(struct nd_region *nd_region) } dev->parent = &nd_region->dev; devs[count++] = dev; - } else if (is_nd_pmem(&nd_region->dev)) { + } else if (is_memory(&nd_region->dev)) { /* clean unselected labels */ for (i = 0; i < nd_region->ndr_mappings; i++) { struct list_head *l, *e; diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h index 4c4bd20..86bc19a 100644 --- a/drivers/nvdimm/nd-core.h +++ b/drivers/nvdimm/nd-core.h @@ -64,7 +64,16 @@ struct blk_alloc_info { bool is_nvdimm(struct device *dev); bool is_nd_pmem(struct device *dev); +bool is_nd_volatile(struct device *dev); bool is_nd_blk(struct device *dev); +static inline bool is_nd_region(struct device *dev) +{ + return is_nd_pmem(dev) || is_nd_blk(dev) || is_nd_volatile(dev); +} +static inline bool is_memory(struct device *dev) +{ + return is_nd_pmem(dev) || is_nd_volatile(dev); +} struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev); int __init nvdimm_bus_init(void); void nvdimm_bus_exit(void); diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index 03852d7..e1b5715 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -42,7 +42,7 @@ struct nd_poison { struct nvdimm_drvdata { struct device *dev; - int nsindex_size; + int nsindex_size, nslabel_size; struct nd_cmd_get_config_size nsarea; void *data; int ns_current, ns_next; @@ -96,6 +96,12 @@ static inline struct nd_namespace_index *to_next_namespace_index( return to_namespace_index(ndd, ndd->ns_next); } +unsigned sizeof_namespace_label(struct nvdimm_drvdata *ndd); + +#define namespace_label_has(ndd, field) \ + (offsetof(struct nd_namespace_label, field) \ + < sizeof_namespace_label(ndd)) + #define nd_dbg_dpa(r, d, res, fmt, arg...) \ dev_dbg((r) ? &(r)->dev : (d)->dev, "%s: %.13s: %#llx @ %#llx " fmt, \ (r) ? dev_name((d)->dev) : "", res ? res->name : "null", \ @@ -155,6 +161,7 @@ struct nd_region { u64 ndr_start; int id, num_lanes, ro, numa_node; void *provider_data; + struct kernfs_node *bb_state; struct badblocks bb; struct nd_interleave_set *nd_set; struct nd_percpu_lane __percpu *lane; @@ -188,6 +195,9 @@ struct nd_btt { u64 size; u8 *uuid; int id; + int initial_offset; + u16 version_major; + u16 version_minor; }; enum nd_pfn_mode { @@ -229,6 +239,7 @@ ssize_t nd_sector_size_store(struct device *dev, const char *buf, unsigned long *current_lbasize, const unsigned long *supported); int __init nvdimm_init(void); int __init nd_region_init(void); +int __init nd_label_init(void); void nvdimm_exit(void); void nd_region_exit(void); struct nvdimm; @@ -330,7 +341,8 @@ static inline struct device *nd_dax_create(struct nd_region *nd_region) struct nd_region *to_nd_region(struct device *dev); int nd_region_to_nstype(struct nd_region *nd_region); int nd_region_register_namespaces(struct nd_region *nd_region, int *err); -u64 nd_region_interleave_set_cookie(struct nd_region *nd_region); +u64 nd_region_interleave_set_cookie(struct nd_region *nd_region, + struct nd_namespace_index *nsindex); u64 nd_region_interleave_set_altcookie(struct nd_region *nd_region); void nvdimm_bus_lock(struct device *dev); void nvdimm_bus_unlock(struct device *dev); @@ -349,6 +361,7 @@ int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns); int nvdimm_namespace_detach_btt(struct nd_btt *nd_btt); const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns, char *name); +unsigned int pmem_sector_size(struct nd_namespace_common *ndns); void nvdimm_badblocks_populate(struct nd_region *nd_region, struct badblocks *bb, const struct resource *res); #if IS_ENABLED(CONFIG_ND_CLAIM) diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index a6c4036..5fcb6f5 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -331,7 +331,7 @@ struct device *nd_pfn_create(struct nd_region *nd_region) struct nd_pfn *nd_pfn; struct device *dev; - if (!is_nd_pmem(&nd_region->dev)) + if (!is_memory(&nd_region->dev)) return NULL; nd_pfn = nd_pfn_alloc(nd_region); @@ -354,7 +354,7 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig) if (!pfn_sb || !ndns) return -ENODEV; - if (!is_nd_pmem(nd_pfn->dev.parent)) + if (!is_memory(nd_pfn->dev.parent)) return -ENODEV; if (nvdimm_read_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb), 0)) @@ -471,6 +471,14 @@ int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns) if (ndns->force_raw) return -ENODEV; + switch (ndns->claim_class) { + case NVDIMM_CCLASS_NONE: + case NVDIMM_CCLASS_PFN: + break; + default: + return -ENODEV; + } + nvdimm_bus_lock(&ndns->dev); nd_pfn = nd_pfn_alloc(nd_region); pfn_dev = nd_pfn_devinit(nd_pfn, ndns); diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 6b577af..f7099ada 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -28,7 +28,7 @@ #include <linux/blk-mq.h> #include <linux/pfn_t.h> #include <linux/slab.h> -#include <linux/pmem.h> +#include <linux/uio.h> #include <linux/dax.h> #include <linux/nd.h> #include "pmem.h" @@ -68,9 +68,11 @@ static blk_status_t pmem_clear_poison(struct pmem_device *pmem, (unsigned long long) sector, cleared, cleared > 1 ? "s" : ""); badblocks_clear(&pmem->bb, sector, cleared); + if (pmem->bb_state) + sysfs_notify_dirent(pmem->bb_state); } - invalidate_pmem(pmem->virt_addr + offset, len); + arch_invalidate_pmem(pmem->virt_addr + offset, len); return rc; } @@ -80,7 +82,7 @@ static void write_pmem(void *pmem_addr, struct page *page, { void *mem = kmap_atomic(page); - memcpy_to_pmem(pmem_addr, mem + off, len); + memcpy_flushcache(pmem_addr, mem + off, len); kunmap_atomic(mem); } @@ -235,8 +237,27 @@ static long pmem_dax_direct_access(struct dax_device *dax_dev, return __pmem_direct_access(pmem, pgoff, nr_pages, kaddr, pfn); } +static size_t pmem_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i) +{ + return copy_from_iter_flushcache(addr, bytes, i); +} + +static void pmem_dax_flush(struct dax_device *dax_dev, pgoff_t pgoff, + void *addr, size_t size) +{ + arch_wb_cache_pmem(addr, size); +} + static const struct dax_operations pmem_dax_ops = { .direct_access = pmem_dax_direct_access, + .copy_from_iter = pmem_copy_from_iter, + .flush = pmem_dax_flush, +}; + +static const struct attribute_group *pmem_attribute_groups[] = { + &dax_attribute_group, + NULL, }; static void pmem_release_queue(void *q) @@ -265,14 +286,15 @@ static int pmem_attach_disk(struct device *dev, struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev); struct nd_region *nd_region = to_nd_region(dev->parent); struct vmem_altmap __altmap, *altmap = NULL; + int nid = dev_to_node(dev), fua, wbc; struct resource *res = &nsio->res; struct nd_pfn *nd_pfn = NULL; struct dax_device *dax_dev; - int nid = dev_to_node(dev); struct nd_pfn_sb *pfn_sb; struct pmem_device *pmem; struct resource pfn_res; struct request_queue *q; + struct device *gendev; struct gendisk *disk; void *addr; @@ -294,8 +316,12 @@ static int pmem_attach_disk(struct device *dev, dev_set_drvdata(dev, pmem); pmem->phys_addr = res->start; pmem->size = resource_size(res); - if (nvdimm_has_flush(nd_region) < 0) + fua = nvdimm_has_flush(nd_region); + if (!IS_ENABLED(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) || fua < 0) { dev_warn(dev, "unable to guarantee persistence of writes\n"); + fua = 0; + } + wbc = nvdimm_has_cache(nd_region); if (!devm_request_mem_region(dev, res->start, resource_size(res), dev_name(&ndns->dev))) { @@ -339,9 +365,10 @@ static int pmem_attach_disk(struct device *dev, return PTR_ERR(addr); pmem->virt_addr = addr; - blk_queue_write_cache(q, true, true); + blk_queue_write_cache(q, wbc, fua); blk_queue_make_request(q, pmem_make_request); blk_queue_physical_block_size(q, PAGE_SIZE); + blk_queue_logical_block_size(q, pmem_sector_size(ndns)); blk_queue_max_hw_sectors(q, UINT_MAX); queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); queue_flag_set_unlocked(QUEUE_FLAG_DAX, q); @@ -368,14 +395,23 @@ static int pmem_attach_disk(struct device *dev, put_disk(disk); return -ENOMEM; } + dax_write_cache(dax_dev, wbc); pmem->dax_dev = dax_dev; + gendev = disk_to_dev(disk); + gendev->groups = pmem_attribute_groups; + device_add_disk(dev, disk); if (devm_add_action_or_reset(dev, pmem_release_disk, pmem)) return -ENOMEM; revalidate_disk(disk); + pmem->bb_state = sysfs_get_dirent(disk_to_dev(disk)->kobj.sd, + "badblocks"); + if (!pmem->bb_state) + dev_warn(dev, "'badblocks' notification disabled\n"); + return 0; } @@ -407,8 +443,18 @@ static int nd_pmem_probe(struct device *dev) static int nd_pmem_remove(struct device *dev) { + struct pmem_device *pmem = dev_get_drvdata(dev); + if (is_nd_btt(dev)) nvdimm_namespace_detach_btt(to_nd_btt(dev)); + else { + /* + * Note, this assumes device_lock() context to not race + * nd_pmem_notify() + */ + sysfs_put(pmem->bb_state); + pmem->bb_state = NULL; + } nvdimm_flush(to_nd_region(dev->parent)); return 0; @@ -427,6 +473,7 @@ static void nd_pmem_notify(struct device *dev, enum nvdimm_event event) struct nd_namespace_io *nsio; struct resource res; struct badblocks *bb; + struct kernfs_node *bb_state; if (event != NVDIMM_REVALIDATE_POISON) return; @@ -438,11 +485,13 @@ static void nd_pmem_notify(struct device *dev, enum nvdimm_event event) nd_region = to_nd_region(ndns->dev.parent); nsio = to_nd_namespace_io(&ndns->dev); bb = &nsio->bb; + bb_state = NULL; } else { struct pmem_device *pmem = dev_get_drvdata(dev); nd_region = to_region(pmem); bb = &pmem->bb; + bb_state = pmem->bb_state; if (is_nd_pfn(dev)) { struct nd_pfn *nd_pfn = to_nd_pfn(dev); @@ -462,6 +511,8 @@ static void nd_pmem_notify(struct device *dev, enum nvdimm_event event) res.start = nsio->res.start + offset; res.end = nsio->res.end - end_trunc; nvdimm_badblocks_populate(nd_region, bb, &res); + if (bb_state) + sysfs_notify_dirent(bb_state); } MODULE_ALIAS("pmem"); diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h index 7f4dbd7..5434321 100644 --- a/drivers/nvdimm/pmem.h +++ b/drivers/nvdimm/pmem.h @@ -5,6 +5,20 @@ #include <linux/pfn_t.h> #include <linux/fs.h> +#ifdef CONFIG_ARCH_HAS_PMEM_API +#define ARCH_MEMREMAP_PMEM MEMREMAP_WB +void arch_wb_cache_pmem(void *addr, size_t size); +void arch_invalidate_pmem(void *addr, size_t size); +#else +#define ARCH_MEMREMAP_PMEM MEMREMAP_WT +static inline void arch_wb_cache_pmem(void *addr, size_t size) +{ +} +static inline void arch_invalidate_pmem(void *addr, size_t size) +{ +} +#endif + /* this definition is in it's own header for tools/testing/nvdimm to consume */ struct pmem_device { /* One contiguous memory region per device */ @@ -17,6 +31,7 @@ struct pmem_device { size_t size; /* trim size when namespace capacity has been section aligned */ u32 pfn_pad; + struct kernfs_node *bb_state; struct badblocks bb; struct dax_device *dax_dev; struct gendisk *disk; diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c index 869a886..034f0a0 100644 --- a/drivers/nvdimm/region.c +++ b/drivers/nvdimm/region.c @@ -58,10 +58,14 @@ static int nd_region_probe(struct device *dev) if (devm_init_badblocks(dev, &nd_region->bb)) return -ENODEV; + nd_region->bb_state = sysfs_get_dirent(nd_region->dev.kobj.sd, + "badblocks"); + if (!nd_region->bb_state) + dev_warn(&nd_region->dev, + "'badblocks' notification disabled\n"); ndr_res.start = nd_region->ndr_start; ndr_res.end = nd_region->ndr_start + nd_region->ndr_size - 1; - nvdimm_badblocks_populate(nd_region, - &nd_region->bb, &ndr_res); + nvdimm_badblocks_populate(nd_region, &nd_region->bb, &ndr_res); } nd_region->btt_seed = nd_btt_create(nd_region); @@ -105,6 +109,13 @@ static int nd_region_remove(struct device *dev) dev_set_drvdata(dev, NULL); nvdimm_bus_unlock(dev); + /* + * Note, this assumes device_lock() context to not race + * nd_region_notify() + */ + sysfs_put(nd_region->bb_state); + nd_region->bb_state = NULL; + return 0; } @@ -126,6 +137,8 @@ static void nd_region_notify(struct device *dev, enum nvdimm_event event) nd_region->ndr_size - 1; nvdimm_badblocks_populate(nd_region, &nd_region->bb, &res); + if (nd_region->bb_state) + sysfs_notify_dirent(nd_region->bb_state); } } device_for_each_child(dev, &event, child_notify); diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index b550edf..5954cfb 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -15,7 +15,6 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/hash.h> -#include <linux/pmem.h> #include <linux/sort.h> #include <linux/io.h> #include <linux/nd.h> @@ -169,6 +168,11 @@ bool is_nd_blk(struct device *dev) return dev ? dev->type == &nd_blk_device_type : false; } +bool is_nd_volatile(struct device *dev) +{ + return dev ? dev->type == &nd_volatile_device_type : false; +} + struct nd_region *to_nd_region(struct device *dev) { struct nd_region *nd_region = container_of(dev, struct nd_region, dev); @@ -215,7 +219,7 @@ EXPORT_SYMBOL_GPL(nd_blk_region_set_provider_data); */ int nd_region_to_nstype(struct nd_region *nd_region) { - if (is_nd_pmem(&nd_region->dev)) { + if (is_memory(&nd_region->dev)) { u16 i, alias; for (i = 0, alias = 0; i < nd_region->ndr_mappings; i++) { @@ -243,7 +247,7 @@ static ssize_t size_show(struct device *dev, struct nd_region *nd_region = to_nd_region(dev); unsigned long long size = 0; - if (is_nd_pmem(dev)) { + if (is_memory(dev)) { size = nd_region->ndr_size; } else if (nd_region->ndr_mappings == 1) { struct nd_mapping *nd_mapping = &nd_region->mapping[0]; @@ -307,13 +311,41 @@ static ssize_t set_cookie_show(struct device *dev, { struct nd_region *nd_region = to_nd_region(dev); struct nd_interleave_set *nd_set = nd_region->nd_set; + ssize_t rc = 0; - if (is_nd_pmem(dev) && nd_set) + if (is_memory(dev) && nd_set) /* pass, should be precluded by region_visible */; else return -ENXIO; - return sprintf(buf, "%#llx\n", nd_set->cookie); + /* + * The cookie to show depends on which specification of the + * labels we are using. If there are not labels then default to + * the v1.1 namespace label cookie definition. To read all this + * data we need to wait for probing to settle. + */ + device_lock(dev); + nvdimm_bus_lock(dev); + wait_nvdimm_bus_probe_idle(dev); + if (nd_region->ndr_mappings) { + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + + if (ndd) { + struct nd_namespace_index *nsindex; + + nsindex = to_namespace_index(ndd, ndd->ns_current); + rc = sprintf(buf, "%#llx\n", + nd_region_interleave_set_cookie(nd_region, + nsindex)); + } + } + nvdimm_bus_unlock(dev); + device_unlock(dev); + + if (rc) + return rc; + return sprintf(buf, "%#llx\n", nd_set->cookie1); } static DEVICE_ATTR_RO(set_cookie); @@ -335,7 +367,7 @@ resource_size_t nd_region_available_dpa(struct nd_region *nd_region) if (!ndd) return 0; - if (is_nd_pmem(&nd_region->dev)) { + if (is_memory(&nd_region->dev)) { available += nd_pmem_available_dpa(nd_region, nd_mapping, &overlap); if (overlap > blk_max_overlap) { @@ -521,10 +553,10 @@ static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n) struct nd_interleave_set *nd_set = nd_region->nd_set; int type = nd_region_to_nstype(nd_region); - if (!is_nd_pmem(dev) && a == &dev_attr_pfn_seed.attr) + if (!is_memory(dev) && a == &dev_attr_pfn_seed.attr) return 0; - if (!is_nd_pmem(dev) && a == &dev_attr_dax_seed.attr) + if (!is_memory(dev) && a == &dev_attr_dax_seed.attr) return 0; if (!is_nd_pmem(dev) && a == &dev_attr_badblocks.attr) @@ -552,7 +584,7 @@ static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n) || type == ND_DEVICE_NAMESPACE_BLK) && a == &dev_attr_available_size.attr) return a->mode; - else if (is_nd_pmem(dev) && nd_set) + else if (is_memory(dev) && nd_set) return a->mode; return 0; @@ -564,13 +596,18 @@ struct attribute_group nd_region_attribute_group = { }; EXPORT_SYMBOL_GPL(nd_region_attribute_group); -u64 nd_region_interleave_set_cookie(struct nd_region *nd_region) +u64 nd_region_interleave_set_cookie(struct nd_region *nd_region, + struct nd_namespace_index *nsindex) { struct nd_interleave_set *nd_set = nd_region->nd_set; - if (nd_set) - return nd_set->cookie; - return 0; + if (!nd_set) + return 0; + + if (nsindex && __le16_to_cpu(nsindex->major) == 1 + && __le16_to_cpu(nsindex->minor) == 1) + return nd_set->cookie1; + return nd_set->cookie2; } u64 nd_region_interleave_set_altcookie(struct nd_region *nd_region) @@ -604,7 +641,7 @@ static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus, { struct nd_region *nd_region; - if (!probe && (is_nd_pmem(dev) || is_nd_blk(dev))) { + if (!probe && is_nd_region(dev)) { int i; nd_region = to_nd_region(dev); @@ -622,12 +659,8 @@ static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus, if (ndd) atomic_dec(&nvdimm->busy); } - - if (is_nd_pmem(dev)) - return; } - if (dev->parent && (is_nd_blk(dev->parent) || is_nd_pmem(dev->parent)) - && probe) { + if (dev->parent && is_nd_region(dev->parent) && probe) { nd_region = to_nd_region(dev->parent); nvdimm_bus_lock(dev); if (nd_region->ns_seed == dev) @@ -800,7 +833,7 @@ int nd_blk_region_init(struct nd_region *nd_region) return 0; if (nd_region->ndr_mappings < 1) { - dev_err(dev, "invalid BLK region\n"); + dev_dbg(dev, "invalid BLK region\n"); return -ENXIO; } @@ -1015,8 +1048,8 @@ void nvdimm_flush(struct nd_region *nd_region) * The first wmb() is needed to 'sfence' all previous writes * such that they are architecturally visible for the platform * buffer flush. Note that we've already arranged for pmem - * writes to avoid the cache via arch_memcpy_to_pmem(). The - * final wmb() ensures ordering for the NVDIMM flush write. + * writes to avoid the cache via memcpy_flushcache(). The final + * wmb() ensures ordering for the NVDIMM flush write. */ wmb(); for (i = 0; i < nd_region->ndr_mappings; i++) @@ -1038,8 +1071,9 @@ int nvdimm_has_flush(struct nd_region *nd_region) { int i; - /* no nvdimm == flushing capability unknown */ - if (nd_region->ndr_mappings == 0) + /* no nvdimm or pmem api == flushing capability unknown */ + if (nd_region->ndr_mappings == 0 + || !IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API)) return -ENXIO; for (i = 0; i < nd_region->ndr_mappings; i++) { @@ -1059,6 +1093,12 @@ int nvdimm_has_flush(struct nd_region *nd_region) } EXPORT_SYMBOL_GPL(nvdimm_has_flush); +int nvdimm_has_cache(struct nd_region *nd_region) +{ + return is_nd_pmem(&nd_region->dev); +} +EXPORT_SYMBOL_GPL(nvdimm_has_cache); + void __exit nd_region_devs_exit(void) { ida_destroy(®ion_ida); diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 06eb1de..68bae4f 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -18,6 +18,7 @@ #include <linux/interrupt.h> #include <linux/platform_device.h> #include <linux/pfn_t.h> +#include <linux/uio.h> #include <linux/dax.h> #include <asm/extmem.h> #include <asm/io.h> @@ -43,8 +44,15 @@ static const struct block_device_operations dcssblk_devops = { .release = dcssblk_release, }; +static size_t dcssblk_dax_copy_from_iter(struct dax_device *dax_dev, + pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i) +{ + return copy_from_iter(addr, bytes, i); +} + static const struct dax_operations dcssblk_dax_ops = { .direct_access = dcssblk_dax_direct_access, + .copy_from_iter = dcssblk_dax_copy_from_iter, }; struct dcssblk_dev_info { @@ -25,7 +25,6 @@ #include <linux/mm.h> #include <linux/mutex.h> #include <linux/pagevec.h> -#include <linux/pmem.h> #include <linux/sched.h> #include <linux/sched/signal.h> #include <linux/uio.h> @@ -784,7 +783,7 @@ static int dax_writeback_one(struct block_device *bdev, } dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(pfn)); - wb_cache_pmem(kaddr, size); + dax_flush(dax_dev, pgoff, kaddr, size); /* * After we have flushed the cache, we can clear the dirty tag. There * cannot be new dirty data in the pfn after the flush has completed as @@ -976,7 +975,8 @@ int __dax_zero_page_range(struct block_device *bdev, dax_read_unlock(id); return rc; } - clear_pmem(kaddr + offset, size); + memset(kaddr + offset, 0, size); + dax_flush(dax_dev, pgoff, kaddr + offset, size); dax_read_unlock(id); } return 0; @@ -1055,7 +1055,8 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, map_len = end - pos; if (iov_iter_rw(iter) == WRITE) - map_len = copy_from_iter_pmem(kaddr, map_len, iter); + map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr, + map_len, iter); else map_len = copy_to_iter(kaddr, map_len, iter); if (map_len <= 0) { diff --git a/include/linux/dax.h b/include/linux/dax.h index 5ec1f6c..8f39db7 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -16,8 +16,15 @@ struct dax_operations { */ long (*direct_access)(struct dax_device *, pgoff_t, long, void **, pfn_t *); + /* copy_from_iter: required operation for fs-dax direct-i/o */ + size_t (*copy_from_iter)(struct dax_device *, pgoff_t, void *, size_t, + struct iov_iter *); + /* flush: optional driver-specific cache management after writes */ + void (*flush)(struct dax_device *, pgoff_t, void *, size_t); }; +extern struct attribute_group dax_attribute_group; + #if IS_ENABLED(CONFIG_DAX) struct dax_device *dax_get_by_host(const char *host); void put_dax(struct dax_device *dax_dev); @@ -75,6 +82,11 @@ void kill_dax(struct dax_device *dax_dev); void *dax_get_private(struct dax_device *dax_dev); long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn); +size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, + size_t bytes, struct iov_iter *i); +void dax_flush(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, + size_t size); +void dax_write_cache(struct dax_device *dax_dev, bool wc); /* * We use lowest available bit in exceptional entry for locking, one bit for diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 0c1b50ad..1473455 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -132,6 +132,10 @@ typedef int (*dm_busy_fn) (struct dm_target *ti); */ typedef long (*dm_dax_direct_access_fn) (struct dm_target *ti, pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn); +typedef size_t (*dm_dax_copy_from_iter_fn)(struct dm_target *ti, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i); +typedef void (*dm_dax_flush_fn)(struct dm_target *ti, pgoff_t pgoff, void *addr, + size_t size); #define PAGE_SECTORS (PAGE_SIZE / 512) void dm_error(const char *message); @@ -181,6 +185,8 @@ struct target_type { dm_iterate_devices_fn iterate_devices; dm_io_hints_fn io_hints; dm_dax_direct_access_fn direct_access; + dm_dax_copy_from_iter_fn dax_copy_from_iter; + dm_dax_flush_fn dax_flush; /* For internal device-mapper use. */ struct list_head list; diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 6c80701..f3d3e6a 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -17,6 +17,7 @@ #include <linux/kernel.h> #include <linux/sizes.h> #include <linux/types.h> +#include <linux/uuid.h> enum { /* when a dimm supports both PMEM and BLK access a label is required */ @@ -54,6 +55,7 @@ typedef int (*ndctl_fn)(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm_bus_descriptor { const struct attribute_group **attr_groups; + unsigned long bus_dsm_mask; unsigned long cmd_mask; struct module *module; char *provider_name; @@ -71,9 +73,14 @@ struct nd_cmd_desc { }; struct nd_interleave_set { - u64 cookie; + /* v1.1 definition of the interleave-set-cookie algorithm */ + u64 cookie1; + /* v1.2 definition of the interleave-set-cookie algorithm */ + u64 cookie2; /* compatibility with initial buggy Linux implementation */ u64 altcookie; + + guid_t type_guid; }; struct nd_mapping_desc { @@ -159,9 +166,11 @@ void *nd_region_provider_data(struct nd_region *nd_region); void *nd_blk_region_provider_data(struct nd_blk_region *ndbr); void nd_blk_region_set_provider_data(struct nd_blk_region *ndbr, void *data); struct nvdimm *nd_blk_region_to_dimm(struct nd_blk_region *ndbr); +unsigned long nd_blk_memremap_flags(struct nd_blk_region *ndbr); unsigned int nd_region_acquire_lane(struct nd_region *nd_region); void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane); u64 nd_fletcher64(void *addr, size_t len, bool le); void nvdimm_flush(struct nd_region *nd_region); int nvdimm_has_flush(struct nd_region *nd_region); +int nvdimm_has_cache(struct nd_region *nd_region); #endif /* __LIBNVDIMM_H__ */ diff --git a/include/linux/nd.h b/include/linux/nd.h index 194b8e0..5dc6b69 100644 --- a/include/linux/nd.h +++ b/include/linux/nd.h @@ -21,6 +21,15 @@ enum nvdimm_event { NVDIMM_REVALIDATE_POISON, }; +enum nvdimm_claim_class { + NVDIMM_CCLASS_NONE, + NVDIMM_CCLASS_BTT, + NVDIMM_CCLASS_BTT2, + NVDIMM_CCLASS_PFN, + NVDIMM_CCLASS_DAX, + NVDIMM_CCLASS_UNKNOWN, +}; + struct nd_device_driver { struct device_driver drv; unsigned long type; @@ -41,12 +50,14 @@ static inline struct nd_device_driver *to_nd_device_driver( * @force_raw: ignore other personalities for the namespace (e.g. btt) * @dev: device model node * @claim: when set a another personality has taken ownership of the namespace + * @claim_class: restrict claim type to a given class * @rw_bytes: access the raw namespace capacity with byte-aligned transfers */ struct nd_namespace_common { int force_raw; struct device dev; struct device *claim; + enum nvdimm_claim_class claim_class; int (*rw_bytes)(struct nd_namespace_common *, resource_size_t offset, void *buf, size_t size, int rw, unsigned long flags); }; @@ -75,12 +86,14 @@ struct nd_namespace_io { /** * struct nd_namespace_pmem - namespace device for dimm-backed interleaved memory * @nsio: device and system physical address range to drive + * @lbasize: logical sector size for the namespace in block-device-mode * @alt_name: namespace name supplied in the dimm label * @uuid: namespace name supplied in the dimm label * @id: ida allocated id */ struct nd_namespace_pmem { struct nd_namespace_io nsio; + unsigned long lbasize; char *alt_name; u8 *uuid; int id; diff --git a/include/linux/pmem.h b/include/linux/pmem.h deleted file mode 100644 index 71ecf3d..0000000 --- a/include/linux/pmem.h +++ /dev/null @@ -1,142 +0,0 @@ -/* - * Copyright(c) 2015 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - */ -#ifndef __PMEM_H__ -#define __PMEM_H__ - -#include <linux/io.h> -#include <linux/uio.h> - -#ifdef CONFIG_ARCH_HAS_PMEM_API -#define ARCH_MEMREMAP_PMEM MEMREMAP_WB -#include <asm/pmem.h> -#else -#define ARCH_MEMREMAP_PMEM MEMREMAP_WT -/* - * These are simply here to enable compilation, all call sites gate - * calling these symbols with arch_has_pmem_api() and redirect to the - * implementation in asm/pmem.h. - */ -static inline void arch_memcpy_to_pmem(void *dst, const void *src, size_t n) -{ - BUG(); -} - -static inline size_t arch_copy_from_iter_pmem(void *addr, size_t bytes, - struct iov_iter *i) -{ - BUG(); - return 0; -} - -static inline void arch_clear_pmem(void *addr, size_t size) -{ - BUG(); -} - -static inline void arch_wb_cache_pmem(void *addr, size_t size) -{ - BUG(); -} - -static inline void arch_invalidate_pmem(void *addr, size_t size) -{ - BUG(); -} -#endif - -static inline bool arch_has_pmem_api(void) -{ - return IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API); -} - -/** - * memcpy_to_pmem - copy data to persistent memory - * @dst: destination buffer for the copy - * @src: source buffer for the copy - * @n: length of the copy in bytes - * - * Perform a memory copy that results in the destination of the copy - * being effectively evicted from, or never written to, the processor - * cache hierarchy after the copy completes. After memcpy_to_pmem() - * data may still reside in cpu or platform buffers, so this operation - * must be followed by a blkdev_issue_flush() on the pmem block device. - */ -static inline void memcpy_to_pmem(void *dst, const void *src, size_t n) -{ - if (arch_has_pmem_api()) - arch_memcpy_to_pmem(dst, src, n); - else - memcpy(dst, src, n); -} - -/** - * copy_from_iter_pmem - copy data from an iterator to PMEM - * @addr: PMEM destination address - * @bytes: number of bytes to copy - * @i: iterator with source data - * - * Copy data from the iterator 'i' to the PMEM buffer starting at 'addr'. - * See blkdev_issue_flush() note for memcpy_to_pmem(). - */ -static inline size_t copy_from_iter_pmem(void *addr, size_t bytes, - struct iov_iter *i) -{ - if (arch_has_pmem_api()) - return arch_copy_from_iter_pmem(addr, bytes, i); - return copy_from_iter_nocache(addr, bytes, i); -} - -/** - * clear_pmem - zero a PMEM memory range - * @addr: virtual start address - * @size: number of bytes to zero - * - * Write zeros into the memory range starting at 'addr' for 'size' bytes. - * See blkdev_issue_flush() note for memcpy_to_pmem(). - */ -static inline void clear_pmem(void *addr, size_t size) -{ - if (arch_has_pmem_api()) - arch_clear_pmem(addr, size); - else - memset(addr, 0, size); -} - -/** - * invalidate_pmem - flush a pmem range from the cache hierarchy - * @addr: virtual start address - * @size: bytes to invalidate (internally aligned to cache line size) - * - * For platforms that support clearing poison this flushes any poisoned - * ranges out of the cache - */ -static inline void invalidate_pmem(void *addr, size_t size) -{ - if (arch_has_pmem_api()) - arch_invalidate_pmem(addr, size); -} - -/** - * wb_cache_pmem - write back processor cache for PMEM memory range - * @addr: virtual start address - * @size: number of bytes to write back - * - * Write back the processor cache range starting at 'addr' for 'size' bytes. - * See blkdev_issue_flush() note for memcpy_to_pmem(). - */ -static inline void wb_cache_pmem(void *addr, size_t size) -{ - if (arch_has_pmem_api()) - arch_wb_cache_pmem(addr, size); -} -#endif /* __PMEM_H__ */ diff --git a/include/linux/string.h b/include/linux/string.h index 537918f..7439d83 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -122,6 +122,12 @@ static inline __must_check int memcpy_mcsafe(void *dst, const void *src, return 0; } #endif +#ifndef __HAVE_ARCH_MEMCPY_FLUSHCACHE +static inline void memcpy_flushcache(void *dst, const void *src, size_t cnt) +{ + memcpy(dst, src, cnt); +} +#endif void *memchr_inv(const void *s, int c, size_t n); char *strreplace(char *s, char old, char new); diff --git a/include/linux/uio.h b/include/linux/uio.h index f2d36a3..55cd54a 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -95,6 +95,21 @@ size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i); size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i); bool copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i); size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i); +#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE +/* + * Note, users like pmem that depend on the stricter semantics of + * copy_from_iter_flushcache() than copy_from_iter_nocache() must check for + * IS_ENABLED(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) before assuming that the + * destination is flushed from the cache on return. + */ +size_t copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i); +#else +static inline size_t copy_from_iter_flushcache(void *addr, size_t bytes, + struct iov_iter *i) +{ + return copy_from_iter_nocache(addr, bytes, i); +} +#endif bool copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i); size_t iov_iter_zero(size_t bytes, struct iov_iter *); unsigned long iov_iter_alignment(const struct iov_iter *i); diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h index 7ad3863..6d3c542 100644 --- a/include/uapi/linux/ndctl.h +++ b/include/uapi/linux/ndctl.h @@ -105,7 +105,8 @@ struct nd_cmd_ars_cap { __u32 status; __u32 max_ars_out; __u32 clear_err_unit; - __u32 reserved; + __u16 flags; + __u16 reserved; } __packed; struct nd_cmd_ars_start { @@ -144,6 +145,43 @@ struct nd_cmd_clear_error { __u64 cleared; } __packed; +struct nd_cmd_trans_spa { + __u64 spa; + __u32 status; + __u8 flags; + __u8 _reserved[3]; + __u64 trans_length; + __u32 num_nvdimms; + struct nd_nvdimm_device { + __u32 nfit_device_handle; + __u32 _reserved; + __u64 dpa; + } __packed devices[0]; + +} __packed; + +struct nd_cmd_ars_err_inj { + __u64 err_inj_spa_range_base; + __u64 err_inj_spa_range_length; + __u8 err_inj_options; + __u32 status; +} __packed; + +struct nd_cmd_ars_err_inj_clr { + __u64 err_inj_clr_spa_range_base; + __u64 err_inj_clr_spa_range_length; + __u32 status; +} __packed; + +struct nd_cmd_ars_err_inj_stat { + __u32 status; + __u32 inj_err_rec_count; + struct nd_error_stat_query_record { + __u64 err_inj_stat_spa_range_base; + __u64 err_inj_stat_spa_range_length; + } __packed record[0]; +} __packed; + enum { ND_CMD_IMPLEMENTED = 0, @@ -169,6 +207,7 @@ enum { enum { ND_ARS_VOLATILE = 1, ND_ARS_PERSISTENT = 2, + ND_ARS_RETURN_PREV_DATA = 1 << 1, ND_CONFIG_LOCKED = 1, }; @@ -179,6 +218,7 @@ static inline const char *nvdimm_bus_cmd_name(unsigned cmd) [ND_CMD_ARS_START] = "ars_start", [ND_CMD_ARS_STATUS] = "ars_status", [ND_CMD_CLEAR_ERROR] = "clear_error", + [ND_CMD_CALL] = "cmd_call", }; if (cmd < ARRAY_SIZE(names) && names[cmd]) diff --git a/lib/Kconfig b/lib/Kconfig index d2fd262..6762529 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -556,6 +556,9 @@ config ARCH_HAS_SG_CHAIN config ARCH_HAS_PMEM_API bool +config ARCH_HAS_UACCESS_FLUSHCACHE + bool + config ARCH_HAS_MMIO_FLUSH bool diff --git a/lib/iov_iter.c b/lib/iov_iter.c index f835964..c9a6906 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -615,6 +615,28 @@ size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) } EXPORT_SYMBOL(copy_from_iter_nocache); +#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE +size_t copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i) +{ + char *to = addr; + if (unlikely(i->type & ITER_PIPE)) { + WARN_ON(1); + return 0; + } + iterate_and_advance(i, bytes, v, + __copy_from_user_flushcache((to += v.iov_len) - v.iov_len, + v.iov_base, v.iov_len), + memcpy_page_flushcache((to += v.bv_len) - v.bv_len, v.bv_page, + v.bv_offset, v.bv_len), + memcpy_flushcache((to += v.iov_len) - v.iov_len, v.iov_base, + v.iov_len) + ) + + return bytes; +} +EXPORT_SYMBOL_GPL(copy_from_iter_flushcache); +#endif + bool copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i) { char *to = addr; diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c index 28859da..4c2fa98 100644 --- a/tools/testing/nvdimm/test/nfit.c +++ b/tools/testing/nvdimm/test/nfit.c @@ -1943,7 +1943,7 @@ static __init int nfit_test_init(void) nfit_test->setup = nfit_test0_setup; break; case 1: - nfit_test->num_pm = 1; + nfit_test->num_pm = 2; nfit_test->dcr_idx = NUM_DCR; nfit_test->num_dcr = 2; nfit_test->alloc = nfit_test1_alloc; |