diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index e1d5ea6d5e40..71a7d07c28c9 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -886,6 +886,58 @@ static ssize_t revision_show(struct device *dev, } static DEVICE_ATTR_RO(revision); +static ssize_t hw_error_scrub_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev); + struct nvdimm_bus_descriptor *nd_desc = to_nd_desc(nvdimm_bus); + struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc); + + return sprintf(buf, "%d\n", acpi_desc->scrub_mode); +} + +/* + * The 'hw_error_scrub' attribute can have the following values written to it: + * '0': Switch to the default mode where an exception will only insert + * the address of the memory error into the poison and badblocks lists. + * '1': Enable a full scrub to happen if an exception for a memory error is + * received. + */ +static ssize_t hw_error_scrub_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t size) +{ + struct nvdimm_bus_descriptor *nd_desc; + ssize_t rc; + long val; + + rc = kstrtol(buf, 0, &val); + if (rc) + return rc; + + device_lock(dev); + nd_desc = dev_get_drvdata(dev); + if (nd_desc) { + struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc); + + switch (val) { + case HW_ERROR_SCRUB_ON: + acpi_desc->scrub_mode = HW_ERROR_SCRUB_ON; + break; + case HW_ERROR_SCRUB_OFF: + acpi_desc->scrub_mode = HW_ERROR_SCRUB_OFF; + break; + default: + rc = -EINVAL; + break; + } + } + device_unlock(dev); + if (rc) + return rc; + return size; +} +static DEVICE_ATTR_RW(hw_error_scrub); + /* * This shows the number of full Address Range Scrubs that have been * completed since driver load time. Userspace can wait on this using @@ -958,6 +1010,7 @@ static umode_t nfit_visible(struct kobject *kobj, struct attribute *a, int n) static struct attribute *acpi_nfit_attributes[] = { &dev_attr_revision.attr, &dev_attr_scrub.attr, + &dev_attr_hw_error_scrub.attr, NULL, }; @@ -1256,6 +1309,44 @@ static struct nvdimm *acpi_nfit_dimm_by_handle(struct acpi_nfit_desc *acpi_desc, return NULL; } +void __acpi_nvdimm_notify(struct device *dev, u32 event) +{ + struct nfit_mem *nfit_mem; + struct acpi_nfit_desc *acpi_desc; + + dev_dbg(dev->parent, "%s: %s: event: %d\n", dev_name(dev), __func__, + event); + + if (event != NFIT_NOTIFY_DIMM_HEALTH) { + dev_dbg(dev->parent, "%s: unknown event: %d\n", dev_name(dev), + event); + return; + } + + acpi_desc = dev_get_drvdata(dev->parent); + if (!acpi_desc) + return; + + /* + * If we successfully retrieved acpi_desc, then we know nfit_mem data + * is still valid. + */ + nfit_mem = dev_get_drvdata(dev); + if (nfit_mem && nfit_mem->flags_attr) + sysfs_notify_dirent(nfit_mem->flags_attr); +} +EXPORT_SYMBOL_GPL(__acpi_nvdimm_notify); + +static void acpi_nvdimm_notify(acpi_handle handle, u32 event, void *data) +{ + struct acpi_device *adev = data; + struct device *dev = &adev->dev; + + device_lock(dev->parent); + __acpi_nvdimm_notify(dev, event); + device_unlock(dev->parent); +} + static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc, struct nfit_mem *nfit_mem, u32 device_handle) { @@ -1280,6 +1371,13 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc, return force_enable_dimms ? 0 : -ENODEV; } + if (ACPI_FAILURE(acpi_install_notify_handler(adev_dimm->handle, + ACPI_DEVICE_NOTIFY, acpi_nvdimm_notify, adev_dimm))) { + dev_err(dev, "%s: notification registration failed\n", + dev_name(&adev_dimm->dev)); + return -ENXIO; + } + /* * Until standardization materializes we need to consider 4 * different command sets. Note, that checking for function0 (bit0) @@ -1318,18 +1416,41 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc, return 0; } +static void shutdown_dimm_notify(void *data) +{ + struct acpi_nfit_desc *acpi_desc = data; + struct nfit_mem *nfit_mem; + + mutex_lock(&acpi_desc->init_mutex); + /* + * Clear out the nfit_mem->flags_attr and shut down dimm event + * notifications. + */ + list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) { + struct acpi_device *adev_dimm = nfit_mem->adev; + + if (nfit_mem->flags_attr) { + sysfs_put(nfit_mem->flags_attr); + nfit_mem->flags_attr = NULL; + } + if (adev_dimm) + acpi_remove_notify_handler(adev_dimm->handle, + ACPI_DEVICE_NOTIFY, acpi_nvdimm_notify); + } + mutex_unlock(&acpi_desc->init_mutex); +} + static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc) { struct nfit_mem *nfit_mem; - int dimm_count = 0; + int dimm_count = 0, rc; + struct nvdimm *nvdimm; list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) { struct acpi_nfit_flush_address *flush; unsigned long flags = 0, cmd_mask; - struct nvdimm *nvdimm; u32 device_handle; u16 mem_flags; - int rc; device_handle = __to_nfit_memdev(nfit_mem)->device_handle; nvdimm = acpi_nfit_dimm_by_handle(acpi_desc, device_handle); @@ -1382,7 +1503,30 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc) } - return nvdimm_bus_check_dimm_count(acpi_desc->nvdimm_bus, dimm_count); + rc = nvdimm_bus_check_dimm_count(acpi_desc->nvdimm_bus, dimm_count); + if (rc) + return rc; + + /* + * Now that dimms are successfully registered, and async registration + * is flushed, attempt to enable event notification. + */ + list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) { + struct kernfs_node *nfit_kernfs; + + nvdimm = nfit_mem->nvdimm; + nfit_kernfs = sysfs_get_dirent(nvdimm_kobj(nvdimm)->sd, "nfit"); + if (nfit_kernfs) + nfit_mem->flags_attr = sysfs_get_dirent(nfit_kernfs, + "flags"); + sysfs_put(nfit_kernfs); + if (!nfit_mem->flags_attr) + dev_warn(acpi_desc->dev, "%s: notifications disabled\n", + nvdimm_name(nvdimm)); + } + + return devm_add_action_or_reset(acpi_desc->dev, shutdown_dimm_notify, + acpi_desc); } static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc) @@ -1491,9 +1635,9 @@ static int acpi_nfit_init_interleave_set(struct acpi_nfit_desc *acpi_desc, if (!info) return -ENOMEM; for (i = 0; i < nr; i++) { - struct nd_mapping *nd_mapping = &ndr_desc->nd_mapping[i]; + struct nd_mapping_desc *mapping = &ndr_desc->mapping[i]; struct nfit_set_info_map *map = &info->mapping[i]; - struct nvdimm *nvdimm = nd_mapping->nvdimm; + struct nvdimm *nvdimm = mapping->nvdimm; struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); struct acpi_nfit_memory_map *memdev = memdev_from_spa(acpi_desc, spa->range_index, i); @@ -1917,7 +2061,7 @@ static int acpi_nfit_insert_resource(struct acpi_nfit_desc *acpi_desc, } static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc, - struct nd_mapping *nd_mapping, struct nd_region_desc *ndr_desc, + struct nd_mapping_desc *mapping, struct nd_region_desc *ndr_desc, struct acpi_nfit_memory_map *memdev, struct nfit_spa *nfit_spa) { @@ -1934,12 +2078,12 @@ static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc, return -ENODEV; } - nd_mapping->nvdimm = nvdimm; + mapping->nvdimm = nvdimm; switch (nfit_spa_type(spa)) { case NFIT_SPA_PM: case NFIT_SPA_VOLATILE: - nd_mapping->start = memdev->address; - nd_mapping->size = memdev->region_size; + mapping->start = memdev->address; + mapping->size = memdev->region_size; break; case NFIT_SPA_DCR: nfit_mem = nvdimm_provider_data(nvdimm); @@ -1947,13 +2091,13 @@ static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc, dev_dbg(acpi_desc->dev, "spa%d %s missing bdw\n", spa->range_index, nvdimm_name(nvdimm)); } else { - nd_mapping->size = nfit_mem->bdw->capacity; - nd_mapping->start = nfit_mem->bdw->start_address; + mapping->size = nfit_mem->bdw->capacity; + mapping->start = nfit_mem->bdw->start_address; ndr_desc->num_lanes = nfit_mem->bdw->windows; blk_valid = 1; } - ndr_desc->nd_mapping = nd_mapping; + ndr_desc->mapping = mapping; ndr_desc->num_mappings = blk_valid; ndbr_desc = to_blk_region_desc(ndr_desc); ndbr_desc->enable = acpi_nfit_blk_region_enable; @@ -1979,7 +2123,7 @@ static bool nfit_spa_is_virtual(struct acpi_nfit_system_address *spa) static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc, struct nfit_spa *nfit_spa) { - static struct nd_mapping nd_mappings[ND_MAX_MAPPINGS]; + static struct nd_mapping_desc mappings[ND_MAX_MAPPINGS]; struct acpi_nfit_system_address *spa = nfit_spa->spa; struct nd_blk_region_desc ndbr_desc; struct nd_region_desc *ndr_desc; @@ -1998,7 +2142,7 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc, } memset(&res, 0, sizeof(res)); - memset(&nd_mappings, 0, sizeof(nd_mappings)); + memset(&mappings, 0, sizeof(mappings)); memset(&ndbr_desc, 0, sizeof(ndbr_desc)); res.start = spa->address; res.end = res.start + spa->length - 1; @@ -2014,7 +2158,7 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc, list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) { struct acpi_nfit_memory_map *memdev = nfit_memdev->memdev; - struct nd_mapping *nd_mapping; + struct nd_mapping_desc *mapping; if (memdev->range_index != spa->range_index) continue; @@ -2023,14 +2167,14 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc, spa->range_index, ND_MAX_MAPPINGS); return -ENXIO; } - nd_mapping = &nd_mappings[count++]; - rc = acpi_nfit_init_mapping(acpi_desc, nd_mapping, ndr_desc, + mapping = &mappings[count++]; + rc = acpi_nfit_init_mapping(acpi_desc, mapping, ndr_desc, memdev, nfit_spa); if (rc) goto out; } - ndr_desc->nd_mapping = nd_mappings; + ndr_desc->mapping = mappings; ndr_desc->num_mappings = count; rc = acpi_nfit_init_interleave_set(acpi_desc, ndr_desc, spa); if (rc) @@ -2678,29 +2822,30 @@ static int acpi_nfit_remove(struct acpi_device *adev) return 0; } -static void acpi_nfit_notify(struct acpi_device *adev, u32 event) +void __acpi_nfit_notify(struct device *dev, acpi_handle handle, u32 event) { - struct acpi_nfit_desc *acpi_desc = dev_get_drvdata(&adev->dev); + struct acpi_nfit_desc *acpi_desc = dev_get_drvdata(dev); struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL }; - struct device *dev = &adev->dev; union acpi_object *obj; acpi_status status; int ret; dev_dbg(dev, "%s: event: %d\n", __func__, event); - device_lock(dev); + if (event != NFIT_NOTIFY_UPDATE) + return; + if (!dev->driver) { /* dev->driver may be null if we're being removed */ dev_dbg(dev, "%s: no driver found for dev\n", __func__); - goto out_unlock; + return; } if (!acpi_desc) { acpi_desc = devm_kzalloc(dev, sizeof(*acpi_desc), GFP_KERNEL); if (!acpi_desc) - goto out_unlock; - acpi_nfit_desc_init(acpi_desc, &adev->dev); + return; + acpi_nfit_desc_init(acpi_desc, dev); } else { /* * Finish previous registration before considering new @@ -2710,10 +2855,10 @@ static void acpi_nfit_notify(struct acpi_device *adev, u32 event) } /* Evaluate _FIT */ - status = acpi_evaluate_object(adev->handle, "_FIT", NULL, &buf); + status = acpi_evaluate_object(handle, "_FIT", NULL, &buf); if (ACPI_FAILURE(status)) { dev_err(dev, "failed to evaluate _FIT\n"); - goto out_unlock; + return; } obj = buf.pointer; @@ -2725,9 +2870,14 @@ static void acpi_nfit_notify(struct acpi_device *adev, u32 event) } else dev_err(dev, "Invalid _FIT\n"); kfree(buf.pointer); +} +EXPORT_SYMBOL_GPL(__acpi_nfit_notify); - out_unlock: - device_unlock(dev); +static void acpi_nfit_notify(struct acpi_device *adev, u32 event) +{ + device_lock(&adev->dev); + __acpi_nfit_notify(&adev->dev, adev->handle, event); + device_unlock(&adev->dev); } static const struct acpi_device_id acpi_nfit_ids[] = { diff --git a/drivers/acpi/nfit/mce.c b/drivers/acpi/nfit/mce.c index 161f91539ae6..e5ce81c38eed 100644 --- a/drivers/acpi/nfit/mce.c +++ b/drivers/acpi/nfit/mce.c @@ -14,6 +14,7 @@ */ #include #include +#include #include #include "nfit.h" @@ -62,12 +63,25 @@ static int nfit_handle_mce(struct notifier_block *nb, unsigned long val, } mutex_unlock(&acpi_desc->init_mutex); - /* - * We can ignore an -EBUSY here because if an ARS is already - * in progress, just let that be the last authoritative one - */ - if (found_match) + if (!found_match) + continue; + + /* If this fails due to an -ENOMEM, there is little we can do */ + nvdimm_bus_add_poison(acpi_desc->nvdimm_bus, + ALIGN(mce->addr, L1_CACHE_BYTES), + L1_CACHE_BYTES); + nvdimm_region_notify(nfit_spa->nd_region, + NVDIMM_REVALIDATE_POISON); + + if (acpi_desc->scrub_mode == HW_ERROR_SCRUB_ON) { + /* + * We can ignore an -EBUSY here because if an ARS is + * already in progress, just let that be the last + * authoritative one + */ acpi_nfit_ars_rescan(acpi_desc); + } + break; } mutex_unlock(&acpi_desc_lock); diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h index e894ded24d99..14296f5267c8 100644 --- a/drivers/acpi/nfit/nfit.h +++ b/drivers/acpi/nfit/nfit.h @@ -78,6 +78,14 @@ enum { NFIT_ARS_TIMEOUT = 90, }; +enum nfit_root_notifiers { + NFIT_NOTIFY_UPDATE = 0x80, +}; + +enum nfit_dimm_notifiers { + NFIT_NOTIFY_DIMM_HEALTH = 0x81, +}; + struct nfit_spa { struct list_head list; struct nd_region *nd_region; @@ -124,6 +132,7 @@ struct nfit_mem { struct acpi_nfit_system_address *spa_bdw; struct acpi_nfit_interleave *idt_dcr; struct acpi_nfit_interleave *idt_bdw; + struct kernfs_node *flags_attr; struct nfit_flush *nfit_flush; struct list_head list; struct acpi_device *adev; @@ -152,6 +161,7 @@ struct acpi_nfit_desc { struct list_head list; struct kernfs_node *scrub_count_state; unsigned int scrub_count; + unsigned int scrub_mode; unsigned int cancel:1; unsigned long dimm_cmd_force_en; unsigned long bus_cmd_force_en; @@ -159,6 +169,11 @@ struct acpi_nfit_desc { void *iobuf, u64 len, int rw); }; +enum scrub_mode { + HW_ERROR_SCRUB_OFF, + HW_ERROR_SCRUB_ON, +}; + enum nd_blk_mmio_selector { BDW, DCR, @@ -223,5 +238,7 @@ static inline struct acpi_nfit_desc *to_acpi_desc( const u8 *to_nfit_uuid(enum nfit_uuids id); int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, void *nfit, acpi_size sz); +void __acpi_nfit_notify(struct device *dev, acpi_handle handle, u32 event); +void __acpi_nvdimm_notify(struct device *dev, u32 event); void acpi_nfit_desc_init(struct acpi_nfit_desc *acpi_desc, struct device *dev); #endif /* __NFIT_H__ */ diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig index cedab7572de3..daadd20aa936 100644 --- a/drivers/dax/Kconfig +++ b/drivers/dax/Kconfig @@ -23,4 +23,9 @@ config DEV_DAX_PMEM Say Y if unsure +config NR_DEV_DAX + int "Maximum number of Device-DAX instances" + default 32768 + range 256 2147483647 + endif diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c index 29f600f2c447..0e499bfca41c 100644 --- a/drivers/dax/dax.c +++ b/drivers/dax/dax.c @@ -13,15 +13,25 @@ #include #include #include +#include #include +#include +#include #include #include #include #include +#include "dax.h" -static int dax_major; +static dev_t dax_devt; static struct class *dax_class; static DEFINE_IDA(dax_minor_ida); +static int nr_dax = CONFIG_NR_DEV_DAX; +module_param(nr_dax, int, S_IRUGO); +static struct vfsmount *dax_mnt; +static struct kmem_cache *dax_cache __read_mostly; +static struct super_block *dax_superblock __read_mostly; +MODULE_PARM_DESC(nr_dax, "max number of device-dax instances"); /** * struct dax_region - mapping infrastructure for dax devices @@ -48,7 +58,7 @@ struct dax_region { * struct dax_dev - subdivision of a dax region * @region - parent region * @dev - device backing the character device - * @kref - enable this data to be tracked in filp->private_data + * @cdev - core chardev data * @alive - !alive + rcu grace period == no new mappings can be established * @id - child id in the region * @num_resources - number of physical address extents in this device @@ -56,14 +66,126 @@ struct dax_region { */ struct dax_dev { struct dax_region *region; - struct device *dev; - struct kref kref; + struct inode *inode; + struct device dev; + struct cdev cdev; bool alive; int id; int num_resources; struct resource res[0]; }; +static struct inode *dax_alloc_inode(struct super_block *sb) +{ + return kmem_cache_alloc(dax_cache, GFP_KERNEL); +} + +static void dax_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + + kmem_cache_free(dax_cache, inode); +} + +static void dax_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, dax_i_callback); +} + +static const struct super_operations dax_sops = { + .statfs = simple_statfs, + .alloc_inode = dax_alloc_inode, + .destroy_inode = dax_destroy_inode, + .drop_inode = generic_delete_inode, +}; + +static struct dentry *dax_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_pseudo(fs_type, "dax:", &dax_sops, NULL, DAXFS_MAGIC); +} + +static struct file_system_type dax_type = { + .name = "dax", + .mount = dax_mount, + .kill_sb = kill_anon_super, +}; + +static int dax_test(struct inode *inode, void *data) +{ + return inode->i_cdev == data; +} + +static int dax_set(struct inode *inode, void *data) +{ + inode->i_cdev = data; + return 0; +} + +static struct inode *dax_inode_get(struct cdev *cdev, dev_t devt) +{ + struct inode *inode; + + inode = iget5_locked(dax_superblock, hash_32(devt + DAXFS_MAGIC, 31), + dax_test, dax_set, cdev); + + if (!inode) + return NULL; + + if (inode->i_state & I_NEW) { + inode->i_mode = S_IFCHR; + inode->i_flags = S_DAX; + inode->i_rdev = devt; + mapping_set_gfp_mask(&inode->i_data, GFP_USER); + unlock_new_inode(inode); + } + return inode; +} + +static void init_once(void *inode) +{ + inode_init_once(inode); +} + +static int dax_inode_init(void) +{ + int rc; + + dax_cache = kmem_cache_create("dax_cache", sizeof(struct inode), 0, + (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD|SLAB_ACCOUNT), + init_once); + if (!dax_cache) + return -ENOMEM; + + rc = register_filesystem(&dax_type); + if (rc) + goto err_register_fs; + + dax_mnt = kern_mount(&dax_type); + if (IS_ERR(dax_mnt)) { + rc = PTR_ERR(dax_mnt); + goto err_mount; + } + dax_superblock = dax_mnt->mnt_sb; + + return 0; + + err_mount: + unregister_filesystem(&dax_type); + err_register_fs: + kmem_cache_destroy(dax_cache); + + return rc; +} + +static void dax_inode_exit(void) +{ + kern_unmount(dax_mnt); + unregister_filesystem(&dax_type); + kmem_cache_destroy(dax_cache); +} + static void dax_region_free(struct kref *kref) { struct dax_region *dax_region; @@ -78,28 +200,17 @@ void dax_region_put(struct dax_region *dax_region) } EXPORT_SYMBOL_GPL(dax_region_put); -static void dax_dev_free(struct kref *kref) -{ - struct dax_dev *dax_dev; - - dax_dev = container_of(kref, struct dax_dev, kref); - dax_region_put(dax_dev->region); - kfree(dax_dev); -} - -static void dax_dev_put(struct dax_dev *dax_dev) -{ - kref_put(&dax_dev->kref, dax_dev_free); -} - struct dax_region *alloc_dax_region(struct device *parent, int region_id, struct resource *res, unsigned int align, void *addr, unsigned long pfn_flags) { struct dax_region *dax_region; - dax_region = kzalloc(sizeof(*dax_region), GFP_KERNEL); + if (!IS_ALIGNED(res->start, align) + || !IS_ALIGNED(resource_size(res), align)) + return NULL; + dax_region = kzalloc(sizeof(*dax_region), GFP_KERNEL); if (!dax_region) return NULL; @@ -116,10 +227,15 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id, } EXPORT_SYMBOL_GPL(alloc_dax_region); +static struct dax_dev *to_dax_dev(struct device *dev) +{ + return container_of(dev, struct dax_dev, dev); +} + static ssize_t size_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct dax_dev *dax_dev = dev_get_drvdata(dev); + struct dax_dev *dax_dev = to_dax_dev(dev); unsigned long long size = 0; int i; @@ -144,180 +260,11 @@ static const struct attribute_group *dax_attribute_groups[] = { NULL, }; -static void unregister_dax_dev(void *_dev) -{ - struct device *dev = _dev; - struct dax_dev *dax_dev = dev_get_drvdata(dev); - struct dax_region *dax_region = dax_dev->region; - - dev_dbg(dev, "%s\n", __func__); - - /* - * Note, rcu is not protecting the liveness of dax_dev, rcu is - * ensuring that any fault handlers that might have seen - * dax_dev->alive == true, have completed. Any fault handlers - * that start after synchronize_rcu() has started will abort - * upon seeing dax_dev->alive == false. - */ - dax_dev->alive = false; - synchronize_rcu(); - - get_device(dev); - device_unregister(dev); - ida_simple_remove(&dax_region->ida, dax_dev->id); - ida_simple_remove(&dax_minor_ida, MINOR(dev->devt)); - put_device(dev); - dax_dev_put(dax_dev); -} - -int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res, - int count) -{ - struct device *parent = dax_region->dev; - struct dax_dev *dax_dev; - struct device *dev; - int rc, minor; - dev_t dev_t; - - dax_dev = kzalloc(sizeof(*dax_dev) + sizeof(*res) * count, GFP_KERNEL); - if (!dax_dev) - return -ENOMEM; - memcpy(dax_dev->res, res, sizeof(*res) * count); - dax_dev->num_resources = count; - kref_init(&dax_dev->kref); - dax_dev->alive = true; - dax_dev->region = dax_region; - kref_get(&dax_region->kref); - - dax_dev->id = ida_simple_get(&dax_region->ida, 0, 0, GFP_KERNEL); - if (dax_dev->id < 0) { - rc = dax_dev->id; - goto err_id; - } - - minor = ida_simple_get(&dax_minor_ida, 0, 0, GFP_KERNEL); - if (minor < 0) { - rc = minor; - goto err_minor; - } - - dev_t = MKDEV(dax_major, minor); - dev = device_create_with_groups(dax_class, parent, dev_t, dax_dev, - dax_attribute_groups, "dax%d.%d", dax_region->id, - dax_dev->id); - if (IS_ERR(dev)) { - rc = PTR_ERR(dev); - goto err_create; - } - dax_dev->dev = dev; - - rc = devm_add_action_or_reset(dax_region->dev, unregister_dax_dev, dev); - if (rc) - return rc; - - return 0; - - err_create: - ida_simple_remove(&dax_minor_ida, minor); - err_minor: - ida_simple_remove(&dax_region->ida, dax_dev->id); - err_id: - dax_dev_put(dax_dev); - - return rc; -} -EXPORT_SYMBOL_GPL(devm_create_dax_dev); - -/* return an unmapped area aligned to the dax region specified alignment */ -static unsigned long dax_dev_get_unmapped_area(struct file *filp, - unsigned long addr, unsigned long len, unsigned long pgoff, - unsigned long flags) -{ - unsigned long off, off_end, off_align, len_align, addr_align, align; - struct dax_dev *dax_dev = filp ? filp->private_data : NULL; - struct dax_region *dax_region; - - if (!dax_dev || addr) - goto out; - - dax_region = dax_dev->region; - align = dax_region->align; - off = pgoff << PAGE_SHIFT; - off_end = off + len; - off_align = round_up(off, align); - - if ((off_end <= off_align) || ((off_end - off_align) < align)) - goto out; - - len_align = len + align; - if ((off + len_align) < off) - goto out; - - addr_align = current->mm->get_unmapped_area(filp, addr, len_align, - pgoff, flags); - if (!IS_ERR_VALUE(addr_align)) { - addr_align += (off - addr_align) & (align - 1); - return addr_align; - } - out: - return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); -} - -static int __match_devt(struct device *dev, const void *data) -{ - const dev_t *devt = data; - - return dev->devt == *devt; -} - -static struct device *dax_dev_find(dev_t dev_t) -{ - return class_find_device(dax_class, NULL, &dev_t, __match_devt); -} - -static int dax_dev_open(struct inode *inode, struct file *filp) -{ - struct dax_dev *dax_dev = NULL; - struct device *dev; - - dev = dax_dev_find(inode->i_rdev); - if (!dev) - return -ENXIO; - - device_lock(dev); - dax_dev = dev_get_drvdata(dev); - if (dax_dev) { - dev_dbg(dev, "%s\n", __func__); - filp->private_data = dax_dev; - kref_get(&dax_dev->kref); - inode->i_flags = S_DAX; - } - device_unlock(dev); - - if (!dax_dev) { - put_device(dev); - return -ENXIO; - } - return 0; -} - -static int dax_dev_release(struct inode *inode, struct file *filp) -{ - struct dax_dev *dax_dev = filp->private_data; - struct device *dev = dax_dev->dev; - - dev_dbg(dax_dev->dev, "%s\n", __func__); - dax_dev_put(dax_dev); - put_device(dev); - - return 0; -} - static int check_vma(struct dax_dev *dax_dev, struct vm_area_struct *vma, const char *func) { struct dax_region *dax_region = dax_dev->region; - struct device *dev = dax_dev->dev; + struct device *dev = &dax_dev->dev; unsigned long mask; if (!dax_dev->alive) @@ -382,7 +329,7 @@ static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma, struct vm_fault *vmf) { unsigned long vaddr = (unsigned long) vmf->virtual_address; - struct device *dev = dax_dev->dev; + struct device *dev = &dax_dev->dev; struct dax_region *dax_region; int rc = VM_FAULT_SIGBUS; phys_addr_t phys; @@ -422,7 +369,7 @@ static int dax_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct file *filp = vma->vm_file; struct dax_dev *dax_dev = filp->private_data; - dev_dbg(dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__, + dev_dbg(&dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__, current->comm, (vmf->flags & FAULT_FLAG_WRITE) ? "write" : "read", vma->vm_start, vma->vm_end); rcu_read_lock(); @@ -437,7 +384,7 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, unsigned int flags) { unsigned long pmd_addr = addr & PMD_MASK; - struct device *dev = dax_dev->dev; + struct device *dev = &dax_dev->dev; struct dax_region *dax_region; phys_addr_t phys; pgoff_t pgoff; @@ -479,7 +426,7 @@ static int dax_dev_pmd_fault(struct vm_area_struct *vma, unsigned long addr, struct file *filp = vma->vm_file; struct dax_dev *dax_dev = filp->private_data; - dev_dbg(dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__, + dev_dbg(&dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__, current->comm, (flags & FAULT_FLAG_WRITE) ? "write" : "read", vma->vm_start, vma->vm_end); @@ -490,81 +437,257 @@ static int dax_dev_pmd_fault(struct vm_area_struct *vma, unsigned long addr, return rc; } -static void dax_dev_vm_open(struct vm_area_struct *vma) -{ - struct file *filp = vma->vm_file; - struct dax_dev *dax_dev = filp->private_data; - - dev_dbg(dax_dev->dev, "%s\n", __func__); - kref_get(&dax_dev->kref); -} - -static void dax_dev_vm_close(struct vm_area_struct *vma) -{ - struct file *filp = vma->vm_file; - struct dax_dev *dax_dev = filp->private_data; - - dev_dbg(dax_dev->dev, "%s\n", __func__); - dax_dev_put(dax_dev); -} - static const struct vm_operations_struct dax_dev_vm_ops = { .fault = dax_dev_fault, .pmd_fault = dax_dev_pmd_fault, - .open = dax_dev_vm_open, - .close = dax_dev_vm_close, }; -static int dax_dev_mmap(struct file *filp, struct vm_area_struct *vma) +static int dax_mmap(struct file *filp, struct vm_area_struct *vma) { struct dax_dev *dax_dev = filp->private_data; int rc; - dev_dbg(dax_dev->dev, "%s\n", __func__); + dev_dbg(&dax_dev->dev, "%s\n", __func__); rc = check_vma(dax_dev, vma, __func__); if (rc) return rc; - kref_get(&dax_dev->kref); vma->vm_ops = &dax_dev_vm_ops; vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; return 0; +} +/* return an unmapped area aligned to the dax region specified alignment */ +static unsigned long dax_get_unmapped_area(struct file *filp, + unsigned long addr, unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + unsigned long off, off_end, off_align, len_align, addr_align, align; + struct dax_dev *dax_dev = filp ? filp->private_data : NULL; + struct dax_region *dax_region; + + if (!dax_dev || addr) + goto out; + + dax_region = dax_dev->region; + align = dax_region->align; + off = pgoff << PAGE_SHIFT; + off_end = off + len; + off_align = round_up(off, align); + + if ((off_end <= off_align) || ((off_end - off_align) < align)) + goto out; + + len_align = len + align; + if ((off + len_align) < off) + goto out; + + addr_align = current->mm->get_unmapped_area(filp, addr, len_align, + pgoff, flags); + if (!IS_ERR_VALUE(addr_align)) { + addr_align += (off - addr_align) & (align - 1); + return addr_align; + } + out: + return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); +} + +static int dax_open(struct inode *inode, struct file *filp) +{ + struct dax_dev *dax_dev; + + dax_dev = container_of(inode->i_cdev, struct dax_dev, cdev); + dev_dbg(&dax_dev->dev, "%s\n", __func__); + inode->i_mapping = dax_dev->inode->i_mapping; + inode->i_mapping->host = dax_dev->inode; + filp->f_mapping = inode->i_mapping; + filp->private_data = dax_dev; + inode->i_flags = S_DAX; + + return 0; +} + +static int dax_release(struct inode *inode, struct file *filp) +{ + struct dax_dev *dax_dev = filp->private_data; + + dev_dbg(&dax_dev->dev, "%s\n", __func__); + return 0; } static const struct file_operations dax_fops = { .llseek = noop_llseek, .owner = THIS_MODULE, - .open = dax_dev_open, - .release = dax_dev_release, - .get_unmapped_area = dax_dev_get_unmapped_area, - .mmap = dax_dev_mmap, + .open = dax_open, + .release = dax_release, + .get_unmapped_area = dax_get_unmapped_area, + .mmap = dax_mmap, }; +static void dax_dev_release(struct device *dev) +{ + struct dax_dev *dax_dev = to_dax_dev(dev); + struct dax_region *dax_region = dax_dev->region; + + ida_simple_remove(&dax_region->ida, dax_dev->id); + ida_simple_remove(&dax_minor_ida, MINOR(dev->devt)); + dax_region_put(dax_region); + iput(dax_dev->inode); + kfree(dax_dev); +} + +static void unregister_dax_dev(void *dev) +{ + struct dax_dev *dax_dev = to_dax_dev(dev); + struct cdev *cdev = &dax_dev->cdev; + + dev_dbg(dev, "%s\n", __func__); + + /* + * Note, rcu is not protecting the liveness of dax_dev, rcu is + * ensuring that any fault handlers that might have seen + * dax_dev->alive == true, have completed. Any fault handlers + * that start after synchronize_rcu() has started will abort + * upon seeing dax_dev->alive == false. + */ + dax_dev->alive = false; + synchronize_rcu(); + unmap_mapping_range(dax_dev->inode->i_mapping, 0, 0, 1); + cdev_del(cdev); + device_unregister(dev); +} + +struct dax_dev *devm_create_dax_dev(struct dax_region *dax_region, + struct resource *res, int count) +{ + struct device *parent = dax_region->dev; + struct dax_dev *dax_dev; + int rc = 0, minor, i; + struct device *dev; + struct cdev *cdev; + dev_t dev_t; + + dax_dev = kzalloc(sizeof(*dax_dev) + sizeof(*res) * count, GFP_KERNEL); + if (!dax_dev) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < count; i++) { + if (!IS_ALIGNED(res[i].start, dax_region->align) + || !IS_ALIGNED(resource_size(&res[i]), + dax_region->align)) { + rc = -EINVAL; + break; + } + dax_dev->res[i].start = res[i].start; + dax_dev->res[i].end = res[i].end; + } + + if (i < count) + goto err_id; + + dax_dev->id = ida_simple_get(&dax_region->ida, 0, 0, GFP_KERNEL); + if (dax_dev->id < 0) { + rc = dax_dev->id; + goto err_id; + } + + minor = ida_simple_get(&dax_minor_ida, 0, 0, GFP_KERNEL); + if (minor < 0) { + rc = minor; + goto err_minor; + } + + dev_t = MKDEV(MAJOR(dax_devt), minor); + dev = &dax_dev->dev; + dax_dev->inode = dax_inode_get(&dax_dev->cdev, dev_t); + if (!dax_dev->inode) { + rc = -ENOMEM; + goto err_inode; + } + + /* device_initialize() so cdev can reference kobj parent */ + device_initialize(dev); + + cdev = &dax_dev->cdev; + cdev_init(cdev, &dax_fops); + cdev->owner = parent->driver->owner; + cdev->kobj.parent = &dev->kobj; + rc = cdev_add(&dax_dev->cdev, dev_t, 1); + if (rc) + goto err_cdev; + + /* from here on we're committed to teardown via dax_dev_release() */ + dax_dev->num_resources = count; + dax_dev->alive = true; + dax_dev->region = dax_region; + kref_get(&dax_region->kref); + + dev->devt = dev_t; + dev->class = dax_class; + dev->parent = parent; + dev->groups = dax_attribute_groups; + dev->release = dax_dev_release; + dev_set_name(dev, "dax%d.%d", dax_region->id, dax_dev->id); + rc = device_add(dev); + if (rc) { + put_device(dev); + return ERR_PTR(rc); + } + + rc = devm_add_action_or_reset(dax_region->dev, unregister_dax_dev, dev); + if (rc) + return ERR_PTR(rc); + + return dax_dev; + + err_cdev: + iput(dax_dev->inode); + err_inode: + ida_simple_remove(&dax_minor_ida, minor); + err_minor: + ida_simple_remove(&dax_region->ida, dax_dev->id); + err_id: + kfree(dax_dev); + + return ERR_PTR(rc); +} +EXPORT_SYMBOL_GPL(devm_create_dax_dev); + static int __init dax_init(void) { int rc; - rc = register_chrdev(0, "dax", &dax_fops); - if (rc < 0) + rc = dax_inode_init(); + if (rc) return rc; - dax_major = rc; + + nr_dax = max(nr_dax, 256); + rc = alloc_chrdev_region(&dax_devt, 0, nr_dax, "dax"); + if (rc) + goto err_chrdev; dax_class = class_create(THIS_MODULE, "dax"); if (IS_ERR(dax_class)) { - unregister_chrdev(dax_major, "dax"); - return PTR_ERR(dax_class); + rc = PTR_ERR(dax_class); + goto err_class; } return 0; + + err_class: + unregister_chrdev_region(dax_devt, nr_dax); + err_chrdev: + dax_inode_exit(); + return rc; } static void __exit dax_exit(void) { class_destroy(dax_class); - unregister_chrdev(dax_major, "dax"); + unregister_chrdev_region(dax_devt, nr_dax); ida_destroy(&dax_minor_ida); + dax_inode_exit(); } MODULE_AUTHOR("Intel Corporation"); diff --git a/drivers/dax/dax.h b/drivers/dax/dax.h index d8b8f1f25054..ddd829ab58c0 100644 --- a/drivers/dax/dax.h +++ b/drivers/dax/dax.h @@ -13,12 +13,13 @@ #ifndef __DAX_H__ #define __DAX_H__ struct device; +struct dax_dev; struct resource; struct dax_region; void dax_region_put(struct dax_region *dax_region); struct dax_region *alloc_dax_region(struct device *parent, int region_id, struct resource *res, unsigned int align, void *addr, unsigned long flags); -int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res, - int count); +struct dax_dev *devm_create_dax_dev(struct dax_region *dax_region, + struct resource *res, int count); #endif /* __DAX_H__ */ diff --git a/drivers/dax/pmem.c b/drivers/dax/pmem.c index 1f01e98c83c7..9630d8837ba9 100644 --- a/drivers/dax/pmem.c +++ b/drivers/dax/pmem.c @@ -24,7 +24,7 @@ struct dax_pmem { struct completion cmp; }; -struct dax_pmem *to_dax_pmem(struct percpu_ref *ref) +static struct dax_pmem *to_dax_pmem(struct percpu_ref *ref) { return container_of(ref, struct dax_pmem, ref); } @@ -61,6 +61,7 @@ static int dax_pmem_probe(struct device *dev) int rc; void *addr; struct resource res; + struct dax_dev *dax_dev; struct nd_pfn_sb *pfn_sb; struct dax_pmem *dax_pmem; struct nd_region *nd_region; @@ -126,12 +127,12 @@ static int dax_pmem_probe(struct device *dev) return -ENOMEM; /* TODO: support for subdividing a dax region... */ - rc = devm_create_dax_dev(dax_region, &res, 1); + dax_dev = devm_create_dax_dev(dax_region, &res, 1); /* child dax_dev instances now own the lifetime of the dax_region */ dax_region_put(dax_region); - return rc; + return PTR_ERR_OR_ZERO(dax_dev); } static struct nd_device_driver dax_pmem_driver = { diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig index 124c2432ac9c..8b2b740d6679 100644 --- a/drivers/nvdimm/Kconfig +++ b/drivers/nvdimm/Kconfig @@ -89,7 +89,7 @@ config NVDIMM_PFN Select Y if unsure config NVDIMM_DAX - bool "NVDIMM DAX: Raw access to persistent memory" + tristate "NVDIMM DAX: Raw access to persistent memory" default LIBNVDIMM depends on NVDIMM_PFN help diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c index 935866fe5ec2..a8b6949a8778 100644 --- a/drivers/nvdimm/bus.c +++ b/drivers/nvdimm/bus.c @@ -217,6 +217,8 @@ long nvdimm_clear_poison(struct device *dev, phys_addr_t phys, return rc; if (cmd_rc < 0) return cmd_rc; + + nvdimm_clear_from_poison_list(nvdimm_bus, phys, len); return clear_err.cleared; } EXPORT_SYMBOL_GPL(nvdimm_clear_poison); diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c index 4d7bbd2df5c0..7ceba08774b6 100644 --- a/drivers/nvdimm/core.c +++ b/drivers/nvdimm/core.c @@ -547,11 +547,12 @@ void nvdimm_badblocks_populate(struct nd_region *nd_region, } EXPORT_SYMBOL_GPL(nvdimm_badblocks_populate); -static int add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length) +static int add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length, + gfp_t flags) { struct nd_poison *pl; - pl = kzalloc(sizeof(*pl), GFP_KERNEL); + pl = kzalloc(sizeof(*pl), flags); if (!pl) return -ENOMEM; @@ -567,7 +568,7 @@ static int bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length) struct nd_poison *pl; if (list_empty(&nvdimm_bus->poison_list)) - return add_poison(nvdimm_bus, addr, length); + return add_poison(nvdimm_bus, addr, length, GFP_KERNEL); /* * There is a chance this is a duplicate, check for those first. @@ -587,7 +588,7 @@ static int bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length) * as any overlapping ranges will get resolved when the list is consumed * and converted to badblocks */ - return add_poison(nvdimm_bus, addr, length); + return add_poison(nvdimm_bus, addr, length, GFP_KERNEL); } int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length) @@ -602,6 +603,70 @@ int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length) } EXPORT_SYMBOL_GPL(nvdimm_bus_add_poison); +void nvdimm_clear_from_poison_list(struct nvdimm_bus *nvdimm_bus, + phys_addr_t start, unsigned int len) +{ + struct list_head *poison_list = &nvdimm_bus->poison_list; + u64 clr_end = start + len - 1; + struct nd_poison *pl, *next; + + nvdimm_bus_lock(&nvdimm_bus->dev); + WARN_ON_ONCE(list_empty(poison_list)); + + /* + * [start, clr_end] is the poison interval being cleared. + * [pl->start, pl_end] is the poison_list entry we're comparing + * the above interval against. The poison list entry may need + * to be modified (update either start or length), deleted, or + * split into two based on the overlap characteristics + */ + + list_for_each_entry_safe(pl, next, poison_list, list) { + u64 pl_end = pl->start + pl->length - 1; + + /* Skip intervals with no intersection */ + if (pl_end < start) + continue; + if (pl->start > clr_end) + continue; + /* Delete completely overlapped poison entries */ + if ((pl->start >= start) && (pl_end <= clr_end)) { + list_del(&pl->list); + kfree(pl); + continue; + } + /* Adjust start point of partially cleared entries */ + if ((start <= pl->start) && (clr_end > pl->start)) { + pl->length -= clr_end - pl->start + 1; + pl->start = clr_end + 1; + continue; + } + /* Adjust pl->length for partial clearing at the tail end */ + if ((pl->start < start) && (pl_end <= clr_end)) { + /* pl->start remains the same */ + pl->length = start - pl->start; + continue; + } + /* + * If clearing in the middle of an entry, we split it into + * two by modifying the current entry to represent one half of + * the split, and adding a new entry for the second half. + */ + if ((pl->start < start) && (pl_end > clr_end)) { + u64 new_start = clr_end + 1; + u64 new_len = pl_end - new_start + 1; + + /* Add new entry covering the right half */ + add_poison(nvdimm_bus, new_start, new_len, GFP_NOIO); + /* Adjust this entry to cover the left half */ + pl->length = start - pl->start; + continue; + } + } + nvdimm_bus_unlock(&nvdimm_bus->dev); +} +EXPORT_SYMBOL_GPL(nvdimm_clear_from_poison_list); + #ifdef CONFIG_BLK_DEV_INTEGRITY int nd_integrity_init(struct gendisk *disk, unsigned long meta_size) { diff --git a/drivers/nvdimm/dimm.c b/drivers/nvdimm/dimm.c index 71d12bb67339..619834e144d1 100644 --- a/drivers/nvdimm/dimm.c +++ b/drivers/nvdimm/dimm.c @@ -26,6 +26,14 @@ static int nvdimm_probe(struct device *dev) struct nvdimm_drvdata *ndd; int rc; + rc = nvdimm_check_config_data(dev); + if (rc) { + /* not required for non-aliased nvdimm, ex. NVDIMM-N */ + if (rc == -ENOTTY) + rc = 0; + return rc; + } + ndd = kzalloc(sizeof(*ndd), GFP_KERNEL); if (!ndd) return -ENOMEM; @@ -72,6 +80,9 @@ static int nvdimm_remove(struct device *dev) { struct nvdimm_drvdata *ndd = dev_get_drvdata(dev); + if (!ndd) + return 0; + nvdimm_bus_lock(dev); dev_set_drvdata(dev, NULL); nvdimm_bus_unlock(dev); diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c index d9bba5edd8dc..d614493ad5ac 100644 --- a/drivers/nvdimm/dimm_devs.c +++ b/drivers/nvdimm/dimm_devs.c @@ -28,28 +28,30 @@ static DEFINE_IDA(dimm_ida); * Retrieve bus and dimm handle and return if this bus supports * get_config_data commands */ -static int __validate_dimm(struct nvdimm_drvdata *ndd) +int nvdimm_check_config_data(struct device *dev) { - struct nvdimm *nvdimm; + struct nvdimm *nvdimm = to_nvdimm(dev); - if (!ndd) - return -EINVAL; - - nvdimm = to_nvdimm(ndd->dev); - - if (!nvdimm->cmd_mask) - return -ENXIO; - if (!test_bit(ND_CMD_GET_CONFIG_DATA, &nvdimm->cmd_mask)) - return -ENXIO; + if (!nvdimm->cmd_mask || + !test_bit(ND_CMD_GET_CONFIG_DATA, &nvdimm->cmd_mask)) { + if (nvdimm->flags & NDD_ALIASING) + return -ENXIO; + else + return -ENOTTY; + } return 0; } static int validate_dimm(struct nvdimm_drvdata *ndd) { - int rc = __validate_dimm(ndd); + int rc; - if (rc && ndd) + if (!ndd) + return -EINVAL; + + rc = nvdimm_check_config_data(ndd->dev); + if (rc) dev_dbg(ndd->dev, "%pf: %s error: %d\n", __builtin_return_address(0), __func__, rc); return rc; @@ -263,6 +265,12 @@ const char *nvdimm_name(struct nvdimm *nvdimm) } EXPORT_SYMBOL_GPL(nvdimm_name); +struct kobject *nvdimm_kobj(struct nvdimm *nvdimm) +{ + return &nvdimm->dev.kobj; +} +EXPORT_SYMBOL_GPL(nvdimm_kobj); + unsigned long nvdimm_cmd_mask(struct nvdimm *nvdimm) { return nvdimm->cmd_mask; @@ -378,40 +386,166 @@ struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data, } EXPORT_SYMBOL_GPL(nvdimm_create); +int alias_dpa_busy(struct device *dev, void *data) +{ + resource_size_t map_end, blk_start, new, busy; + struct blk_alloc_info *info = data; + struct nd_mapping *nd_mapping; + struct nd_region *nd_region; + struct nvdimm_drvdata *ndd; + struct resource *res; + int i; + + if (!is_nd_pmem(dev)) + return 0; + + nd_region = to_nd_region(dev); + for (i = 0; i < nd_region->ndr_mappings; i++) { + nd_mapping = &nd_region->mapping[i]; + if (nd_mapping->nvdimm == info->nd_mapping->nvdimm) + break; + } + + if (i >= nd_region->ndr_mappings) + return 0; + + ndd = to_ndd(nd_mapping); + map_end = nd_mapping->start + nd_mapping->size - 1; + blk_start = nd_mapping->start; + + /* + * In the allocation case ->res is set to free space that we are + * looking to validate against PMEM aliasing collision rules + * (i.e. BLK is allocated after all aliased PMEM). + */ + if (info->res) { + if (info->res->start >= nd_mapping->start + && info->res->start < map_end) + /* pass */; + else + return 0; + } + + retry: + /* + * Find the free dpa from the end of the last pmem allocation to + * the end of the interleave-set mapping that is not already + * covered by a blk allocation. + */ + busy = 0; + for_each_dpa_resource(ndd, res) { + if ((res->start >= blk_start && res->start < map_end) + || (res->end >= blk_start + && res->end <= map_end)) { + if (strncmp(res->name, "pmem", 4) == 0) { + new = max(blk_start, min(map_end + 1, + res->end + 1)); + if (new != blk_start) { + blk_start = new; + goto retry; + } + } else + busy += min(map_end, res->end) + - max(nd_mapping->start, res->start) + 1; + } else if (nd_mapping->start > res->start + && map_end < res->end) { + /* total eclipse of the PMEM region mapping */ + busy += nd_mapping->size; + break; + } + } + + /* update the free space range with the probed blk_start */ + if (info->res && blk_start > info->res->start) { + info->res->start = max(info->res->start, blk_start); + if (info->res->start > info->res->end) + info->res->end = info->res->start - 1; + return 1; + } + + info->available -= blk_start - nd_mapping->start + busy; + + return 0; +} + +static int blk_dpa_busy(struct device *dev, void *data) +{ + struct blk_alloc_info *info = data; + struct nd_mapping *nd_mapping; + struct nd_region *nd_region; + resource_size_t map_end; + int i; + + if (!is_nd_pmem(dev)) + return 0; + + nd_region = to_nd_region(dev); + for (i = 0; i < nd_region->ndr_mappings; i++) { + nd_mapping = &nd_region->mapping[i]; + if (nd_mapping->nvdimm == info->nd_mapping->nvdimm) + break; + } + + if (i >= nd_region->ndr_mappings) + return 0; + + map_end = nd_mapping->start + nd_mapping->size - 1; + if (info->res->start >= nd_mapping->start + && info->res->start < map_end) { + if (info->res->end <= map_end) { + info->busy = 0; + return 1; + } else { + info->busy -= info->res->end - map_end; + return 0; + } + } else if (info->res->end >= nd_mapping->start + && info->res->end <= map_end) { + info->busy -= nd_mapping->start - info->res->start; + return 0; + } else { + info->busy -= nd_mapping->size; + return 0; + } +} + /** * nd_blk_available_dpa - account the unused dpa of BLK region * @nd_mapping: container of dpa-resource-root + labels * - * Unlike PMEM, BLK namespaces can occupy discontiguous DPA ranges. + * Unlike PMEM, BLK namespaces can occupy discontiguous DPA ranges, but + * we arrange for them to never start at an lower dpa than the last + * PMEM allocation in an aliased region. */ -resource_size_t nd_blk_available_dpa(struct nd_mapping *nd_mapping) +resource_size_t nd_blk_available_dpa(struct nd_region *nd_region) { + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev); + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); - resource_size_t map_end, busy = 0, available; + struct blk_alloc_info info = { + .nd_mapping = nd_mapping, + .available = nd_mapping->size, + .res = NULL, + }; struct resource *res; if (!ndd) return 0; - map_end = nd_mapping->start + nd_mapping->size - 1; - for_each_dpa_resource(ndd, res) - if (res->start >= nd_mapping->start && res->start < map_end) { - resource_size_t end = min(map_end, res->end); + device_for_each_child(&nvdimm_bus->dev, &info, alias_dpa_busy); - busy += end - res->start + 1; - } else if (res->end >= nd_mapping->start - && res->end <= map_end) { - busy += res->end - nd_mapping->start; - } else if (nd_mapping->start > res->start - && nd_mapping->start < res->end) { - /* total eclipse of the BLK region mapping */ - busy += nd_mapping->size; - } + /* now account for busy blk allocations in unaliased dpa */ + for_each_dpa_resource(ndd, res) { + if (strncmp(res->name, "blk", 3) != 0) + continue; - available = map_end - nd_mapping->start + 1; - if (busy < available) - return available - busy; - return 0; + info.res = res; + info.busy = resource_size(res); + device_for_each_child(&nvdimm_bus->dev, &info, blk_dpa_busy); + info.available -= info.busy; + } + + return info.available; } /** @@ -443,21 +577,16 @@ resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region, map_start = nd_mapping->start; map_end = map_start + nd_mapping->size - 1; blk_start = max(map_start, map_end + 1 - *overlap); - for_each_dpa_resource(ndd, res) + for_each_dpa_resource(ndd, res) { if (res->start >= map_start && res->start < map_end) { if (strncmp(res->name, "blk", 3) == 0) - blk_start = min(blk_start, res->start); - else if (res->start != map_start) { + blk_start = min(blk_start, + max(map_start, res->start)); + else if (res->end > map_end) { reason = "misaligned to iset"; goto err; - } else { - if (busy) { - reason = "duplicate overlapping PMEM reservations?"; - goto err; - } + } else busy += resource_size(res); - continue; - } } else if (res->end >= map_start && res->end <= map_end) { if (strncmp(res->name, "blk", 3) == 0) { /* @@ -466,15 +595,14 @@ resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region, * be used for BLK. */ blk_start = map_start; - } else { - reason = "misaligned to iset"; - goto err; - } + } else + busy += resource_size(res); } else if (map_start > res->start && map_start < res->end) { /* total eclipse of the mapping */ busy += nd_mapping->size; blk_start = map_start; } + } *overlap = map_end + 1 - blk_start; available = blk_start - map_start; @@ -483,10 +611,6 @@ resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region, return 0; err: - /* - * Something is wrong, PMEM must align with the start of the - * interleave set, and there can only be one allocation per set. - */ nd_dbg_dpa(nd_region, ndd, res, "%s\n", reason); return 0; } diff --git a/drivers/nvdimm/label.c b/drivers/nvdimm/label.c index 96526dcfdd37..fac7cabe8f56 100644 --- a/drivers/nvdimm/label.c +++ b/drivers/nvdimm/label.c @@ -494,11 +494,13 @@ static int __pmem_label_update(struct nd_region *nd_region, struct nd_mapping *nd_mapping, struct nd_namespace_pmem *nspm, int pos) { - u64 cookie = nd_region_interleave_set_cookie(nd_region), rawsize; + u64 cookie = nd_region_interleave_set_cookie(nd_region); struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); - struct nd_namespace_label *victim_label; + struct nd_label_ent *label_ent, *victim = NULL; struct nd_namespace_label *nd_label; struct nd_namespace_index *nsindex; + struct nd_label_id label_id; + struct resource *res; unsigned long *free; u32 nslot, slot; size_t offset; @@ -507,6 +509,16 @@ static int __pmem_label_update(struct nd_region *nd_region, if (!preamble_next(ndd, &nsindex, &free, &nslot)) return -ENXIO; + nd_label_gen_id(&label_id, nspm->uuid, 0); + for_each_dpa_resource(ndd, res) + if (strcmp(res->name, label_id.id) == 0) + break; + + if (!res) { + WARN_ON_ONCE(1); + return -ENXIO; + } + /* allocate and write the label to the staging (next) index */ slot = nd_label_alloc_slot(ndd); if (slot == UINT_MAX) @@ -522,11 +534,10 @@ static int __pmem_label_update(struct nd_region *nd_region, nd_label->nlabel = __cpu_to_le16(nd_region->ndr_mappings); nd_label->position = __cpu_to_le16(pos); nd_label->isetcookie = __cpu_to_le64(cookie); - rawsize = div_u64(resource_size(&nspm->nsio.res), - nd_region->ndr_mappings); - nd_label->rawsize = __cpu_to_le64(rawsize); - nd_label->dpa = __cpu_to_le64(nd_mapping->start); + nd_label->rawsize = __cpu_to_le64(resource_size(res)); + nd_label->dpa = __cpu_to_le64(res->start); nd_label->slot = __cpu_to_le32(slot); + nd_dbg_dpa(nd_region, ndd, res, "%s\n", __func__); /* update label */ offset = nd_label_offset(ndd, nd_label); @@ -536,38 +547,43 @@ static int __pmem_label_update(struct nd_region *nd_region, return rc; /* Garbage collect the previous label */ - victim_label = nd_mapping->labels[0]; - if (victim_label) { - slot = to_slot(ndd, victim_label); - nd_label_free_slot(ndd, slot); + mutex_lock(&nd_mapping->lock); + list_for_each_entry(label_ent, &nd_mapping->labels, list) { + if (!label_ent->label) + continue; + if (memcmp(nspm->uuid, label_ent->label->uuid, + NSLABEL_UUID_LEN) != 0) + continue; + victim = label_ent; + list_move_tail(&victim->list, &nd_mapping->labels); + break; + } + if (victim) { dev_dbg(ndd->dev, "%s: free: %d\n", __func__, slot); + slot = to_slot(ndd, victim->label); + nd_label_free_slot(ndd, slot); + victim->label = NULL; } /* update index */ rc = nd_label_write_index(ndd, ndd->ns_next, nd_inc_seq(__le32_to_cpu(nsindex->seq)), 0); - if (rc < 0) - return rc; + if (rc == 0) { + list_for_each_entry(label_ent, &nd_mapping->labels, list) + if (!label_ent->label) { + label_ent->label = nd_label; + nd_label = NULL; + break; + } + dev_WARN_ONCE(&nspm->nsio.common.dev, nd_label, + "failed to track label: %d\n", + to_slot(ndd, nd_label)); + if (nd_label) + rc = -ENXIO; + } + mutex_unlock(&nd_mapping->lock); - nd_mapping->labels[0] = nd_label; - - return 0; -} - -static void del_label(struct nd_mapping *nd_mapping, int l) -{ - struct nd_namespace_label *next_label, *nd_label; - struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); - unsigned int slot; - int j; - - nd_label = nd_mapping->labels[l]; - slot = to_slot(ndd, nd_label); - dev_vdbg(ndd->dev, "%s: clear: %d\n", __func__, slot); - - for (j = l; (next_label = nd_mapping->labels[j + 1]); j++) - nd_mapping->labels[j] = next_label; - nd_mapping->labels[j] = NULL; + return rc; } static bool is_old_resource(struct resource *res, struct resource **list, int n) @@ -607,14 +623,16 @@ static int __blk_label_update(struct nd_region *nd_region, struct nd_mapping *nd_mapping, struct nd_namespace_blk *nsblk, int num_labels) { - int i, l, alloc, victims, nfree, old_num_resources, nlabel, rc = -ENXIO; + int i, alloc, victims, nfree, old_num_resources, nlabel, rc = -ENXIO; struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); struct nd_namespace_label *nd_label; + struct nd_label_ent *label_ent, *e; struct nd_namespace_index *nsindex; unsigned long *free, *victim_map = NULL; struct resource *res, **old_res_list; struct nd_label_id label_id; u8 uuid[NSLABEL_UUID_LEN]; + LIST_HEAD(list); u32 nslot, slot; if (!preamble_next(ndd, &nsindex, &free, &nslot)) @@ -736,15 +754,22 @@ static int __blk_label_update(struct nd_region *nd_region, * entries in nd_mapping->labels */ nlabel = 0; - for_each_label(l, nd_label, nd_mapping->labels) { + mutex_lock(&nd_mapping->lock); + list_for_each_entry_safe(label_ent, e, &nd_mapping->labels, list) { + nd_label = label_ent->label; + if (!nd_label) + continue; nlabel++; memcpy(uuid, nd_label->uuid, NSLABEL_UUID_LEN); if (memcmp(uuid, nsblk->uuid, NSLABEL_UUID_LEN) != 0) continue; nlabel--; - del_label(nd_mapping, l); - l--; /* retry with the new label at this index */ + list_move(&label_ent->list, &list); + label_ent->label = NULL; } + list_splice_tail_init(&list, &nd_mapping->labels); + mutex_unlock(&nd_mapping->lock); + if (nlabel + nsblk->num_resources > num_labels) { /* * Bug, we can't end up with more resources than @@ -755,6 +780,15 @@ static int __blk_label_update(struct nd_region *nd_region, goto out; } + mutex_lock(&nd_mapping->lock); + label_ent = list_first_entry_or_null(&nd_mapping->labels, + typeof(*label_ent), list); + if (!label_ent) { + WARN_ON(1); + mutex_unlock(&nd_mapping->lock); + rc = -ENXIO; + goto out; + } for_each_clear_bit_le(slot, free, nslot) { nd_label = nd_label_base(ndd) + slot; memcpy(uuid, nd_label->uuid, NSLABEL_UUID_LEN); @@ -762,11 +796,19 @@ static int __blk_label_update(struct nd_region *nd_region, continue; res = to_resource(ndd, nd_label); res->flags &= ~DPA_RESOURCE_ADJUSTED; - dev_vdbg(&nsblk->common.dev, "assign label[%d] slot: %d\n", - l, slot); - nd_mapping->labels[l++] = nd_label; + dev_vdbg(&nsblk->common.dev, "assign label slot: %d\n", slot); + list_for_each_entry_from(label_ent, &nd_mapping->labels, list) { + if (label_ent->label) + continue; + label_ent->label = nd_label; + nd_label = NULL; + break; + } + if (nd_label) + dev_WARN(&nsblk->common.dev, + "failed to track label slot%d\n", slot); } - nd_mapping->labels[l] = NULL; + mutex_unlock(&nd_mapping->lock); out: kfree(old_res_list); @@ -788,32 +830,28 @@ static int __blk_label_update(struct nd_region *nd_region, static int init_labels(struct nd_mapping *nd_mapping, int num_labels) { - int i, l, old_num_labels = 0; + int i, old_num_labels = 0; + struct nd_label_ent *label_ent; struct nd_namespace_index *nsindex; - struct nd_namespace_label *nd_label; struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); - size_t size = (num_labels + 1) * sizeof(struct nd_namespace_label *); - for_each_label(l, nd_label, nd_mapping->labels) + mutex_lock(&nd_mapping->lock); + list_for_each_entry(label_ent, &nd_mapping->labels, list) old_num_labels++; + mutex_unlock(&nd_mapping->lock); /* * We need to preserve all the old labels for the mapping so * they can be garbage collected after writing the new labels. */ - if (num_labels > old_num_labels) { - struct nd_namespace_label **labels; - - labels = krealloc(nd_mapping->labels, size, GFP_KERNEL); - if (!labels) + for (i = old_num_labels; i < num_labels; i++) { + label_ent = kzalloc(sizeof(*label_ent), GFP_KERNEL); + if (!label_ent) return -ENOMEM; - nd_mapping->labels = labels; + mutex_lock(&nd_mapping->lock); + list_add_tail(&label_ent->list, &nd_mapping->labels); + mutex_unlock(&nd_mapping->lock); } - if (!nd_mapping->labels) - return -ENOMEM; - - for (i = old_num_labels; i <= num_labels; i++) - nd_mapping->labels[i] = NULL; if (ndd->ns_current == -1 || ndd->ns_next == -1) /* pass */; @@ -837,42 +875,45 @@ static int init_labels(struct nd_mapping *nd_mapping, int num_labels) static int del_labels(struct nd_mapping *nd_mapping, u8 *uuid) { struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); - struct nd_namespace_label *nd_label; + struct nd_label_ent *label_ent, *e; struct nd_namespace_index *nsindex; u8 label_uuid[NSLABEL_UUID_LEN]; - int l, num_freed = 0; unsigned long *free; + LIST_HEAD(list); u32 nslot, slot; + int active = 0; if (!uuid) return 0; /* no index || no labels == nothing to delete */ - if (!preamble_next(ndd, &nsindex, &free, &nslot) - || !nd_mapping->labels) + if (!preamble_next(ndd, &nsindex, &free, &nslot)) return 0; - for_each_label(l, nd_label, nd_mapping->labels) { + mutex_lock(&nd_mapping->lock); + list_for_each_entry_safe(label_ent, e, &nd_mapping->labels, list) { + struct nd_namespace_label *nd_label = label_ent->label; + + if (!nd_label) + continue; + active++; memcpy(label_uuid, nd_label->uuid, NSLABEL_UUID_LEN); if (memcmp(label_uuid, uuid, NSLABEL_UUID_LEN) != 0) continue; + active--; slot = to_slot(ndd, nd_label); nd_label_free_slot(ndd, slot); dev_dbg(ndd->dev, "%s: free: %d\n", __func__, slot); - del_label(nd_mapping, l); - num_freed++; - l--; /* retry with new label at this index */ + list_move_tail(&label_ent->list, &list); + label_ent->label = NULL; } + list_splice_tail_init(&list, &nd_mapping->labels); - if (num_freed > l) { - /* - * num_freed will only ever be > l when we delete the last - * label - */ - kfree(nd_mapping->labels); - nd_mapping->labels = NULL; - dev_dbg(ndd->dev, "%s: no more labels\n", __func__); + if (active == 0) { + nd_mapping_free_labels(nd_mapping); + dev_dbg(ndd->dev, "%s: no more active labels\n", __func__); } + mutex_unlock(&nd_mapping->lock); return nd_label_write_index(ndd, ndd->ns_next, nd_inc_seq(__le32_to_cpu(nsindex->seq)), 0); @@ -885,7 +926,9 @@ int nd_pmem_namespace_label_update(struct nd_region *nd_region, for (i = 0; i < nd_region->ndr_mappings; i++) { struct nd_mapping *nd_mapping = &nd_region->mapping[i]; - int rc; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct resource *res; + int rc, count = 0; if (size == 0) { rc = del_labels(nd_mapping, nspm->uuid); @@ -894,7 +937,12 @@ int nd_pmem_namespace_label_update(struct nd_region *nd_region, continue; } - rc = init_labels(nd_mapping, 1); + for_each_dpa_resource(ndd, res) + if (strncmp(res->name, "pmem", 3) == 0) + count++; + WARN_ON_ONCE(!count); + + rc = init_labels(nd_mapping, count); if (rc < 0) return rc; diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index c5e3196c45b0..3509cff68ef9 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -12,8 +12,10 @@ */ #include #include +#include #include #include +#include #include #include "nd-core.h" #include "nd.h" @@ -28,7 +30,10 @@ static void namespace_io_release(struct device *dev) static void namespace_pmem_release(struct device *dev) { struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + struct nd_region *nd_region = to_nd_region(dev->parent); + if (nspm->id >= 0) + ida_simple_remove(&nd_region->ns_ida, nspm->id); kfree(nspm->alt_name); kfree(nspm->uuid); kfree(nspm); @@ -62,17 +67,17 @@ static struct device_type namespace_blk_device_type = { .release = namespace_blk_release, }; -static bool is_namespace_pmem(struct device *dev) +static bool is_namespace_pmem(const struct device *dev) { return dev ? dev->type == &namespace_pmem_device_type : false; } -static bool is_namespace_blk(struct device *dev) +static bool is_namespace_blk(const struct device *dev) { return dev ? dev->type == &namespace_blk_device_type : false; } -static bool is_namespace_io(struct device *dev) +static bool is_namespace_io(const struct device *dev) { return dev ? dev->type == &namespace_io_device_type : false; } @@ -168,7 +173,21 @@ const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns, suffix = "s"; if (is_namespace_pmem(&ndns->dev) || is_namespace_io(&ndns->dev)) { - sprintf(name, "pmem%d%s", nd_region->id, suffix ? suffix : ""); + int nsidx = 0; + + if (is_namespace_pmem(&ndns->dev)) { + struct nd_namespace_pmem *nspm; + + nspm = to_nd_namespace_pmem(&ndns->dev); + nsidx = nspm->id; + } + + if (nsidx) + sprintf(name, "pmem%d.%d%s", nd_region->id, nsidx, + suffix ? suffix : ""); + else + sprintf(name, "pmem%d%s", nd_region->id, + suffix ? suffix : ""); } else if (is_namespace_blk(&ndns->dev)) { struct nd_namespace_blk *nsblk; @@ -294,7 +313,7 @@ static bool __nd_namespace_blk_validate(struct nd_namespace_blk *nsblk) if (strcmp(res->name, label_id.id) != 0) continue; /* - * Resources with unacknoweldged adjustments indicate a + * Resources with unacknowledged adjustments indicate a * failure to update labels */ if (res->flags & DPA_RESOURCE_ADJUSTED) @@ -510,19 +529,68 @@ static resource_size_t init_dpa_allocation(struct nd_label_id *label_id, return rc ? n : 0; } -static bool space_valid(bool is_pmem, bool is_reserve, - struct nd_label_id *label_id, struct resource *res) + +/** + * space_valid() - validate free dpa space against constraints + * @nd_region: hosting region of the free space + * @ndd: dimm device data for debug + * @label_id: namespace id to allocate space + * @prev: potential allocation that precedes free space + * @next: allocation that follows the given free space range + * @exist: first allocation with same id in the mapping + * @n: range that must satisfied for pmem allocations + * @valid: free space range to validate + * + * BLK-space is valid as long as it does not precede a PMEM + * allocation in a given region. PMEM-space must be contiguous + * and adjacent to an existing existing allocation (if one + * exists). If reserving PMEM any space is valid. + */ +static void space_valid(struct nd_region *nd_region, struct nvdimm_drvdata *ndd, + struct nd_label_id *label_id, struct resource *prev, + struct resource *next, struct resource *exist, + resource_size_t n, struct resource *valid) { - /* - * For BLK-space any space is valid, for PMEM-space, it must be - * contiguous with an existing allocation unless we are - * reserving pmem. - */ - if (is_reserve || !is_pmem) - return true; - if (!res || strcmp(res->name, label_id->id) == 0) - return true; - return false; + bool is_reserve = strcmp(label_id->id, "pmem-reserve") == 0; + bool is_pmem = strncmp(label_id->id, "pmem", 4) == 0; + + if (valid->start >= valid->end) + goto invalid; + + if (is_reserve) + return; + + if (!is_pmem) { + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; + struct nvdimm_bus *nvdimm_bus; + struct blk_alloc_info info = { + .nd_mapping = nd_mapping, + .available = nd_mapping->size, + .res = valid, + }; + + WARN_ON(!is_nd_blk(&nd_region->dev)); + nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev); + device_for_each_child(&nvdimm_bus->dev, &info, alias_dpa_busy); + return; + } + + /* allocation needs to be contiguous, so this is all or nothing */ + if (resource_size(valid) < n) + goto invalid; + + /* we've got all the space we need and no existing allocation */ + if (!exist) + return; + + /* allocation needs to be contiguous with the existing namespace */ + if (valid->start == exist->end + 1 + || valid->end == exist->start - 1) + return; + + invalid: + /* truncate @valid size to 0 */ + valid->end = valid->start - 1; } enum alloc_loc { @@ -534,18 +602,24 @@ static resource_size_t scan_allocate(struct nd_region *nd_region, resource_size_t n) { resource_size_t mapping_end = nd_mapping->start + nd_mapping->size - 1; - bool is_reserve = strcmp(label_id->id, "pmem-reserve") == 0; bool is_pmem = strncmp(label_id->id, "pmem", 4) == 0; struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct resource *res, *exist = NULL, valid; const resource_size_t to_allocate = n; - struct resource *res; int first; + for_each_dpa_resource(ndd, res) + if (strcmp(label_id->id, res->name) == 0) + exist = res; + + valid.start = nd_mapping->start; + valid.end = mapping_end; + valid.name = "free space"; retry: first = 0; for_each_dpa_resource(ndd, res) { - resource_size_t allocate, available = 0, free_start, free_end; struct resource *next = res->sibling, *new_res = NULL; + resource_size_t allocate, available = 0; enum alloc_loc loc = ALLOC_ERR; const char *action; int rc = 0; @@ -558,32 +632,35 @@ static resource_size_t scan_allocate(struct nd_region *nd_region, /* space at the beginning of the mapping */ if (!first++ && res->start > nd_mapping->start) { - free_start = nd_mapping->start; - available = res->start - free_start; - if (space_valid(is_pmem, is_reserve, label_id, NULL)) + valid.start = nd_mapping->start; + valid.end = res->start - 1; + space_valid(nd_region, ndd, label_id, NULL, next, exist, + to_allocate, &valid); + available = resource_size(&valid); + if (available) loc = ALLOC_BEFORE; } /* space between allocations */ if (!loc && next) { - free_start = res->start + resource_size(res); - free_end = min(mapping_end, next->start - 1); - if (space_valid(is_pmem, is_reserve, label_id, res) - && free_start < free_end) { - available = free_end + 1 - free_start; + valid.start = res->start + resource_size(res); + valid.end = min(mapping_end, next->start - 1); + space_valid(nd_region, ndd, label_id, res, next, exist, + to_allocate, &valid); + available = resource_size(&valid); + if (available) loc = ALLOC_MID; - } } /* space at the end of the mapping */ if (!loc && !next) { - free_start = res->start + resource_size(res); - free_end = mapping_end; - if (space_valid(is_pmem, is_reserve, label_id, res) - && free_start < free_end) { - available = free_end + 1 - free_start; + valid.start = res->start + resource_size(res); + valid.end = mapping_end; + space_valid(nd_region, ndd, label_id, res, next, exist, + to_allocate, &valid); + available = resource_size(&valid); + if (available) loc = ALLOC_AFTER; - } } if (!loc || !available) @@ -593,8 +670,6 @@ static resource_size_t scan_allocate(struct nd_region *nd_region, case ALLOC_BEFORE: if (strcmp(res->name, label_id->id) == 0) { /* adjust current resource up */ - if (is_pmem && !is_reserve) - return n; rc = adjust_resource(res, res->start - allocate, resource_size(res) + allocate); action = "cur grow up"; @@ -604,8 +679,6 @@ static resource_size_t scan_allocate(struct nd_region *nd_region, case ALLOC_MID: if (strcmp(next->name, label_id->id) == 0) { /* adjust next resource up */ - if (is_pmem && !is_reserve) - return n; rc = adjust_resource(next, next->start - allocate, resource_size(next) + allocate); @@ -629,12 +702,10 @@ static resource_size_t scan_allocate(struct nd_region *nd_region, if (strcmp(action, "allocate") == 0) { /* BLK allocate bottom up */ if (!is_pmem) - free_start += available - allocate; - else if (!is_reserve && free_start != nd_mapping->start) - return n; + valid.start += available - allocate; new_res = nvdimm_allocate_dpa(ndd, label_id, - free_start, allocate); + valid.start, allocate); if (!new_res) rc = -EBUSY; } else if (strcmp(action, "grow down") == 0) { @@ -832,13 +903,45 @@ static int grow_dpa_allocation(struct nd_region *nd_region, return 0; } -static void nd_namespace_pmem_set_size(struct nd_region *nd_region, +static void nd_namespace_pmem_set_resource(struct nd_region *nd_region, struct nd_namespace_pmem *nspm, resource_size_t size) { struct resource *res = &nspm->nsio.res; + resource_size_t offset = 0; - res->start = nd_region->ndr_start; - res->end = nd_region->ndr_start + size - 1; + if (size && !nspm->uuid) { + WARN_ON_ONCE(1); + size = 0; + } + + if (size && nspm->uuid) { + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct nd_label_id label_id; + struct resource *res; + + if (!ndd) { + size = 0; + goto out; + } + + nd_label_gen_id(&label_id, nspm->uuid, 0); + + /* calculate a spa offset from the dpa allocation offset */ + for_each_dpa_resource(ndd, res) + if (strcmp(res->name, label_id.id) == 0) { + offset = (res->start - nd_mapping->start) + * nd_region->ndr_mappings; + goto out; + } + + WARN_ON_ONCE(1); + size = 0; + } + + out: + res->start = nd_region->ndr_start + offset; + res->end = res->start + size - 1; } static bool uuid_not_set(const u8 *uuid, struct device *dev, const char *where) @@ -929,7 +1032,7 @@ static ssize_t __size_store(struct device *dev, unsigned long long val) if (is_namespace_pmem(dev)) { struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); - nd_namespace_pmem_set_size(nd_region, nspm, + nd_namespace_pmem_set_resource(nd_region, nspm, val * nd_region->ndr_mappings); } else if (is_namespace_blk(dev)) { struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); @@ -1031,22 +1134,27 @@ static ssize_t size_show(struct device *dev, } static DEVICE_ATTR(size, S_IRUGO, size_show, size_store); -static ssize_t uuid_show(struct device *dev, - struct device_attribute *attr, char *buf) +static u8 *namespace_to_uuid(struct device *dev) { - u8 *uuid; - if (is_namespace_pmem(dev)) { struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); - uuid = nspm->uuid; + return nspm->uuid; } else if (is_namespace_blk(dev)) { struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); - uuid = nsblk->uuid; + return nsblk->uuid; } else - return -ENXIO; + return ERR_PTR(-ENXIO); +} +static ssize_t uuid_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + u8 *uuid = namespace_to_uuid(dev); + + if (IS_ERR(uuid)) + return PTR_ERR(uuid); if (uuid) return sprintf(buf, "%pUb\n", uuid); return sprintf(buf, "\n"); @@ -1089,7 +1197,7 @@ static int namespace_update_uuid(struct nd_region *nd_region, * * FIXME: can we delete uuid with zero dpa allocated? */ - if (nd_mapping->labels) + if (list_empty(&nd_mapping->labels)) return -EBUSY; } @@ -1491,14 +1599,19 @@ static bool has_uuid_at_pos(struct nd_region *nd_region, u8 *uuid, for (i = 0; i < nd_region->ndr_mappings; i++) { struct nd_mapping *nd_mapping = &nd_region->mapping[i]; - struct nd_namespace_label *nd_label; + struct nd_label_ent *label_ent; bool found_uuid = false; - int l; - for_each_label(l, nd_label, nd_mapping->labels) { - u64 isetcookie = __le64_to_cpu(nd_label->isetcookie); - u16 position = __le16_to_cpu(nd_label->position); - u16 nlabel = __le16_to_cpu(nd_label->nlabel); + list_for_each_entry(label_ent, &nd_mapping->labels, list) { + struct nd_namespace_label *nd_label = label_ent->label; + u16 position, nlabel; + u64 isetcookie; + + if (!nd_label) + continue; + isetcookie = __le64_to_cpu(nd_label->isetcookie); + position = __le16_to_cpu(nd_label->position); + nlabel = __le16_to_cpu(nd_label->nlabel); if (isetcookie != cookie) continue; @@ -1528,7 +1641,6 @@ static bool has_uuid_at_pos(struct nd_region *nd_region, u8 *uuid, static int select_pmem_id(struct nd_region *nd_region, u8 *pmem_id) { - struct nd_namespace_label *select = NULL; int i; if (!pmem_id) @@ -1536,90 +1648,106 @@ static int select_pmem_id(struct nd_region *nd_region, u8 *pmem_id) for (i = 0; i < nd_region->ndr_mappings; i++) { struct nd_mapping *nd_mapping = &nd_region->mapping[i]; - struct nd_namespace_label *nd_label; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct nd_namespace_label *nd_label = NULL; u64 hw_start, hw_end, pmem_start, pmem_end; - int l; + struct nd_label_ent *label_ent; - for_each_label(l, nd_label, nd_mapping->labels) + WARN_ON(!mutex_is_locked(&nd_mapping->lock)); + list_for_each_entry(label_ent, &nd_mapping->labels, list) { + nd_label = label_ent->label; + if (!nd_label) + continue; if (memcmp(nd_label->uuid, pmem_id, NSLABEL_UUID_LEN) == 0) break; + nd_label = NULL; + } if (!nd_label) { WARN_ON(1); return -EINVAL; } - select = nd_label; /* * Check that this label is compliant with the dpa * range published in NFIT */ hw_start = nd_mapping->start; hw_end = hw_start + nd_mapping->size; - pmem_start = __le64_to_cpu(select->dpa); - pmem_end = pmem_start + __le64_to_cpu(select->rawsize); - if (pmem_start == hw_start && pmem_end <= hw_end) + pmem_start = __le64_to_cpu(nd_label->dpa); + pmem_end = pmem_start + __le64_to_cpu(nd_label->rawsize); + if (pmem_start >= hw_start && pmem_start < hw_end + && pmem_end <= hw_end && pmem_end > hw_start) /* pass */; - else + else { + dev_dbg(&nd_region->dev, "%s invalid label for %pUb\n", + dev_name(ndd->dev), nd_label->uuid); return -EINVAL; + } - nd_mapping->labels[0] = select; - nd_mapping->labels[1] = NULL; + /* move recently validated label to the front of the list */ + list_move(&label_ent->list, &nd_mapping->labels); } return 0; } /** - * find_pmem_label_set - validate interleave set labelling, retrieve label0 + * create_namespace_pmem - validate interleave set labelling, retrieve label0 * @nd_region: region with mappings to validate + * @nspm: target namespace to create + * @nd_label: target pmem namespace label to evaluate */ -static int find_pmem_label_set(struct nd_region *nd_region, - struct nd_namespace_pmem *nspm) +struct device *create_namespace_pmem(struct nd_region *nd_region, + struct nd_namespace_label *nd_label) { u64 cookie = nd_region_interleave_set_cookie(nd_region); - struct nd_namespace_label *nd_label; - u8 select_id[NSLABEL_UUID_LEN]; + struct nd_label_ent *label_ent; + struct nd_namespace_pmem *nspm; + struct nd_mapping *nd_mapping; resource_size_t size = 0; - u8 *pmem_id = NULL; - int rc = -ENODEV, l; + struct resource *res; + struct device *dev; + int rc = 0; u16 i; - if (cookie == 0) - return -ENXIO; + if (cookie == 0) { + dev_dbg(&nd_region->dev, "invalid interleave-set-cookie\n"); + return ERR_PTR(-ENXIO); + } - /* - * Find a complete set of labels by uuid. By definition we can start - * with any mapping as the reference label - */ - for_each_label(l, nd_label, nd_region->mapping[0].labels) { - u64 isetcookie = __le64_to_cpu(nd_label->isetcookie); + if (__le64_to_cpu(nd_label->isetcookie) != cookie) { + dev_dbg(&nd_region->dev, "invalid cookie in label: %pUb\n", + nd_label->uuid); + return ERR_PTR(-EAGAIN); + } - if (isetcookie != cookie) - continue; + nspm = kzalloc(sizeof(*nspm), GFP_KERNEL); + if (!nspm) + return ERR_PTR(-ENOMEM); - for (i = 0; nd_region->ndr_mappings; i++) - if (!has_uuid_at_pos(nd_region, nd_label->uuid, - cookie, i)) - break; - if (i < nd_region->ndr_mappings) { - /* - * Give up if we don't find an instance of a - * uuid at each position (from 0 to - * nd_region->ndr_mappings - 1), or if we find a - * dimm with two instances of the same uuid. - */ - rc = -EINVAL; - goto err; - } else if (pmem_id) { - /* - * If there is more than one valid uuid set, we - * need userspace to clean this up. - */ - rc = -EBUSY; - goto err; - } - memcpy(select_id, nd_label->uuid, NSLABEL_UUID_LEN); - pmem_id = select_id; + nspm->id = -1; + dev = &nspm->nsio.common.dev; + dev->type = &namespace_pmem_device_type; + dev->parent = &nd_region->dev; + res = &nspm->nsio.res; + res->name = dev_name(&nd_region->dev); + res->flags = IORESOURCE_MEM; + + for (i = 0; i < nd_region->ndr_mappings; i++) + if (!has_uuid_at_pos(nd_region, nd_label->uuid, cookie, i)) + break; + if (i < nd_region->ndr_mappings) { + struct nvdimm_drvdata *ndd = to_ndd(&nd_region->mapping[i]); + + /* + * Give up if we don't find an instance of a uuid at each + * position (from 0 to nd_region->ndr_mappings - 1), or if we + * find a dimm with two instances of the same uuid. + */ + dev_err(&nd_region->dev, "%s missing label for %pUb\n", + dev_name(ndd->dev), nd_label->uuid); + rc = -EINVAL; + goto err; } /* @@ -1630,14 +1758,23 @@ static int find_pmem_label_set(struct nd_region *nd_region, * the dimm being enabled (i.e. nd_label_reserve_dpa() * succeeded). */ - rc = select_pmem_id(nd_region, pmem_id); + rc = select_pmem_id(nd_region, nd_label->uuid); if (rc) goto err; /* Calculate total size and populate namespace properties from label0 */ for (i = 0; i < nd_region->ndr_mappings; i++) { - struct nd_mapping *nd_mapping = &nd_region->mapping[i]; - struct nd_namespace_label *label0 = nd_mapping->labels[0]; + struct nd_namespace_label *label0; + + nd_mapping = &nd_region->mapping[i]; + label_ent = list_first_entry_or_null(&nd_mapping->labels, + typeof(*label_ent), list); + label0 = label_ent ? label_ent->label : 0; + + if (!label0) { + WARN_ON(1); + continue; + } size += __le64_to_cpu(label0->rawsize); if (__le16_to_cpu(label0->position) != 0) @@ -1654,10 +1791,11 @@ static int find_pmem_label_set(struct nd_region *nd_region, goto err; } - nd_namespace_pmem_set_size(nd_region, nspm, size); + nd_namespace_pmem_set_resource(nd_region, nspm, size); - return 0; + return dev; err: + namespace_pmem_release(dev); switch (rc) { case -EINVAL: dev_dbg(&nd_region->dev, "%s: invalid label(s)\n", __func__); @@ -1670,55 +1808,7 @@ static int find_pmem_label_set(struct nd_region *nd_region, __func__, rc); break; } - return rc; -} - -static struct device **create_namespace_pmem(struct nd_region *nd_region) -{ - struct nd_namespace_pmem *nspm; - struct device *dev, **devs; - struct resource *res; - int rc; - - nspm = kzalloc(sizeof(*nspm), GFP_KERNEL); - if (!nspm) - return NULL; - - dev = &nspm->nsio.common.dev; - dev->type = &namespace_pmem_device_type; - dev->parent = &nd_region->dev; - res = &nspm->nsio.res; - res->name = dev_name(&nd_region->dev); - res->flags = IORESOURCE_MEM; - rc = find_pmem_label_set(nd_region, nspm); - if (rc == -ENODEV) { - int i; - - /* Pass, try to permit namespace creation... */ - for (i = 0; i < nd_region->ndr_mappings; i++) { - struct nd_mapping *nd_mapping = &nd_region->mapping[i]; - - kfree(nd_mapping->labels); - nd_mapping->labels = NULL; - } - - /* Publish a zero-sized namespace for userspace to configure. */ - nd_namespace_pmem_set_size(nd_region, nspm, 0); - - rc = 0; - } else if (rc) - goto err; - - devs = kcalloc(2, sizeof(struct device *), GFP_KERNEL); - if (!devs) - goto err; - - devs[0] = dev; - return devs; - - err: - namespace_pmem_release(&nspm->nsio.common.dev); - return NULL; + return ERR_PTR(rc); } struct resource *nsblk_add_resource(struct nd_region *nd_region, @@ -1770,16 +1860,58 @@ static struct device *nd_namespace_blk_create(struct nd_region *nd_region) return &nsblk->common.dev; } -void nd_region_create_blk_seed(struct nd_region *nd_region) +static struct device *nd_namespace_pmem_create(struct nd_region *nd_region) +{ + struct nd_namespace_pmem *nspm; + struct resource *res; + struct device *dev; + + if (!is_nd_pmem(&nd_region->dev)) + return NULL; + + nspm = kzalloc(sizeof(*nspm), GFP_KERNEL); + if (!nspm) + return NULL; + + dev = &nspm->nsio.common.dev; + dev->type = &namespace_pmem_device_type; + dev->parent = &nd_region->dev; + res = &nspm->nsio.res; + res->name = dev_name(&nd_region->dev); + res->flags = IORESOURCE_MEM; + + nspm->id = ida_simple_get(&nd_region->ns_ida, 0, 0, GFP_KERNEL); + if (nspm->id < 0) { + kfree(nspm); + return NULL; + } + dev_set_name(dev, "namespace%d.%d", nd_region->id, nspm->id); + dev->parent = &nd_region->dev; + dev->groups = nd_namespace_attribute_groups; + nd_namespace_pmem_set_resource(nd_region, nspm, 0); + + return dev; +} + +void nd_region_create_ns_seed(struct nd_region *nd_region) { WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev)); - nd_region->ns_seed = nd_namespace_blk_create(nd_region); + + if (nd_region_to_nstype(nd_region) == ND_DEVICE_NAMESPACE_IO) + return; + + if (is_nd_blk(&nd_region->dev)) + nd_region->ns_seed = nd_namespace_blk_create(nd_region); + else + nd_region->ns_seed = nd_namespace_pmem_create(nd_region); + /* * Seed creation failures are not fatal, provisioning is simply * disabled until memory becomes available */ if (!nd_region->ns_seed) - dev_err(&nd_region->dev, "failed to create blk namespace\n"); + dev_err(&nd_region->dev, "failed to create %s namespace\n", + is_nd_blk(&nd_region->dev) ? "blk" : "pmem"); else nd_device_register(nd_region->ns_seed); } @@ -1820,43 +1952,137 @@ void nd_region_create_btt_seed(struct nd_region *nd_region) dev_err(&nd_region->dev, "failed to create btt namespace\n"); } -static struct device **create_namespace_blk(struct nd_region *nd_region) +static int add_namespace_resource(struct nd_region *nd_region, + struct nd_namespace_label *nd_label, struct device **devs, + int count) { struct nd_mapping *nd_mapping = &nd_region->mapping[0]; - struct nd_namespace_label *nd_label; - struct device *dev, **devs = NULL; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + int i; + + for (i = 0; i < count; i++) { + u8 *uuid = namespace_to_uuid(devs[i]); + struct resource *res; + + if (IS_ERR_OR_NULL(uuid)) { + WARN_ON(1); + continue; + } + + if (memcmp(uuid, nd_label->uuid, NSLABEL_UUID_LEN) != 0) + continue; + if (is_namespace_blk(devs[i])) { + res = nsblk_add_resource(nd_region, ndd, + to_nd_namespace_blk(devs[i]), + __le64_to_cpu(nd_label->dpa)); + if (!res) + return -ENXIO; + nd_dbg_dpa(nd_region, ndd, res, "%d assign\n", count); + } else { + dev_err(&nd_region->dev, + "error: conflicting extents for uuid: %pUb\n", + nd_label->uuid); + return -ENXIO; + } + break; + } + + return i; +} + +struct device *create_namespace_blk(struct nd_region *nd_region, + struct nd_namespace_label *nd_label, int count) +{ + + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); struct nd_namespace_blk *nsblk; - struct nvdimm_drvdata *ndd; - int i, l, count = 0; + char *name[NSLABEL_NAME_LEN]; + struct device *dev = NULL; struct resource *res; - if (nd_region->ndr_mappings == 0) - return NULL; + nsblk = kzalloc(sizeof(*nsblk), GFP_KERNEL); + if (!nsblk) + return ERR_PTR(-ENOMEM); + dev = &nsblk->common.dev; + dev->type = &namespace_blk_device_type; + dev->parent = &nd_region->dev; + nsblk->id = -1; + nsblk->lbasize = __le64_to_cpu(nd_label->lbasize); + nsblk->uuid = kmemdup(nd_label->uuid, NSLABEL_UUID_LEN, + GFP_KERNEL); + if (!nsblk->uuid) + goto blk_err; + memcpy(name, nd_label->name, NSLABEL_NAME_LEN); + if (name[0]) + nsblk->alt_name = kmemdup(name, NSLABEL_NAME_LEN, + GFP_KERNEL); + res = nsblk_add_resource(nd_region, ndd, nsblk, + __le64_to_cpu(nd_label->dpa)); + if (!res) + goto blk_err; + nd_dbg_dpa(nd_region, ndd, res, "%d: assign\n", count); + return dev; + blk_err: + namespace_blk_release(dev); + return ERR_PTR(-ENXIO); +} - ndd = to_ndd(nd_mapping); - for_each_label(l, nd_label, nd_mapping->labels) { - u32 flags = __le32_to_cpu(nd_label->flags); - char *name[NSLABEL_NAME_LEN]; +static int cmp_dpa(const void *a, const void *b) +{ + const struct device *dev_a = *(const struct device **) a; + const struct device *dev_b = *(const struct device **) b; + struct nd_namespace_blk *nsblk_a, *nsblk_b; + struct nd_namespace_pmem *nspm_a, *nspm_b; + + if (is_namespace_io(dev_a)) + return 0; + + if (is_namespace_blk(dev_a)) { + nsblk_a = to_nd_namespace_blk(dev_a); + nsblk_b = to_nd_namespace_blk(dev_b); + + return memcmp(&nsblk_a->res[0]->start, &nsblk_b->res[0]->start, + sizeof(resource_size_t)); + } + + nspm_a = to_nd_namespace_pmem(dev_a); + nspm_b = to_nd_namespace_pmem(dev_b); + + return memcmp(&nspm_a->nsio.res.start, &nspm_b->nsio.res.start, + sizeof(resource_size_t)); +} + +static struct device **scan_labels(struct nd_region *nd_region) +{ + int i, count = 0; + struct device *dev, **devs = NULL; + struct nd_label_ent *label_ent, *e; + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; + resource_size_t map_end = nd_mapping->start + nd_mapping->size - 1; + + /* "safe" because create_namespace_pmem() might list_move() label_ent */ + list_for_each_entry_safe(label_ent, e, &nd_mapping->labels, list) { + struct nd_namespace_label *nd_label = label_ent->label; struct device **__devs; + u32 flags; - if (flags & NSLABEL_FLAG_LOCAL) - /* pass */; + if (!nd_label) + continue; + flags = __le32_to_cpu(nd_label->flags); + if (is_nd_blk(&nd_region->dev) + == !!(flags & NSLABEL_FLAG_LOCAL)) + /* pass, region matches label type */; else continue; - for (i = 0; i < count; i++) { - nsblk = to_nd_namespace_blk(devs[i]); - if (memcmp(nsblk->uuid, nd_label->uuid, - NSLABEL_UUID_LEN) == 0) { - res = nsblk_add_resource(nd_region, ndd, nsblk, - __le64_to_cpu(nd_label->dpa)); - if (!res) - goto err; - nd_dbg_dpa(nd_region, ndd, res, "%s assign\n", - dev_name(&nsblk->common.dev)); - break; - } - } + /* skip labels that describe extents outside of the region */ + if (nd_label->dpa < nd_mapping->start || nd_label->dpa > map_end) + continue; + + i = add_namespace_resource(nd_region, nd_label, devs, count); + if (i < 0) + goto err; if (i < count) continue; __devs = kcalloc(count + 2, sizeof(dev), GFP_KERNEL); @@ -1866,67 +2092,126 @@ static struct device **create_namespace_blk(struct nd_region *nd_region) kfree(devs); devs = __devs; - nsblk = kzalloc(sizeof(*nsblk), GFP_KERNEL); - if (!nsblk) - goto err; - dev = &nsblk->common.dev; - dev->type = &namespace_blk_device_type; - dev->parent = &nd_region->dev; - dev_set_name(dev, "namespace%d.%d", nd_region->id, count); - devs[count++] = dev; - nsblk->id = -1; - nsblk->lbasize = __le64_to_cpu(nd_label->lbasize); - nsblk->uuid = kmemdup(nd_label->uuid, NSLABEL_UUID_LEN, - GFP_KERNEL); - if (!nsblk->uuid) - goto err; - memcpy(name, nd_label->name, NSLABEL_NAME_LEN); - if (name[0]) - nsblk->alt_name = kmemdup(name, NSLABEL_NAME_LEN, - GFP_KERNEL); - res = nsblk_add_resource(nd_region, ndd, nsblk, - __le64_to_cpu(nd_label->dpa)); - if (!res) - goto err; - nd_dbg_dpa(nd_region, ndd, res, "%s assign\n", - dev_name(&nsblk->common.dev)); + if (is_nd_blk(&nd_region->dev)) { + dev = create_namespace_blk(nd_region, nd_label, count); + if (IS_ERR(dev)) + goto err; + devs[count++] = dev; + } else { + dev = create_namespace_pmem(nd_region, nd_label); + if (IS_ERR(dev)) { + switch (PTR_ERR(dev)) { + case -EAGAIN: + /* skip invalid labels */ + continue; + case -ENODEV: + /* fallthrough to seed creation */ + break; + default: + goto err; + } + } else + devs[count++] = dev; + } } - dev_dbg(&nd_region->dev, "%s: discovered %d blk namespace%s\n", - __func__, count, count == 1 ? "" : "s"); + dev_dbg(&nd_region->dev, "%s: discovered %d %s namespace%s\n", + __func__, count, is_nd_blk(&nd_region->dev) + ? "blk" : "pmem", count == 1 ? "" : "s"); if (count == 0) { /* Publish a zero-sized namespace for userspace to configure. */ - for (i = 0; i < nd_region->ndr_mappings; i++) { - struct nd_mapping *nd_mapping = &nd_region->mapping[i]; - - kfree(nd_mapping->labels); - nd_mapping->labels = NULL; - } + nd_mapping_free_labels(nd_mapping); devs = kcalloc(2, sizeof(dev), GFP_KERNEL); if (!devs) goto err; - nsblk = kzalloc(sizeof(*nsblk), GFP_KERNEL); - if (!nsblk) - goto err; - dev = &nsblk->common.dev; - dev->type = &namespace_blk_device_type; + if (is_nd_blk(&nd_region->dev)) { + struct nd_namespace_blk *nsblk; + + nsblk = kzalloc(sizeof(*nsblk), GFP_KERNEL); + if (!nsblk) + goto err; + dev = &nsblk->common.dev; + dev->type = &namespace_blk_device_type; + } else { + struct nd_namespace_pmem *nspm; + + nspm = kzalloc(sizeof(*nspm), GFP_KERNEL); + if (!nspm) + goto err; + dev = &nspm->nsio.common.dev; + dev->type = &namespace_pmem_device_type; + nd_namespace_pmem_set_resource(nd_region, nspm, 0); + } dev->parent = &nd_region->dev; devs[count++] = dev; + } else if (is_nd_pmem(&nd_region->dev)) { + /* clean unselected labels */ + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct list_head *l, *e; + LIST_HEAD(list); + int j; + + nd_mapping = &nd_region->mapping[i]; + if (list_empty(&nd_mapping->labels)) { + WARN_ON(1); + continue; + } + + j = count; + list_for_each_safe(l, e, &nd_mapping->labels) { + if (!j--) + break; + list_move_tail(l, &list); + } + nd_mapping_free_labels(nd_mapping); + list_splice_init(&list, &nd_mapping->labels); + } } + if (count > 1) + sort(devs, count, sizeof(struct device *), cmp_dpa, NULL); + return devs; -err: - for (i = 0; i < count; i++) { - nsblk = to_nd_namespace_blk(devs[i]); - namespace_blk_release(&nsblk->common.dev); - } + err: + for (i = 0; devs[i]; i++) + if (is_nd_blk(&nd_region->dev)) + namespace_blk_release(devs[i]); + else + namespace_pmem_release(devs[i]); kfree(devs); return NULL; } +static struct device **create_namespaces(struct nd_region *nd_region) +{ + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; + struct device **devs; + int i; + + if (nd_region->ndr_mappings == 0) + return NULL; + + /* lock down all mappings while we scan labels */ + for (i = 0; i < nd_region->ndr_mappings; i++) { + nd_mapping = &nd_region->mapping[i]; + mutex_lock_nested(&nd_mapping->lock, i); + } + + devs = scan_labels(nd_region); + + for (i = 0; i < nd_region->ndr_mappings; i++) { + int reverse = nd_region->ndr_mappings - 1 - i; + + nd_mapping = &nd_region->mapping[reverse]; + mutex_unlock(&nd_mapping->lock); + } + + return devs; +} + static int init_active_labels(struct nd_region *nd_region) { int i; @@ -1935,6 +2220,7 @@ static int init_active_labels(struct nd_region *nd_region) struct nd_mapping *nd_mapping = &nd_region->mapping[i]; struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); struct nvdimm *nvdimm = nd_mapping->nvdimm; + struct nd_label_ent *label_ent; int count, j; /* @@ -1956,16 +2242,27 @@ static int init_active_labels(struct nd_region *nd_region) dev_dbg(ndd->dev, "%s: %d\n", __func__, count); if (!count) continue; - nd_mapping->labels = kcalloc(count + 1, sizeof(void *), - GFP_KERNEL); - if (!nd_mapping->labels) - return -ENOMEM; for (j = 0; j < count; j++) { struct nd_namespace_label *label; + label_ent = kzalloc(sizeof(*label_ent), GFP_KERNEL); + if (!label_ent) + break; label = nd_label_active(ndd, j); - nd_mapping->labels[j] = label; + label_ent->label = label; + + mutex_lock(&nd_mapping->lock); + list_add_tail(&label_ent->list, &nd_mapping->labels); + mutex_unlock(&nd_mapping->lock); } + + if (j >= count) + continue; + + mutex_lock(&nd_mapping->lock); + nd_mapping_free_labels(nd_mapping); + mutex_unlock(&nd_mapping->lock); + return -ENOMEM; } return 0; @@ -1990,10 +2287,8 @@ int nd_region_register_namespaces(struct nd_region *nd_region, int *err) devs = create_namespace_io(nd_region); break; case ND_DEVICE_NAMESPACE_PMEM: - devs = create_namespace_pmem(nd_region); - break; case ND_DEVICE_NAMESPACE_BLK: - devs = create_namespace_blk(nd_region); + devs = create_namespaces(nd_region); break; default: break; @@ -2014,6 +2309,13 @@ int nd_region_register_namespaces(struct nd_region *nd_region, int *err) id = ida_simple_get(&nd_region->ns_ida, 0, 0, GFP_KERNEL); nsblk->id = id; + } else if (type == ND_DEVICE_NAMESPACE_PMEM) { + struct nd_namespace_pmem *nspm; + + nspm = to_nd_namespace_pmem(dev); + id = ida_simple_get(&nd_region->ns_ida, 0, 0, + GFP_KERNEL); + nspm->id = id; } else id = i; diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h index 38ce6bbbc170..8623e57c2ce3 100644 --- a/drivers/nvdimm/nd-core.h +++ b/drivers/nvdimm/nd-core.h @@ -44,6 +44,23 @@ struct nvdimm { struct resource *flush_wpq; }; +/** + * struct blk_alloc_info - tracking info for BLK dpa scanning + * @nd_mapping: blk region mapping boundaries + * @available: decremented in alias_dpa_busy as aliased PMEM is scanned + * @busy: decremented in blk_dpa_busy to account for ranges already + * handled by alias_dpa_busy + * @res: alias_dpa_busy interprets this a free space range that needs to + * be truncated to the valid BLK allocation starting DPA, blk_dpa_busy + * treats it as a busy range that needs the aliased PMEM ranges + * truncated. + */ +struct blk_alloc_info { + struct nd_mapping *nd_mapping; + resource_size_t available, busy; + struct resource *res; +}; + bool is_nvdimm(struct device *dev); bool is_nd_pmem(struct device *dev); bool is_nd_blk(struct device *dev); @@ -54,7 +71,7 @@ void nvdimm_devs_exit(void); void nd_region_devs_exit(void); void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus, struct device *dev); struct nd_region; -void nd_region_create_blk_seed(struct nd_region *nd_region); +void nd_region_create_ns_seed(struct nd_region *nd_region); void nd_region_create_btt_seed(struct nd_region *nd_region); void nd_region_create_pfn_seed(struct nd_region *nd_region); void nd_region_create_dax_seed(struct nd_region *nd_region); @@ -73,13 +90,14 @@ bool nd_is_uuid_unique(struct device *dev, u8 *uuid); struct nd_region; struct nvdimm_drvdata; struct nd_mapping; +void nd_mapping_free_labels(struct nd_mapping *nd_mapping); resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region, struct nd_mapping *nd_mapping, resource_size_t *overlap); -resource_size_t nd_blk_available_dpa(struct nd_mapping *nd_mapping); +resource_size_t nd_blk_available_dpa(struct nd_region *nd_region); resource_size_t nd_region_available_dpa(struct nd_region *nd_region); resource_size_t nvdimm_allocated_dpa(struct nvdimm_drvdata *ndd, struct nd_label_id *label_id); -struct nd_mapping; +int alias_dpa_busy(struct device *dev, void *data); struct resource *nsblk_add_resource(struct nd_region *nd_region, struct nvdimm_drvdata *ndd, struct nd_namespace_blk *nsblk, resource_size_t start); diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index 0b78a8211f4a..d3b2fca8deec 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -101,9 +101,6 @@ static inline struct nd_namespace_index *to_next_namespace_index( (unsigned long long) (res ? resource_size(res) : 0), \ (unsigned long long) (res ? res->start : 0), ##arg) -#define for_each_label(l, label, labels) \ - for (l = 0; (label = labels ? labels[l] : NULL); l++) - #define for_each_dpa_resource(ndd, res) \ for (res = (ndd)->dpa.child; res; res = res->sibling) @@ -116,6 +113,31 @@ struct nd_percpu_lane { spinlock_t lock; }; +struct nd_label_ent { + struct list_head list; + struct nd_namespace_label *label; +}; + +enum nd_mapping_lock_class { + ND_MAPPING_CLASS0, + ND_MAPPING_UUID_SCAN, +}; + +struct nd_mapping { + struct nvdimm *nvdimm; + u64 start; + u64 size; + struct list_head labels; + struct mutex lock; + /* + * @ndd is for private use at region enable / disable time for + * get_ndd() + put_ndd(), all other nd_mapping to ndd + * conversions use to_ndd() which respects enabled state of the + * nvdimm. + */ + struct nvdimm_drvdata *ndd; +}; + struct nd_region { struct device dev; struct ida ns_ida; @@ -209,6 +231,7 @@ void nvdimm_exit(void); void nd_region_exit(void); struct nvdimm; struct nvdimm_drvdata *to_ndd(struct nd_mapping *nd_mapping); +int nvdimm_check_config_data(struct device *dev); int nvdimm_init_nsarea(struct nvdimm_drvdata *ndd); int nvdimm_init_config_data(struct nvdimm_drvdata *ndd); int nvdimm_set_config_data(struct nvdimm_drvdata *ndd, size_t offset, diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 571a6c7ee2fc..42b3a8217073 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -66,13 +66,32 @@ static void pmem_clear_poison(struct pmem_device *pmem, phys_addr_t offset, invalidate_pmem(pmem->virt_addr + offset, len); } +static void write_pmem(void *pmem_addr, struct page *page, + unsigned int off, unsigned int len) +{ + void *mem = kmap_atomic(page); + + memcpy_to_pmem(pmem_addr, mem + off, len); + kunmap_atomic(mem); +} + +static int read_pmem(struct page *page, unsigned int off, + void *pmem_addr, unsigned int len) +{ + int rc; + void *mem = kmap_atomic(page); + + rc = memcpy_from_pmem(mem + off, pmem_addr, len); + kunmap_atomic(mem); + return rc; +} + static int pmem_do_bvec(struct pmem_device *pmem, struct page *page, unsigned int len, unsigned int off, bool is_write, sector_t sector) { int rc = 0; bool bad_pmem = false; - void *mem = kmap_atomic(page); phys_addr_t pmem_off = sector * 512 + pmem->data_offset; void *pmem_addr = pmem->virt_addr + pmem_off; @@ -83,7 +102,7 @@ static int pmem_do_bvec(struct pmem_device *pmem, struct page *page, if (unlikely(bad_pmem)) rc = -EIO; else { - rc = memcpy_from_pmem(mem + off, pmem_addr, len); + rc = read_pmem(page, off, pmem_addr, len); flush_dcache_page(page); } } else { @@ -102,14 +121,13 @@ static int pmem_do_bvec(struct pmem_device *pmem, struct page *page, * after clear poison. */ flush_dcache_page(page); - memcpy_to_pmem(pmem_addr, mem + off, len); + write_pmem(pmem_addr, page, off, len); if (unlikely(bad_pmem)) { pmem_clear_poison(pmem, pmem_off, len); - memcpy_to_pmem(pmem_addr, mem + off, len); + write_pmem(pmem_addr, page, off, len); } } - kunmap_atomic(mem); return rc; } diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index 4c0ac4abb629..6af5e629140c 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -70,7 +70,7 @@ static int nvdimm_map_flush(struct device *dev, struct nvdimm *nvdimm, int dimm, int nd_region_activate(struct nd_region *nd_region) { - int i, num_flush = 0; + int i, j, num_flush = 0; struct nd_region_data *ndrd; struct device *dev = &nd_region->dev; size_t flush_data_size = sizeof(void *); @@ -107,6 +107,21 @@ int nd_region_activate(struct nd_region *nd_region) return rc; } + /* + * Clear out entries that are duplicates. This should prevent the + * extra flushings. + */ + for (i = 0; i < nd_region->ndr_mappings - 1; i++) { + /* ignore if NULL already */ + if (!ndrd_get_flush_wpq(ndrd, i, 0)) + continue; + + for (j = i + 1; j < nd_region->ndr_mappings; j++) + if (ndrd_get_flush_wpq(ndrd, i, 0) == + ndrd_get_flush_wpq(ndrd, j, 0)) + ndrd_set_flush_wpq(ndrd, j, 0, NULL); + } + return 0; } @@ -298,9 +313,8 @@ resource_size_t nd_region_available_dpa(struct nd_region *nd_region) blk_max_overlap = overlap; goto retry; } - } else if (is_nd_blk(&nd_region->dev)) { - available += nd_blk_available_dpa(nd_mapping); - } + } else if (is_nd_blk(&nd_region->dev)) + available += nd_blk_available_dpa(nd_region); } return available; @@ -491,6 +505,17 @@ u64 nd_region_interleave_set_cookie(struct nd_region *nd_region) return 0; } +void nd_mapping_free_labels(struct nd_mapping *nd_mapping) +{ + struct nd_label_ent *label_ent, *e; + + WARN_ON(!mutex_is_locked(&nd_mapping->lock)); + list_for_each_entry_safe(label_ent, e, &nd_mapping->labels, list) { + list_del(&label_ent->list); + kfree(label_ent); + } +} + /* * Upon successful probe/remove, take/release a reference on the * associated interleave set (if present), and plant new btt + namespace @@ -511,8 +536,10 @@ static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus, struct nvdimm_drvdata *ndd = nd_mapping->ndd; struct nvdimm *nvdimm = nd_mapping->nvdimm; - kfree(nd_mapping->labels); - nd_mapping->labels = NULL; + mutex_lock(&nd_mapping->lock); + nd_mapping_free_labels(nd_mapping); + mutex_unlock(&nd_mapping->lock); + put_ndd(ndd); nd_mapping->ndd = NULL; if (ndd) @@ -522,11 +549,12 @@ static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus, if (is_nd_pmem(dev)) return; } - if (dev->parent && is_nd_blk(dev->parent) && probe) { + if (dev->parent && (is_nd_blk(dev->parent) || is_nd_pmem(dev->parent)) + && probe) { nd_region = to_nd_region(dev->parent); nvdimm_bus_lock(dev); if (nd_region->ns_seed == dev) - nd_region_create_blk_seed(nd_region); + nd_region_create_ns_seed(nd_region); nvdimm_bus_unlock(dev); } if (is_nd_btt(dev) && probe) { @@ -536,23 +564,30 @@ static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus, nvdimm_bus_lock(dev); if (nd_region->btt_seed == dev) nd_region_create_btt_seed(nd_region); - if (nd_region->ns_seed == &nd_btt->ndns->dev && - is_nd_blk(dev->parent)) - nd_region_create_blk_seed(nd_region); + if (nd_region->ns_seed == &nd_btt->ndns->dev) + nd_region_create_ns_seed(nd_region); nvdimm_bus_unlock(dev); } if (is_nd_pfn(dev) && probe) { + struct nd_pfn *nd_pfn = to_nd_pfn(dev); + nd_region = to_nd_region(dev->parent); nvdimm_bus_lock(dev); if (nd_region->pfn_seed == dev) nd_region_create_pfn_seed(nd_region); + if (nd_region->ns_seed == &nd_pfn->ndns->dev) + nd_region_create_ns_seed(nd_region); nvdimm_bus_unlock(dev); } if (is_nd_dax(dev) && probe) { + struct nd_dax *nd_dax = to_nd_dax(dev); + nd_region = to_nd_region(dev->parent); nvdimm_bus_lock(dev); if (nd_region->dax_seed == dev) nd_region_create_dax_seed(nd_region); + if (nd_region->ns_seed == &nd_dax->nd_pfn.ndns->dev) + nd_region_create_ns_seed(nd_region); nvdimm_bus_unlock(dev); } } @@ -759,10 +794,10 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, int ro = 0; for (i = 0; i < ndr_desc->num_mappings; i++) { - struct nd_mapping *nd_mapping = &ndr_desc->nd_mapping[i]; - struct nvdimm *nvdimm = nd_mapping->nvdimm; + struct nd_mapping_desc *mapping = &ndr_desc->mapping[i]; + struct nvdimm *nvdimm = mapping->nvdimm; - if ((nd_mapping->start | nd_mapping->size) % SZ_4K) { + if ((mapping->start | mapping->size) % SZ_4K) { dev_err(&nvdimm_bus->dev, "%s: %s mapping%d is not 4K aligned\n", caller, dev_name(&nvdimm->dev), i); @@ -813,11 +848,15 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, ndl->count = 0; } - memcpy(nd_region->mapping, ndr_desc->nd_mapping, - sizeof(struct nd_mapping) * ndr_desc->num_mappings); for (i = 0; i < ndr_desc->num_mappings; i++) { - struct nd_mapping *nd_mapping = &ndr_desc->nd_mapping[i]; - struct nvdimm *nvdimm = nd_mapping->nvdimm; + struct nd_mapping_desc *mapping = &ndr_desc->mapping[i]; + struct nvdimm *nvdimm = mapping->nvdimm; + + nd_region->mapping[i].nvdimm = nvdimm; + nd_region->mapping[i].start = mapping->start; + nd_region->mapping[i].size = mapping->size; + INIT_LIST_HEAD(&nd_region->mapping[i].labels); + mutex_init(&nd_region->mapping[i].lock); get_device(&nvdimm->dev); } diff --git a/fs/char_dev.c b/fs/char_dev.c index 6edd825231c5..44a240c4bb65 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -406,6 +406,7 @@ void cd_forget(struct inode *inode) spin_lock(&cdev_lock); list_del_init(&inode->i_devices); inode->i_cdev = NULL; + inode->i_mapping = &inode->i_data; spin_unlock(&cdev_lock); } diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index b519e137b9b7..f4947fda11e7 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -50,23 +50,6 @@ typedef int (*ndctl_fn)(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, unsigned int cmd, void *buf, unsigned int buf_len, int *cmd_rc); -struct nd_namespace_label; -struct nvdimm_drvdata; - -struct nd_mapping { - struct nvdimm *nvdimm; - struct nd_namespace_label **labels; - u64 start; - u64 size; - /* - * @ndd is for private use at region enable / disable time for - * get_ndd() + put_ndd(), all other nd_mapping to ndd - * conversions use to_ndd() which respects enabled state of the - * nvdimm. - */ - struct nvdimm_drvdata *ndd; -}; - struct nvdimm_bus_descriptor { const struct attribute_group **attr_groups; unsigned long cmd_mask; @@ -89,9 +72,15 @@ struct nd_interleave_set { u64 cookie; }; +struct nd_mapping_desc { + struct nvdimm *nvdimm; + u64 start; + u64 size; +}; + struct nd_region_desc { struct resource *res; - struct nd_mapping *nd_mapping; + struct nd_mapping_desc *mapping; u16 num_mappings; const struct attribute_group **attr_groups; struct nd_interleave_set *nd_set; @@ -129,6 +118,8 @@ static inline struct nd_blk_region_desc *to_blk_region_desc( } int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length); +void nvdimm_clear_from_poison_list(struct nvdimm_bus *nvdimm_bus, + phys_addr_t start, unsigned int len); struct nvdimm_bus *nvdimm_bus_register(struct device *parent, struct nvdimm_bus_descriptor *nfit_desc); void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus); @@ -139,6 +130,7 @@ struct nd_blk_region *to_nd_blk_region(struct device *dev); struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus); struct device *to_nvdimm_bus_dev(struct nvdimm_bus *nvdimm_bus); const char *nvdimm_name(struct nvdimm *nvdimm); +struct kobject *nvdimm_kobj(struct nvdimm *nvdimm); unsigned long nvdimm_cmd_mask(struct nvdimm *nvdimm); void *nvdimm_provider_data(struct nvdimm *nvdimm); struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data, diff --git a/include/linux/nd.h b/include/linux/nd.h index f1ea426d6a5e..fa66aeed441a 100644 --- a/include/linux/nd.h +++ b/include/linux/nd.h @@ -77,11 +77,13 @@ struct nd_namespace_io { * @nsio: device and system physical address range to drive * @alt_name: namespace name supplied in the dimm label * @uuid: namespace name supplied in the dimm label + * @id: ida allocated id */ struct nd_namespace_pmem { struct nd_namespace_io nsio; char *alt_name; u8 *uuid; + int id; }; /** @@ -105,19 +107,19 @@ struct nd_namespace_blk { struct resource **res; }; -static inline struct nd_namespace_io *to_nd_namespace_io(struct device *dev) +static inline struct nd_namespace_io *to_nd_namespace_io(const struct device *dev) { return container_of(dev, struct nd_namespace_io, common.dev); } -static inline struct nd_namespace_pmem *to_nd_namespace_pmem(struct device *dev) +static inline struct nd_namespace_pmem *to_nd_namespace_pmem(const struct device *dev) { struct nd_namespace_io *nsio = to_nd_namespace_io(dev); return container_of(nsio, struct nd_namespace_pmem, nsio); } -static inline struct nd_namespace_blk *to_nd_namespace_blk(struct device *dev) +static inline struct nd_namespace_blk *to_nd_namespace_blk(const struct device *dev) { return container_of(dev, struct nd_namespace_blk, common.dev); } diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index e398beac67b8..9bd559472c92 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -65,6 +65,7 @@ #define V9FS_MAGIC 0x01021997 #define BDEVFS_MAGIC 0x62646576 +#define DAXFS_MAGIC 0x64646178 #define BINFMTFS_MAGIC 0x42494e4d #define DEVPTS_SUPER_MAGIC 0x1cd1 #define FUTEXFS_SUPER_MAGIC 0xBAD1DEA diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h index ba5a8c79652a..ede5c6a62164 100644 --- a/include/uapi/linux/ndctl.h +++ b/include/uapi/linux/ndctl.h @@ -21,14 +21,16 @@ struct nd_cmd_smart { } __packed; #define ND_SMART_HEALTH_VALID (1 << 0) -#define ND_SMART_TEMP_VALID (1 << 1) -#define ND_SMART_SPARES_VALID (1 << 2) -#define ND_SMART_ALARM_VALID (1 << 3) -#define ND_SMART_USED_VALID (1 << 4) -#define ND_SMART_SHUTDOWN_VALID (1 << 5) -#define ND_SMART_VENDOR_VALID (1 << 6) -#define ND_SMART_TEMP_TRIP (1 << 0) -#define ND_SMART_SPARE_TRIP (1 << 1) +#define ND_SMART_SPARES_VALID (1 << 1) +#define ND_SMART_USED_VALID (1 << 2) +#define ND_SMART_TEMP_VALID (1 << 3) +#define ND_SMART_CTEMP_VALID (1 << 4) +#define ND_SMART_ALARM_VALID (1 << 9) +#define ND_SMART_SHUTDOWN_VALID (1 << 10) +#define ND_SMART_VENDOR_VALID (1 << 11) +#define ND_SMART_SPARE_TRIP (1 << 0) +#define ND_SMART_TEMP_TRIP (1 << 1) +#define ND_SMART_CTEMP_TRIP (1 << 2) #define ND_SMART_NON_CRITICAL_HEALTH (1 << 0) #define ND_SMART_CRITICAL_HEALTH (1 << 1) #define ND_SMART_FATAL_HEALTH (1 << 2) @@ -37,14 +39,15 @@ struct nd_smart_payload { __u32 flags; __u8 reserved0[4]; __u8 health; - __u16 temperature; __u8 spares; - __u8 alarm_flags; __u8 life_used; + __u8 alarm_flags; + __u16 temperature; + __u16 ctrl_temperature; + __u8 reserved1[15]; __u8 shutdown_state; - __u8 reserved1; __u32 vendor_size; - __u8 vendor_data[108]; + __u8 vendor_data[92]; } __packed; struct nd_cmd_smart_threshold { @@ -53,7 +56,8 @@ struct nd_cmd_smart_threshold { } __packed; struct nd_smart_threshold_payload { - __u16 alarm_control; + __u8 alarm_control; + __u8 reserved0; __u16 temperature; __u8 spares; __u8 reserved[3]; diff --git a/tools/testing/nvdimm/Kbuild b/tools/testing/nvdimm/Kbuild index ad6dd0543019..582db95127ed 100644 --- a/tools/testing/nvdimm/Kbuild +++ b/tools/testing/nvdimm/Kbuild @@ -13,6 +13,7 @@ ldflags-y += --wrap=__release_region ldflags-y += --wrap=devm_memremap_pages ldflags-y += --wrap=insert_resource ldflags-y += --wrap=remove_resource +ldflags-y += --wrap=acpi_evaluate_object DRIVERS := ../../../drivers NVDIMM_SRC := $(DRIVERS)/nvdimm diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c index c29f8dca9e67..3ccef732fce9 100644 --- a/tools/testing/nvdimm/test/iomap.c +++ b/tools/testing/nvdimm/test/iomap.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include "nfit_test.h" @@ -73,7 +74,7 @@ void __iomem *__nfit_test_ioremap(resource_size_t offset, unsigned long size, if (nfit_res) return (void __iomem *) nfit_res->buf + offset - - nfit_res->res->start; + - nfit_res->res.start; return fallback_fn(offset, size); } @@ -84,7 +85,7 @@ void __iomem *__wrap_devm_ioremap_nocache(struct device *dev, if (nfit_res) return (void __iomem *) nfit_res->buf + offset - - nfit_res->res->start; + - nfit_res->res.start; return devm_ioremap_nocache(dev, offset, size); } EXPORT_SYMBOL(__wrap_devm_ioremap_nocache); @@ -95,7 +96,7 @@ void *__wrap_devm_memremap(struct device *dev, resource_size_t offset, struct nfit_test_resource *nfit_res = get_nfit_res(offset); if (nfit_res) - return nfit_res->buf + offset - nfit_res->res->start; + return nfit_res->buf + offset - nfit_res->res.start; return devm_memremap(dev, offset, size, flags); } EXPORT_SYMBOL(__wrap_devm_memremap); @@ -107,7 +108,7 @@ void *__wrap_devm_memremap_pages(struct device *dev, struct resource *res, struct nfit_test_resource *nfit_res = get_nfit_res(offset); if (nfit_res) - return nfit_res->buf + offset - nfit_res->res->start; + return nfit_res->buf + offset - nfit_res->res.start; return devm_memremap_pages(dev, res, ref, altmap); } EXPORT_SYMBOL(__wrap_devm_memremap_pages); @@ -128,7 +129,7 @@ void *__wrap_memremap(resource_size_t offset, size_t size, struct nfit_test_resource *nfit_res = get_nfit_res(offset); if (nfit_res) - return nfit_res->buf + offset - nfit_res->res->start; + return nfit_res->buf + offset - nfit_res->res.start; return memremap(offset, size, flags); } EXPORT_SYMBOL(__wrap_memremap); @@ -174,6 +175,63 @@ void __wrap_memunmap(void *addr) } EXPORT_SYMBOL(__wrap_memunmap); +static bool nfit_test_release_region(struct device *dev, + struct resource *parent, resource_size_t start, + resource_size_t n); + +static void nfit_devres_release(struct device *dev, void *data) +{ + struct resource *res = *((struct resource **) data); + + WARN_ON(!nfit_test_release_region(NULL, &iomem_resource, res->start, + resource_size(res))); +} + +static int match(struct device *dev, void *__res, void *match_data) +{ + struct resource *res = *((struct resource **) __res); + resource_size_t start = *((resource_size_t *) match_data); + + return res->start == start; +} + +static bool nfit_test_release_region(struct device *dev, + struct resource *parent, resource_size_t start, + resource_size_t n) +{ + if (parent == &iomem_resource) { + struct nfit_test_resource *nfit_res = get_nfit_res(start); + + if (nfit_res) { + struct nfit_test_request *req; + struct resource *res = NULL; + + if (dev) { + devres_release(dev, nfit_devres_release, match, + &start); + return true; + } + + spin_lock(&nfit_res->lock); + list_for_each_entry(req, &nfit_res->requests, list) + if (req->res.start == start) { + res = &req->res; + list_del(&req->list); + break; + } + spin_unlock(&nfit_res->lock); + + WARN(!res || resource_size(res) != n, + "%s: start: %llx n: %llx mismatch: %pr\n", + __func__, start, n, res); + if (res) + kfree(req); + return true; + } + } + return false; +} + static struct resource *nfit_test_request_region(struct device *dev, struct resource *parent, resource_size_t start, resource_size_t n, const char *name, int flags) @@ -183,21 +241,57 @@ static struct resource *nfit_test_request_region(struct device *dev, if (parent == &iomem_resource) { nfit_res = get_nfit_res(start); if (nfit_res) { - struct resource *res = nfit_res->res + 1; + struct nfit_test_request *req; + struct resource *res = NULL; - if (start + n > nfit_res->res->start - + resource_size(nfit_res->res)) { + if (start + n > nfit_res->res.start + + resource_size(&nfit_res->res)) { pr_debug("%s: start: %llx n: %llx overflow: %pr\n", __func__, start, n, - nfit_res->res); + &nfit_res->res); return NULL; } + spin_lock(&nfit_res->lock); + list_for_each_entry(req, &nfit_res->requests, list) + if (start == req->res.start) { + res = &req->res; + break; + } + spin_unlock(&nfit_res->lock); + + if (res) { + WARN(1, "%pr already busy\n", res); + return NULL; + } + + req = kzalloc(sizeof(*req), GFP_KERNEL); + if (!req) + return NULL; + INIT_LIST_HEAD(&req->list); + res = &req->res; + res->start = start; res->end = start + n - 1; res->name = name; res->flags = resource_type(parent); res->flags |= IORESOURCE_BUSY | flags; + spin_lock(&nfit_res->lock); + list_add(&req->list, &nfit_res->requests); + spin_unlock(&nfit_res->lock); + + if (dev) { + struct resource **d; + + d = devres_alloc(nfit_devres_release, + sizeof(struct resource *), + GFP_KERNEL); + if (!d) + return NULL; + *d = res; + devres_add(dev, d); + } + pr_debug("%s: %pr\n", __func__, res); return res; } @@ -241,29 +335,10 @@ struct resource *__wrap___devm_request_region(struct device *dev, } EXPORT_SYMBOL(__wrap___devm_request_region); -static bool nfit_test_release_region(struct resource *parent, - resource_size_t start, resource_size_t n) -{ - if (parent == &iomem_resource) { - struct nfit_test_resource *nfit_res = get_nfit_res(start); - if (nfit_res) { - struct resource *res = nfit_res->res + 1; - - if (start != res->start || resource_size(res) != n) - pr_info("%s: start: %llx n: %llx mismatch: %pr\n", - __func__, start, n, res); - else - memset(res, 0, sizeof(*res)); - return true; - } - } - return false; -} - void __wrap___release_region(struct resource *parent, resource_size_t start, resource_size_t n) { - if (!nfit_test_release_region(parent, start, n)) + if (!nfit_test_release_region(NULL, parent, start, n)) __release_region(parent, start, n); } EXPORT_SYMBOL(__wrap___release_region); @@ -271,9 +346,25 @@ EXPORT_SYMBOL(__wrap___release_region); void __wrap___devm_release_region(struct device *dev, struct resource *parent, resource_size_t start, resource_size_t n) { - if (!nfit_test_release_region(parent, start, n)) + if (!nfit_test_release_region(dev, parent, start, n)) __devm_release_region(dev, parent, start, n); } EXPORT_SYMBOL(__wrap___devm_release_region); +acpi_status __wrap_acpi_evaluate_object(acpi_handle handle, acpi_string path, + struct acpi_object_list *p, struct acpi_buffer *buf) +{ + struct nfit_test_resource *nfit_res = get_nfit_res((long) handle); + union acpi_object **obj; + + if (!nfit_res || strcmp(path, "_FIT") || !buf) + return acpi_evaluate_object(handle, path, p, buf); + + obj = nfit_res->buf; + buf->length = sizeof(union acpi_object); + buf->pointer = *obj; + return AE_OK; +} +EXPORT_SYMBOL(__wrap_acpi_evaluate_object); + MODULE_LICENSE("GPL v2"); diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c index f64c57bf1d4b..c9a6458cb63e 100644 --- a/tools/testing/nvdimm/test/nfit.c +++ b/tools/testing/nvdimm/test/nfit.c @@ -132,6 +132,8 @@ static u32 handle[NUM_DCR] = { [4] = NFIT_DIMM_HANDLE(0, 1, 0, 0, 0), }; +static unsigned long dimm_fail_cmd_flags[NUM_DCR]; + struct nfit_test { struct acpi_nfit_desc acpi_desc; struct platform_device pdev; @@ -154,11 +156,14 @@ struct nfit_test { int (*alloc)(struct nfit_test *t); void (*setup)(struct nfit_test *t); int setup_hotplug; + union acpi_object **_fit; + dma_addr_t _fit_dma; struct ars_state { struct nd_cmd_ars_status *ars_status; unsigned long deadline; spinlock_t lock; } ars_state; + struct device *dimm_dev[NUM_DCR]; }; static struct nfit_test *to_nfit_test(struct device *dev) @@ -411,6 +416,9 @@ static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc, if (i >= ARRAY_SIZE(handle)) return -ENXIO; + if ((1 << func) & dimm_fail_cmd_flags[i]) + return -EIO; + switch (func) { case ND_CMD_GET_CONFIG_SIZE: rc = nfit_test_cmd_get_config_size(buf, buf_len); @@ -428,6 +436,9 @@ static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc, break; case ND_CMD_SMART_THRESHOLD: rc = nfit_test_cmd_smart_threshold(buf, buf_len); + device_lock(&t->pdev.dev); + __acpi_nvdimm_notify(t->dimm_dev[i], 0x81); + device_unlock(&t->pdev.dev); break; default: return -ENOTTY; @@ -467,14 +478,12 @@ static struct nfit_test *instances[NUM_NFITS]; static void release_nfit_res(void *data) { struct nfit_test_resource *nfit_res = data; - struct resource *res = nfit_res->res; spin_lock(&nfit_test_lock); list_del(&nfit_res->list); spin_unlock(&nfit_test_lock); vfree(nfit_res->buf); - kfree(res); kfree(nfit_res); } @@ -482,12 +491,11 @@ static void *__test_alloc(struct nfit_test *t, size_t size, dma_addr_t *dma, void *buf) { struct device *dev = &t->pdev.dev; - struct resource *res = kzalloc(sizeof(*res) * 2, GFP_KERNEL); struct nfit_test_resource *nfit_res = kzalloc(sizeof(*nfit_res), GFP_KERNEL); int rc; - if (!res || !buf || !nfit_res) + if (!buf || !nfit_res) goto err; rc = devm_add_action(dev, release_nfit_res, nfit_res); if (rc) @@ -496,10 +504,11 @@ static void *__test_alloc(struct nfit_test *t, size_t size, dma_addr_t *dma, memset(buf, 0, size); nfit_res->dev = dev; nfit_res->buf = buf; - nfit_res->res = res; - res->start = *dma; - res->end = *dma + size - 1; - res->name = "NFIT"; + nfit_res->res.start = *dma; + nfit_res->res.end = *dma + size - 1; + nfit_res->res.name = "NFIT"; + spin_lock_init(&nfit_res->lock); + INIT_LIST_HEAD(&nfit_res->requests); spin_lock(&nfit_test_lock); list_add(&nfit_res->list, &t->resources); spin_unlock(&nfit_test_lock); @@ -508,7 +517,6 @@ static void *__test_alloc(struct nfit_test *t, size_t size, dma_addr_t *dma, err: if (buf) vfree(buf); - kfree(res); kfree(nfit_res); return NULL; } @@ -533,13 +541,13 @@ static struct nfit_test_resource *nfit_test_lookup(resource_size_t addr) continue; spin_lock(&nfit_test_lock); list_for_each_entry(n, &t->resources, list) { - if (addr >= n->res->start && (addr < n->res->start - + resource_size(n->res))) { + if (addr >= n->res.start && (addr < n->res.start + + resource_size(&n->res))) { nfit_res = n; break; } else if (addr >= (unsigned long) n->buf && (addr < (unsigned long) n->buf - + resource_size(n->res))) { + + resource_size(&n->res))) { nfit_res = n; break; } @@ -564,6 +572,86 @@ static int ars_state_init(struct device *dev, struct ars_state *ars_state) return 0; } +static void put_dimms(void *data) +{ + struct device **dimm_dev = data; + int i; + + for (i = 0; i < NUM_DCR; i++) + if (dimm_dev[i]) + device_unregister(dimm_dev[i]); +} + +static struct class *nfit_test_dimm; + +static int dimm_name_to_id(struct device *dev) +{ + int dimm; + + if (sscanf(dev_name(dev), "test_dimm%d", &dimm) != 1 + || dimm >= NUM_DCR || dimm < 0) + return -ENXIO; + return dimm; +} + + +static ssize_t handle_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + int dimm = dimm_name_to_id(dev); + + if (dimm < 0) + return dimm; + + return sprintf(buf, "%#x", handle[dimm]); +} +DEVICE_ATTR_RO(handle); + +static ssize_t fail_cmd_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + int dimm = dimm_name_to_id(dev); + + if (dimm < 0) + return dimm; + + return sprintf(buf, "%#lx\n", dimm_fail_cmd_flags[dimm]); +} + +static ssize_t fail_cmd_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t size) +{ + int dimm = dimm_name_to_id(dev); + unsigned long val; + ssize_t rc; + + if (dimm < 0) + return dimm; + + rc = kstrtol(buf, 0, &val); + if (rc) + return rc; + + dimm_fail_cmd_flags[dimm] = val; + return size; +} +static DEVICE_ATTR_RW(fail_cmd); + +static struct attribute *nfit_test_dimm_attributes[] = { + &dev_attr_fail_cmd.attr, + &dev_attr_handle.attr, + NULL, +}; + +static struct attribute_group nfit_test_dimm_attribute_group = { + .attrs = nfit_test_dimm_attributes, +}; + +static const struct attribute_group *nfit_test_dimm_attribute_groups[] = { + &nfit_test_dimm_attribute_group, + NULL, +}; + static int nfit_test0_alloc(struct nfit_test *t) { size_t nfit_size = sizeof(struct acpi_nfit_system_address) * NUM_SPA @@ -616,6 +704,21 @@ static int nfit_test0_alloc(struct nfit_test *t) return -ENOMEM; } + t->_fit = test_alloc(t, sizeof(union acpi_object **), &t->_fit_dma); + if (!t->_fit) + return -ENOMEM; + + if (devm_add_action_or_reset(&t->pdev.dev, put_dimms, t->dimm_dev)) + return -ENOMEM; + for (i = 0; i < NUM_DCR; i++) { + t->dimm_dev[i] = device_create_with_groups(nfit_test_dimm, + &t->pdev.dev, 0, NULL, + nfit_test_dimm_attribute_groups, + "test_dimm%d", i); + if (!t->dimm_dev[i]) + return -ENOMEM; + } + return ars_state_init(&t->pdev.dev, &t->ars_state); } @@ -1409,6 +1512,8 @@ static int nfit_test_probe(struct platform_device *pdev) struct acpi_nfit_desc *acpi_desc; struct device *dev = &pdev->dev; struct nfit_test *nfit_test; + struct nfit_mem *nfit_mem; + union acpi_object *obj; int rc; nfit_test = to_nfit_test(&pdev->dev); @@ -1476,14 +1581,30 @@ static int nfit_test_probe(struct platform_device *pdev) if (nfit_test->setup != nfit_test0_setup) return 0; - flush_work(&acpi_desc->work); nfit_test->setup_hotplug = 1; nfit_test->setup(nfit_test); - rc = acpi_nfit_init(acpi_desc, nfit_test->nfit_buf, - nfit_test->nfit_size); - if (rc) - return rc; + obj = kzalloc(sizeof(*obj), GFP_KERNEL); + if (!obj) + return -ENOMEM; + obj->type = ACPI_TYPE_BUFFER; + obj->buffer.length = nfit_test->nfit_size; + obj->buffer.pointer = nfit_test->nfit_buf; + *(nfit_test->_fit) = obj; + __acpi_nfit_notify(&pdev->dev, nfit_test, 0x80); + + /* associate dimm devices with nfit_mem data for notification testing */ + mutex_lock(&acpi_desc->init_mutex); + list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) { + u32 nfit_handle = __to_nfit_memdev(nfit_mem)->device_handle; + int i; + + for (i = 0; i < NUM_DCR; i++) + if (nfit_handle == handle[i]) + dev_set_drvdata(nfit_test->dimm_dev[i], + nfit_mem); + } + mutex_unlock(&acpi_desc->init_mutex); return 0; } @@ -1518,6 +1639,10 @@ static __init int nfit_test_init(void) { int rc, i; + nfit_test_dimm = class_create(THIS_MODULE, "nfit_test_dimm"); + if (IS_ERR(nfit_test_dimm)) + return PTR_ERR(nfit_test_dimm); + nfit_test_setup(nfit_test_lookup); for (i = 0; i < NUM_NFITS; i++) { @@ -1584,6 +1709,7 @@ static __exit void nfit_test_exit(void) for (i = 0; i < NUM_NFITS; i++) platform_device_unregister(&instances[i]->pdev); nfit_test_teardown(); + class_destroy(nfit_test_dimm); } module_init(nfit_test_init); diff --git a/tools/testing/nvdimm/test/nfit_test.h b/tools/testing/nvdimm/test/nfit_test.h index 9f18e2a4a862..c281dd2e5e2d 100644 --- a/tools/testing/nvdimm/test/nfit_test.h +++ b/tools/testing/nvdimm/test/nfit_test.h @@ -13,11 +13,21 @@ #ifndef __NFIT_TEST_H__ #define __NFIT_TEST_H__ #include +#include +#include + +struct nfit_test_request { + struct list_head list; + struct resource res; +}; struct nfit_test_resource { + struct list_head requests; struct list_head list; - struct resource *res; + struct resource res; struct device *dev; + spinlock_t lock; + int req_count; void *buf; };