From 02486c2905a7caa50b0f508a86e03d12d8d24ac4 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 11 Aug 2016 17:36:20 -0700 Subject: [PATCH 01/39] libnvdimm: fix SMART Health DSM payload definition "NVDIMM DSM Interface Example" v1.2 made an incompatible change to the layout of function1 "SMART and Health Info". While the kernel does not directly consume this payload, it does define it in ndctl.h that userpace utilities consume. Reported-by: Brian Boylston Signed-off-by: Dan Williams --- include/uapi/linux/ndctl.h | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h index ba5a8c79652a..ede5c6a62164 100644 --- a/include/uapi/linux/ndctl.h +++ b/include/uapi/linux/ndctl.h @@ -21,14 +21,16 @@ struct nd_cmd_smart { } __packed; #define ND_SMART_HEALTH_VALID (1 << 0) -#define ND_SMART_TEMP_VALID (1 << 1) -#define ND_SMART_SPARES_VALID (1 << 2) -#define ND_SMART_ALARM_VALID (1 << 3) -#define ND_SMART_USED_VALID (1 << 4) -#define ND_SMART_SHUTDOWN_VALID (1 << 5) -#define ND_SMART_VENDOR_VALID (1 << 6) -#define ND_SMART_TEMP_TRIP (1 << 0) -#define ND_SMART_SPARE_TRIP (1 << 1) +#define ND_SMART_SPARES_VALID (1 << 1) +#define ND_SMART_USED_VALID (1 << 2) +#define ND_SMART_TEMP_VALID (1 << 3) +#define ND_SMART_CTEMP_VALID (1 << 4) +#define ND_SMART_ALARM_VALID (1 << 9) +#define ND_SMART_SHUTDOWN_VALID (1 << 10) +#define ND_SMART_VENDOR_VALID (1 << 11) +#define ND_SMART_SPARE_TRIP (1 << 0) +#define ND_SMART_TEMP_TRIP (1 << 1) +#define ND_SMART_CTEMP_TRIP (1 << 2) #define ND_SMART_NON_CRITICAL_HEALTH (1 << 0) #define ND_SMART_CRITICAL_HEALTH (1 << 1) #define ND_SMART_FATAL_HEALTH (1 << 2) @@ -37,14 +39,15 @@ struct nd_smart_payload { __u32 flags; __u8 reserved0[4]; __u8 health; - __u16 temperature; __u8 spares; - __u8 alarm_flags; __u8 life_used; + __u8 alarm_flags; + __u16 temperature; + __u16 ctrl_temperature; + __u8 reserved1[15]; __u8 shutdown_state; - __u8 reserved1; __u32 vendor_size; - __u8 vendor_data[108]; + __u8 vendor_data[92]; } __packed; struct nd_cmd_smart_threshold { @@ -53,7 +56,8 @@ struct nd_cmd_smart_threshold { } __packed; struct nd_smart_threshold_payload { - __u16 alarm_control; + __u8 alarm_control; + __u8 reserved0; __u16 temperature; __u8 spares; __u8 reserved[3]; From c09f12186d6b03b798832d95289af76495990192 Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Fri, 19 Aug 2016 14:40:58 -0600 Subject: [PATCH 02/39] acpi, nfit: check for the correct event code in notifications Commit 209851649dc4 "acpi: nfit: Add support for hot-add" added support for _FIT notifications, but it neglected to verify the notification event code matches the one in the ACPI spec for "NFIT Update". Currently there is only one code in the spec, but once additional codes are added, older kernels (without this fix) will misbehave by assuming all event notifications are for an NFIT Update. Fixes: 209851649dc4 ("acpi: nfit: Add support for hot-add") Cc: Cc: Cc: Dan Williams Reported-by: Linda Knippers Signed-off-by: Vishal Verma Signed-off-by: Dan Williams --- drivers/acpi/nfit/core.c | 3 +++ drivers/acpi/nfit/nfit.h | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index 80cc7c089a15..4a363bed89b3 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -2681,6 +2681,9 @@ static void acpi_nfit_notify(struct acpi_device *adev, u32 event) dev_dbg(dev, "%s: event: %d\n", __func__, event); + if (event != NFIT_NOTIFY_UPDATE) + return; + device_lock(dev); if (!dev->driver) { /* dev->driver may be null if we're being removed */ diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h index e894ded24d99..51d23f130d86 100644 --- a/drivers/acpi/nfit/nfit.h +++ b/drivers/acpi/nfit/nfit.h @@ -78,6 +78,10 @@ enum { NFIT_ARS_TIMEOUT = 90, }; +enum nfit_root_notifiers { + NFIT_NOTIFY_UPDATE = 0x80, +}; + struct nfit_spa { struct list_head list; struct nd_region *nd_region; From c14a868a5a14f385059f012e54291de95a538a02 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 18 Aug 2016 22:15:04 -0700 Subject: [PATCH 03/39] tools/testing/nvdimm: unit test for acpi_nfit_notify() We have had a couple bugs in this implementation in the past and before we add another ->notify() implementation for nvdimm devices, lets allow this routine to be exercised via nfit_test. Rewrite acpi_nfit_notify() in terms of a generic struct device and acpi_handle parameter, and then implement a mock acpi_evaluate_object() that returns a _FIT payload. Cc: Vishal Verma Reviewed-by: Vishal Verma Acked-by: Rafael J. Wysocki Signed-off-by: Dan Williams --- drivers/acpi/nfit/core.c | 25 ++++++++++++++----------- drivers/acpi/nfit/nfit.h | 1 + tools/testing/nvdimm/Kbuild | 1 + tools/testing/nvdimm/test/iomap.c | 17 +++++++++++++++++ tools/testing/nvdimm/test/nfit.c | 21 +++++++++++++++------ 5 files changed, 48 insertions(+), 17 deletions(-) diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index 4a363bed89b3..8120e8218f93 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -2670,11 +2670,10 @@ static int acpi_nfit_remove(struct acpi_device *adev) return 0; } -static void acpi_nfit_notify(struct acpi_device *adev, u32 event) +void __acpi_nfit_notify(struct device *dev, acpi_handle handle, u32 event) { - struct acpi_nfit_desc *acpi_desc = dev_get_drvdata(&adev->dev); + struct acpi_nfit_desc *acpi_desc = dev_get_drvdata(dev); struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL }; - struct device *dev = &adev->dev; union acpi_object *obj; acpi_status status; int ret; @@ -2684,18 +2683,17 @@ static void acpi_nfit_notify(struct acpi_device *adev, u32 event) if (event != NFIT_NOTIFY_UPDATE) return; - device_lock(dev); if (!dev->driver) { /* dev->driver may be null if we're being removed */ dev_dbg(dev, "%s: no driver found for dev\n", __func__); - goto out_unlock; + return; } if (!acpi_desc) { acpi_desc = devm_kzalloc(dev, sizeof(*acpi_desc), GFP_KERNEL); if (!acpi_desc) - goto out_unlock; - acpi_nfit_desc_init(acpi_desc, &adev->dev); + return; + acpi_nfit_desc_init(acpi_desc, dev); } else { /* * Finish previous registration before considering new @@ -2705,10 +2703,10 @@ static void acpi_nfit_notify(struct acpi_device *adev, u32 event) } /* Evaluate _FIT */ - status = acpi_evaluate_object(adev->handle, "_FIT", NULL, &buf); + status = acpi_evaluate_object(handle, "_FIT", NULL, &buf); if (ACPI_FAILURE(status)) { dev_err(dev, "failed to evaluate _FIT\n"); - goto out_unlock; + return; } obj = buf.pointer; @@ -2720,9 +2718,14 @@ static void acpi_nfit_notify(struct acpi_device *adev, u32 event) } else dev_err(dev, "Invalid _FIT\n"); kfree(buf.pointer); +} +EXPORT_SYMBOL_GPL(__acpi_nfit_notify); - out_unlock: - device_unlock(dev); +static void acpi_nfit_notify(struct acpi_device *adev, u32 event) +{ + device_lock(&adev->dev); + __acpi_nfit_notify(&adev->dev, adev->handle, event); + device_unlock(&adev->dev); } static const struct acpi_device_id acpi_nfit_ids[] = { diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h index 51d23f130d86..52370347fb0e 100644 --- a/drivers/acpi/nfit/nfit.h +++ b/drivers/acpi/nfit/nfit.h @@ -227,5 +227,6 @@ static inline struct acpi_nfit_desc *to_acpi_desc( const u8 *to_nfit_uuid(enum nfit_uuids id); int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, void *nfit, acpi_size sz); +void __acpi_nfit_notify(struct device *dev, acpi_handle handle, u32 event); void acpi_nfit_desc_init(struct acpi_nfit_desc *acpi_desc, struct device *dev); #endif /* __NFIT_H__ */ diff --git a/tools/testing/nvdimm/Kbuild b/tools/testing/nvdimm/Kbuild index ad6dd0543019..582db95127ed 100644 --- a/tools/testing/nvdimm/Kbuild +++ b/tools/testing/nvdimm/Kbuild @@ -13,6 +13,7 @@ ldflags-y += --wrap=__release_region ldflags-y += --wrap=devm_memremap_pages ldflags-y += --wrap=insert_resource ldflags-y += --wrap=remove_resource +ldflags-y += --wrap=acpi_evaluate_object DRIVERS := ../../../drivers NVDIMM_SRC := $(DRIVERS)/nvdimm diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c index c29f8dca9e67..dae5b9b6d186 100644 --- a/tools/testing/nvdimm/test/iomap.c +++ b/tools/testing/nvdimm/test/iomap.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include "nfit_test.h" @@ -276,4 +277,20 @@ void __wrap___devm_release_region(struct device *dev, struct resource *parent, } EXPORT_SYMBOL(__wrap___devm_release_region); +acpi_status __wrap_acpi_evaluate_object(acpi_handle handle, acpi_string path, + struct acpi_object_list *p, struct acpi_buffer *buf) +{ + struct nfit_test_resource *nfit_res = get_nfit_res((long) handle); + union acpi_object **obj; + + if (!nfit_res || strcmp(path, "_FIT") || !buf) + return acpi_evaluate_object(handle, path, p, buf); + + obj = nfit_res->buf; + buf->length = sizeof(union acpi_object); + buf->pointer = *obj; + return AE_OK; +} +EXPORT_SYMBOL(__wrap_acpi_evaluate_object); + MODULE_LICENSE("GPL v2"); diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c index dd48f421844c..8d79c75d3cae 100644 --- a/tools/testing/nvdimm/test/nfit.c +++ b/tools/testing/nvdimm/test/nfit.c @@ -154,6 +154,8 @@ struct nfit_test { int (*alloc)(struct nfit_test *t); void (*setup)(struct nfit_test *t); int setup_hotplug; + union acpi_object **_fit; + dma_addr_t _fit_dma; struct ars_state { struct nd_cmd_ars_status *ars_status; unsigned long deadline; @@ -615,6 +617,10 @@ static int nfit_test0_alloc(struct nfit_test *t) return -ENOMEM; } + t->_fit = test_alloc(t, sizeof(union acpi_object **), &t->_fit_dma); + if (!t->_fit) + return -ENOMEM; + return ars_state_init(&t->pdev.dev, &t->ars_state); } @@ -1408,6 +1414,7 @@ static int nfit_test_probe(struct platform_device *pdev) struct acpi_nfit_desc *acpi_desc; struct device *dev = &pdev->dev; struct nfit_test *nfit_test; + union acpi_object *obj; int rc; nfit_test = to_nfit_test(&pdev->dev); @@ -1475,15 +1482,17 @@ static int nfit_test_probe(struct platform_device *pdev) if (nfit_test->setup != nfit_test0_setup) return 0; - flush_work(&acpi_desc->work); nfit_test->setup_hotplug = 1; nfit_test->setup(nfit_test); - rc = acpi_nfit_init(acpi_desc, nfit_test->nfit_buf, - nfit_test->nfit_size); - if (rc) - return rc; - + obj = kzalloc(sizeof(*obj), GFP_KERNEL); + if (!obj) + return -ENOMEM; + obj->type = ACPI_TYPE_BUFFER; + obj->buffer.length = nfit_test->nfit_size; + obj->buffer.pointer = nfit_test->nfit_buf; + *(nfit_test->_fit) = obj; + __acpi_nfit_notify(&pdev->dev, nfit_test, 0x80); return 0; } From ccdb07f62986968ecd687a71550ed187c8cf875c Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 6 Aug 2016 16:05:06 -0700 Subject: [PATCH 04/39] dax: cleanup needlessly global symbol warnings drivers/dax/dax.c:75:6: warning: symbol 'dax_region_put' was not declared. drivers/dax/dax.c:95:19: warning: symbol 'alloc_dax_region' was not declared. drivers/dax/dax.c:173:5: warning: symbol 'devm_create_dax_dev' was not declared. drivers/dax/pmem.c:27:17: warning: symbol 'to_dax_pmem' was not declared. Signed-off-by: Dan Williams --- drivers/dax/dax.c | 1 + drivers/dax/pmem.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c index 803f3953b341..736c03830fd0 100644 --- a/drivers/dax/dax.c +++ b/drivers/dax/dax.c @@ -18,6 +18,7 @@ #include #include #include +#include "dax.h" static int dax_major; static struct class *dax_class; diff --git a/drivers/dax/pmem.c b/drivers/dax/pmem.c index dfb168568af1..59b75c5972bb 100644 --- a/drivers/dax/pmem.c +++ b/drivers/dax/pmem.c @@ -24,7 +24,7 @@ struct dax_pmem { struct completion cmp; }; -struct dax_pmem *to_dax_pmem(struct percpu_ref *ref) +static struct dax_pmem *to_dax_pmem(struct percpu_ref *ref) { return container_of(ref, struct dax_pmem, ref); } From 043a9255021bad498e31365d104d33915b6a6e33 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 7 Aug 2016 08:23:56 -0700 Subject: [PATCH 05/39] dax: reorder dax_fops function definitions In order to convert devm_create_dax_dev() to use cdev, it will need access to dax_fops. Move dax_fops and related function definitions before devm_create_dax_dev(). Signed-off-by: Dan Williams --- drivers/dax/dax.c | 337 +++++++++++++++++++++++----------------------- 1 file changed, 168 insertions(+), 169 deletions(-) diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c index 736c03830fd0..3774fc9709bb 100644 --- a/drivers/dax/dax.c +++ b/drivers/dax/dax.c @@ -145,175 +145,6 @@ static const struct attribute_group *dax_attribute_groups[] = { NULL, }; -static void unregister_dax_dev(void *_dev) -{ - struct device *dev = _dev; - struct dax_dev *dax_dev = dev_get_drvdata(dev); - struct dax_region *dax_region = dax_dev->region; - - dev_dbg(dev, "%s\n", __func__); - - /* - * Note, rcu is not protecting the liveness of dax_dev, rcu is - * ensuring that any fault handlers that might have seen - * dax_dev->alive == true, have completed. Any fault handlers - * that start after synchronize_rcu() has started will abort - * upon seeing dax_dev->alive == false. - */ - dax_dev->alive = false; - synchronize_rcu(); - - get_device(dev); - device_unregister(dev); - ida_simple_remove(&dax_region->ida, dax_dev->id); - ida_simple_remove(&dax_minor_ida, MINOR(dev->devt)); - put_device(dev); - dax_dev_put(dax_dev); -} - -int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res, - int count) -{ - struct device *parent = dax_region->dev; - struct dax_dev *dax_dev; - struct device *dev; - int rc, minor; - dev_t dev_t; - - dax_dev = kzalloc(sizeof(*dax_dev) + sizeof(*res) * count, GFP_KERNEL); - if (!dax_dev) - return -ENOMEM; - memcpy(dax_dev->res, res, sizeof(*res) * count); - dax_dev->num_resources = count; - kref_init(&dax_dev->kref); - dax_dev->alive = true; - dax_dev->region = dax_region; - kref_get(&dax_region->kref); - - dax_dev->id = ida_simple_get(&dax_region->ida, 0, 0, GFP_KERNEL); - if (dax_dev->id < 0) { - rc = dax_dev->id; - goto err_id; - } - - minor = ida_simple_get(&dax_minor_ida, 0, 0, GFP_KERNEL); - if (minor < 0) { - rc = minor; - goto err_minor; - } - - dev_t = MKDEV(dax_major, minor); - dev = device_create_with_groups(dax_class, parent, dev_t, dax_dev, - dax_attribute_groups, "dax%d.%d", dax_region->id, - dax_dev->id); - if (IS_ERR(dev)) { - rc = PTR_ERR(dev); - goto err_create; - } - dax_dev->dev = dev; - - rc = devm_add_action_or_reset(dax_region->dev, unregister_dax_dev, dev); - if (rc) - return rc; - - return 0; - - err_create: - ida_simple_remove(&dax_minor_ida, minor); - err_minor: - ida_simple_remove(&dax_region->ida, dax_dev->id); - err_id: - dax_dev_put(dax_dev); - - return rc; -} -EXPORT_SYMBOL_GPL(devm_create_dax_dev); - -/* return an unmapped area aligned to the dax region specified alignment */ -static unsigned long dax_dev_get_unmapped_area(struct file *filp, - unsigned long addr, unsigned long len, unsigned long pgoff, - unsigned long flags) -{ - unsigned long off, off_end, off_align, len_align, addr_align, align; - struct dax_dev *dax_dev = filp ? filp->private_data : NULL; - struct dax_region *dax_region; - - if (!dax_dev || addr) - goto out; - - dax_region = dax_dev->region; - align = dax_region->align; - off = pgoff << PAGE_SHIFT; - off_end = off + len; - off_align = round_up(off, align); - - if ((off_end <= off_align) || ((off_end - off_align) < align)) - goto out; - - len_align = len + align; - if ((off + len_align) < off) - goto out; - - addr_align = current->mm->get_unmapped_area(filp, addr, len_align, - pgoff, flags); - if (!IS_ERR_VALUE(addr_align)) { - addr_align += (off - addr_align) & (align - 1); - return addr_align; - } - out: - return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); -} - -static int __match_devt(struct device *dev, const void *data) -{ - const dev_t *devt = data; - - return dev->devt == *devt; -} - -static struct device *dax_dev_find(dev_t dev_t) -{ - return class_find_device(dax_class, NULL, &dev_t, __match_devt); -} - -static int dax_dev_open(struct inode *inode, struct file *filp) -{ - struct dax_dev *dax_dev = NULL; - struct device *dev; - - dev = dax_dev_find(inode->i_rdev); - if (!dev) - return -ENXIO; - - device_lock(dev); - dax_dev = dev_get_drvdata(dev); - if (dax_dev) { - dev_dbg(dev, "%s\n", __func__); - filp->private_data = dax_dev; - kref_get(&dax_dev->kref); - inode->i_flags = S_DAX; - } - device_unlock(dev); - - if (!dax_dev) { - put_device(dev); - return -ENXIO; - } - return 0; -} - -static int dax_dev_release(struct inode *inode, struct file *filp) -{ - struct dax_dev *dax_dev = filp->private_data; - struct device *dev = dax_dev->dev; - - dev_dbg(dax_dev->dev, "%s\n", __func__); - dax_dev_put(dax_dev); - put_device(dev); - - return 0; -} - static int check_vma(struct dax_dev *dax_dev, struct vm_area_struct *vma, const char *func) { @@ -531,7 +362,91 @@ static int dax_dev_mmap(struct file *filp, struct vm_area_struct *vma) vma->vm_ops = &dax_dev_vm_ops; vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; return 0; +} +/* return an unmapped area aligned to the dax region specified alignment */ +static unsigned long dax_dev_get_unmapped_area(struct file *filp, + unsigned long addr, unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + unsigned long off, off_end, off_align, len_align, addr_align, align; + struct dax_dev *dax_dev = filp ? filp->private_data : NULL; + struct dax_region *dax_region; + + if (!dax_dev || addr) + goto out; + + dax_region = dax_dev->region; + align = dax_region->align; + off = pgoff << PAGE_SHIFT; + off_end = off + len; + off_align = round_up(off, align); + + if ((off_end <= off_align) || ((off_end - off_align) < align)) + goto out; + + len_align = len + align; + if ((off + len_align) < off) + goto out; + + addr_align = current->mm->get_unmapped_area(filp, addr, len_align, + pgoff, flags); + if (!IS_ERR_VALUE(addr_align)) { + addr_align += (off - addr_align) & (align - 1); + return addr_align; + } + out: + return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); +} + +static int __match_devt(struct device *dev, const void *data) +{ + const dev_t *devt = data; + + return dev->devt == *devt; +} + +static struct device *dax_dev_find(dev_t dev_t) +{ + return class_find_device(dax_class, NULL, &dev_t, __match_devt); +} + +static int dax_dev_open(struct inode *inode, struct file *filp) +{ + struct dax_dev *dax_dev = NULL; + struct device *dev; + + dev = dax_dev_find(inode->i_rdev); + if (!dev) + return -ENXIO; + + device_lock(dev); + dax_dev = dev_get_drvdata(dev); + if (dax_dev) { + dev_dbg(dev, "%s\n", __func__); + filp->private_data = dax_dev; + kref_get(&dax_dev->kref); + inode->i_flags = S_DAX; + } + device_unlock(dev); + + if (!dax_dev) { + put_device(dev); + return -ENXIO; + } + return 0; +} + +static int dax_dev_release(struct inode *inode, struct file *filp) +{ + struct dax_dev *dax_dev = filp->private_data; + struct device *dev = dax_dev->dev; + + dev_dbg(dax_dev->dev, "%s\n", __func__); + dax_dev_put(dax_dev); + put_device(dev); + + return 0; } static const struct file_operations dax_fops = { @@ -543,6 +458,90 @@ static const struct file_operations dax_fops = { .mmap = dax_dev_mmap, }; +static void unregister_dax_dev(void *_dev) +{ + struct device *dev = _dev; + struct dax_dev *dax_dev = dev_get_drvdata(dev); + struct dax_region *dax_region = dax_dev->region; + + dev_dbg(dev, "%s\n", __func__); + + /* + * Note, rcu is not protecting the liveness of dax_dev, rcu is + * ensuring that any fault handlers that might have seen + * dax_dev->alive == true, have completed. Any fault handlers + * that start after synchronize_rcu() has started will abort + * upon seeing dax_dev->alive == false. + */ + dax_dev->alive = false; + synchronize_rcu(); + + get_device(dev); + device_unregister(dev); + ida_simple_remove(&dax_region->ida, dax_dev->id); + ida_simple_remove(&dax_minor_ida, MINOR(dev->devt)); + put_device(dev); + dax_dev_put(dax_dev); +} + +int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res, + int count) +{ + struct device *parent = dax_region->dev; + struct dax_dev *dax_dev; + struct device *dev; + int rc, minor; + dev_t dev_t; + + dax_dev = kzalloc(sizeof(*dax_dev) + sizeof(*res) * count, GFP_KERNEL); + if (!dax_dev) + return -ENOMEM; + memcpy(dax_dev->res, res, sizeof(*res) * count); + dax_dev->num_resources = count; + kref_init(&dax_dev->kref); + dax_dev->alive = true; + dax_dev->region = dax_region; + kref_get(&dax_region->kref); + + dax_dev->id = ida_simple_get(&dax_region->ida, 0, 0, GFP_KERNEL); + if (dax_dev->id < 0) { + rc = dax_dev->id; + goto err_id; + } + + minor = ida_simple_get(&dax_minor_ida, 0, 0, GFP_KERNEL); + if (minor < 0) { + rc = minor; + goto err_minor; + } + + dev_t = MKDEV(dax_major, minor); + dev = device_create_with_groups(dax_class, parent, dev_t, dax_dev, + dax_attribute_groups, "dax%d.%d", dax_region->id, + dax_dev->id); + if (IS_ERR(dev)) { + rc = PTR_ERR(dev); + goto err_create; + } + dax_dev->dev = dev; + + rc = devm_add_action_or_reset(dax_region->dev, unregister_dax_dev, dev); + if (rc) + return rc; + + return 0; + + err_create: + ida_simple_remove(&dax_minor_ida, minor); + err_minor: + ida_simple_remove(&dax_region->ida, dax_dev->id); + err_id: + dax_dev_put(dax_dev); + + return rc; +} +EXPORT_SYMBOL_GPL(devm_create_dax_dev); + static int __init dax_init(void) { int rc; From af69f51e506f5ad3625c817ba2449a439bbe68ef Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 11 Aug 2016 00:38:03 -0700 Subject: [PATCH 06/39] dax: rename fops from dax_dev_ to dax_ Shorten the prefix of the file operations to distinguish them from operations on the struct device associated with the dax_dev. Signed-off-by: Dan Williams --- drivers/dax/dax.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c index 3774fc9709bb..994dfa507dfb 100644 --- a/drivers/dax/dax.c +++ b/drivers/dax/dax.c @@ -347,7 +347,7 @@ static const struct vm_operations_struct dax_dev_vm_ops = { .close = dax_dev_vm_close, }; -static int dax_dev_mmap(struct file *filp, struct vm_area_struct *vma) +static int dax_mmap(struct file *filp, struct vm_area_struct *vma) { struct dax_dev *dax_dev = filp->private_data; int rc; @@ -365,7 +365,7 @@ static int dax_dev_mmap(struct file *filp, struct vm_area_struct *vma) } /* return an unmapped area aligned to the dax region specified alignment */ -static unsigned long dax_dev_get_unmapped_area(struct file *filp, +static unsigned long dax_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { @@ -411,7 +411,7 @@ static struct device *dax_dev_find(dev_t dev_t) return class_find_device(dax_class, NULL, &dev_t, __match_devt); } -static int dax_dev_open(struct inode *inode, struct file *filp) +static int dax_open(struct inode *inode, struct file *filp) { struct dax_dev *dax_dev = NULL; struct device *dev; @@ -437,7 +437,7 @@ static int dax_dev_open(struct inode *inode, struct file *filp) return 0; } -static int dax_dev_release(struct inode *inode, struct file *filp) +static int dax_release(struct inode *inode, struct file *filp) { struct dax_dev *dax_dev = filp->private_data; struct device *dev = dax_dev->dev; @@ -452,10 +452,10 @@ static int dax_dev_release(struct inode *inode, struct file *filp) static const struct file_operations dax_fops = { .llseek = noop_llseek, .owner = THIS_MODULE, - .open = dax_dev_open, - .release = dax_dev_release, - .get_unmapped_area = dax_dev_get_unmapped_area, - .mmap = dax_dev_mmap, + .open = dax_open, + .release = dax_release, + .get_unmapped_area = dax_get_unmapped_area, + .mmap = dax_mmap, }; static void unregister_dax_dev(void *_dev) From ebd84d724c85f22037a5c9cb04b9e6631309cb78 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 11 Aug 2016 00:41:51 -0700 Subject: [PATCH 07/39] dax: embed a struct device in dax_dev The kref in dax_dev can be made redundant if the final put_device() on the device associated with the dax_dev frees the dax_dev. This can be accomplished by embedding a struct device in struct dax_dev, open coding device_create() and specifying a custom release method. Signed-off-by: Dan Williams --- drivers/dax/dax.c | 130 ++++++++++++++++------------------------------ 1 file changed, 45 insertions(+), 85 deletions(-) diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c index 994dfa507dfb..181d2a5a21e4 100644 --- a/drivers/dax/dax.c +++ b/drivers/dax/dax.c @@ -49,7 +49,6 @@ struct dax_region { * struct dax_dev - subdivision of a dax region * @region - parent region * @dev - device backing the character device - * @kref - enable this data to be tracked in filp->private_data * @alive - !alive + rcu grace period == no new mappings can be established * @id - child id in the region * @num_resources - number of physical address extents in this device @@ -57,8 +56,7 @@ struct dax_region { */ struct dax_dev { struct dax_region *region; - struct device *dev; - struct kref kref; + struct device dev; bool alive; int id; int num_resources; @@ -79,20 +77,6 @@ void dax_region_put(struct dax_region *dax_region) } EXPORT_SYMBOL_GPL(dax_region_put); -static void dax_dev_free(struct kref *kref) -{ - struct dax_dev *dax_dev; - - dax_dev = container_of(kref, struct dax_dev, kref); - dax_region_put(dax_dev->region); - kfree(dax_dev); -} - -static void dax_dev_put(struct dax_dev *dax_dev) -{ - kref_put(&dax_dev->kref, dax_dev_free); -} - struct dax_region *alloc_dax_region(struct device *parent, int region_id, struct resource *res, unsigned int align, void *addr, unsigned long pfn_flags) @@ -117,10 +101,15 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id, } EXPORT_SYMBOL_GPL(alloc_dax_region); +static struct dax_dev *to_dax_dev(struct device *dev) +{ + return container_of(dev, struct dax_dev, dev); +} + static ssize_t size_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct dax_dev *dax_dev = dev_get_drvdata(dev); + struct dax_dev *dax_dev = to_dax_dev(dev); unsigned long long size = 0; int i; @@ -149,7 +138,7 @@ static int check_vma(struct dax_dev *dax_dev, struct vm_area_struct *vma, const char *func) { struct dax_region *dax_region = dax_dev->region; - struct device *dev = dax_dev->dev; + struct device *dev = &dax_dev->dev; unsigned long mask; if (!dax_dev->alive) @@ -214,7 +203,7 @@ static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma, struct vm_fault *vmf) { unsigned long vaddr = (unsigned long) vmf->virtual_address; - struct device *dev = dax_dev->dev; + struct device *dev = &dax_dev->dev; struct dax_region *dax_region; int rc = VM_FAULT_SIGBUS; phys_addr_t phys; @@ -254,7 +243,7 @@ static int dax_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct file *filp = vma->vm_file; struct dax_dev *dax_dev = filp->private_data; - dev_dbg(dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__, + dev_dbg(&dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__, current->comm, (vmf->flags & FAULT_FLAG_WRITE) ? "write" : "read", vma->vm_start, vma->vm_end); rcu_read_lock(); @@ -269,7 +258,7 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, unsigned int flags) { unsigned long pmd_addr = addr & PMD_MASK; - struct device *dev = dax_dev->dev; + struct device *dev = &dax_dev->dev; struct dax_region *dax_region; phys_addr_t phys; pgoff_t pgoff; @@ -311,7 +300,7 @@ static int dax_dev_pmd_fault(struct vm_area_struct *vma, unsigned long addr, struct file *filp = vma->vm_file; struct dax_dev *dax_dev = filp->private_data; - dev_dbg(dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__, + dev_dbg(&dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__, current->comm, (flags & FAULT_FLAG_WRITE) ? "write" : "read", vma->vm_start, vma->vm_end); @@ -322,29 +311,9 @@ static int dax_dev_pmd_fault(struct vm_area_struct *vma, unsigned long addr, return rc; } -static void dax_dev_vm_open(struct vm_area_struct *vma) -{ - struct file *filp = vma->vm_file; - struct dax_dev *dax_dev = filp->private_data; - - dev_dbg(dax_dev->dev, "%s\n", __func__); - kref_get(&dax_dev->kref); -} - -static void dax_dev_vm_close(struct vm_area_struct *vma) -{ - struct file *filp = vma->vm_file; - struct dax_dev *dax_dev = filp->private_data; - - dev_dbg(dax_dev->dev, "%s\n", __func__); - dax_dev_put(dax_dev); -} - static const struct vm_operations_struct dax_dev_vm_ops = { .fault = dax_dev_fault, .pmd_fault = dax_dev_pmd_fault, - .open = dax_dev_vm_open, - .close = dax_dev_vm_close, }; static int dax_mmap(struct file *filp, struct vm_area_struct *vma) @@ -352,13 +321,12 @@ static int dax_mmap(struct file *filp, struct vm_area_struct *vma) struct dax_dev *dax_dev = filp->private_data; int rc; - dev_dbg(dax_dev->dev, "%s\n", __func__); + dev_dbg(&dax_dev->dev, "%s\n", __func__); rc = check_vma(dax_dev, vma, __func__); if (rc) return rc; - kref_get(&dax_dev->kref); vma->vm_ops = &dax_dev_vm_ops; vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; return 0; @@ -420,30 +388,20 @@ static int dax_open(struct inode *inode, struct file *filp) if (!dev) return -ENXIO; - device_lock(dev); - dax_dev = dev_get_drvdata(dev); - if (dax_dev) { - dev_dbg(dev, "%s\n", __func__); - filp->private_data = dax_dev; - kref_get(&dax_dev->kref); - inode->i_flags = S_DAX; - } - device_unlock(dev); + dax_dev = to_dax_dev(dev); + dev_dbg(dev, "%s\n", __func__); + filp->private_data = dax_dev; + inode->i_flags = S_DAX; - if (!dax_dev) { - put_device(dev); - return -ENXIO; - } return 0; } static int dax_release(struct inode *inode, struct file *filp) { struct dax_dev *dax_dev = filp->private_data; - struct device *dev = dax_dev->dev; + struct device *dev = &dax_dev->dev; - dev_dbg(dax_dev->dev, "%s\n", __func__); - dax_dev_put(dax_dev); + dev_dbg(dev, "%s\n", __func__); put_device(dev); return 0; @@ -458,12 +416,21 @@ static const struct file_operations dax_fops = { .mmap = dax_mmap, }; -static void unregister_dax_dev(void *_dev) +static void dax_dev_release(struct device *dev) { - struct device *dev = _dev; - struct dax_dev *dax_dev = dev_get_drvdata(dev); + struct dax_dev *dax_dev = to_dax_dev(dev); struct dax_region *dax_region = dax_dev->region; + ida_simple_remove(&dax_region->ida, dax_dev->id); + ida_simple_remove(&dax_minor_ida, MINOR(dev->devt)); + dax_region_put(dax_region); + kfree(dax_dev); +} + +static void unregister_dax_dev(void *dev) +{ + struct dax_dev *dax_dev = to_dax_dev(dev); + dev_dbg(dev, "%s\n", __func__); /* @@ -475,13 +442,7 @@ static void unregister_dax_dev(void *_dev) */ dax_dev->alive = false; synchronize_rcu(); - - get_device(dev); device_unregister(dev); - ida_simple_remove(&dax_region->ida, dax_dev->id); - ida_simple_remove(&dax_minor_ida, MINOR(dev->devt)); - put_device(dev); - dax_dev_put(dax_dev); } int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res, @@ -498,7 +459,6 @@ int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res, return -ENOMEM; memcpy(dax_dev->res, res, sizeof(*res) * count); dax_dev->num_resources = count; - kref_init(&dax_dev->kref); dax_dev->alive = true; dax_dev->region = dax_region; kref_get(&dax_region->kref); @@ -516,27 +476,27 @@ int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res, } dev_t = MKDEV(dax_major, minor); - dev = device_create_with_groups(dax_class, parent, dev_t, dax_dev, - dax_attribute_groups, "dax%d.%d", dax_region->id, - dax_dev->id); - if (IS_ERR(dev)) { - rc = PTR_ERR(dev); - goto err_create; - } - dax_dev->dev = dev; - rc = devm_add_action_or_reset(dax_region->dev, unregister_dax_dev, dev); - if (rc) + dev = &dax_dev->dev; + device_initialize(dev); + dev->devt = dev_t; + dev->class = dax_class; + dev->parent = parent; + dev->groups = dax_attribute_groups; + dev->release = dax_dev_release; + dev_set_name(dev, "dax%d.%d", dax_region->id, dax_dev->id); + rc = device_add(dev); + if (rc) { + put_device(dev); return rc; + } - return 0; + return devm_add_action_or_reset(dax_region->dev, unregister_dax_dev, dev); - err_create: - ida_simple_remove(&dax_minor_ida, minor); err_minor: ida_simple_remove(&dax_region->ida, dax_dev->id); err_id: - dax_dev_put(dax_dev); + kfree(dax_dev); return rc; } From ba09c01d2fa866f22e42ac2af405fe386f491879 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 24 Jul 2016 15:55:42 -0700 Subject: [PATCH 08/39] dax: convert to the cdev api A goal of the device-DAX interface is to be able to support many exclusive allocations (partitions) of performance / feature differentiated memory. This count may exceed the default minors limit of 256. As a result of switching to an embedded cdev the inode-to-dax_dev conversion is simplified, as well as reference counting which can switch to the cdev kobject lifetime. Cc: Al Viro Signed-off-by: Dan Williams --- drivers/dax/Kconfig | 5 +++ drivers/dax/dax.c | 82 ++++++++++++++++++++++----------------------- 2 files changed, 46 insertions(+), 41 deletions(-) diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig index cedab7572de3..daadd20aa936 100644 --- a/drivers/dax/Kconfig +++ b/drivers/dax/Kconfig @@ -23,4 +23,9 @@ config DEV_DAX_PMEM Say Y if unsure +config NR_DEV_DAX + int "Maximum number of Device-DAX instances" + default 32768 + range 256 2147483647 + endif diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c index 181d2a5a21e4..17715773c097 100644 --- a/drivers/dax/dax.c +++ b/drivers/dax/dax.c @@ -14,15 +14,19 @@ #include #include #include +#include #include #include #include #include #include "dax.h" -static int dax_major; +static dev_t dax_devt; static struct class *dax_class; static DEFINE_IDA(dax_minor_ida); +static int nr_dax = CONFIG_NR_DEV_DAX; +module_param(nr_dax, int, S_IRUGO); +MODULE_PARM_DESC(nr_dax, "max number of device-dax instances"); /** * struct dax_region - mapping infrastructure for dax devices @@ -49,6 +53,7 @@ struct dax_region { * struct dax_dev - subdivision of a dax region * @region - parent region * @dev - device backing the character device + * @cdev - core chardev data * @alive - !alive + rcu grace period == no new mappings can be established * @id - child id in the region * @num_resources - number of physical address extents in this device @@ -57,6 +62,7 @@ struct dax_region { struct dax_dev { struct dax_region *region; struct device dev; + struct cdev cdev; bool alive; int id; int num_resources; @@ -367,29 +373,12 @@ static unsigned long dax_get_unmapped_area(struct file *filp, return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); } -static int __match_devt(struct device *dev, const void *data) -{ - const dev_t *devt = data; - - return dev->devt == *devt; -} - -static struct device *dax_dev_find(dev_t dev_t) -{ - return class_find_device(dax_class, NULL, &dev_t, __match_devt); -} - static int dax_open(struct inode *inode, struct file *filp) { - struct dax_dev *dax_dev = NULL; - struct device *dev; + struct dax_dev *dax_dev; - dev = dax_dev_find(inode->i_rdev); - if (!dev) - return -ENXIO; - - dax_dev = to_dax_dev(dev); - dev_dbg(dev, "%s\n", __func__); + dax_dev = container_of(inode->i_cdev, struct dax_dev, cdev); + dev_dbg(&dax_dev->dev, "%s\n", __func__); filp->private_data = dax_dev; inode->i_flags = S_DAX; @@ -399,11 +388,8 @@ static int dax_open(struct inode *inode, struct file *filp) static int dax_release(struct inode *inode, struct file *filp) { struct dax_dev *dax_dev = filp->private_data; - struct device *dev = &dax_dev->dev; - - dev_dbg(dev, "%s\n", __func__); - put_device(dev); + dev_dbg(&dax_dev->dev, "%s\n", __func__); return 0; } @@ -430,6 +416,7 @@ static void dax_dev_release(struct device *dev) static void unregister_dax_dev(void *dev) { struct dax_dev *dax_dev = to_dax_dev(dev); + struct cdev *cdev = &dax_dev->cdev; dev_dbg(dev, "%s\n", __func__); @@ -442,6 +429,7 @@ static void unregister_dax_dev(void *dev) */ dax_dev->alive = false; synchronize_rcu(); + cdev_del(cdev); device_unregister(dev); } @@ -451,17 +439,13 @@ int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res, struct device *parent = dax_region->dev; struct dax_dev *dax_dev; struct device *dev; + struct cdev *cdev; int rc, minor; dev_t dev_t; dax_dev = kzalloc(sizeof(*dax_dev) + sizeof(*res) * count, GFP_KERNEL); if (!dax_dev) return -ENOMEM; - memcpy(dax_dev->res, res, sizeof(*res) * count); - dax_dev->num_resources = count; - dax_dev->alive = true; - dax_dev->region = dax_region; - kref_get(&dax_region->kref); dax_dev->id = ida_simple_get(&dax_region->ida, 0, 0, GFP_KERNEL); if (dax_dev->id < 0) { @@ -475,10 +459,26 @@ int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res, goto err_minor; } - dev_t = MKDEV(dax_major, minor); - + /* device_initialize() so cdev can reference kobj parent */ + dev_t = MKDEV(MAJOR(dax_devt), minor); dev = &dax_dev->dev; device_initialize(dev); + + cdev = &dax_dev->cdev; + cdev_init(cdev, &dax_fops); + cdev->owner = parent->driver->owner; + cdev->kobj.parent = &dev->kobj; + rc = cdev_add(&dax_dev->cdev, dev_t, 1); + if (rc) + goto err_cdev; + + /* from here on we're committed to teardown via dax_dev_release() */ + memcpy(dax_dev->res, res, sizeof(*res) * count); + dax_dev->num_resources = count; + dax_dev->alive = true; + dax_dev->region = dax_region; + kref_get(&dax_region->kref); + dev->devt = dev_t; dev->class = dax_class; dev->parent = parent; @@ -493,6 +493,8 @@ int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res, return devm_add_action_or_reset(dax_region->dev, unregister_dax_dev, dev); + err_cdev: + ida_simple_remove(&dax_minor_ida, minor); err_minor: ida_simple_remove(&dax_region->ida, dax_dev->id); err_id: @@ -506,24 +508,22 @@ static int __init dax_init(void) { int rc; - rc = register_chrdev(0, "dax", &dax_fops); - if (rc < 0) + nr_dax = max(nr_dax, 256); + rc = alloc_chrdev_region(&dax_devt, 0, nr_dax, "dax"); + if (rc) return rc; - dax_major = rc; dax_class = class_create(THIS_MODULE, "dax"); - if (IS_ERR(dax_class)) { - unregister_chrdev(dax_major, "dax"); - return PTR_ERR(dax_class); - } + if (IS_ERR(dax_class)) + unregister_chrdev_region(dax_devt, nr_dax); - return 0; + return PTR_ERR_OR_ZERO(dax_class); } static void __exit dax_exit(void) { class_destroy(dax_class); - unregister_chrdev(dax_major, "dax"); + unregister_chrdev_region(dax_devt, nr_dax); ida_destroy(&dax_minor_ida); } From 3bc52c45bac26bf7ed1dc8d287ad1aeaed1250b6 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 24 Jul 2016 21:55:45 -0700 Subject: [PATCH 09/39] dax: define a unified inode/address_space for device-dax mappings In support of enabling resize / truncate of device-dax instances, define a pseudo-fs to provide a unified inode/address space for vm operations. Cc: Al Viro Signed-off-by: Dan Williams --- drivers/dax/dax.c | 154 +++++++++++++++++++++++++++++++++++-- fs/char_dev.c | 1 + include/uapi/linux/magic.h | 1 + 3 files changed, 150 insertions(+), 6 deletions(-) diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c index 17715773c097..e8b9319aeadb 100644 --- a/drivers/dax/dax.c +++ b/drivers/dax/dax.c @@ -13,7 +13,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -26,6 +28,9 @@ static struct class *dax_class; static DEFINE_IDA(dax_minor_ida); static int nr_dax = CONFIG_NR_DEV_DAX; module_param(nr_dax, int, S_IRUGO); +static struct vfsmount *dax_mnt; +static struct kmem_cache *dax_cache __read_mostly; +static struct super_block *dax_superblock __read_mostly; MODULE_PARM_DESC(nr_dax, "max number of device-dax instances"); /** @@ -61,6 +66,7 @@ struct dax_region { */ struct dax_dev { struct dax_region *region; + struct inode *inode; struct device dev; struct cdev cdev; bool alive; @@ -69,6 +75,117 @@ struct dax_dev { struct resource res[0]; }; +static struct inode *dax_alloc_inode(struct super_block *sb) +{ + return kmem_cache_alloc(dax_cache, GFP_KERNEL); +} + +static void dax_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + + kmem_cache_free(dax_cache, inode); +} + +static void dax_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, dax_i_callback); +} + +static const struct super_operations dax_sops = { + .statfs = simple_statfs, + .alloc_inode = dax_alloc_inode, + .destroy_inode = dax_destroy_inode, + .drop_inode = generic_delete_inode, +}; + +static struct dentry *dax_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_pseudo(fs_type, "dax:", &dax_sops, NULL, DAXFS_MAGIC); +} + +static struct file_system_type dax_type = { + .name = "dax", + .mount = dax_mount, + .kill_sb = kill_anon_super, +}; + +static int dax_test(struct inode *inode, void *data) +{ + return inode->i_cdev == data; +} + +static int dax_set(struct inode *inode, void *data) +{ + inode->i_cdev = data; + return 0; +} + +static struct inode *dax_inode_get(struct cdev *cdev, dev_t devt) +{ + struct inode *inode; + + inode = iget5_locked(dax_superblock, hash_32(devt + DAXFS_MAGIC, 31), + dax_test, dax_set, cdev); + + if (!inode) + return NULL; + + if (inode->i_state & I_NEW) { + inode->i_mode = S_IFCHR; + inode->i_flags = S_DAX; + inode->i_rdev = devt; + mapping_set_gfp_mask(&inode->i_data, GFP_USER); + unlock_new_inode(inode); + } + return inode; +} + +static void init_once(void *inode) +{ + inode_init_once(inode); +} + +static int dax_inode_init(void) +{ + int rc; + + dax_cache = kmem_cache_create("dax_cache", sizeof(struct inode), 0, + (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD|SLAB_ACCOUNT), + init_once); + if (!dax_cache) + return -ENOMEM; + + rc = register_filesystem(&dax_type); + if (rc) + goto err_register_fs; + + dax_mnt = kern_mount(&dax_type); + if (IS_ERR(dax_mnt)) { + rc = PTR_ERR(dax_mnt); + goto err_mount; + } + dax_superblock = dax_mnt->mnt_sb; + + return 0; + + err_mount: + unregister_filesystem(&dax_type); + err_register_fs: + kmem_cache_destroy(dax_cache); + + return rc; +} + +static void dax_inode_exit(void) +{ + kern_unmount(dax_mnt); + unregister_filesystem(&dax_type); + kmem_cache_destroy(dax_cache); +} + static void dax_region_free(struct kref *kref) { struct dax_region *dax_region; @@ -379,6 +496,9 @@ static int dax_open(struct inode *inode, struct file *filp) dax_dev = container_of(inode->i_cdev, struct dax_dev, cdev); dev_dbg(&dax_dev->dev, "%s\n", __func__); + inode->i_mapping = dax_dev->inode->i_mapping; + inode->i_mapping->host = dax_dev->inode; + filp->f_mapping = inode->i_mapping; filp->private_data = dax_dev; inode->i_flags = S_DAX; @@ -410,6 +530,7 @@ static void dax_dev_release(struct device *dev) ida_simple_remove(&dax_region->ida, dax_dev->id); ida_simple_remove(&dax_minor_ida, MINOR(dev->devt)); dax_region_put(dax_region); + iput(dax_dev->inode); kfree(dax_dev); } @@ -459,6 +580,12 @@ int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res, goto err_minor; } + dax_dev->inode = dax_inode_get(&dax_dev->cdev, dev_t); + if (!dax_dev->inode) { + rc = -ENOMEM; + goto err_inode; + } + /* device_initialize() so cdev can reference kobj parent */ dev_t = MKDEV(MAJOR(dax_devt), minor); dev = &dax_dev->dev; @@ -494,6 +621,8 @@ int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res, return devm_add_action_or_reset(dax_region->dev, unregister_dax_dev, dev); err_cdev: + iput(dax_dev->inode); + err_inode: ida_simple_remove(&dax_minor_ida, minor); err_minor: ida_simple_remove(&dax_region->ida, dax_dev->id); @@ -508,16 +637,28 @@ static int __init dax_init(void) { int rc; - nr_dax = max(nr_dax, 256); - rc = alloc_chrdev_region(&dax_devt, 0, nr_dax, "dax"); + rc = dax_inode_init(); if (rc) return rc; - dax_class = class_create(THIS_MODULE, "dax"); - if (IS_ERR(dax_class)) - unregister_chrdev_region(dax_devt, nr_dax); + nr_dax = max(nr_dax, 256); + rc = alloc_chrdev_region(&dax_devt, 0, nr_dax, "dax"); + if (rc) + goto err_chrdev; - return PTR_ERR_OR_ZERO(dax_class); + dax_class = class_create(THIS_MODULE, "dax"); + if (IS_ERR(dax_class)) { + rc = PTR_ERR(dax_class); + goto err_class; + } + + return 0; + + err_class: + unregister_chrdev_region(dax_devt, nr_dax); + err_chrdev: + dax_inode_exit(); + return rc; } static void __exit dax_exit(void) @@ -525,6 +666,7 @@ static void __exit dax_exit(void) class_destroy(dax_class); unregister_chrdev_region(dax_devt, nr_dax); ida_destroy(&dax_minor_ida); + dax_inode_exit(); } MODULE_AUTHOR("Intel Corporation"); diff --git a/fs/char_dev.c b/fs/char_dev.c index 6edd825231c5..44a240c4bb65 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -406,6 +406,7 @@ void cd_forget(struct inode *inode) spin_lock(&cdev_lock); list_del_init(&inode->i_devices); inode->i_cdev = NULL; + inode->i_mapping = &inode->i_data; spin_unlock(&cdev_lock); } diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index e398beac67b8..9bd559472c92 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -65,6 +65,7 @@ #define V9FS_MAGIC 0x01021997 #define BDEVFS_MAGIC 0x62646576 +#define DAXFS_MAGIC 0x64646178 #define BINFMTFS_MAGIC 0x42494e4d #define DEVPTS_SUPER_MAGIC 0x1cd1 #define FUTEXFS_SUPER_MAGIC 0xBAD1DEA From 9dc1e4927bfabaf654738c9ecca3a4926a0aaeb5 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 4 Aug 2016 16:53:50 -0700 Subject: [PATCH 10/39] dax: unmap/truncate on device shutdown Invalidate all mappings of a device-dax instance when the device is unregistered. Signed-off-by: Dan Williams --- drivers/dax/dax.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c index e8b9319aeadb..0a7899d5c65c 100644 --- a/drivers/dax/dax.c +++ b/drivers/dax/dax.c @@ -550,6 +550,7 @@ static void unregister_dax_dev(void *dev) */ dax_dev->alive = false; synchronize_rcu(); + unmap_mapping_range(dax_dev->inode->i_mapping, 0, 0, 1); cdev_del(cdev); device_unregister(dev); } From 9d2d01a031a945075d4609b1c4d3c73f10ba61e7 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 19 Jul 2016 16:17:58 -0700 Subject: [PATCH 11/39] dax: check resource alignment at dax region/device create All the extents of a dax-device must match the alignment of the region. Otherwise, we are unable to guarantee fault semantics of a given page size. The region must be self-consistent itself as well. Signed-off-by: Dan Williams --- drivers/dax/dax.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c index 0a7899d5c65c..03bb54f7f58f 100644 --- a/drivers/dax/dax.c +++ b/drivers/dax/dax.c @@ -206,8 +206,11 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id, { struct dax_region *dax_region; - dax_region = kzalloc(sizeof(*dax_region), GFP_KERNEL); + if (!IS_ALIGNED(res->start, align) + || !IS_ALIGNED(resource_size(res), align)) + return NULL; + dax_region = kzalloc(sizeof(*dax_region), GFP_KERNEL); if (!dax_region) return NULL; @@ -560,15 +563,29 @@ int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res, { struct device *parent = dax_region->dev; struct dax_dev *dax_dev; + int rc = 0, minor, i; struct device *dev; struct cdev *cdev; - int rc, minor; dev_t dev_t; dax_dev = kzalloc(sizeof(*dax_dev) + sizeof(*res) * count, GFP_KERNEL); if (!dax_dev) return -ENOMEM; + for (i = 0; i < count; i++) { + if (!IS_ALIGNED(res[i].start, dax_region->align) + || !IS_ALIGNED(resource_size(&res[i]), + dax_region->align)) { + rc = -EINVAL; + break; + } + dax_dev->res[i].start = res[i].start; + dax_dev->res[i].end = res[i].end; + } + + if (i < count) + goto err_id; + dax_dev->id = ida_simple_get(&dax_region->ida, 0, 0, GFP_KERNEL); if (dax_dev->id < 0) { rc = dax_dev->id; @@ -601,7 +618,6 @@ int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res, goto err_cdev; /* from here on we're committed to teardown via dax_dev_release() */ - memcpy(dax_dev->res, res, sizeof(*res) * count); dax_dev->num_resources = count; dax_dev->alive = true; dax_dev->region = dax_region; From ba9c8dd3c22275e46feef429f343b85e9cf3924c Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 22 Aug 2016 19:28:37 -0700 Subject: [PATCH 12/39] acpi, nfit: add dimm device notification support Per "ACPI 6.1 Section 9.20.3" NVDIMM devices, children of the ACPI0012 NVDIMM Root device, can receive health event notifications. Given that these devices are precluded from registering a notification handler via acpi_driver.acpi_device_ops (due to no _HID), we use acpi_install_notify_handler() directly. The registered handler, acpi_nvdimm_notify(), triggers a poll(2) event on the nmemX/nfit/flags sysfs attribute when a health event notification is received. Cc: Rafael J. Wysocki Tested-by: Toshi Kani Reviewed-by: Vishal Verma Acked-by: Rafael J. Wysocki Reviewed-by: Toshi Kani Signed-off-by: Dan Williams --- drivers/acpi/nfit/core.c | 95 ++++++++++++++++++++++++++++++++++++-- drivers/acpi/nfit/nfit.h | 5 ++ drivers/nvdimm/dimm_devs.c | 6 +++ include/linux/libnvdimm.h | 1 + 4 files changed, 103 insertions(+), 4 deletions(-) diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index 8120e8218f93..4af1db1cb599 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -1248,6 +1248,43 @@ static struct nvdimm *acpi_nfit_dimm_by_handle(struct acpi_nfit_desc *acpi_desc, return NULL; } +static void __acpi_nvdimm_notify(struct device *dev, u32 event) +{ + struct nfit_mem *nfit_mem; + struct acpi_nfit_desc *acpi_desc; + + dev_dbg(dev->parent, "%s: %s: event: %d\n", dev_name(dev), __func__, + event); + + if (event != NFIT_NOTIFY_DIMM_HEALTH) { + dev_dbg(dev->parent, "%s: unknown event: %d\n", dev_name(dev), + event); + return; + } + + acpi_desc = dev_get_drvdata(dev->parent); + if (!acpi_desc) + return; + + /* + * If we successfully retrieved acpi_desc, then we know nfit_mem data + * is still valid. + */ + nfit_mem = dev_get_drvdata(dev); + if (nfit_mem && nfit_mem->flags_attr) + sysfs_notify_dirent(nfit_mem->flags_attr); +} + +static void acpi_nvdimm_notify(acpi_handle handle, u32 event, void *data) +{ + struct acpi_device *adev = data; + struct device *dev = &adev->dev; + + device_lock(dev->parent); + __acpi_nvdimm_notify(dev, event); + device_unlock(dev->parent); +} + static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc, struct nfit_mem *nfit_mem, u32 device_handle) { @@ -1272,6 +1309,13 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc, return force_enable_dimms ? 0 : -ENODEV; } + if (ACPI_FAILURE(acpi_install_notify_handler(adev_dimm->handle, + ACPI_DEVICE_NOTIFY, acpi_nvdimm_notify, adev_dimm))) { + dev_err(dev, "%s: notification registration failed\n", + dev_name(&adev_dimm->dev)); + return -ENXIO; + } + /* * Until standardization materializes we need to consider 4 * different command sets. Note, that checking for function0 (bit0) @@ -1310,18 +1354,38 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc, return 0; } +static void shutdown_dimm_notify(void *data) +{ + struct acpi_nfit_desc *acpi_desc = data; + struct nfit_mem *nfit_mem; + + mutex_lock(&acpi_desc->init_mutex); + /* + * Clear out the nfit_mem->flags_attr and shut down dimm event + * notifications. + */ + list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) { + if (nfit_mem->flags_attr) { + sysfs_put(nfit_mem->flags_attr); + nfit_mem->flags_attr = NULL; + } + acpi_remove_notify_handler(nfit_mem->adev->handle, + ACPI_DEVICE_NOTIFY, acpi_nvdimm_notify); + } + mutex_unlock(&acpi_desc->init_mutex); +} + static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc) { struct nfit_mem *nfit_mem; - int dimm_count = 0; + int dimm_count = 0, rc; + struct nvdimm *nvdimm; list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) { struct acpi_nfit_flush_address *flush; unsigned long flags = 0, cmd_mask; - struct nvdimm *nvdimm; u32 device_handle; u16 mem_flags; - int rc; device_handle = __to_nfit_memdev(nfit_mem)->device_handle; nvdimm = acpi_nfit_dimm_by_handle(acpi_desc, device_handle); @@ -1374,7 +1438,30 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc) } - return nvdimm_bus_check_dimm_count(acpi_desc->nvdimm_bus, dimm_count); + rc = nvdimm_bus_check_dimm_count(acpi_desc->nvdimm_bus, dimm_count); + if (rc) + return rc; + + /* + * Now that dimms are successfully registered, and async registration + * is flushed, attempt to enable event notification. + */ + list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) { + struct kernfs_node *nfit_kernfs; + + nvdimm = nfit_mem->nvdimm; + nfit_kernfs = sysfs_get_dirent(nvdimm_kobj(nvdimm)->sd, "nfit"); + if (nfit_kernfs) + nfit_mem->flags_attr = sysfs_get_dirent(nfit_kernfs, + "flags"); + sysfs_put(nfit_kernfs); + if (!nfit_mem->flags_attr) + dev_warn(acpi_desc->dev, "%s: notifications disabled\n", + nvdimm_name(nvdimm)); + } + + return devm_add_action_or_reset(acpi_desc->dev, shutdown_dimm_notify, + acpi_desc); } static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc) diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h index 52370347fb0e..13195824778c 100644 --- a/drivers/acpi/nfit/nfit.h +++ b/drivers/acpi/nfit/nfit.h @@ -82,6 +82,10 @@ enum nfit_root_notifiers { NFIT_NOTIFY_UPDATE = 0x80, }; +enum nfit_dimm_notifiers { + NFIT_NOTIFY_DIMM_HEALTH = 0x81, +}; + struct nfit_spa { struct list_head list; struct nd_region *nd_region; @@ -128,6 +132,7 @@ struct nfit_mem { struct acpi_nfit_system_address *spa_bdw; struct acpi_nfit_interleave *idt_dcr; struct acpi_nfit_interleave *idt_bdw; + struct kernfs_node *flags_attr; struct nfit_flush *nfit_flush; struct list_head list; struct acpi_device *adev; diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c index d9bba5edd8dc..ce75cc3f41fb 100644 --- a/drivers/nvdimm/dimm_devs.c +++ b/drivers/nvdimm/dimm_devs.c @@ -263,6 +263,12 @@ const char *nvdimm_name(struct nvdimm *nvdimm) } EXPORT_SYMBOL_GPL(nvdimm_name); +struct kobject *nvdimm_kobj(struct nvdimm *nvdimm) +{ + return &nvdimm->dev.kobj; +} +EXPORT_SYMBOL_GPL(nvdimm_kobj); + unsigned long nvdimm_cmd_mask(struct nvdimm *nvdimm) { return nvdimm->cmd_mask; diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index b519e137b9b7..ad18d0531b6e 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -139,6 +139,7 @@ struct nd_blk_region *to_nd_blk_region(struct device *dev); struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus); struct device *to_nvdimm_bus_dev(struct nvdimm_bus *nvdimm_bus); const char *nvdimm_name(struct nvdimm *nvdimm); +struct kobject *nvdimm_kobj(struct nvdimm *nvdimm); unsigned long nvdimm_cmd_mask(struct nvdimm *nvdimm); void *nvdimm_provider_data(struct nvdimm *nvdimm); struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data, From 231bf117aada289023fd6f3377461ce80792e273 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 22 Aug 2016 19:23:25 -0700 Subject: [PATCH 13/39] tools/testing/nvdimm: unit test for acpi_nvdimm_notify() Trigger an nmemX/nfit/flags attribute to fire an event whenever a smart-threshold DSM is received. Reviewed-by: Vishal Verma Acked-by: Rafael J. Wysocki Signed-off-by: Dan Williams --- drivers/acpi/nfit/core.c | 10 ++++--- drivers/acpi/nfit/nfit.h | 1 + tools/testing/nvdimm/test/nfit.c | 45 ++++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 3 deletions(-) diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index 4af1db1cb599..ceb6671ab355 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -1248,7 +1248,7 @@ static struct nvdimm *acpi_nfit_dimm_by_handle(struct acpi_nfit_desc *acpi_desc, return NULL; } -static void __acpi_nvdimm_notify(struct device *dev, u32 event) +void __acpi_nvdimm_notify(struct device *dev, u32 event) { struct nfit_mem *nfit_mem; struct acpi_nfit_desc *acpi_desc; @@ -1274,6 +1274,7 @@ static void __acpi_nvdimm_notify(struct device *dev, u32 event) if (nfit_mem && nfit_mem->flags_attr) sysfs_notify_dirent(nfit_mem->flags_attr); } +EXPORT_SYMBOL_GPL(__acpi_nvdimm_notify); static void acpi_nvdimm_notify(acpi_handle handle, u32 event, void *data) { @@ -1365,12 +1366,15 @@ static void shutdown_dimm_notify(void *data) * notifications. */ list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) { + struct acpi_device *adev_dimm = nfit_mem->adev; + if (nfit_mem->flags_attr) { sysfs_put(nfit_mem->flags_attr); nfit_mem->flags_attr = NULL; } - acpi_remove_notify_handler(nfit_mem->adev->handle, - ACPI_DEVICE_NOTIFY, acpi_nvdimm_notify); + if (adev_dimm) + acpi_remove_notify_handler(adev_dimm->handle, + ACPI_DEVICE_NOTIFY, acpi_nvdimm_notify); } mutex_unlock(&acpi_desc->init_mutex); } diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h index 13195824778c..bb101170cd0b 100644 --- a/drivers/acpi/nfit/nfit.h +++ b/drivers/acpi/nfit/nfit.h @@ -233,5 +233,6 @@ static inline struct acpi_nfit_desc *to_acpi_desc( const u8 *to_nfit_uuid(enum nfit_uuids id); int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, void *nfit, acpi_size sz); void __acpi_nfit_notify(struct device *dev, acpi_handle handle, u32 event); +void __acpi_nvdimm_notify(struct device *dev, u32 event); void acpi_nfit_desc_init(struct acpi_nfit_desc *acpi_desc, struct device *dev); #endif /* __NFIT_H__ */ diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c index 8d79c75d3cae..99ea68674f0a 100644 --- a/tools/testing/nvdimm/test/nfit.c +++ b/tools/testing/nvdimm/test/nfit.c @@ -161,6 +161,7 @@ struct nfit_test { unsigned long deadline; spinlock_t lock; } ars_state; + struct device *dimm_dev[NUM_DCR]; }; static struct nfit_test *to_nfit_test(struct device *dev) @@ -430,6 +431,9 @@ static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc, break; case ND_CMD_SMART_THRESHOLD: rc = nfit_test_cmd_smart_threshold(buf, buf_len); + device_lock(&t->pdev.dev); + __acpi_nvdimm_notify(t->dimm_dev[i], 0x81); + device_unlock(&t->pdev.dev); break; default: return -ENOTTY; @@ -566,6 +570,18 @@ static int ars_state_init(struct device *dev, struct ars_state *ars_state) return 0; } +static void put_dimms(void *data) +{ + struct device **dimm_dev = data; + int i; + + for (i = 0; i < NUM_DCR; i++) + if (dimm_dev[i]) + device_unregister(dimm_dev[i]); +} + +static struct class *nfit_test_dimm; + static int nfit_test0_alloc(struct nfit_test *t) { size_t nfit_size = sizeof(struct acpi_nfit_system_address) * NUM_SPA @@ -621,6 +637,15 @@ static int nfit_test0_alloc(struct nfit_test *t) if (!t->_fit) return -ENOMEM; + if (devm_add_action_or_reset(&t->pdev.dev, put_dimms, t->dimm_dev)) + return -ENOMEM; + for (i = 0; i < NUM_DCR; i++) { + t->dimm_dev[i] = device_create(nfit_test_dimm, &t->pdev.dev, 0, + NULL, "test_dimm%d", i); + if (!t->dimm_dev[i]) + return -ENOMEM; + } + return ars_state_init(&t->pdev.dev, &t->ars_state); } @@ -1414,6 +1439,7 @@ static int nfit_test_probe(struct platform_device *pdev) struct acpi_nfit_desc *acpi_desc; struct device *dev = &pdev->dev; struct nfit_test *nfit_test; + struct nfit_mem *nfit_mem; union acpi_object *obj; int rc; @@ -1493,6 +1519,20 @@ static int nfit_test_probe(struct platform_device *pdev) obj->buffer.pointer = nfit_test->nfit_buf; *(nfit_test->_fit) = obj; __acpi_nfit_notify(&pdev->dev, nfit_test, 0x80); + + /* associate dimm devices with nfit_mem data for notification testing */ + mutex_lock(&acpi_desc->init_mutex); + list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) { + u32 nfit_handle = __to_nfit_memdev(nfit_mem)->device_handle; + int i; + + for (i = 0; i < NUM_DCR; i++) + if (nfit_handle == handle[i]) + dev_set_drvdata(nfit_test->dimm_dev[i], + nfit_mem); + } + mutex_unlock(&acpi_desc->init_mutex); + return 0; } @@ -1526,6 +1566,10 @@ static __init int nfit_test_init(void) { int rc, i; + nfit_test_dimm = class_create(THIS_MODULE, "nfit_test_dimm"); + if (IS_ERR(nfit_test_dimm)) + return PTR_ERR(nfit_test_dimm); + nfit_test_setup(nfit_test_lookup); for (i = 0; i < NUM_NFITS; i++) { @@ -1592,6 +1636,7 @@ static __exit void nfit_test_exit(void) for (i = 0; i < NUM_NFITS; i++) platform_device_unregister(&instances[i]->pdev); nfit_test_teardown(); + class_destroy(nfit_test_dimm); } module_init(nfit_test_init); From ae551e9ca289762c63a6a139872a63dd66183209 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 31 Aug 2016 11:45:25 +0200 Subject: [PATCH 14/39] nvdimm: Spelling s/unacknoweldged/unacknowledged/ Signed-off-by: Geert Uytterhoeven Signed-off-by: Dan Williams --- drivers/nvdimm/namespace_devs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index c5e3196c45b0..6b0449bd7720 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -294,7 +294,7 @@ static bool __nd_namespace_blk_validate(struct nd_namespace_blk *nsblk) if (strcmp(res->name, label_id.id) != 0) continue; /* - * Resources with unacknoweldged adjustments indicate a + * Resources with unacknowledged adjustments indicate a * failure to update labels */ if (res->flags & DPA_RESOURCE_ADJUSTED) From aee6598748335794dc25d7c4f16f0d4801f6b584 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 16 Aug 2016 13:08:40 -0600 Subject: [PATCH 15/39] libnvdimm: Fix nvdimm_probe error on NVDIMM-N 'ndctl list --buses --dimms' does not list any NVDIMM-Ns since they are considered as idle. ndctl checks if any driver is attached to nmem device. nvdimm_probe() always fails in nvdimm_init_nsarea() since NVDIMM-Ns do not implement optinal ND_CMD_GET_CONFIG_DATA command. Change nvdimm_probe() to accept the case that the CONFIG_DATA command is not implemented for NVDIMM-Ns. The driver attaches without ndd, which keeps it no-op to the device. Reported-by: Brian Boylston Signed-off-by: Toshi Kani Cc: Dan Williams Tested-by: Johannes Thumshirn Acked-by: Johannes Thumshirn Signed-off-by: Dan Williams --- drivers/nvdimm/dimm.c | 11 +++++++++++ drivers/nvdimm/dimm_devs.c | 28 +++++++++++++++------------- drivers/nvdimm/nd.h | 1 + 3 files changed, 27 insertions(+), 13 deletions(-) diff --git a/drivers/nvdimm/dimm.c b/drivers/nvdimm/dimm.c index 71d12bb67339..619834e144d1 100644 --- a/drivers/nvdimm/dimm.c +++ b/drivers/nvdimm/dimm.c @@ -26,6 +26,14 @@ static int nvdimm_probe(struct device *dev) struct nvdimm_drvdata *ndd; int rc; + rc = nvdimm_check_config_data(dev); + if (rc) { + /* not required for non-aliased nvdimm, ex. NVDIMM-N */ + if (rc == -ENOTTY) + rc = 0; + return rc; + } + ndd = kzalloc(sizeof(*ndd), GFP_KERNEL); if (!ndd) return -ENOMEM; @@ -72,6 +80,9 @@ static int nvdimm_remove(struct device *dev) { struct nvdimm_drvdata *ndd = dev_get_drvdata(dev); + if (!ndd) + return 0; + nvdimm_bus_lock(dev); dev_set_drvdata(dev, NULL); nvdimm_bus_unlock(dev); diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c index ce75cc3f41fb..cf36470e94c0 100644 --- a/drivers/nvdimm/dimm_devs.c +++ b/drivers/nvdimm/dimm_devs.c @@ -28,28 +28,30 @@ static DEFINE_IDA(dimm_ida); * Retrieve bus and dimm handle and return if this bus supports * get_config_data commands */ -static int __validate_dimm(struct nvdimm_drvdata *ndd) +int nvdimm_check_config_data(struct device *dev) { - struct nvdimm *nvdimm; + struct nvdimm *nvdimm = to_nvdimm(dev); - if (!ndd) - return -EINVAL; - - nvdimm = to_nvdimm(ndd->dev); - - if (!nvdimm->cmd_mask) - return -ENXIO; - if (!test_bit(ND_CMD_GET_CONFIG_DATA, &nvdimm->cmd_mask)) - return -ENXIO; + if (!nvdimm->cmd_mask || + !test_bit(ND_CMD_GET_CONFIG_DATA, &nvdimm->cmd_mask)) { + if (nvdimm->flags & NDD_ALIASING) + return -ENXIO; + else + return -ENOTTY; + } return 0; } static int validate_dimm(struct nvdimm_drvdata *ndd) { - int rc = __validate_dimm(ndd); + int rc; - if (rc && ndd) + if (!ndd) + return -EINVAL; + + rc = nvdimm_check_config_data(ndd->dev); + if (rc) dev_dbg(ndd->dev, "%pf: %s error: %d\n", __builtin_return_address(0), __func__, rc); return rc; diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index 8024a0ef86d3..38d6f039234e 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -191,6 +191,7 @@ void nvdimm_exit(void); void nd_region_exit(void); struct nvdimm; struct nvdimm_drvdata *to_ndd(struct nd_mapping *nd_mapping); +int nvdimm_check_config_data(struct device *dev); int nvdimm_init_nsarea(struct nvdimm_drvdata *ndd); int nvdimm_init_config_data(struct nvdimm_drvdata *ndd); int nvdimm_set_config_data(struct nvdimm_drvdata *ndd, size_t offset, From 73606afd4603a2f6296cd44c4d2b385916565a58 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 12 Sep 2016 15:11:39 -0700 Subject: [PATCH 16/39] tools/testing/nvdimm: test get_config_size DSM failures Add an nfit_test specific attribute for gating whether a get_config_size DSM, or any DSM for that matter, succeeds or fails. The get_config_size DSM is initial motivation since that is the first command libnvdimm core issues to determine the state of the namespace label area. Signed-off-by: Dan Williams --- tools/testing/nvdimm/test/nfit.c | 79 +++++++++++++++++++++++++++++++- 1 file changed, 77 insertions(+), 2 deletions(-) diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c index 99ea68674f0a..175fc24f8f3a 100644 --- a/tools/testing/nvdimm/test/nfit.c +++ b/tools/testing/nvdimm/test/nfit.c @@ -132,6 +132,8 @@ static u32 handle[NUM_DCR] = { [4] = NFIT_DIMM_HANDLE(0, 1, 0, 0, 0), }; +static unsigned long dimm_fail_cmd_flags[NUM_DCR]; + struct nfit_test { struct acpi_nfit_desc acpi_desc; struct platform_device pdev; @@ -414,6 +416,9 @@ static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc, if (i >= ARRAY_SIZE(handle)) return -ENXIO; + if ((1 << func) & dimm_fail_cmd_flags[i]) + return -EIO; + switch (func) { case ND_CMD_GET_CONFIG_SIZE: rc = nfit_test_cmd_get_config_size(buf, buf_len); @@ -582,6 +587,74 @@ static void put_dimms(void *data) static struct class *nfit_test_dimm; +static int dimm_name_to_id(struct device *dev) +{ + int dimm; + + if (sscanf(dev_name(dev), "test_dimm%d", &dimm) != 1 + || dimm >= NUM_DCR || dimm < 0) + return -ENXIO; + return dimm; +} + + +static ssize_t handle_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + int dimm = dimm_name_to_id(dev); + + if (dimm < 0) + return dimm; + + return sprintf(buf, "%#x", handle[dimm]); +} +DEVICE_ATTR_RO(handle); + +static ssize_t fail_cmd_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + int dimm = dimm_name_to_id(dev); + + if (dimm < 0) + return dimm; + + return sprintf(buf, "%#lx\n", dimm_fail_cmd_flags[dimm]); +} + +static ssize_t fail_cmd_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t size) +{ + int dimm = dimm_name_to_id(dev); + unsigned long val; + ssize_t rc; + + if (dimm < 0) + return dimm; + + rc = kstrtol(buf, 0, &val); + if (rc) + return rc; + + dimm_fail_cmd_flags[dimm] = val; + return size; +} +static DEVICE_ATTR_RW(fail_cmd); + +static struct attribute *nfit_test_dimm_attributes[] = { + &dev_attr_fail_cmd.attr, + &dev_attr_handle.attr, + NULL, +}; + +static struct attribute_group nfit_test_dimm_attribute_group = { + .attrs = nfit_test_dimm_attributes, +}; + +static const struct attribute_group *nfit_test_dimm_attribute_groups[] = { + &nfit_test_dimm_attribute_group, + NULL, +}; + static int nfit_test0_alloc(struct nfit_test *t) { size_t nfit_size = sizeof(struct acpi_nfit_system_address) * NUM_SPA @@ -640,8 +713,10 @@ static int nfit_test0_alloc(struct nfit_test *t) if (devm_add_action_or_reset(&t->pdev.dev, put_dimms, t->dimm_dev)) return -ENOMEM; for (i = 0; i < NUM_DCR; i++) { - t->dimm_dev[i] = device_create(nfit_test_dimm, &t->pdev.dev, 0, - NULL, "test_dimm%d", i); + t->dimm_dev[i] = device_create_with_groups(nfit_test_dimm, + &t->pdev.dev, 0, NULL, + nfit_test_dimm_attribute_groups, + "test_dimm%d", i); if (!t->dimm_dev[i]) return -ENOMEM; } From 4765218db79561ce13922806716eadb138c65439 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 15 Sep 2016 18:08:05 -0700 Subject: [PATCH 17/39] libnvdimm, namespace: debug invalid interleave-set-cookie values If platform firmware fails to populate unique / non-zero serial number data for each nvdimm in an interleave-set it may cause pmem region initialization to fail. Add a debug message for this case. Signed-off-by: Dan Williams --- drivers/nvdimm/namespace_devs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index 6b0449bd7720..4f0a21308417 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -1584,8 +1584,10 @@ static int find_pmem_label_set(struct nd_region *nd_region, int rc = -ENODEV, l; u16 i; - if (cookie == 0) + if (cookie == 0) { + dev_dbg(&nd_region->dev, "invalid interleave-set-cookie\n"); return -ENXIO; + } /* * Find a complete set of labels by uuid. By definition we can start From a0056afe21fdf79d1fad2b8fb14868cd710d400f Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Wed, 21 Sep 2016 15:28:29 -0700 Subject: [PATCH 18/39] nvdimm: remove duplicate nd_mapping declaration Signed-off-by: Dave Jiang Signed-off-by: Dan Williams --- drivers/nvdimm/nd-core.h | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h index 38ce6bbbc170..1414784c6c2b 100644 --- a/drivers/nvdimm/nd-core.h +++ b/drivers/nvdimm/nd-core.h @@ -79,7 +79,6 @@ resource_size_t nd_blk_available_dpa(struct nd_mapping *nd_mapping); resource_size_t nd_region_available_dpa(struct nd_region *nd_region); resource_size_t nvdimm_allocated_dpa(struct nvdimm_drvdata *ndd, struct nd_label_id *label_id); -struct nd_mapping; struct resource *nsblk_add_resource(struct nd_region *nd_region, struct nvdimm_drvdata *ndd, struct nd_namespace_blk *nsblk, resource_size_t start); From 9ffd6350a103cb9e73e3abb4573c900cfead2f9b Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Fri, 30 Sep 2016 17:19:29 -0600 Subject: [PATCH 19/39] nfit: don't start a full scrub by default for an MCE Starting a full Address Range Scrub (ARS) on hitting a memory error machine check exception may not always be desirable. Provide a way through sysfs to toggle the behavior between just adding the address (cache line) where the MCE happened to the poison list and doing a full scrub. The former (selective insertion of the address) is done unconditionally. Cc: linux-acpi@vger.kernel.org Cc: Linda Knippers Cc: Rafael J. Wysocki Signed-off-by: Vishal Verma Signed-off-by: Dan Williams --- drivers/acpi/nfit/core.c | 53 ++++++++++++++++++++++++++++++++++++++++ drivers/acpi/nfit/mce.c | 24 ++++++++++++++---- drivers/acpi/nfit/nfit.h | 6 +++++ 3 files changed, 78 insertions(+), 5 deletions(-) diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index ceb6671ab355..02838f928d7e 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -878,6 +878,58 @@ static ssize_t revision_show(struct device *dev, } static DEVICE_ATTR_RO(revision); +static ssize_t hw_error_scrub_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev); + struct nvdimm_bus_descriptor *nd_desc = to_nd_desc(nvdimm_bus); + struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc); + + return sprintf(buf, "%d\n", acpi_desc->scrub_mode); +} + +/* + * The 'hw_error_scrub' attribute can have the following values written to it: + * '0': Switch to the default mode where an exception will only insert + * the address of the memory error into the poison and badblocks lists. + * '1': Enable a full scrub to happen if an exception for a memory error is + * received. + */ +static ssize_t hw_error_scrub_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t size) +{ + struct nvdimm_bus_descriptor *nd_desc; + ssize_t rc; + long val; + + rc = kstrtol(buf, 0, &val); + if (rc) + return rc; + + device_lock(dev); + nd_desc = dev_get_drvdata(dev); + if (nd_desc) { + struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc); + + switch (val) { + case HW_ERROR_SCRUB_ON: + acpi_desc->scrub_mode = HW_ERROR_SCRUB_ON; + break; + case HW_ERROR_SCRUB_OFF: + acpi_desc->scrub_mode = HW_ERROR_SCRUB_OFF; + break; + default: + rc = -EINVAL; + break; + } + } + device_unlock(dev); + if (rc) + return rc; + return size; +} +static DEVICE_ATTR_RW(hw_error_scrub); + /* * This shows the number of full Address Range Scrubs that have been * completed since driver load time. Userspace can wait on this using @@ -950,6 +1002,7 @@ static umode_t nfit_visible(struct kobject *kobj, struct attribute *a, int n) static struct attribute *acpi_nfit_attributes[] = { &dev_attr_revision.attr, &dev_attr_scrub.attr, + &dev_attr_hw_error_scrub.attr, NULL, }; diff --git a/drivers/acpi/nfit/mce.c b/drivers/acpi/nfit/mce.c index 4c745bf389fe..2e25e252945f 100644 --- a/drivers/acpi/nfit/mce.c +++ b/drivers/acpi/nfit/mce.c @@ -14,6 +14,7 @@ */ #include #include +#include #include #include "nfit.h" @@ -62,12 +63,25 @@ static int nfit_handle_mce(struct notifier_block *nb, unsigned long val, } mutex_unlock(&acpi_desc->init_mutex); - /* - * We can ignore an -EBUSY here because if an ARS is already - * in progress, just let that be the last authoritative one - */ - if (found_match) + if (!found_match) + continue; + + /* If this fails due to an -ENOMEM, there is little we can do */ + nvdimm_bus_add_poison(acpi_desc->nvdimm_bus, + ALIGN(mce->addr, L1_CACHE_BYTES), + L1_CACHE_BYTES); + nvdimm_region_notify(nfit_spa->nd_region, + NVDIMM_REVALIDATE_POISON); + + if (acpi_desc->scrub_mode == HW_ERROR_SCRUB_ON) { + /* + * We can ignore an -EBUSY here because if an ARS is + * already in progress, just let that be the last + * authoritative one + */ acpi_nfit_ars_rescan(acpi_desc); + } + break; } mutex_unlock(&acpi_desc_lock); diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h index bb101170cd0b..14296f5267c8 100644 --- a/drivers/acpi/nfit/nfit.h +++ b/drivers/acpi/nfit/nfit.h @@ -161,6 +161,7 @@ struct acpi_nfit_desc { struct list_head list; struct kernfs_node *scrub_count_state; unsigned int scrub_count; + unsigned int scrub_mode; unsigned int cancel:1; unsigned long dimm_cmd_force_en; unsigned long bus_cmd_force_en; @@ -168,6 +169,11 @@ struct acpi_nfit_desc { void *iobuf, u64 len, int rw); }; +enum scrub_mode { + HW_ERROR_SCRUB_OFF, + HW_ERROR_SCRUB_ON, +}; + enum nd_blk_mmio_selector { BDW, DCR, From bd697a80c329072b991475fa6608bb0e665b3d90 Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Fri, 30 Sep 2016 17:19:30 -0600 Subject: [PATCH 20/39] pmem: reduce kmap_atomic sections to the memcpys only pmem_do_bvec used to kmap_atomic at the begin, and only unmap at the end. Things like nvdimm_clear_poison may want to do nvdimm subsystem bookkeeping operations that may involve taking locks or doing memory allocations, and we can't do that from the atomic context. Reduce the atomic context to just what needs it - the memcpy to/from pmem. Cc: Ross Zwisler Signed-off-by: Vishal Verma Signed-off-by: Dan Williams --- drivers/nvdimm/pmem.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 571a6c7ee2fc..42b3a8217073 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -66,13 +66,32 @@ static void pmem_clear_poison(struct pmem_device *pmem, phys_addr_t offset, invalidate_pmem(pmem->virt_addr + offset, len); } +static void write_pmem(void *pmem_addr, struct page *page, + unsigned int off, unsigned int len) +{ + void *mem = kmap_atomic(page); + + memcpy_to_pmem(pmem_addr, mem + off, len); + kunmap_atomic(mem); +} + +static int read_pmem(struct page *page, unsigned int off, + void *pmem_addr, unsigned int len) +{ + int rc; + void *mem = kmap_atomic(page); + + rc = memcpy_from_pmem(mem + off, pmem_addr, len); + kunmap_atomic(mem); + return rc; +} + static int pmem_do_bvec(struct pmem_device *pmem, struct page *page, unsigned int len, unsigned int off, bool is_write, sector_t sector) { int rc = 0; bool bad_pmem = false; - void *mem = kmap_atomic(page); phys_addr_t pmem_off = sector * 512 + pmem->data_offset; void *pmem_addr = pmem->virt_addr + pmem_off; @@ -83,7 +102,7 @@ static int pmem_do_bvec(struct pmem_device *pmem, struct page *page, if (unlikely(bad_pmem)) rc = -EIO; else { - rc = memcpy_from_pmem(mem + off, pmem_addr, len); + rc = read_pmem(page, off, pmem_addr, len); flush_dcache_page(page); } } else { @@ -102,14 +121,13 @@ static int pmem_do_bvec(struct pmem_device *pmem, struct page *page, * after clear poison. */ flush_dcache_page(page); - memcpy_to_pmem(pmem_addr, mem + off, len); + write_pmem(pmem_addr, page, off, len); if (unlikely(bad_pmem)) { pmem_clear_poison(pmem, pmem_off, len); - memcpy_to_pmem(pmem_addr, mem + off, len); + write_pmem(pmem_addr, page, off, len); } } - kunmap_atomic(mem); return rc; } From e046114af5fcafe8d6d3f0b6ccb99804bad34bfb Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Fri, 30 Sep 2016 17:19:31 -0600 Subject: [PATCH 21/39] libnvdimm: clear the internal poison_list when clearing badblocks nvdimm_clear_poison cleared the user-visible badblocks, and sent commands to the NVDIMM to clear the areas marked as 'poison', but it neglected to clear the same areas from the internal poison_list which is used to marshal ARS results before sorting them by namespace. As a result, once on-demand ARS functionality was added: 37b137f nfit, libnvdimm: allow an ARS scrub to be triggered on demand A scrub triggered from either sysfs or an MCE was found to be adding stale entries that had been cleared from gendisk->badblocks, but were still present in nvdimm_bus->poison_list. Additionally, the stale entries could be triggered into producing stale disk->badblocks by simply disabling and re-enabling the namespace or region. This adds the missing step of clearing poison_list entries when clearing poison, so that it is always in sync with badblocks. Fixes: 37b137f ("nfit, libnvdimm: allow an ARS scrub to be triggered on demand") Signed-off-by: Vishal Verma Signed-off-by: Dan Williams --- drivers/nvdimm/bus.c | 2 ++ drivers/nvdimm/core.c | 73 ++++++++++++++++++++++++++++++++++++--- include/linux/libnvdimm.h | 2 ++ 3 files changed, 73 insertions(+), 4 deletions(-) diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c index 458daf927336..8493a2559daa 100644 --- a/drivers/nvdimm/bus.c +++ b/drivers/nvdimm/bus.c @@ -213,6 +213,8 @@ long nvdimm_clear_poison(struct device *dev, phys_addr_t phys, return rc; if (cmd_rc < 0) return cmd_rc; + + nvdimm_clear_from_poison_list(nvdimm_bus, phys, len); return clear_err.cleared; } EXPORT_SYMBOL_GPL(nvdimm_clear_poison); diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c index 715583f69d28..42e40db4651b 100644 --- a/drivers/nvdimm/core.c +++ b/drivers/nvdimm/core.c @@ -541,11 +541,12 @@ void nvdimm_badblocks_populate(struct nd_region *nd_region, } EXPORT_SYMBOL_GPL(nvdimm_badblocks_populate); -static int add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length) +static int add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length, + gfp_t flags) { struct nd_poison *pl; - pl = kzalloc(sizeof(*pl), GFP_KERNEL); + pl = kzalloc(sizeof(*pl), flags); if (!pl) return -ENOMEM; @@ -561,7 +562,7 @@ static int bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length) struct nd_poison *pl; if (list_empty(&nvdimm_bus->poison_list)) - return add_poison(nvdimm_bus, addr, length); + return add_poison(nvdimm_bus, addr, length, GFP_KERNEL); /* * There is a chance this is a duplicate, check for those first. @@ -581,7 +582,7 @@ static int bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length) * as any overlapping ranges will get resolved when the list is consumed * and converted to badblocks */ - return add_poison(nvdimm_bus, addr, length); + return add_poison(nvdimm_bus, addr, length, GFP_KERNEL); } int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length) @@ -596,6 +597,70 @@ int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length) } EXPORT_SYMBOL_GPL(nvdimm_bus_add_poison); +void nvdimm_clear_from_poison_list(struct nvdimm_bus *nvdimm_bus, + phys_addr_t start, unsigned int len) +{ + struct list_head *poison_list = &nvdimm_bus->poison_list; + u64 clr_end = start + len - 1; + struct nd_poison *pl, *next; + + nvdimm_bus_lock(&nvdimm_bus->dev); + WARN_ON_ONCE(list_empty(poison_list)); + + /* + * [start, clr_end] is the poison interval being cleared. + * [pl->start, pl_end] is the poison_list entry we're comparing + * the above interval against. The poison list entry may need + * to be modified (update either start or length), deleted, or + * split into two based on the overlap characteristics + */ + + list_for_each_entry_safe(pl, next, poison_list, list) { + u64 pl_end = pl->start + pl->length - 1; + + /* Skip intervals with no intersection */ + if (pl_end < start) + continue; + if (pl->start > clr_end) + continue; + /* Delete completely overlapped poison entries */ + if ((pl->start >= start) && (pl_end <= clr_end)) { + list_del(&pl->list); + kfree(pl); + continue; + } + /* Adjust start point of partially cleared entries */ + if ((start <= pl->start) && (clr_end > pl->start)) { + pl->length -= clr_end - pl->start + 1; + pl->start = clr_end + 1; + continue; + } + /* Adjust pl->length for partial clearing at the tail end */ + if ((pl->start < start) && (pl_end <= clr_end)) { + /* pl->start remains the same */ + pl->length = start - pl->start; + continue; + } + /* + * If clearing in the middle of an entry, we split it into + * two by modifying the current entry to represent one half of + * the split, and adding a new entry for the second half. + */ + if ((pl->start < start) && (pl_end > clr_end)) { + u64 new_start = clr_end + 1; + u64 new_len = pl_end - new_start + 1; + + /* Add new entry covering the right half */ + add_poison(nvdimm_bus, new_start, new_len, GFP_NOIO); + /* Adjust this entry to cover the left half */ + pl->length = start - pl->start; + continue; + } + } + nvdimm_bus_unlock(&nvdimm_bus->dev); +} +EXPORT_SYMBOL_GPL(nvdimm_clear_from_poison_list); + #ifdef CONFIG_BLK_DEV_INTEGRITY int nd_integrity_init(struct gendisk *disk, unsigned long meta_size) { diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index ad18d0531b6e..4a5f8c51f2a5 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -129,6 +129,8 @@ static inline struct nd_blk_region_desc *to_blk_region_desc( } int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length); +void nvdimm_clear_from_poison_list(struct nvdimm_bus *nvdimm_bus, + phys_addr_t start, unsigned int len); struct nvdimm_bus *nvdimm_bus_register(struct device *parent, struct nvdimm_bus_descriptor *nfit_desc); void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus); From db58028ee4e360430de8e3b48f657dc798ee6591 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Mon, 26 Sep 2016 11:06:50 -0700 Subject: [PATCH 22/39] nvdimm: reduce duplicated wpq flushes Existing implemenetation writes to all the flush hint addresses for a given ND region. This is not necessary as the flushes are per imc and not per DIMM. Search the mappings and clear out the duplicates at init to avoid multiple flush to the same imc. Signed-off-by: Dave Jiang Signed-off-by: Dan Williams --- drivers/nvdimm/region_devs.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index 4c0ac4abb629..f9d58c2b5341 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -70,7 +70,7 @@ static int nvdimm_map_flush(struct device *dev, struct nvdimm *nvdimm, int dimm, int nd_region_activate(struct nd_region *nd_region) { - int i, num_flush = 0; + int i, j, num_flush = 0; struct nd_region_data *ndrd; struct device *dev = &nd_region->dev; size_t flush_data_size = sizeof(void *); @@ -107,6 +107,21 @@ int nd_region_activate(struct nd_region *nd_region) return rc; } + /* + * Clear out entries that are duplicates. This should prevent the + * extra flushings. + */ + for (i = 0; i < nd_region->ndr_mappings - 1; i++) { + /* ignore if NULL already */ + if (!ndrd_get_flush_wpq(ndrd, i, 0)) + continue; + + for (j = i + 1; j < nd_region->ndr_mappings; j++) + if (ndrd_get_flush_wpq(ndrd, i, 0) == + ndrd_get_flush_wpq(ndrd, j, 0)) + ndrd_set_flush_wpq(ndrd, j, 0, NULL); + } + return 0; } From 44c462eb9e19dfa089b454271dd2dff5eaf1ad6d Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 19 Sep 2016 16:38:50 -0700 Subject: [PATCH 23/39] libnvdimm, region: move region-mapping input-paramters to nd_mapping_desc Before we add more libnvdimm-private fields to nd_mapping make it clear which parameters are input vs libnvdimm internals. Use struct nd_mapping_desc instead of struct nd_mapping in nd_region_desc and make struct nd_mapping private to libnvdimm. Signed-off-by: Dan Williams --- drivers/acpi/nfit/core.c | 30 +++++++++++++++--------------- drivers/nvdimm/nd.h | 14 ++++++++++++++ drivers/nvdimm/region_devs.c | 16 +++++++++------- include/linux/libnvdimm.h | 25 +++++++------------------ 4 files changed, 45 insertions(+), 40 deletions(-) diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index 02838f928d7e..6490a15abdd3 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -1627,9 +1627,9 @@ static int acpi_nfit_init_interleave_set(struct acpi_nfit_desc *acpi_desc, if (!info) return -ENOMEM; for (i = 0; i < nr; i++) { - struct nd_mapping *nd_mapping = &ndr_desc->nd_mapping[i]; + struct nd_mapping_desc *mapping = &ndr_desc->mapping[i]; struct nfit_set_info_map *map = &info->mapping[i]; - struct nvdimm *nvdimm = nd_mapping->nvdimm; + struct nvdimm *nvdimm = mapping->nvdimm; struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); struct acpi_nfit_memory_map *memdev = memdev_from_spa(acpi_desc, spa->range_index, i); @@ -2053,7 +2053,7 @@ static int acpi_nfit_insert_resource(struct acpi_nfit_desc *acpi_desc, } static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc, - struct nd_mapping *nd_mapping, struct nd_region_desc *ndr_desc, + struct nd_mapping_desc *mapping, struct nd_region_desc *ndr_desc, struct acpi_nfit_memory_map *memdev, struct nfit_spa *nfit_spa) { @@ -2070,12 +2070,12 @@ static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc, return -ENODEV; } - nd_mapping->nvdimm = nvdimm; + mapping->nvdimm = nvdimm; switch (nfit_spa_type(spa)) { case NFIT_SPA_PM: case NFIT_SPA_VOLATILE: - nd_mapping->start = memdev->address; - nd_mapping->size = memdev->region_size; + mapping->start = memdev->address; + mapping->size = memdev->region_size; break; case NFIT_SPA_DCR: nfit_mem = nvdimm_provider_data(nvdimm); @@ -2083,13 +2083,13 @@ static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc, dev_dbg(acpi_desc->dev, "spa%d %s missing bdw\n", spa->range_index, nvdimm_name(nvdimm)); } else { - nd_mapping->size = nfit_mem->bdw->capacity; - nd_mapping->start = nfit_mem->bdw->start_address; + mapping->size = nfit_mem->bdw->capacity; + mapping->start = nfit_mem->bdw->start_address; ndr_desc->num_lanes = nfit_mem->bdw->windows; blk_valid = 1; } - ndr_desc->nd_mapping = nd_mapping; + ndr_desc->mapping = mapping; ndr_desc->num_mappings = blk_valid; ndbr_desc = to_blk_region_desc(ndr_desc); ndbr_desc->enable = acpi_nfit_blk_region_enable; @@ -2115,7 +2115,7 @@ static bool nfit_spa_is_virtual(struct acpi_nfit_system_address *spa) static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc, struct nfit_spa *nfit_spa) { - static struct nd_mapping nd_mappings[ND_MAX_MAPPINGS]; + static struct nd_mapping_desc mappings[ND_MAX_MAPPINGS]; struct acpi_nfit_system_address *spa = nfit_spa->spa; struct nd_blk_region_desc ndbr_desc; struct nd_region_desc *ndr_desc; @@ -2134,7 +2134,7 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc, } memset(&res, 0, sizeof(res)); - memset(&nd_mappings, 0, sizeof(nd_mappings)); + memset(&mappings, 0, sizeof(mappings)); memset(&ndbr_desc, 0, sizeof(ndbr_desc)); res.start = spa->address; res.end = res.start + spa->length - 1; @@ -2150,7 +2150,7 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc, list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) { struct acpi_nfit_memory_map *memdev = nfit_memdev->memdev; - struct nd_mapping *nd_mapping; + struct nd_mapping_desc *mapping; if (memdev->range_index != spa->range_index) continue; @@ -2159,14 +2159,14 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc, spa->range_index, ND_MAX_MAPPINGS); return -ENXIO; } - nd_mapping = &nd_mappings[count++]; - rc = acpi_nfit_init_mapping(acpi_desc, nd_mapping, ndr_desc, + mapping = &mappings[count++]; + rc = acpi_nfit_init_mapping(acpi_desc, mapping, ndr_desc, memdev, nfit_spa); if (rc) goto out; } - ndr_desc->nd_mapping = nd_mappings; + ndr_desc->mapping = mappings; ndr_desc->num_mappings = count; rc = acpi_nfit_init_interleave_set(acpi_desc, ndr_desc, spa); if (rc) diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index 38d6f039234e..e58c40824e1f 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -98,6 +98,20 @@ struct nd_percpu_lane { spinlock_t lock; }; +struct nd_mapping { + struct nvdimm *nvdimm; + struct nd_namespace_label **labels; + u64 start; + u64 size; + /* + * @ndd is for private use at region enable / disable time for + * get_ndd() + put_ndd(), all other nd_mapping to ndd + * conversions use to_ndd() which respects enabled state of the + * nvdimm. + */ + struct nvdimm_drvdata *ndd; +}; + struct nd_region { struct device dev; struct ida ns_ida; diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index e8d5ba7b29af..0ff43cbb15e3 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -755,10 +755,10 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, int ro = 0; for (i = 0; i < ndr_desc->num_mappings; i++) { - struct nd_mapping *nd_mapping = &ndr_desc->nd_mapping[i]; - struct nvdimm *nvdimm = nd_mapping->nvdimm; + struct nd_mapping_desc *mapping = &ndr_desc->mapping[i]; + struct nvdimm *nvdimm = mapping->nvdimm; - if ((nd_mapping->start | nd_mapping->size) % SZ_4K) { + if ((mapping->start | mapping->size) % SZ_4K) { dev_err(&nvdimm_bus->dev, "%s: %s mapping%d is not 4K aligned\n", caller, dev_name(&nvdimm->dev), i); @@ -809,11 +809,13 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, ndl->count = 0; } - memcpy(nd_region->mapping, ndr_desc->nd_mapping, - sizeof(struct nd_mapping) * ndr_desc->num_mappings); for (i = 0; i < ndr_desc->num_mappings; i++) { - struct nd_mapping *nd_mapping = &ndr_desc->nd_mapping[i]; - struct nvdimm *nvdimm = nd_mapping->nvdimm; + struct nd_mapping_desc *mapping = &ndr_desc->mapping[i]; + struct nvdimm *nvdimm = mapping->nvdimm; + + nd_region->mapping[i].nvdimm = nvdimm; + nd_region->mapping[i].start = mapping->start; + nd_region->mapping[i].size = mapping->size; get_device(&nvdimm->dev); } diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 4a5f8c51f2a5..f4947fda11e7 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -50,23 +50,6 @@ typedef int (*ndctl_fn)(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, unsigned int cmd, void *buf, unsigned int buf_len, int *cmd_rc); -struct nd_namespace_label; -struct nvdimm_drvdata; - -struct nd_mapping { - struct nvdimm *nvdimm; - struct nd_namespace_label **labels; - u64 start; - u64 size; - /* - * @ndd is for private use at region enable / disable time for - * get_ndd() + put_ndd(), all other nd_mapping to ndd - * conversions use to_ndd() which respects enabled state of the - * nvdimm. - */ - struct nvdimm_drvdata *ndd; -}; - struct nvdimm_bus_descriptor { const struct attribute_group **attr_groups; unsigned long cmd_mask; @@ -89,9 +72,15 @@ struct nd_interleave_set { u64 cookie; }; +struct nd_mapping_desc { + struct nvdimm *nvdimm; + u64 start; + u64 size; +}; + struct nd_region_desc { struct resource *res; - struct nd_mapping *nd_mapping; + struct nd_mapping_desc *mapping; u16 num_mappings; const struct attribute_group **attr_groups; struct nd_interleave_set *nd_set; From ae8219f186d8e98a3239afc6ea49bb46f2871d2f Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 19 Sep 2016 16:04:21 -0700 Subject: [PATCH 24/39] libnvdimm, label: convert label tracking to a linked list In preparation for enabling multiple namespaces per pmem region, convert the label tracking to use a linked list. In particular this will allow select_pmem_id() to move labels from the unvalidated state to the validated state. Currently we only track one validated set per-region. Signed-off-by: Dan Williams --- drivers/nvdimm/label.c | 136 ++++++++++++++----------- drivers/nvdimm/namespace_devs.c | 173 ++++++++++++++++++++++---------- drivers/nvdimm/nd-core.h | 1 + drivers/nvdimm/nd.h | 16 ++- drivers/nvdimm/region_devs.c | 19 +++- 5 files changed, 225 insertions(+), 120 deletions(-) diff --git a/drivers/nvdimm/label.c b/drivers/nvdimm/label.c index 96526dcfdd37..c37357210428 100644 --- a/drivers/nvdimm/label.c +++ b/drivers/nvdimm/label.c @@ -499,6 +499,7 @@ static int __pmem_label_update(struct nd_region *nd_region, struct nd_namespace_label *victim_label; struct nd_namespace_label *nd_label; struct nd_namespace_index *nsindex; + struct nd_label_ent *label_ent; unsigned long *free; u32 nslot, slot; size_t offset; @@ -536,8 +537,13 @@ static int __pmem_label_update(struct nd_region *nd_region, return rc; /* Garbage collect the previous label */ - victim_label = nd_mapping->labels[0]; + mutex_lock(&nd_mapping->lock); + label_ent = list_first_entry_or_null(&nd_mapping->labels, + typeof(*label_ent), list); + WARN_ON(!label_ent); + victim_label = label_ent ? label_ent->label : NULL; if (victim_label) { + label_ent->label = NULL; slot = to_slot(ndd, victim_label); nd_label_free_slot(ndd, slot); dev_dbg(ndd->dev, "%s: free: %d\n", __func__, slot); @@ -546,28 +552,11 @@ static int __pmem_label_update(struct nd_region *nd_region, /* update index */ rc = nd_label_write_index(ndd, ndd->ns_next, nd_inc_seq(__le32_to_cpu(nsindex->seq)), 0); - if (rc < 0) - return rc; + if (rc == 0 && label_ent) + label_ent->label = nd_label; + mutex_unlock(&nd_mapping->lock); - nd_mapping->labels[0] = nd_label; - - return 0; -} - -static void del_label(struct nd_mapping *nd_mapping, int l) -{ - struct nd_namespace_label *next_label, *nd_label; - struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); - unsigned int slot; - int j; - - nd_label = nd_mapping->labels[l]; - slot = to_slot(ndd, nd_label); - dev_vdbg(ndd->dev, "%s: clear: %d\n", __func__, slot); - - for (j = l; (next_label = nd_mapping->labels[j + 1]); j++) - nd_mapping->labels[j] = next_label; - nd_mapping->labels[j] = NULL; + return rc; } static bool is_old_resource(struct resource *res, struct resource **list, int n) @@ -607,14 +596,16 @@ static int __blk_label_update(struct nd_region *nd_region, struct nd_mapping *nd_mapping, struct nd_namespace_blk *nsblk, int num_labels) { - int i, l, alloc, victims, nfree, old_num_resources, nlabel, rc = -ENXIO; + int i, alloc, victims, nfree, old_num_resources, nlabel, rc = -ENXIO; struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); struct nd_namespace_label *nd_label; + struct nd_label_ent *label_ent, *e; struct nd_namespace_index *nsindex; unsigned long *free, *victim_map = NULL; struct resource *res, **old_res_list; struct nd_label_id label_id; u8 uuid[NSLABEL_UUID_LEN]; + LIST_HEAD(list); u32 nslot, slot; if (!preamble_next(ndd, &nsindex, &free, &nslot)) @@ -736,15 +727,22 @@ static int __blk_label_update(struct nd_region *nd_region, * entries in nd_mapping->labels */ nlabel = 0; - for_each_label(l, nd_label, nd_mapping->labels) { + mutex_lock(&nd_mapping->lock); + list_for_each_entry_safe(label_ent, e, &nd_mapping->labels, list) { + nd_label = label_ent->label; + if (!nd_label) + continue; nlabel++; memcpy(uuid, nd_label->uuid, NSLABEL_UUID_LEN); if (memcmp(uuid, nsblk->uuid, NSLABEL_UUID_LEN) != 0) continue; nlabel--; - del_label(nd_mapping, l); - l--; /* retry with the new label at this index */ + list_move(&label_ent->list, &list); + label_ent->label = NULL; } + list_splice_tail_init(&list, &nd_mapping->labels); + mutex_unlock(&nd_mapping->lock); + if (nlabel + nsblk->num_resources > num_labels) { /* * Bug, we can't end up with more resources than @@ -755,6 +753,15 @@ static int __blk_label_update(struct nd_region *nd_region, goto out; } + mutex_lock(&nd_mapping->lock); + label_ent = list_first_entry_or_null(&nd_mapping->labels, + typeof(*label_ent), list); + if (!label_ent) { + WARN_ON(1); + mutex_unlock(&nd_mapping->lock); + rc = -ENXIO; + goto out; + } for_each_clear_bit_le(slot, free, nslot) { nd_label = nd_label_base(ndd) + slot; memcpy(uuid, nd_label->uuid, NSLABEL_UUID_LEN); @@ -762,11 +769,19 @@ static int __blk_label_update(struct nd_region *nd_region, continue; res = to_resource(ndd, nd_label); res->flags &= ~DPA_RESOURCE_ADJUSTED; - dev_vdbg(&nsblk->common.dev, "assign label[%d] slot: %d\n", - l, slot); - nd_mapping->labels[l++] = nd_label; + dev_vdbg(&nsblk->common.dev, "assign label slot: %d\n", slot); + list_for_each_entry_from(label_ent, &nd_mapping->labels, list) { + if (label_ent->label) + continue; + label_ent->label = nd_label; + nd_label = NULL; + break; + } + if (nd_label) + dev_WARN(&nsblk->common.dev, + "failed to track label slot%d\n", slot); } - nd_mapping->labels[l] = NULL; + mutex_unlock(&nd_mapping->lock); out: kfree(old_res_list); @@ -788,32 +803,28 @@ static int __blk_label_update(struct nd_region *nd_region, static int init_labels(struct nd_mapping *nd_mapping, int num_labels) { - int i, l, old_num_labels = 0; + int i, old_num_labels = 0; + struct nd_label_ent *label_ent; struct nd_namespace_index *nsindex; - struct nd_namespace_label *nd_label; struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); - size_t size = (num_labels + 1) * sizeof(struct nd_namespace_label *); - for_each_label(l, nd_label, nd_mapping->labels) + mutex_lock(&nd_mapping->lock); + list_for_each_entry(label_ent, &nd_mapping->labels, list) old_num_labels++; + mutex_unlock(&nd_mapping->lock); /* * We need to preserve all the old labels for the mapping so * they can be garbage collected after writing the new labels. */ - if (num_labels > old_num_labels) { - struct nd_namespace_label **labels; - - labels = krealloc(nd_mapping->labels, size, GFP_KERNEL); - if (!labels) + for (i = old_num_labels; i < num_labels; i++) { + label_ent = kzalloc(sizeof(*label_ent), GFP_KERNEL); + if (!label_ent) return -ENOMEM; - nd_mapping->labels = labels; + mutex_lock(&nd_mapping->lock); + list_add_tail(&label_ent->list, &nd_mapping->labels); + mutex_unlock(&nd_mapping->lock); } - if (!nd_mapping->labels) - return -ENOMEM; - - for (i = old_num_labels; i <= num_labels; i++) - nd_mapping->labels[i] = NULL; if (ndd->ns_current == -1 || ndd->ns_next == -1) /* pass */; @@ -837,42 +848,45 @@ static int init_labels(struct nd_mapping *nd_mapping, int num_labels) static int del_labels(struct nd_mapping *nd_mapping, u8 *uuid) { struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); - struct nd_namespace_label *nd_label; + struct nd_label_ent *label_ent, *e; struct nd_namespace_index *nsindex; u8 label_uuid[NSLABEL_UUID_LEN]; - int l, num_freed = 0; unsigned long *free; + LIST_HEAD(list); u32 nslot, slot; + int active = 0; if (!uuid) return 0; /* no index || no labels == nothing to delete */ - if (!preamble_next(ndd, &nsindex, &free, &nslot) - || !nd_mapping->labels) + if (!preamble_next(ndd, &nsindex, &free, &nslot)) return 0; - for_each_label(l, nd_label, nd_mapping->labels) { + mutex_lock(&nd_mapping->lock); + list_for_each_entry_safe(label_ent, e, &nd_mapping->labels, list) { + struct nd_namespace_label *nd_label = label_ent->label; + + if (!nd_label) + continue; + active++; memcpy(label_uuid, nd_label->uuid, NSLABEL_UUID_LEN); if (memcmp(label_uuid, uuid, NSLABEL_UUID_LEN) != 0) continue; + active--; slot = to_slot(ndd, nd_label); nd_label_free_slot(ndd, slot); dev_dbg(ndd->dev, "%s: free: %d\n", __func__, slot); - del_label(nd_mapping, l); - num_freed++; - l--; /* retry with new label at this index */ + list_move_tail(&label_ent->list, &list); + label_ent->label = NULL; } + list_splice_tail_init(&list, &nd_mapping->labels); - if (num_freed > l) { - /* - * num_freed will only ever be > l when we delete the last - * label - */ - kfree(nd_mapping->labels); - nd_mapping->labels = NULL; - dev_dbg(ndd->dev, "%s: no more labels\n", __func__); + if (active == 0) { + nd_mapping_free_labels(nd_mapping); + dev_dbg(ndd->dev, "%s: no more active labels\n", __func__); } + mutex_unlock(&nd_mapping->lock); return nd_label_write_index(ndd, ndd->ns_next, nd_inc_seq(__le32_to_cpu(nsindex->seq)), 0); diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index 4f0a21308417..9f4188c78120 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include "nd-core.h" #include "nd.h" @@ -1089,7 +1090,7 @@ static int namespace_update_uuid(struct nd_region *nd_region, * * FIXME: can we delete uuid with zero dpa allocated? */ - if (nd_mapping->labels) + if (list_empty(&nd_mapping->labels)) return -EBUSY; } @@ -1491,14 +1492,19 @@ static bool has_uuid_at_pos(struct nd_region *nd_region, u8 *uuid, for (i = 0; i < nd_region->ndr_mappings; i++) { struct nd_mapping *nd_mapping = &nd_region->mapping[i]; - struct nd_namespace_label *nd_label; + struct nd_label_ent *label_ent; bool found_uuid = false; - int l; - for_each_label(l, nd_label, nd_mapping->labels) { - u64 isetcookie = __le64_to_cpu(nd_label->isetcookie); - u16 position = __le16_to_cpu(nd_label->position); - u16 nlabel = __le16_to_cpu(nd_label->nlabel); + list_for_each_entry(label_ent, &nd_mapping->labels, list) { + struct nd_namespace_label *nd_label = label_ent->label; + u16 position, nlabel; + u64 isetcookie; + + if (!nd_label) + continue; + isetcookie = __le64_to_cpu(nd_label->isetcookie); + position = __le16_to_cpu(nd_label->position); + nlabel = __le16_to_cpu(nd_label->nlabel); if (isetcookie != cookie) continue; @@ -1528,7 +1534,6 @@ static bool has_uuid_at_pos(struct nd_region *nd_region, u8 *uuid, static int select_pmem_id(struct nd_region *nd_region, u8 *pmem_id) { - struct nd_namespace_label *select = NULL; int i; if (!pmem_id) @@ -1536,35 +1541,47 @@ static int select_pmem_id(struct nd_region *nd_region, u8 *pmem_id) for (i = 0; i < nd_region->ndr_mappings; i++) { struct nd_mapping *nd_mapping = &nd_region->mapping[i]; - struct nd_namespace_label *nd_label; + struct nd_namespace_label *nd_label = NULL; u64 hw_start, hw_end, pmem_start, pmem_end; - int l; + struct nd_label_ent *label_ent; - for_each_label(l, nd_label, nd_mapping->labels) + mutex_lock(&nd_mapping->lock); + list_for_each_entry(label_ent, &nd_mapping->labels, list) { + nd_label = label_ent->label; + if (!nd_label) + continue; if (memcmp(nd_label->uuid, pmem_id, NSLABEL_UUID_LEN) == 0) break; + nd_label = NULL; + } + mutex_unlock(&nd_mapping->lock); if (!nd_label) { WARN_ON(1); return -EINVAL; } - select = nd_label; /* * Check that this label is compliant with the dpa * range published in NFIT */ hw_start = nd_mapping->start; hw_end = hw_start + nd_mapping->size; - pmem_start = __le64_to_cpu(select->dpa); - pmem_end = pmem_start + __le64_to_cpu(select->rawsize); + pmem_start = __le64_to_cpu(nd_label->dpa); + pmem_end = pmem_start + __le64_to_cpu(nd_label->rawsize); if (pmem_start == hw_start && pmem_end <= hw_end) /* pass */; else return -EINVAL; - nd_mapping->labels[0] = select; - nd_mapping->labels[1] = NULL; + mutex_lock(&nd_mapping->lock); + label_ent = list_first_entry(&nd_mapping->labels, + typeof(*label_ent), list); + label_ent->label = nd_label; + list_del(&label_ent->list); + nd_mapping_free_labels(nd_mapping); + list_add(&label_ent->list, &nd_mapping->labels); + mutex_unlock(&nd_mapping->lock); } return 0; } @@ -1577,11 +1594,12 @@ static int find_pmem_label_set(struct nd_region *nd_region, struct nd_namespace_pmem *nspm) { u64 cookie = nd_region_interleave_set_cookie(nd_region); - struct nd_namespace_label *nd_label; u8 select_id[NSLABEL_UUID_LEN]; + struct nd_label_ent *label_ent; + struct nd_mapping *nd_mapping; resource_size_t size = 0; u8 *pmem_id = NULL; - int rc = -ENODEV, l; + int rc = 0; u16 i; if (cookie == 0) { @@ -1593,13 +1611,19 @@ static int find_pmem_label_set(struct nd_region *nd_region, * Find a complete set of labels by uuid. By definition we can start * with any mapping as the reference label */ - for_each_label(l, nd_label, nd_region->mapping[0].labels) { - u64 isetcookie = __le64_to_cpu(nd_label->isetcookie); + for (i = 0; i < nd_region->ndr_mappings; i++) { + nd_mapping = &nd_region->mapping[i]; + mutex_lock_nested(&nd_mapping->lock, i); + } + list_for_each_entry(label_ent, &nd_region->mapping[0].labels, list) { + struct nd_namespace_label *nd_label = label_ent->label; - if (isetcookie != cookie) + if (!nd_label) + continue; + if (__le64_to_cpu(nd_label->isetcookie) != cookie) continue; - for (i = 0; nd_region->ndr_mappings; i++) + for (i = 0; i < nd_region->ndr_mappings; i++) if (!has_uuid_at_pos(nd_region, nd_label->uuid, cookie, i)) break; @@ -1611,18 +1635,27 @@ static int find_pmem_label_set(struct nd_region *nd_region, * dimm with two instances of the same uuid. */ rc = -EINVAL; - goto err; + break; } else if (pmem_id) { /* * If there is more than one valid uuid set, we * need userspace to clean this up. */ rc = -EBUSY; - goto err; + break; } memcpy(select_id, nd_label->uuid, NSLABEL_UUID_LEN); pmem_id = select_id; } + for (i = 0; i < nd_region->ndr_mappings; i++) { + int reverse = nd_region->ndr_mappings - 1 - i; + + nd_mapping = &nd_region->mapping[reverse]; + mutex_unlock(&nd_mapping->lock); + } + + if (rc) + goto err; /* * Fix up each mapping's 'labels' to have the validated pmem label for @@ -1638,8 +1671,19 @@ static int find_pmem_label_set(struct nd_region *nd_region, /* Calculate total size and populate namespace properties from label0 */ for (i = 0; i < nd_region->ndr_mappings; i++) { - struct nd_mapping *nd_mapping = &nd_region->mapping[i]; - struct nd_namespace_label *label0 = nd_mapping->labels[0]; + struct nd_namespace_label *label0; + + nd_mapping = &nd_region->mapping[i]; + mutex_lock(&nd_mapping->lock); + label_ent = list_first_entry_or_null(&nd_mapping->labels, + typeof(*label_ent), list); + label0 = label_ent ? label_ent->label : 0; + mutex_unlock(&nd_mapping->lock); + + if (!label0) { + WARN_ON(1); + continue; + } size += __le64_to_cpu(label0->rawsize); if (__le16_to_cpu(label0->position) != 0) @@ -1700,8 +1744,9 @@ static struct device **create_namespace_pmem(struct nd_region *nd_region) for (i = 0; i < nd_region->ndr_mappings; i++) { struct nd_mapping *nd_mapping = &nd_region->mapping[i]; - kfree(nd_mapping->labels); - nd_mapping->labels = NULL; + mutex_lock(&nd_mapping->lock); + nd_mapping_free_labels(nd_mapping); + mutex_unlock(&nd_mapping->lock); } /* Publish a zero-sized namespace for userspace to configure. */ @@ -1822,25 +1867,25 @@ void nd_region_create_btt_seed(struct nd_region *nd_region) dev_err(&nd_region->dev, "failed to create btt namespace\n"); } -static struct device **create_namespace_blk(struct nd_region *nd_region) +static struct device **scan_labels(struct nd_region *nd_region, + struct nd_mapping *nd_mapping) { - struct nd_mapping *nd_mapping = &nd_region->mapping[0]; - struct nd_namespace_label *nd_label; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); struct device *dev, **devs = NULL; struct nd_namespace_blk *nsblk; - struct nvdimm_drvdata *ndd; - int i, l, count = 0; - struct resource *res; + struct nd_label_ent *label_ent; + int i, count = 0; - if (nd_region->ndr_mappings == 0) - return NULL; - - ndd = to_ndd(nd_mapping); - for_each_label(l, nd_label, nd_mapping->labels) { - u32 flags = __le32_to_cpu(nd_label->flags); + list_for_each_entry(label_ent, &nd_mapping->labels, list) { + struct nd_namespace_label *nd_label = label_ent->label; char *name[NSLABEL_NAME_LEN]; struct device **__devs; + struct resource *res; + u32 flags; + if (!nd_label) + continue; + flags = __le32_to_cpu(nd_label->flags); if (flags & NSLABEL_FLAG_LOCAL) /* pass */; else @@ -1899,12 +1944,7 @@ static struct device **create_namespace_blk(struct nd_region *nd_region) if (count == 0) { /* Publish a zero-sized namespace for userspace to configure. */ - for (i = 0; i < nd_region->ndr_mappings; i++) { - struct nd_mapping *nd_mapping = &nd_region->mapping[i]; - - kfree(nd_mapping->labels); - nd_mapping->labels = NULL; - } + nd_mapping_free_labels(nd_mapping); devs = kcalloc(2, sizeof(dev), GFP_KERNEL); if (!devs) @@ -1920,8 +1960,8 @@ static struct device **create_namespace_blk(struct nd_region *nd_region) return devs; -err: - for (i = 0; i < count; i++) { + err: + for (i = 0; devs[i]; i++) { nsblk = to_nd_namespace_blk(devs[i]); namespace_blk_release(&nsblk->common.dev); } @@ -1929,6 +1969,21 @@ err: return NULL; } +static struct device **create_namespace_blk(struct nd_region *nd_region) +{ + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; + struct device **devs; + + if (nd_region->ndr_mappings == 0) + return NULL; + + mutex_lock(&nd_mapping->lock); + devs = scan_labels(nd_region, nd_mapping); + mutex_unlock(&nd_mapping->lock); + + return devs; +} + static int init_active_labels(struct nd_region *nd_region) { int i; @@ -1937,6 +1992,7 @@ static int init_active_labels(struct nd_region *nd_region) struct nd_mapping *nd_mapping = &nd_region->mapping[i]; struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); struct nvdimm *nvdimm = nd_mapping->nvdimm; + struct nd_label_ent *label_ent; int count, j; /* @@ -1958,16 +2014,27 @@ static int init_active_labels(struct nd_region *nd_region) dev_dbg(ndd->dev, "%s: %d\n", __func__, count); if (!count) continue; - nd_mapping->labels = kcalloc(count + 1, sizeof(void *), - GFP_KERNEL); - if (!nd_mapping->labels) - return -ENOMEM; for (j = 0; j < count; j++) { struct nd_namespace_label *label; + label_ent = kzalloc(sizeof(*label_ent), GFP_KERNEL); + if (!label_ent) + break; label = nd_label_active(ndd, j); - nd_mapping->labels[j] = label; + label_ent->label = label; + + mutex_lock(&nd_mapping->lock); + list_add_tail(&label_ent->list, &nd_mapping->labels); + mutex_unlock(&nd_mapping->lock); } + + if (j >= count) + continue; + + mutex_lock(&nd_mapping->lock); + nd_mapping_free_labels(nd_mapping); + mutex_unlock(&nd_mapping->lock); + return -ENOMEM; } return 0; diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h index 1414784c6c2b..fb3ade0d4a83 100644 --- a/drivers/nvdimm/nd-core.h +++ b/drivers/nvdimm/nd-core.h @@ -73,6 +73,7 @@ bool nd_is_uuid_unique(struct device *dev, u8 *uuid); struct nd_region; struct nvdimm_drvdata; struct nd_mapping; +void nd_mapping_free_labels(struct nd_mapping *nd_mapping); resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region, struct nd_mapping *nd_mapping, resource_size_t *overlap); resource_size_t nd_blk_available_dpa(struct nd_mapping *nd_mapping); diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index e58c40824e1f..f67c61f1a8a4 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -83,9 +83,6 @@ static inline struct nd_namespace_index *to_next_namespace_index( (unsigned long long) (res ? resource_size(res) : 0), \ (unsigned long long) (res ? res->start : 0), ##arg) -#define for_each_label(l, label, labels) \ - for (l = 0; (label = labels ? labels[l] : NULL); l++) - #define for_each_dpa_resource(ndd, res) \ for (res = (ndd)->dpa.child; res; res = res->sibling) @@ -98,11 +95,22 @@ struct nd_percpu_lane { spinlock_t lock; }; +struct nd_label_ent { + struct list_head list; + struct nd_namespace_label *label; +}; + +enum nd_mapping_lock_class { + ND_MAPPING_CLASS0, + ND_MAPPING_UUID_SCAN, +}; + struct nd_mapping { struct nvdimm *nvdimm; - struct nd_namespace_label **labels; u64 start; u64 size; + struct list_head labels; + struct mutex lock; /* * @ndd is for private use at region enable / disable time for * get_ndd() + put_ndd(), all other nd_mapping to ndd diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index 0ff43cbb15e3..19bcd68c4141 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -487,6 +487,17 @@ u64 nd_region_interleave_set_cookie(struct nd_region *nd_region) return 0; } +void nd_mapping_free_labels(struct nd_mapping *nd_mapping) +{ + struct nd_label_ent *label_ent, *e; + + WARN_ON(!mutex_is_locked(&nd_mapping->lock)); + list_for_each_entry_safe(label_ent, e, &nd_mapping->labels, list) { + list_del(&label_ent->list); + kfree(label_ent); + } +} + /* * Upon successful probe/remove, take/release a reference on the * associated interleave set (if present), and plant new btt + namespace @@ -507,8 +518,10 @@ static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus, struct nvdimm_drvdata *ndd = nd_mapping->ndd; struct nvdimm *nvdimm = nd_mapping->nvdimm; - kfree(nd_mapping->labels); - nd_mapping->labels = NULL; + mutex_lock(&nd_mapping->lock); + nd_mapping_free_labels(nd_mapping); + mutex_unlock(&nd_mapping->lock); + put_ndd(ndd); nd_mapping->ndd = NULL; if (ndd) @@ -816,6 +829,8 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, nd_region->mapping[i].nvdimm = nvdimm; nd_region->mapping[i].start = mapping->start; nd_region->mapping[i].size = mapping->size; + INIT_LIST_HEAD(&nd_region->mapping[i].labels); + mutex_init(&nd_region->mapping[i].lock); get_device(&nvdimm->dev); } From f95b4bca9e7d29db284f9b175edf8deca1489def Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 21 Sep 2016 18:16:21 -0700 Subject: [PATCH 25/39] libnvdimm, namespace: refactor uuid_show() into a namespace_to_uuid() helper The ability to translate a generic struct device pointer into a namespace uuid is a useful utility as we go to unify the blk and pmem label scanning paths. Signed-off-by: Dan Williams --- drivers/nvdimm/namespace_devs.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index 9f4188c78120..0e62f46755e7 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -1032,22 +1032,27 @@ static ssize_t size_show(struct device *dev, } static DEVICE_ATTR(size, S_IRUGO, size_show, size_store); -static ssize_t uuid_show(struct device *dev, - struct device_attribute *attr, char *buf) +static u8 *namespace_to_uuid(struct device *dev) { - u8 *uuid; - if (is_namespace_pmem(dev)) { struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); - uuid = nspm->uuid; + return nspm->uuid; } else if (is_namespace_blk(dev)) { struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); - uuid = nsblk->uuid; + return nsblk->uuid; } else - return -ENXIO; + return ERR_PTR(-ENXIO); +} +static ssize_t uuid_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + u8 *uuid = namespace_to_uuid(dev); + + if (IS_ERR(uuid)) + return PTR_ERR(uuid); if (uuid) return sprintf(buf, "%pUb\n", uuid); return sprintf(buf, "\n"); From 8a5f50d3b7f2f601c200f84827c2c9220cd69f71 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 22 Sep 2016 15:42:59 -0700 Subject: [PATCH 26/39] libnvdimm, namespace: unify blk and pmem label scanning In preparation for allowing multiple namespace per pmem region, unify blk and pmem label scanning. Given that blk regions already support multiple namespaces, teaching that path how to do pmem namespace scanning is an incremental step towards multiple pmem namespace support. This should be functionally equivalent to the previous state in that stops after finding the first valid pmem label set. Signed-off-by: Dan Williams --- drivers/nvdimm/namespace_devs.c | 385 +++++++++++++++++--------------- 1 file changed, 207 insertions(+), 178 deletions(-) diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index 0e62f46755e7..fbcadc7cb8fd 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -1550,7 +1550,7 @@ static int select_pmem_id(struct nd_region *nd_region, u8 *pmem_id) u64 hw_start, hw_end, pmem_start, pmem_end; struct nd_label_ent *label_ent; - mutex_lock(&nd_mapping->lock); + WARN_ON(!mutex_is_locked(&nd_mapping->lock)); list_for_each_entry(label_ent, &nd_mapping->labels, list) { nd_label = label_ent->label; if (!nd_label) @@ -1559,7 +1559,6 @@ static int select_pmem_id(struct nd_region *nd_region, u8 *pmem_id) break; nd_label = NULL; } - mutex_unlock(&nd_mapping->lock); if (!nd_label) { WARN_ON(1); @@ -1579,88 +1578,65 @@ static int select_pmem_id(struct nd_region *nd_region, u8 *pmem_id) else return -EINVAL; - mutex_lock(&nd_mapping->lock); - label_ent = list_first_entry(&nd_mapping->labels, - typeof(*label_ent), list); - label_ent->label = nd_label; - list_del(&label_ent->list); - nd_mapping_free_labels(nd_mapping); - list_add(&label_ent->list, &nd_mapping->labels); - mutex_unlock(&nd_mapping->lock); + /* move recently validated label to the front of the list */ + list_move(&label_ent->list, &nd_mapping->labels); } return 0; } /** - * find_pmem_label_set - validate interleave set labelling, retrieve label0 + * create_namespace_pmem - validate interleave set labelling, retrieve label0 * @nd_region: region with mappings to validate + * @nspm: target namespace to create + * @nd_label: target pmem namespace label to evaluate */ -static int find_pmem_label_set(struct nd_region *nd_region, - struct nd_namespace_pmem *nspm) +struct device *create_namespace_pmem(struct nd_region *nd_region, + struct nd_namespace_label *nd_label) { u64 cookie = nd_region_interleave_set_cookie(nd_region); - u8 select_id[NSLABEL_UUID_LEN]; struct nd_label_ent *label_ent; + struct nd_namespace_pmem *nspm; struct nd_mapping *nd_mapping; resource_size_t size = 0; - u8 *pmem_id = NULL; + struct resource *res; + struct device *dev; int rc = 0; u16 i; if (cookie == 0) { dev_dbg(&nd_region->dev, "invalid interleave-set-cookie\n"); - return -ENXIO; + return ERR_PTR(-ENXIO); } - /* - * Find a complete set of labels by uuid. By definition we can start - * with any mapping as the reference label - */ - for (i = 0; i < nd_region->ndr_mappings; i++) { - nd_mapping = &nd_region->mapping[i]; - mutex_lock_nested(&nd_mapping->lock, i); + if (__le64_to_cpu(nd_label->isetcookie) != cookie) { + dev_dbg(&nd_region->dev, "invalid cookie in label: %pUb\n", + nd_label->uuid); + return ERR_PTR(-EAGAIN); } - list_for_each_entry(label_ent, &nd_region->mapping[0].labels, list) { - struct nd_namespace_label *nd_label = label_ent->label; - if (!nd_label) - continue; - if (__le64_to_cpu(nd_label->isetcookie) != cookie) - continue; + nspm = kzalloc(sizeof(*nspm), GFP_KERNEL); + if (!nspm) + return ERR_PTR(-ENOMEM); - for (i = 0; i < nd_region->ndr_mappings; i++) - if (!has_uuid_at_pos(nd_region, nd_label->uuid, - cookie, i)) - break; - if (i < nd_region->ndr_mappings) { - /* - * Give up if we don't find an instance of a - * uuid at each position (from 0 to - * nd_region->ndr_mappings - 1), or if we find a - * dimm with two instances of the same uuid. - */ - rc = -EINVAL; + dev = &nspm->nsio.common.dev; + dev->type = &namespace_pmem_device_type; + dev->parent = &nd_region->dev; + res = &nspm->nsio.res; + res->name = dev_name(&nd_region->dev); + res->flags = IORESOURCE_MEM; + + for (i = 0; i < nd_region->ndr_mappings; i++) + if (!has_uuid_at_pos(nd_region, nd_label->uuid, cookie, i)) break; - } else if (pmem_id) { - /* - * If there is more than one valid uuid set, we - * need userspace to clean this up. - */ - rc = -EBUSY; - break; - } - memcpy(select_id, nd_label->uuid, NSLABEL_UUID_LEN); - pmem_id = select_id; - } - for (i = 0; i < nd_region->ndr_mappings; i++) { - int reverse = nd_region->ndr_mappings - 1 - i; - - nd_mapping = &nd_region->mapping[reverse]; - mutex_unlock(&nd_mapping->lock); - } - - if (rc) + if (i < nd_region->ndr_mappings) { + /* + * Give up if we don't find an instance of a uuid at each + * position (from 0 to nd_region->ndr_mappings - 1), or if we + * find a dimm with two instances of the same uuid. + */ + rc = -EINVAL; goto err; + } /* * Fix up each mapping's 'labels' to have the validated pmem label for @@ -1670,7 +1646,7 @@ static int find_pmem_label_set(struct nd_region *nd_region, * the dimm being enabled (i.e. nd_label_reserve_dpa() * succeeded). */ - rc = select_pmem_id(nd_region, pmem_id); + rc = select_pmem_id(nd_region, nd_label->uuid); if (rc) goto err; @@ -1679,11 +1655,9 @@ static int find_pmem_label_set(struct nd_region *nd_region, struct nd_namespace_label *label0; nd_mapping = &nd_region->mapping[i]; - mutex_lock(&nd_mapping->lock); label_ent = list_first_entry_or_null(&nd_mapping->labels, typeof(*label_ent), list); label0 = label_ent ? label_ent->label : 0; - mutex_unlock(&nd_mapping->lock); if (!label0) { WARN_ON(1); @@ -1707,8 +1681,9 @@ static int find_pmem_label_set(struct nd_region *nd_region, nd_namespace_pmem_set_size(nd_region, nspm, size); - return 0; + return dev; err: + namespace_pmem_release(dev); switch (rc) { case -EINVAL: dev_dbg(&nd_region->dev, "%s: invalid label(s)\n", __func__); @@ -1721,56 +1696,7 @@ static int find_pmem_label_set(struct nd_region *nd_region, __func__, rc); break; } - return rc; -} - -static struct device **create_namespace_pmem(struct nd_region *nd_region) -{ - struct nd_namespace_pmem *nspm; - struct device *dev, **devs; - struct resource *res; - int rc; - - nspm = kzalloc(sizeof(*nspm), GFP_KERNEL); - if (!nspm) - return NULL; - - dev = &nspm->nsio.common.dev; - dev->type = &namespace_pmem_device_type; - dev->parent = &nd_region->dev; - res = &nspm->nsio.res; - res->name = dev_name(&nd_region->dev); - res->flags = IORESOURCE_MEM; - rc = find_pmem_label_set(nd_region, nspm); - if (rc == -ENODEV) { - int i; - - /* Pass, try to permit namespace creation... */ - for (i = 0; i < nd_region->ndr_mappings; i++) { - struct nd_mapping *nd_mapping = &nd_region->mapping[i]; - - mutex_lock(&nd_mapping->lock); - nd_mapping_free_labels(nd_mapping); - mutex_unlock(&nd_mapping->lock); - } - - /* Publish a zero-sized namespace for userspace to configure. */ - nd_namespace_pmem_set_size(nd_region, nspm, 0); - - rc = 0; - } else if (rc) - goto err; - - devs = kcalloc(2, sizeof(struct device *), GFP_KERNEL); - if (!devs) - goto err; - - devs[0] = dev; - return devs; - - err: - namespace_pmem_release(&nspm->nsio.common.dev); - return NULL; + return ERR_PTR(rc); } struct resource *nsblk_add_resource(struct nd_region *nd_region, @@ -1872,43 +1798,107 @@ void nd_region_create_btt_seed(struct nd_region *nd_region) dev_err(&nd_region->dev, "failed to create btt namespace\n"); } -static struct device **scan_labels(struct nd_region *nd_region, - struct nd_mapping *nd_mapping) +static int add_namespace_resource(struct nd_region *nd_region, + struct nd_namespace_label *nd_label, struct device **devs, + int count) { + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + int i; + + for (i = 0; i < count; i++) { + u8 *uuid = namespace_to_uuid(devs[i]); + struct resource *res; + + if (IS_ERR_OR_NULL(uuid)) { + WARN_ON(1); + continue; + } + + if (memcmp(uuid, nd_label->uuid, NSLABEL_UUID_LEN) != 0) + continue; + if (is_namespace_blk(devs[i])) { + res = nsblk_add_resource(nd_region, ndd, + to_nd_namespace_blk(devs[i]), + __le64_to_cpu(nd_label->dpa)); + if (!res) + return -ENXIO; + nd_dbg_dpa(nd_region, ndd, res, "%d assign\n", count); + } else { + dev_err(&nd_region->dev, + "error: conflicting extents for uuid: %pUb\n", + nd_label->uuid); + return -ENXIO; + } + break; + } + + return i; +} + +struct device *create_namespace_blk(struct nd_region *nd_region, + struct nd_namespace_label *nd_label, int count) +{ + + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); - struct device *dev, **devs = NULL; struct nd_namespace_blk *nsblk; - struct nd_label_ent *label_ent; + char *name[NSLABEL_NAME_LEN]; + struct device *dev = NULL; + struct resource *res; + + nsblk = kzalloc(sizeof(*nsblk), GFP_KERNEL); + if (!nsblk) + return ERR_PTR(-ENOMEM); + dev = &nsblk->common.dev; + dev->type = &namespace_blk_device_type; + dev->parent = &nd_region->dev; + nsblk->id = -1; + nsblk->lbasize = __le64_to_cpu(nd_label->lbasize); + nsblk->uuid = kmemdup(nd_label->uuid, NSLABEL_UUID_LEN, + GFP_KERNEL); + if (!nsblk->uuid) + goto blk_err; + memcpy(name, nd_label->name, NSLABEL_NAME_LEN); + if (name[0]) + nsblk->alt_name = kmemdup(name, NSLABEL_NAME_LEN, + GFP_KERNEL); + res = nsblk_add_resource(nd_region, ndd, nsblk, + __le64_to_cpu(nd_label->dpa)); + if (!res) + goto blk_err; + nd_dbg_dpa(nd_region, ndd, res, "%d: assign\n", count); + return dev; + blk_err: + namespace_blk_release(dev); + return ERR_PTR(-ENXIO); +} + +static struct device **scan_labels(struct nd_region *nd_region) +{ + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; + struct device *dev, **devs = NULL; + struct nd_label_ent *label_ent, *e; int i, count = 0; - list_for_each_entry(label_ent, &nd_mapping->labels, list) { + /* "safe" because create_namespace_pmem() might list_move() label_ent */ + list_for_each_entry_safe(label_ent, e, &nd_mapping->labels, list) { struct nd_namespace_label *nd_label = label_ent->label; - char *name[NSLABEL_NAME_LEN]; struct device **__devs; - struct resource *res; u32 flags; if (!nd_label) continue; flags = __le32_to_cpu(nd_label->flags); - if (flags & NSLABEL_FLAG_LOCAL) - /* pass */; + if (is_nd_blk(&nd_region->dev) + == !!(flags & NSLABEL_FLAG_LOCAL)) + /* pass, region matches label type */; else continue; - for (i = 0; i < count; i++) { - nsblk = to_nd_namespace_blk(devs[i]); - if (memcmp(nsblk->uuid, nd_label->uuid, - NSLABEL_UUID_LEN) == 0) { - res = nsblk_add_resource(nd_region, ndd, nsblk, - __le64_to_cpu(nd_label->dpa)); - if (!res) - goto err; - nd_dbg_dpa(nd_region, ndd, res, "%s assign\n", - dev_name(&nsblk->common.dev)); - break; - } - } + i = add_namespace_resource(nd_region, nd_label, devs, count); + if (i < 0) + goto err; if (i < count) continue; __devs = kcalloc(count + 2, sizeof(dev), GFP_KERNEL); @@ -1918,34 +1908,35 @@ static struct device **scan_labels(struct nd_region *nd_region, kfree(devs); devs = __devs; - nsblk = kzalloc(sizeof(*nsblk), GFP_KERNEL); - if (!nsblk) - goto err; - dev = &nsblk->common.dev; - dev->type = &namespace_blk_device_type; - dev->parent = &nd_region->dev; - dev_set_name(dev, "namespace%d.%d", nd_region->id, count); - devs[count++] = dev; - nsblk->id = -1; - nsblk->lbasize = __le64_to_cpu(nd_label->lbasize); - nsblk->uuid = kmemdup(nd_label->uuid, NSLABEL_UUID_LEN, - GFP_KERNEL); - if (!nsblk->uuid) - goto err; - memcpy(name, nd_label->name, NSLABEL_NAME_LEN); - if (name[0]) - nsblk->alt_name = kmemdup(name, NSLABEL_NAME_LEN, - GFP_KERNEL); - res = nsblk_add_resource(nd_region, ndd, nsblk, - __le64_to_cpu(nd_label->dpa)); - if (!res) - goto err; - nd_dbg_dpa(nd_region, ndd, res, "%s assign\n", - dev_name(&nsblk->common.dev)); + if (is_nd_blk(&nd_region->dev)) { + dev = create_namespace_blk(nd_region, nd_label, count); + if (IS_ERR(dev)) + goto err; + devs[count++] = dev; + } else { + dev = create_namespace_pmem(nd_region, nd_label); + if (IS_ERR(dev)) { + switch (PTR_ERR(dev)) { + case -EAGAIN: + /* skip invalid labels */ + continue; + case -ENODEV: + /* fallthrough to seed creation */ + break; + default: + goto err; + } + } else + devs[count++] = dev; + + /* we only expect one valid pmem label set per region */ + break; + } } - dev_dbg(&nd_region->dev, "%s: discovered %d blk namespace%s\n", - __func__, count, count == 1 ? "" : "s"); + dev_dbg(&nd_region->dev, "%s: discovered %d %s namespace%s\n", + __func__, count, is_nd_blk(&nd_region->dev) + ? "blk" : "pmem", count == 1 ? "" : "s"); if (count == 0) { /* Publish a zero-sized namespace for userspace to configure. */ @@ -1954,37 +1945,77 @@ static struct device **scan_labels(struct nd_region *nd_region, devs = kcalloc(2, sizeof(dev), GFP_KERNEL); if (!devs) goto err; - nsblk = kzalloc(sizeof(*nsblk), GFP_KERNEL); - if (!nsblk) - goto err; - dev = &nsblk->common.dev; - dev->type = &namespace_blk_device_type; + if (is_nd_blk(&nd_region->dev)) { + struct nd_namespace_blk *nsblk; + + nsblk = kzalloc(sizeof(*nsblk), GFP_KERNEL); + if (!nsblk) + goto err; + dev = &nsblk->common.dev; + dev->type = &namespace_blk_device_type; + } else { + struct nd_namespace_pmem *nspm; + + nspm = kzalloc(sizeof(*nspm), GFP_KERNEL); + if (!nspm) + goto err; + dev = &nspm->nsio.common.dev; + dev->type = &namespace_pmem_device_type; + nd_namespace_pmem_set_size(nd_region, nspm, 0); + } dev->parent = &nd_region->dev; devs[count++] = dev; + } else if (is_nd_pmem(&nd_region->dev)) { + /* clean unselected labels */ + for (i = 0; i < nd_region->ndr_mappings; i++) { + nd_mapping = &nd_region->mapping[i]; + if (list_empty(&nd_mapping->labels)) { + WARN_ON(1); + continue; + } + label_ent = list_first_entry(&nd_mapping->labels, + typeof(*label_ent), list); + list_del(&label_ent->list); + nd_mapping_free_labels(nd_mapping); + list_add(&label_ent->list, &nd_mapping->labels); + } } return devs; err: - for (i = 0; devs[i]; i++) { - nsblk = to_nd_namespace_blk(devs[i]); - namespace_blk_release(&nsblk->common.dev); - } + for (i = 0; devs[i]; i++) + if (is_nd_blk(&nd_region->dev)) + namespace_blk_release(devs[i]); + else + namespace_pmem_release(devs[i]); kfree(devs); return NULL; } -static struct device **create_namespace_blk(struct nd_region *nd_region) +static struct device **create_namespaces(struct nd_region *nd_region) { struct nd_mapping *nd_mapping = &nd_region->mapping[0]; struct device **devs; + int i; if (nd_region->ndr_mappings == 0) return NULL; - mutex_lock(&nd_mapping->lock); - devs = scan_labels(nd_region, nd_mapping); - mutex_unlock(&nd_mapping->lock); + /* lock down all mappings while we scan labels */ + for (i = 0; i < nd_region->ndr_mappings; i++) { + nd_mapping = &nd_region->mapping[i]; + mutex_lock_nested(&nd_mapping->lock, i); + } + + devs = scan_labels(nd_region); + + for (i = 0; i < nd_region->ndr_mappings; i++) { + int reverse = nd_region->ndr_mappings - 1 - i; + + nd_mapping = &nd_region->mapping[reverse]; + mutex_unlock(&nd_mapping->lock); + } return devs; } @@ -2064,10 +2095,8 @@ int nd_region_register_namespaces(struct nd_region *nd_region, int *err) devs = create_namespace_io(nd_region); break; case ND_DEVICE_NAMESPACE_PMEM: - devs = create_namespace_pmem(nd_region); - break; case ND_DEVICE_NAMESPACE_BLK: - devs = create_namespace_blk(nd_region); + devs = create_namespaces(nd_region); break; default: break; From bd4cd745b3b412ac93227640e3b337962f41d932 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 6 Oct 2016 11:22:37 -0700 Subject: [PATCH 27/39] tools/testing/nvdimm: support for sub-dividing a pmem region Update nfit_test to handle multiple sub-allocations within a given pmem region. The mock resource now tracks and un-tracks sub-ranges as they are requested and released (either explicitly or via devm callback). Signed-off-by: Dan Williams --- tools/testing/nvdimm/test/iomap.c | 134 ++++++++++++++++++++------ tools/testing/nvdimm/test/nfit.c | 21 ++-- tools/testing/nvdimm/test/nfit_test.h | 12 ++- 3 files changed, 124 insertions(+), 43 deletions(-) diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c index dae5b9b6d186..3ccef732fce9 100644 --- a/tools/testing/nvdimm/test/iomap.c +++ b/tools/testing/nvdimm/test/iomap.c @@ -74,7 +74,7 @@ void __iomem *__nfit_test_ioremap(resource_size_t offset, unsigned long size, if (nfit_res) return (void __iomem *) nfit_res->buf + offset - - nfit_res->res->start; + - nfit_res->res.start; return fallback_fn(offset, size); } @@ -85,7 +85,7 @@ void __iomem *__wrap_devm_ioremap_nocache(struct device *dev, if (nfit_res) return (void __iomem *) nfit_res->buf + offset - - nfit_res->res->start; + - nfit_res->res.start; return devm_ioremap_nocache(dev, offset, size); } EXPORT_SYMBOL(__wrap_devm_ioremap_nocache); @@ -96,7 +96,7 @@ void *__wrap_devm_memremap(struct device *dev, resource_size_t offset, struct nfit_test_resource *nfit_res = get_nfit_res(offset); if (nfit_res) - return nfit_res->buf + offset - nfit_res->res->start; + return nfit_res->buf + offset - nfit_res->res.start; return devm_memremap(dev, offset, size, flags); } EXPORT_SYMBOL(__wrap_devm_memremap); @@ -108,7 +108,7 @@ void *__wrap_devm_memremap_pages(struct device *dev, struct resource *res, struct nfit_test_resource *nfit_res = get_nfit_res(offset); if (nfit_res) - return nfit_res->buf + offset - nfit_res->res->start; + return nfit_res->buf + offset - nfit_res->res.start; return devm_memremap_pages(dev, res, ref, altmap); } EXPORT_SYMBOL(__wrap_devm_memremap_pages); @@ -129,7 +129,7 @@ void *__wrap_memremap(resource_size_t offset, size_t size, struct nfit_test_resource *nfit_res = get_nfit_res(offset); if (nfit_res) - return nfit_res->buf + offset - nfit_res->res->start; + return nfit_res->buf + offset - nfit_res->res.start; return memremap(offset, size, flags); } EXPORT_SYMBOL(__wrap_memremap); @@ -175,6 +175,63 @@ void __wrap_memunmap(void *addr) } EXPORT_SYMBOL(__wrap_memunmap); +static bool nfit_test_release_region(struct device *dev, + struct resource *parent, resource_size_t start, + resource_size_t n); + +static void nfit_devres_release(struct device *dev, void *data) +{ + struct resource *res = *((struct resource **) data); + + WARN_ON(!nfit_test_release_region(NULL, &iomem_resource, res->start, + resource_size(res))); +} + +static int match(struct device *dev, void *__res, void *match_data) +{ + struct resource *res = *((struct resource **) __res); + resource_size_t start = *((resource_size_t *) match_data); + + return res->start == start; +} + +static bool nfit_test_release_region(struct device *dev, + struct resource *parent, resource_size_t start, + resource_size_t n) +{ + if (parent == &iomem_resource) { + struct nfit_test_resource *nfit_res = get_nfit_res(start); + + if (nfit_res) { + struct nfit_test_request *req; + struct resource *res = NULL; + + if (dev) { + devres_release(dev, nfit_devres_release, match, + &start); + return true; + } + + spin_lock(&nfit_res->lock); + list_for_each_entry(req, &nfit_res->requests, list) + if (req->res.start == start) { + res = &req->res; + list_del(&req->list); + break; + } + spin_unlock(&nfit_res->lock); + + WARN(!res || resource_size(res) != n, + "%s: start: %llx n: %llx mismatch: %pr\n", + __func__, start, n, res); + if (res) + kfree(req); + return true; + } + } + return false; +} + static struct resource *nfit_test_request_region(struct device *dev, struct resource *parent, resource_size_t start, resource_size_t n, const char *name, int flags) @@ -184,21 +241,57 @@ static struct resource *nfit_test_request_region(struct device *dev, if (parent == &iomem_resource) { nfit_res = get_nfit_res(start); if (nfit_res) { - struct resource *res = nfit_res->res + 1; + struct nfit_test_request *req; + struct resource *res = NULL; - if (start + n > nfit_res->res->start - + resource_size(nfit_res->res)) { + if (start + n > nfit_res->res.start + + resource_size(&nfit_res->res)) { pr_debug("%s: start: %llx n: %llx overflow: %pr\n", __func__, start, n, - nfit_res->res); + &nfit_res->res); return NULL; } + spin_lock(&nfit_res->lock); + list_for_each_entry(req, &nfit_res->requests, list) + if (start == req->res.start) { + res = &req->res; + break; + } + spin_unlock(&nfit_res->lock); + + if (res) { + WARN(1, "%pr already busy\n", res); + return NULL; + } + + req = kzalloc(sizeof(*req), GFP_KERNEL); + if (!req) + return NULL; + INIT_LIST_HEAD(&req->list); + res = &req->res; + res->start = start; res->end = start + n - 1; res->name = name; res->flags = resource_type(parent); res->flags |= IORESOURCE_BUSY | flags; + spin_lock(&nfit_res->lock); + list_add(&req->list, &nfit_res->requests); + spin_unlock(&nfit_res->lock); + + if (dev) { + struct resource **d; + + d = devres_alloc(nfit_devres_release, + sizeof(struct resource *), + GFP_KERNEL); + if (!d) + return NULL; + *d = res; + devres_add(dev, d); + } + pr_debug("%s: %pr\n", __func__, res); return res; } @@ -242,29 +335,10 @@ struct resource *__wrap___devm_request_region(struct device *dev, } EXPORT_SYMBOL(__wrap___devm_request_region); -static bool nfit_test_release_region(struct resource *parent, - resource_size_t start, resource_size_t n) -{ - if (parent == &iomem_resource) { - struct nfit_test_resource *nfit_res = get_nfit_res(start); - if (nfit_res) { - struct resource *res = nfit_res->res + 1; - - if (start != res->start || resource_size(res) != n) - pr_info("%s: start: %llx n: %llx mismatch: %pr\n", - __func__, start, n, res); - else - memset(res, 0, sizeof(*res)); - return true; - } - } - return false; -} - void __wrap___release_region(struct resource *parent, resource_size_t start, resource_size_t n) { - if (!nfit_test_release_region(parent, start, n)) + if (!nfit_test_release_region(NULL, parent, start, n)) __release_region(parent, start, n); } EXPORT_SYMBOL(__wrap___release_region); @@ -272,7 +346,7 @@ EXPORT_SYMBOL(__wrap___release_region); void __wrap___devm_release_region(struct device *dev, struct resource *parent, resource_size_t start, resource_size_t n) { - if (!nfit_test_release_region(parent, start, n)) + if (!nfit_test_release_region(dev, parent, start, n)) __devm_release_region(dev, parent, start, n); } EXPORT_SYMBOL(__wrap___devm_release_region); diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c index 175fc24f8f3a..0e721c6fb1cf 100644 --- a/tools/testing/nvdimm/test/nfit.c +++ b/tools/testing/nvdimm/test/nfit.c @@ -478,14 +478,12 @@ static struct nfit_test *instances[NUM_NFITS]; static void release_nfit_res(void *data) { struct nfit_test_resource *nfit_res = data; - struct resource *res = nfit_res->res; spin_lock(&nfit_test_lock); list_del(&nfit_res->list); spin_unlock(&nfit_test_lock); vfree(nfit_res->buf); - kfree(res); kfree(nfit_res); } @@ -493,12 +491,11 @@ static void *__test_alloc(struct nfit_test *t, size_t size, dma_addr_t *dma, void *buf) { struct device *dev = &t->pdev.dev; - struct resource *res = kzalloc(sizeof(*res) * 2, GFP_KERNEL); struct nfit_test_resource *nfit_res = kzalloc(sizeof(*nfit_res), GFP_KERNEL); int rc; - if (!res || !buf || !nfit_res) + if (!buf || !nfit_res) goto err; rc = devm_add_action(dev, release_nfit_res, nfit_res); if (rc) @@ -507,10 +504,11 @@ static void *__test_alloc(struct nfit_test *t, size_t size, dma_addr_t *dma, memset(buf, 0, size); nfit_res->dev = dev; nfit_res->buf = buf; - nfit_res->res = res; - res->start = *dma; - res->end = *dma + size - 1; - res->name = "NFIT"; + nfit_res->res.start = *dma; + nfit_res->res.end = *dma + size - 1; + nfit_res->res.name = "NFIT"; + spin_lock_init(&nfit_res->lock); + INIT_LIST_HEAD(&nfit_res->requests); spin_lock(&nfit_test_lock); list_add(&nfit_res->list, &t->resources); spin_unlock(&nfit_test_lock); @@ -519,7 +517,6 @@ static void *__test_alloc(struct nfit_test *t, size_t size, dma_addr_t *dma, err: if (buf) vfree(buf); - kfree(res); kfree(nfit_res); return NULL; } @@ -544,13 +541,13 @@ static struct nfit_test_resource *nfit_test_lookup(resource_size_t addr) continue; spin_lock(&nfit_test_lock); list_for_each_entry(n, &t->resources, list) { - if (addr >= n->res->start && (addr < n->res->start - + resource_size(n->res))) { + if (addr >= n->res.start && (addr < n->res.start + + resource_size(&n->res))) { nfit_res = n; break; } else if (addr >= (unsigned long) n->buf && (addr < (unsigned long) n->buf - + resource_size(n->res))) { + + resource_size(&n->res))) { nfit_res = n; break; } diff --git a/tools/testing/nvdimm/test/nfit_test.h b/tools/testing/nvdimm/test/nfit_test.h index 9f18e2a4a862..c281dd2e5e2d 100644 --- a/tools/testing/nvdimm/test/nfit_test.h +++ b/tools/testing/nvdimm/test/nfit_test.h @@ -13,11 +13,21 @@ #ifndef __NFIT_TEST_H__ #define __NFIT_TEST_H__ #include +#include +#include + +struct nfit_test_request { + struct list_head list; + struct resource res; +}; struct nfit_test_resource { + struct list_head requests; struct list_head list; - struct resource *res; + struct resource res; struct device *dev; + spinlock_t lock; + int req_count; void *buf; }; From 0e3b0d123c8fd5c42f364aea3ab663b1f18dad39 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 6 Oct 2016 23:13:15 -0700 Subject: [PATCH 28/39] libnvdimm, namespace: allow multiple pmem-namespaces per region at scan time If label scanning finds multiple valid pmem namespaces allow them to be surfaced rather than fail namespace scanning. Support for creating multiple namespaces per region is saved for a later patch. Note that this adds some new error messages to clarify which of the pmem namespaces in the set are potentially impacted by invalid labels. Signed-off-by: Dan Williams --- drivers/nvdimm/namespace_devs.c | 84 ++++++++++++++++++++++++++++----- include/linux/nd.h | 2 + 2 files changed, 74 insertions(+), 12 deletions(-) diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index fbcadc7cb8fd..47d29632b937 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -29,7 +29,10 @@ static void namespace_io_release(struct device *dev) static void namespace_pmem_release(struct device *dev) { struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + struct nd_region *nd_region = to_nd_region(dev->parent); + if (nspm->id >= 0) + ida_simple_remove(&nd_region->ns_ida, nspm->id); kfree(nspm->alt_name); kfree(nspm->uuid); kfree(nspm); @@ -833,13 +836,45 @@ static int grow_dpa_allocation(struct nd_region *nd_region, return 0; } -static void nd_namespace_pmem_set_size(struct nd_region *nd_region, +static void nd_namespace_pmem_set_resource(struct nd_region *nd_region, struct nd_namespace_pmem *nspm, resource_size_t size) { struct resource *res = &nspm->nsio.res; + resource_size_t offset = 0; - res->start = nd_region->ndr_start; - res->end = nd_region->ndr_start + size - 1; + if (size && !nspm->uuid) { + WARN_ON_ONCE(1); + size = 0; + } + + if (size && nspm->uuid) { + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct nd_label_id label_id; + struct resource *res; + + if (!ndd) { + size = 0; + goto out; + } + + nd_label_gen_id(&label_id, nspm->uuid, 0); + + /* calculate a spa offset from the dpa allocation offset */ + for_each_dpa_resource(ndd, res) + if (strcmp(res->name, label_id.id) == 0) { + offset = (res->start - nd_mapping->start) + * nd_region->ndr_mappings; + goto out; + } + + WARN_ON_ONCE(1); + size = 0; + } + + out: + res->start = nd_region->ndr_start + offset; + res->end = res->start + size - 1; } static bool uuid_not_set(const u8 *uuid, struct device *dev, const char *where) @@ -930,7 +965,7 @@ static ssize_t __size_store(struct device *dev, unsigned long long val) if (is_namespace_pmem(dev)) { struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); - nd_namespace_pmem_set_size(nd_region, nspm, + nd_namespace_pmem_set_resource(nd_region, nspm, val * nd_region->ndr_mappings); } else if (is_namespace_blk(dev)) { struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); @@ -1546,6 +1581,7 @@ static int select_pmem_id(struct nd_region *nd_region, u8 *pmem_id) for (i = 0; i < nd_region->ndr_mappings; i++) { struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); struct nd_namespace_label *nd_label = NULL; u64 hw_start, hw_end, pmem_start, pmem_end; struct nd_label_ent *label_ent; @@ -1573,10 +1609,14 @@ static int select_pmem_id(struct nd_region *nd_region, u8 *pmem_id) hw_end = hw_start + nd_mapping->size; pmem_start = __le64_to_cpu(nd_label->dpa); pmem_end = pmem_start + __le64_to_cpu(nd_label->rawsize); - if (pmem_start == hw_start && pmem_end <= hw_end) + if (pmem_start >= hw_start && pmem_start < hw_end + && pmem_end <= hw_end && pmem_end > hw_start) /* pass */; - else + else { + dev_dbg(&nd_region->dev, "%s invalid label for %pUb\n", + dev_name(ndd->dev), nd_label->uuid); return -EINVAL; + } /* move recently validated label to the front of the list */ list_move(&label_ent->list, &nd_mapping->labels); @@ -1618,6 +1658,7 @@ struct device *create_namespace_pmem(struct nd_region *nd_region, if (!nspm) return ERR_PTR(-ENOMEM); + nspm->id = -1; dev = &nspm->nsio.common.dev; dev->type = &namespace_pmem_device_type; dev->parent = &nd_region->dev; @@ -1629,11 +1670,15 @@ struct device *create_namespace_pmem(struct nd_region *nd_region, if (!has_uuid_at_pos(nd_region, nd_label->uuid, cookie, i)) break; if (i < nd_region->ndr_mappings) { + struct nvdimm_drvdata *ndd = to_ndd(&nd_region->mapping[i]); + /* * Give up if we don't find an instance of a uuid at each * position (from 0 to nd_region->ndr_mappings - 1), or if we * find a dimm with two instances of the same uuid. */ + dev_err(&nd_region->dev, "%s missing label for %pUb\n", + dev_name(ndd->dev), nd_label->uuid); rc = -EINVAL; goto err; } @@ -1679,7 +1724,7 @@ struct device *create_namespace_pmem(struct nd_region *nd_region, goto err; } - nd_namespace_pmem_set_size(nd_region, nspm, size); + nd_namespace_pmem_set_resource(nd_region, nspm, size); return dev; err: @@ -1961,23 +2006,31 @@ static struct device **scan_labels(struct nd_region *nd_region) goto err; dev = &nspm->nsio.common.dev; dev->type = &namespace_pmem_device_type; - nd_namespace_pmem_set_size(nd_region, nspm, 0); + nd_namespace_pmem_set_resource(nd_region, nspm, 0); } dev->parent = &nd_region->dev; devs[count++] = dev; } else if (is_nd_pmem(&nd_region->dev)) { /* clean unselected labels */ for (i = 0; i < nd_region->ndr_mappings; i++) { + struct list_head *l, *e; + LIST_HEAD(list); + int j; + nd_mapping = &nd_region->mapping[i]; if (list_empty(&nd_mapping->labels)) { WARN_ON(1); continue; } - label_ent = list_first_entry(&nd_mapping->labels, - typeof(*label_ent), list); - list_del(&label_ent->list); + + j = count; + list_for_each_safe(l, e, &nd_mapping->labels) { + if (!j--) + break; + list_move_tail(l, &list); + } nd_mapping_free_labels(nd_mapping); - list_add(&label_ent->list, &nd_mapping->labels); + list_splice_init(&list, &nd_mapping->labels); } } @@ -2117,6 +2170,13 @@ int nd_region_register_namespaces(struct nd_region *nd_region, int *err) id = ida_simple_get(&nd_region->ns_ida, 0, 0, GFP_KERNEL); nsblk->id = id; + } else if (type == ND_DEVICE_NAMESPACE_PMEM) { + struct nd_namespace_pmem *nspm; + + nspm = to_nd_namespace_pmem(dev); + id = ida_simple_get(&nd_region->ns_ida, 0, 0, + GFP_KERNEL); + nspm->id = id; } else id = i; diff --git a/include/linux/nd.h b/include/linux/nd.h index f1ea426d6a5e..ddcc7788305c 100644 --- a/include/linux/nd.h +++ b/include/linux/nd.h @@ -77,11 +77,13 @@ struct nd_namespace_io { * @nsio: device and system physical address range to drive * @alt_name: namespace name supplied in the dimm label * @uuid: namespace name supplied in the dimm label + * @id: ida allocated id */ struct nd_namespace_pmem { struct nd_namespace_io nsio; char *alt_name; u8 *uuid; + int id; }; /** From 6ff3e912d32ece4e9cf8708da796e9e2e7979ffe Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 5 Oct 2016 14:04:15 -0700 Subject: [PATCH 29/39] libnvdimm, namespace: sort namespaces by dpa at init Add more determinism to initial namespace device-name assignments by sorting the namespaces by starting dpa. Signed-off-by: Dan Williams --- drivers/nvdimm/namespace_devs.c | 35 ++++++++++++++++++++++++++++++--- include/linux/nd.h | 6 +++--- 2 files changed, 35 insertions(+), 6 deletions(-) diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index 47d29632b937..f0536c2789e9 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -12,6 +12,7 @@ */ #include #include +#include #include #include #include @@ -66,17 +67,17 @@ static struct device_type namespace_blk_device_type = { .release = namespace_blk_release, }; -static bool is_namespace_pmem(struct device *dev) +static bool is_namespace_pmem(const struct device *dev) { return dev ? dev->type == &namespace_pmem_device_type : false; } -static bool is_namespace_blk(struct device *dev) +static bool is_namespace_blk(const struct device *dev) { return dev ? dev->type == &namespace_blk_device_type : false; } -static bool is_namespace_io(struct device *dev) +static bool is_namespace_io(const struct device *dev) { return dev ? dev->type == &namespace_io_device_type : false; } @@ -1919,6 +1920,31 @@ struct device *create_namespace_blk(struct nd_region *nd_region, return ERR_PTR(-ENXIO); } +static int cmp_dpa(const void *a, const void *b) +{ + const struct device *dev_a = *(const struct device **) a; + const struct device *dev_b = *(const struct device **) b; + struct nd_namespace_blk *nsblk_a, *nsblk_b; + struct nd_namespace_pmem *nspm_a, *nspm_b; + + if (is_namespace_io(dev_a)) + return 0; + + if (is_namespace_blk(dev_a)) { + nsblk_a = to_nd_namespace_blk(dev_a); + nsblk_b = to_nd_namespace_blk(dev_b); + + return memcmp(&nsblk_a->res[0]->start, &nsblk_b->res[0]->start, + sizeof(resource_size_t)); + } + + nspm_a = to_nd_namespace_pmem(dev_a); + nspm_b = to_nd_namespace_pmem(dev_b); + + return memcmp(&nspm_a->nsio.res.start, &nspm_b->nsio.res.start, + sizeof(resource_size_t)); +} + static struct device **scan_labels(struct nd_region *nd_region) { struct nd_mapping *nd_mapping = &nd_region->mapping[0]; @@ -2034,6 +2060,9 @@ static struct device **scan_labels(struct nd_region *nd_region) } } + if (count > 1) + sort(devs, count, sizeof(struct device *), cmp_dpa, NULL); + return devs; err: diff --git a/include/linux/nd.h b/include/linux/nd.h index ddcc7788305c..fa66aeed441a 100644 --- a/include/linux/nd.h +++ b/include/linux/nd.h @@ -107,19 +107,19 @@ struct nd_namespace_blk { struct resource **res; }; -static inline struct nd_namespace_io *to_nd_namespace_io(struct device *dev) +static inline struct nd_namespace_io *to_nd_namespace_io(const struct device *dev) { return container_of(dev, struct nd_namespace_io, common.dev); } -static inline struct nd_namespace_pmem *to_nd_namespace_pmem(struct device *dev) +static inline struct nd_namespace_pmem *to_nd_namespace_pmem(const struct device *dev) { struct nd_namespace_io *nsio = to_nd_namespace_io(dev); return container_of(nsio, struct nd_namespace_pmem, nsio); } -static inline struct nd_namespace_blk *to_nd_namespace_blk(struct device *dev) +static inline struct nd_namespace_blk *to_nd_namespace_blk(const struct device *dev) { return container_of(dev, struct nd_namespace_blk, common.dev); } From a1f3e4d6a0c322eb3e7fdfcc9facdcdf19130434 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 30 Sep 2016 17:28:58 -0700 Subject: [PATCH 30/39] libnvdimm, region: update nd_region_available_dpa() for multi-pmem support The free dpa (dimm-physical-address) space calculation reports how much free space is available with consideration for aliased BLK + PMEM regions. Recall that BLK capacity is allocated from high addresses and PMEM is allocated from low addresses in their respective regions. nd_region_available_dpa() accounts for the fact that the largest encroachment (lowest starting address) into PMEM capacity by a BLK allocation limits the available capacity to that point, regardless if there is BLK allocation hole at a higher address. Similarly, for the multi-pmem case we need to track the largest encroachment (highest ending address) of a PMEM allocation in BLK capacity regardless of whether there is an allocation hole that a BLK allocation could fill at a lower address. Signed-off-by: Dan Williams --- drivers/nvdimm/dimm_devs.c | 174 +++++++++++++++++++++++++++-------- drivers/nvdimm/nd-core.h | 2 +- drivers/nvdimm/region_devs.c | 5 +- 3 files changed, 139 insertions(+), 42 deletions(-) diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c index cf36470e94c0..4b0296ccb375 100644 --- a/drivers/nvdimm/dimm_devs.c +++ b/drivers/nvdimm/dimm_devs.c @@ -386,40 +386,148 @@ struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data, } EXPORT_SYMBOL_GPL(nvdimm_create); +struct blk_alloc_info { + struct nd_mapping *nd_mapping; + resource_size_t available, busy; + struct resource *res; +}; + +static int alias_dpa_busy(struct device *dev, void *data) +{ + resource_size_t map_end, blk_start, new, busy; + struct blk_alloc_info *info = data; + struct nd_mapping *nd_mapping; + struct nd_region *nd_region; + struct nvdimm_drvdata *ndd; + struct resource *res; + int i; + + if (!is_nd_pmem(dev)) + return 0; + + nd_region = to_nd_region(dev); + for (i = 0; i < nd_region->ndr_mappings; i++) { + nd_mapping = &nd_region->mapping[i]; + if (nd_mapping->nvdimm == info->nd_mapping->nvdimm) + break; + } + + if (i >= nd_region->ndr_mappings) + return 0; + + ndd = to_ndd(nd_mapping); + map_end = nd_mapping->start + nd_mapping->size - 1; + blk_start = nd_mapping->start; + retry: + /* + * Find the free dpa from the end of the last pmem allocation to + * the end of the interleave-set mapping that is not already + * covered by a blk allocation. + */ + busy = 0; + for_each_dpa_resource(ndd, res) { + if ((res->start >= blk_start && res->start < map_end) + || (res->end >= blk_start + && res->end <= map_end)) { + if (strncmp(res->name, "pmem", 4) == 0) { + new = max(blk_start, min(map_end + 1, + res->end + 1)); + if (new != blk_start) { + blk_start = new; + goto retry; + } + } else + busy += min(map_end, res->end) + - max(nd_mapping->start, res->start) + 1; + } else if (nd_mapping->start > res->start + && map_end < res->end) { + /* total eclipse of the PMEM region mapping */ + busy += nd_mapping->size; + break; + } + } + + info->available -= blk_start - nd_mapping->start + busy; + return 0; +} + +static int blk_dpa_busy(struct device *dev, void *data) +{ + struct blk_alloc_info *info = data; + struct nd_mapping *nd_mapping; + struct nd_region *nd_region; + resource_size_t map_end; + int i; + + if (!is_nd_pmem(dev)) + return 0; + + nd_region = to_nd_region(dev); + for (i = 0; i < nd_region->ndr_mappings; i++) { + nd_mapping = &nd_region->mapping[i]; + if (nd_mapping->nvdimm == info->nd_mapping->nvdimm) + break; + } + + if (i >= nd_region->ndr_mappings) + return 0; + + map_end = nd_mapping->start + nd_mapping->size - 1; + if (info->res->start >= nd_mapping->start + && info->res->start < map_end) { + if (info->res->end <= map_end) { + info->busy = 0; + return 1; + } else { + info->busy -= info->res->end - map_end; + return 0; + } + } else if (info->res->end >= nd_mapping->start + && info->res->end <= map_end) { + info->busy -= nd_mapping->start - info->res->start; + return 0; + } else { + info->busy -= nd_mapping->size; + return 0; + } +} + /** * nd_blk_available_dpa - account the unused dpa of BLK region * @nd_mapping: container of dpa-resource-root + labels * - * Unlike PMEM, BLK namespaces can occupy discontiguous DPA ranges. + * Unlike PMEM, BLK namespaces can occupy discontiguous DPA ranges, but + * we arrange for them to never start at an lower dpa than the last + * PMEM allocation in an aliased region. */ -resource_size_t nd_blk_available_dpa(struct nd_mapping *nd_mapping) +resource_size_t nd_blk_available_dpa(struct nd_region *nd_region) { + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev); + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); - resource_size_t map_end, busy = 0, available; + struct blk_alloc_info info = { + .nd_mapping = nd_mapping, + .available = nd_mapping->size, + }; struct resource *res; if (!ndd) return 0; - map_end = nd_mapping->start + nd_mapping->size - 1; - for_each_dpa_resource(ndd, res) - if (res->start >= nd_mapping->start && res->start < map_end) { - resource_size_t end = min(map_end, res->end); + device_for_each_child(&nvdimm_bus->dev, &info, alias_dpa_busy); - busy += end - res->start + 1; - } else if (res->end >= nd_mapping->start - && res->end <= map_end) { - busy += res->end - nd_mapping->start; - } else if (nd_mapping->start > res->start - && nd_mapping->start < res->end) { - /* total eclipse of the BLK region mapping */ - busy += nd_mapping->size; - } + /* now account for busy blk allocations in unaliased dpa */ + for_each_dpa_resource(ndd, res) { + if (strncmp(res->name, "blk", 3) != 0) + continue; - available = map_end - nd_mapping->start + 1; - if (busy < available) - return available - busy; - return 0; + info.res = res; + info.busy = resource_size(res); + device_for_each_child(&nvdimm_bus->dev, &info, blk_dpa_busy); + info.available -= info.busy; + } + + return info.available; } /** @@ -451,21 +559,16 @@ resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region, map_start = nd_mapping->start; map_end = map_start + nd_mapping->size - 1; blk_start = max(map_start, map_end + 1 - *overlap); - for_each_dpa_resource(ndd, res) + for_each_dpa_resource(ndd, res) { if (res->start >= map_start && res->start < map_end) { if (strncmp(res->name, "blk", 3) == 0) - blk_start = min(blk_start, res->start); - else if (res->start != map_start) { + blk_start = min(blk_start, + max(map_start, res->start)); + else if (res->end > map_end) { reason = "misaligned to iset"; goto err; - } else { - if (busy) { - reason = "duplicate overlapping PMEM reservations?"; - goto err; - } + } else busy += resource_size(res); - continue; - } } else if (res->end >= map_start && res->end <= map_end) { if (strncmp(res->name, "blk", 3) == 0) { /* @@ -474,15 +577,14 @@ resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region, * be used for BLK. */ blk_start = map_start; - } else { - reason = "misaligned to iset"; - goto err; - } + } else + busy += resource_size(res); } else if (map_start > res->start && map_start < res->end) { /* total eclipse of the mapping */ busy += nd_mapping->size; blk_start = map_start; } + } *overlap = map_end + 1 - blk_start; available = blk_start - map_start; @@ -491,10 +593,6 @@ resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region, return 0; err: - /* - * Something is wrong, PMEM must align with the start of the - * interleave set, and there can only be one allocation per set. - */ nd_dbg_dpa(nd_region, ndd, res, "%s\n", reason); return 0; } diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h index fb3ade0d4a83..7c2196a1d56f 100644 --- a/drivers/nvdimm/nd-core.h +++ b/drivers/nvdimm/nd-core.h @@ -76,7 +76,7 @@ struct nd_mapping; void nd_mapping_free_labels(struct nd_mapping *nd_mapping); resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region, struct nd_mapping *nd_mapping, resource_size_t *overlap); -resource_size_t nd_blk_available_dpa(struct nd_mapping *nd_mapping); +resource_size_t nd_blk_available_dpa(struct nd_region *nd_region); resource_size_t nd_region_available_dpa(struct nd_region *nd_region); resource_size_t nvdimm_allocated_dpa(struct nvdimm_drvdata *ndd, struct nd_label_id *label_id); diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index 19bcd68c4141..3ac534aec60c 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -294,9 +294,8 @@ resource_size_t nd_region_available_dpa(struct nd_region *nd_region) blk_max_overlap = overlap; goto retry; } - } else if (is_nd_blk(&nd_region->dev)) { - available += nd_blk_available_dpa(nd_mapping); - } + } else if (is_nd_blk(&nd_region->dev)) + available += nd_blk_available_dpa(nd_region); } return available; From 012207334a26727369b2668716d84e55af1f1d22 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 5 Oct 2016 09:09:44 -0700 Subject: [PATCH 31/39] libnvdimm, namespace: expand pmem device naming scheme for multi-pmem pmem devices are currently named /dev/pmem. Preserve the naming of the 0th device, but add a "." for other devices. Signed-off-by: Dan Williams --- drivers/nvdimm/namespace_devs.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index f0536c2789e9..132c5b8b5366 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -173,7 +173,21 @@ const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns, suffix = "s"; if (is_namespace_pmem(&ndns->dev) || is_namespace_io(&ndns->dev)) { - sprintf(name, "pmem%d%s", nd_region->id, suffix ? suffix : ""); + int nsidx = 0; + + if (is_namespace_pmem(&ndns->dev)) { + struct nd_namespace_pmem *nspm; + + nspm = to_nd_namespace_pmem(&ndns->dev); + nsidx = nspm->id; + } + + if (nsidx) + sprintf(name, "pmem%d.%d%s", nd_region->id, nsidx, + suffix ? suffix : ""); + else + sprintf(name, "pmem%d%s", nd_region->id, + suffix ? suffix : ""); } else if (is_namespace_blk(&ndns->dev)) { struct nd_namespace_blk *nsblk; From 16660eaea0ccc6d0692f173922cd365876eb288e Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 5 Oct 2016 21:13:23 -0700 Subject: [PATCH 32/39] libnvdimm, namespace: update label implementation for multi-pmem Instead of assuming that there will only ever be one allocated range at the start of the region, account for additional namespaces that might start at an offset from the region base. After this change pmem namespaces now have a reason to carry an array of resources similar to blk. Unifying the resource tracking infrastructure in nd_namespace_common is a future cleanup candidate. Signed-off-by: Dan Williams --- drivers/nvdimm/label.c | 72 +++++++++++++++++++++++++++++++----------- 1 file changed, 53 insertions(+), 19 deletions(-) diff --git a/drivers/nvdimm/label.c b/drivers/nvdimm/label.c index c37357210428..fac7cabe8f56 100644 --- a/drivers/nvdimm/label.c +++ b/drivers/nvdimm/label.c @@ -494,12 +494,13 @@ static int __pmem_label_update(struct nd_region *nd_region, struct nd_mapping *nd_mapping, struct nd_namespace_pmem *nspm, int pos) { - u64 cookie = nd_region_interleave_set_cookie(nd_region), rawsize; + u64 cookie = nd_region_interleave_set_cookie(nd_region); struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); - struct nd_namespace_label *victim_label; + struct nd_label_ent *label_ent, *victim = NULL; struct nd_namespace_label *nd_label; struct nd_namespace_index *nsindex; - struct nd_label_ent *label_ent; + struct nd_label_id label_id; + struct resource *res; unsigned long *free; u32 nslot, slot; size_t offset; @@ -508,6 +509,16 @@ static int __pmem_label_update(struct nd_region *nd_region, if (!preamble_next(ndd, &nsindex, &free, &nslot)) return -ENXIO; + nd_label_gen_id(&label_id, nspm->uuid, 0); + for_each_dpa_resource(ndd, res) + if (strcmp(res->name, label_id.id) == 0) + break; + + if (!res) { + WARN_ON_ONCE(1); + return -ENXIO; + } + /* allocate and write the label to the staging (next) index */ slot = nd_label_alloc_slot(ndd); if (slot == UINT_MAX) @@ -523,11 +534,10 @@ static int __pmem_label_update(struct nd_region *nd_region, nd_label->nlabel = __cpu_to_le16(nd_region->ndr_mappings); nd_label->position = __cpu_to_le16(pos); nd_label->isetcookie = __cpu_to_le64(cookie); - rawsize = div_u64(resource_size(&nspm->nsio.res), - nd_region->ndr_mappings); - nd_label->rawsize = __cpu_to_le64(rawsize); - nd_label->dpa = __cpu_to_le64(nd_mapping->start); + nd_label->rawsize = __cpu_to_le64(resource_size(res)); + nd_label->dpa = __cpu_to_le64(res->start); nd_label->slot = __cpu_to_le32(slot); + nd_dbg_dpa(nd_region, ndd, res, "%s\n", __func__); /* update label */ offset = nd_label_offset(ndd, nd_label); @@ -538,22 +548,39 @@ static int __pmem_label_update(struct nd_region *nd_region, /* Garbage collect the previous label */ mutex_lock(&nd_mapping->lock); - label_ent = list_first_entry_or_null(&nd_mapping->labels, - typeof(*label_ent), list); - WARN_ON(!label_ent); - victim_label = label_ent ? label_ent->label : NULL; - if (victim_label) { - label_ent->label = NULL; - slot = to_slot(ndd, victim_label); - nd_label_free_slot(ndd, slot); + list_for_each_entry(label_ent, &nd_mapping->labels, list) { + if (!label_ent->label) + continue; + if (memcmp(nspm->uuid, label_ent->label->uuid, + NSLABEL_UUID_LEN) != 0) + continue; + victim = label_ent; + list_move_tail(&victim->list, &nd_mapping->labels); + break; + } + if (victim) { dev_dbg(ndd->dev, "%s: free: %d\n", __func__, slot); + slot = to_slot(ndd, victim->label); + nd_label_free_slot(ndd, slot); + victim->label = NULL; } /* update index */ rc = nd_label_write_index(ndd, ndd->ns_next, nd_inc_seq(__le32_to_cpu(nsindex->seq)), 0); - if (rc == 0 && label_ent) - label_ent->label = nd_label; + if (rc == 0) { + list_for_each_entry(label_ent, &nd_mapping->labels, list) + if (!label_ent->label) { + label_ent->label = nd_label; + nd_label = NULL; + break; + } + dev_WARN_ONCE(&nspm->nsio.common.dev, nd_label, + "failed to track label: %d\n", + to_slot(ndd, nd_label)); + if (nd_label) + rc = -ENXIO; + } mutex_unlock(&nd_mapping->lock); return rc; @@ -899,7 +926,9 @@ int nd_pmem_namespace_label_update(struct nd_region *nd_region, for (i = 0; i < nd_region->ndr_mappings; i++) { struct nd_mapping *nd_mapping = &nd_region->mapping[i]; - int rc; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct resource *res; + int rc, count = 0; if (size == 0) { rc = del_labels(nd_mapping, nspm->uuid); @@ -908,7 +937,12 @@ int nd_pmem_namespace_label_update(struct nd_region *nd_region, continue; } - rc = init_labels(nd_mapping, 1); + for_each_dpa_resource(ndd, res) + if (strncmp(res->name, "pmem", 3) == 0) + count++; + WARN_ON_ONCE(!count); + + rc = init_labels(nd_mapping, count); if (rc < 0) return rc; From 762d067dbad5f32560cb1657b7ca20034332dc56 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 4 Oct 2016 16:09:59 -0700 Subject: [PATCH 33/39] libnvdimm, namespace: enable allocation of multiple pmem namespaces Now that we have nd_region_available_dpa() able to handle the presence of multiple PMEM allocations in aliased PMEM regions, reuse that same infrastructure to track allocations from free space. In particular handle allocating from an aliased PMEM region in the case where there are dis-contiguous holes. The allocation for BLK and PMEM are documented in the space_valid() helper: BLK-space is valid as long as it does not precede a PMEM allocation in a given region. PMEM-space must be contiguous and adjacent to an existing existing allocation (if one exists). Signed-off-by: Dan Williams --- drivers/nvdimm/dimm_devs.c | 32 ++++++-- drivers/nvdimm/namespace_devs.c | 128 ++++++++++++++++++++++---------- drivers/nvdimm/nd-core.h | 18 +++++ 3 files changed, 133 insertions(+), 45 deletions(-) diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c index 4b0296ccb375..d614493ad5ac 100644 --- a/drivers/nvdimm/dimm_devs.c +++ b/drivers/nvdimm/dimm_devs.c @@ -386,13 +386,7 @@ struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data, } EXPORT_SYMBOL_GPL(nvdimm_create); -struct blk_alloc_info { - struct nd_mapping *nd_mapping; - resource_size_t available, busy; - struct resource *res; -}; - -static int alias_dpa_busy(struct device *dev, void *data) +int alias_dpa_busy(struct device *dev, void *data) { resource_size_t map_end, blk_start, new, busy; struct blk_alloc_info *info = data; @@ -418,6 +412,20 @@ static int alias_dpa_busy(struct device *dev, void *data) ndd = to_ndd(nd_mapping); map_end = nd_mapping->start + nd_mapping->size - 1; blk_start = nd_mapping->start; + + /* + * In the allocation case ->res is set to free space that we are + * looking to validate against PMEM aliasing collision rules + * (i.e. BLK is allocated after all aliased PMEM). + */ + if (info->res) { + if (info->res->start >= nd_mapping->start + && info->res->start < map_end) + /* pass */; + else + return 0; + } + retry: /* * Find the free dpa from the end of the last pmem allocation to @@ -447,7 +455,16 @@ static int alias_dpa_busy(struct device *dev, void *data) } } + /* update the free space range with the probed blk_start */ + if (info->res && blk_start > info->res->start) { + info->res->start = max(info->res->start, blk_start); + if (info->res->start > info->res->end) + info->res->end = info->res->start - 1; + return 1; + } + info->available -= blk_start - nd_mapping->start + busy; + return 0; } @@ -508,6 +525,7 @@ resource_size_t nd_blk_available_dpa(struct nd_region *nd_region) struct blk_alloc_info info = { .nd_mapping = nd_mapping, .available = nd_mapping->size, + .res = NULL, }; struct resource *res; diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index 132c5b8b5366..81451c74b01c 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -529,19 +529,68 @@ static resource_size_t init_dpa_allocation(struct nd_label_id *label_id, return rc ? n : 0; } -static bool space_valid(bool is_pmem, bool is_reserve, - struct nd_label_id *label_id, struct resource *res) + +/** + * space_valid() - validate free dpa space against constraints + * @nd_region: hosting region of the free space + * @ndd: dimm device data for debug + * @label_id: namespace id to allocate space + * @prev: potential allocation that precedes free space + * @next: allocation that follows the given free space range + * @exist: first allocation with same id in the mapping + * @n: range that must satisfied for pmem allocations + * @valid: free space range to validate + * + * BLK-space is valid as long as it does not precede a PMEM + * allocation in a given region. PMEM-space must be contiguous + * and adjacent to an existing existing allocation (if one + * exists). If reserving PMEM any space is valid. + */ +static void space_valid(struct nd_region *nd_region, struct nvdimm_drvdata *ndd, + struct nd_label_id *label_id, struct resource *prev, + struct resource *next, struct resource *exist, + resource_size_t n, struct resource *valid) { - /* - * For BLK-space any space is valid, for PMEM-space, it must be - * contiguous with an existing allocation unless we are - * reserving pmem. - */ - if (is_reserve || !is_pmem) - return true; - if (!res || strcmp(res->name, label_id->id) == 0) - return true; - return false; + bool is_reserve = strcmp(label_id->id, "pmem-reserve") == 0; + bool is_pmem = strncmp(label_id->id, "pmem", 4) == 0; + + if (valid->start >= valid->end) + goto invalid; + + if (is_reserve) + return; + + if (!is_pmem) { + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; + struct nvdimm_bus *nvdimm_bus; + struct blk_alloc_info info = { + .nd_mapping = nd_mapping, + .available = nd_mapping->size, + .res = valid, + }; + + WARN_ON(!is_nd_blk(&nd_region->dev)); + nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev); + device_for_each_child(&nvdimm_bus->dev, &info, alias_dpa_busy); + return; + } + + /* allocation needs to be contiguous, so this is all or nothing */ + if (resource_size(valid) < n) + goto invalid; + + /* we've got all the space we need and no existing allocation */ + if (!exist) + return; + + /* allocation needs to be contiguous with the existing namespace */ + if (valid->start == exist->end + 1 + || valid->end == exist->start - 1) + return; + + invalid: + /* truncate @valid size to 0 */ + valid->end = valid->start - 1; } enum alloc_loc { @@ -553,18 +602,24 @@ static resource_size_t scan_allocate(struct nd_region *nd_region, resource_size_t n) { resource_size_t mapping_end = nd_mapping->start + nd_mapping->size - 1; - bool is_reserve = strcmp(label_id->id, "pmem-reserve") == 0; bool is_pmem = strncmp(label_id->id, "pmem", 4) == 0; struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct resource *res, *exist = NULL, valid; const resource_size_t to_allocate = n; - struct resource *res; int first; + for_each_dpa_resource(ndd, res) + if (strcmp(label_id->id, res->name) == 0) + exist = res; + + valid.start = nd_mapping->start; + valid.end = mapping_end; + valid.name = "free space"; retry: first = 0; for_each_dpa_resource(ndd, res) { - resource_size_t allocate, available = 0, free_start, free_end; struct resource *next = res->sibling, *new_res = NULL; + resource_size_t allocate, available = 0; enum alloc_loc loc = ALLOC_ERR; const char *action; int rc = 0; @@ -577,32 +632,35 @@ static resource_size_t scan_allocate(struct nd_region *nd_region, /* space at the beginning of the mapping */ if (!first++ && res->start > nd_mapping->start) { - free_start = nd_mapping->start; - available = res->start - free_start; - if (space_valid(is_pmem, is_reserve, label_id, NULL)) + valid.start = nd_mapping->start; + valid.end = res->start - 1; + space_valid(nd_region, ndd, label_id, NULL, next, exist, + to_allocate, &valid); + available = resource_size(&valid); + if (available) loc = ALLOC_BEFORE; } /* space between allocations */ if (!loc && next) { - free_start = res->start + resource_size(res); - free_end = min(mapping_end, next->start - 1); - if (space_valid(is_pmem, is_reserve, label_id, res) - && free_start < free_end) { - available = free_end + 1 - free_start; + valid.start = res->start + resource_size(res); + valid.end = min(mapping_end, next->start - 1); + space_valid(nd_region, ndd, label_id, res, next, exist, + to_allocate, &valid); + available = resource_size(&valid); + if (available) loc = ALLOC_MID; - } } /* space at the end of the mapping */ if (!loc && !next) { - free_start = res->start + resource_size(res); - free_end = mapping_end; - if (space_valid(is_pmem, is_reserve, label_id, res) - && free_start < free_end) { - available = free_end + 1 - free_start; + valid.start = res->start + resource_size(res); + valid.end = mapping_end; + space_valid(nd_region, ndd, label_id, res, next, exist, + to_allocate, &valid); + available = resource_size(&valid); + if (available) loc = ALLOC_AFTER; - } } if (!loc || !available) @@ -612,8 +670,6 @@ static resource_size_t scan_allocate(struct nd_region *nd_region, case ALLOC_BEFORE: if (strcmp(res->name, label_id->id) == 0) { /* adjust current resource up */ - if (is_pmem && !is_reserve) - return n; rc = adjust_resource(res, res->start - allocate, resource_size(res) + allocate); action = "cur grow up"; @@ -623,8 +679,6 @@ static resource_size_t scan_allocate(struct nd_region *nd_region, case ALLOC_MID: if (strcmp(next->name, label_id->id) == 0) { /* adjust next resource up */ - if (is_pmem && !is_reserve) - return n; rc = adjust_resource(next, next->start - allocate, resource_size(next) + allocate); @@ -648,12 +702,10 @@ static resource_size_t scan_allocate(struct nd_region *nd_region, if (strcmp(action, "allocate") == 0) { /* BLK allocate bottom up */ if (!is_pmem) - free_start += available - allocate; - else if (!is_reserve && free_start != nd_mapping->start) - return n; + valid.start += available - allocate; new_res = nvdimm_allocate_dpa(ndd, label_id, - free_start, allocate); + valid.start, allocate); if (!new_res) rc = -EBUSY; } else if (strcmp(action, "grow down") == 0) { diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h index 7c2196a1d56f..3ba0b96ce7de 100644 --- a/drivers/nvdimm/nd-core.h +++ b/drivers/nvdimm/nd-core.h @@ -44,6 +44,23 @@ struct nvdimm { struct resource *flush_wpq; }; +/** + * struct blk_alloc_info - tracking info for BLK dpa scanning + * @nd_mapping: blk region mapping boundaries + * @available: decremented in alias_dpa_busy as aliased PMEM is scanned + * @busy: decremented in blk_dpa_busy to account for ranges already + * handled by alias_dpa_busy + * @res: alias_dpa_busy interprets this a free space range that needs to + * be truncated to the valid BLK allocation starting DPA, blk_dpa_busy + * treats it as a busy range that needs the aliased PMEM ranges + * truncated. + */ +struct blk_alloc_info { + struct nd_mapping *nd_mapping; + resource_size_t available, busy; + struct resource *res; +}; + bool is_nvdimm(struct device *dev); bool is_nd_pmem(struct device *dev); bool is_nd_blk(struct device *dev); @@ -80,6 +97,7 @@ resource_size_t nd_blk_available_dpa(struct nd_region *nd_region); resource_size_t nd_region_available_dpa(struct nd_region *nd_region); resource_size_t nvdimm_allocated_dpa(struct nvdimm_drvdata *ndd, struct nd_label_id *label_id); +int alias_dpa_busy(struct device *dev, void *data); struct resource *nsblk_add_resource(struct nd_region *nd_region, struct nvdimm_drvdata *ndd, struct nd_namespace_blk *nsblk, resource_size_t start); From c969e24c1b696f347c08b4beb73007bc39865b0e Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 5 Oct 2016 15:54:46 -0700 Subject: [PATCH 34/39] libnvdimm, namespace: filter out of range labels in scan_labels() Short-circuit doomed-to-fail label validation attempts by skipping labels that are outside the given region. For example a DIMM that has multiple PMEM regions will waste time attempting to create namespaces only to find that the interleave-set-cookie does not validate, e.g.: nd_region region6: invalid cookie in label: 73e608dc-47b9-4b2a-b5c7-2d55a32e0c2 Similar to how we skip BLK labels when performing PMEM validation we can skip out-of-range labels early. Signed-off-by: Dan Williams --- drivers/nvdimm/namespace_devs.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index 81451c74b01c..54babc3a80ca 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -2013,10 +2013,11 @@ static int cmp_dpa(const void *a, const void *b) static struct device **scan_labels(struct nd_region *nd_region) { - struct nd_mapping *nd_mapping = &nd_region->mapping[0]; + int i, count = 0; struct device *dev, **devs = NULL; struct nd_label_ent *label_ent, *e; - int i, count = 0; + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; + resource_size_t map_end = nd_mapping->start + nd_mapping->size - 1; /* "safe" because create_namespace_pmem() might list_move() label_ent */ list_for_each_entry_safe(label_ent, e, &nd_mapping->labels, list) { @@ -2033,6 +2034,10 @@ static struct device **scan_labels(struct nd_region *nd_region) else continue; + /* skip labels that describe extents outside of the region */ + if (nd_label->dpa < nd_mapping->start || nd_label->dpa > map_end) + continue; + i = add_namespace_resource(nd_region, nd_label, devs, count); if (i < 0) goto err; From 991d9020f3e0447ea00c7c7f11fed364d977320a Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 5 Oct 2016 15:54:46 -0700 Subject: [PATCH 35/39] libnvdimm, namespace: lift single pmem limit in scan_labels() Now that the rest of the infrastructure has been converted to handle multi-pmem configurations, lift the artificial barrier at scan time. Signed-off-by: Dan Williams --- drivers/nvdimm/namespace_devs.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index 54babc3a80ca..fa51d751ccf7 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -2070,9 +2070,6 @@ static struct device **scan_labels(struct nd_region *nd_region) } } else devs[count++] = dev; - - /* we only expect one valid pmem label set per region */ - break; } } From 98a29c39dc689298d2f834f40102cba752eb49c0 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 30 Sep 2016 15:28:27 -0700 Subject: [PATCH 36/39] libnvdimm, namespace: allow creation of multiple pmem-namespaces per region Similar to BLK regions, publish new seed namespace devices to allow unused PMEM region capacity to be consumed by additional namespaces. Signed-off-by: Dan Williams --- drivers/nvdimm/namespace_devs.c | 48 ++++++++++++++++++++++++++++++--- drivers/nvdimm/nd-core.h | 2 +- drivers/nvdimm/region_devs.c | 18 +++++++++---- 3 files changed, 59 insertions(+), 9 deletions(-) diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index fa51d751ccf7..3509cff68ef9 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -1860,16 +1860,58 @@ static struct device *nd_namespace_blk_create(struct nd_region *nd_region) return &nsblk->common.dev; } -void nd_region_create_blk_seed(struct nd_region *nd_region) +static struct device *nd_namespace_pmem_create(struct nd_region *nd_region) +{ + struct nd_namespace_pmem *nspm; + struct resource *res; + struct device *dev; + + if (!is_nd_pmem(&nd_region->dev)) + return NULL; + + nspm = kzalloc(sizeof(*nspm), GFP_KERNEL); + if (!nspm) + return NULL; + + dev = &nspm->nsio.common.dev; + dev->type = &namespace_pmem_device_type; + dev->parent = &nd_region->dev; + res = &nspm->nsio.res; + res->name = dev_name(&nd_region->dev); + res->flags = IORESOURCE_MEM; + + nspm->id = ida_simple_get(&nd_region->ns_ida, 0, 0, GFP_KERNEL); + if (nspm->id < 0) { + kfree(nspm); + return NULL; + } + dev_set_name(dev, "namespace%d.%d", nd_region->id, nspm->id); + dev->parent = &nd_region->dev; + dev->groups = nd_namespace_attribute_groups; + nd_namespace_pmem_set_resource(nd_region, nspm, 0); + + return dev; +} + +void nd_region_create_ns_seed(struct nd_region *nd_region) { WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev)); - nd_region->ns_seed = nd_namespace_blk_create(nd_region); + + if (nd_region_to_nstype(nd_region) == ND_DEVICE_NAMESPACE_IO) + return; + + if (is_nd_blk(&nd_region->dev)) + nd_region->ns_seed = nd_namespace_blk_create(nd_region); + else + nd_region->ns_seed = nd_namespace_pmem_create(nd_region); + /* * Seed creation failures are not fatal, provisioning is simply * disabled until memory becomes available */ if (!nd_region->ns_seed) - dev_err(&nd_region->dev, "failed to create blk namespace\n"); + dev_err(&nd_region->dev, "failed to create %s namespace\n", + is_nd_blk(&nd_region->dev) ? "blk" : "pmem"); else nd_device_register(nd_region->ns_seed); } diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h index 3ba0b96ce7de..8623e57c2ce3 100644 --- a/drivers/nvdimm/nd-core.h +++ b/drivers/nvdimm/nd-core.h @@ -71,7 +71,7 @@ void nvdimm_devs_exit(void); void nd_region_devs_exit(void); void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus, struct device *dev); struct nd_region; -void nd_region_create_blk_seed(struct nd_region *nd_region); +void nd_region_create_ns_seed(struct nd_region *nd_region); void nd_region_create_btt_seed(struct nd_region *nd_region); void nd_region_create_pfn_seed(struct nd_region *nd_region); void nd_region_create_dax_seed(struct nd_region *nd_region); diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index 3ac534aec60c..4f74e009b135 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -530,11 +530,12 @@ static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus, if (is_nd_pmem(dev)) return; } - if (dev->parent && is_nd_blk(dev->parent) && probe) { + if (dev->parent && (is_nd_blk(dev->parent) || is_nd_pmem(dev->parent)) + && probe) { nd_region = to_nd_region(dev->parent); nvdimm_bus_lock(dev); if (nd_region->ns_seed == dev) - nd_region_create_blk_seed(nd_region); + nd_region_create_ns_seed(nd_region); nvdimm_bus_unlock(dev); } if (is_nd_btt(dev) && probe) { @@ -544,23 +545,30 @@ static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus, nvdimm_bus_lock(dev); if (nd_region->btt_seed == dev) nd_region_create_btt_seed(nd_region); - if (nd_region->ns_seed == &nd_btt->ndns->dev && - is_nd_blk(dev->parent)) - nd_region_create_blk_seed(nd_region); + if (nd_region->ns_seed == &nd_btt->ndns->dev) + nd_region_create_ns_seed(nd_region); nvdimm_bus_unlock(dev); } if (is_nd_pfn(dev) && probe) { + struct nd_pfn *nd_pfn = to_nd_pfn(dev); + nd_region = to_nd_region(dev->parent); nvdimm_bus_lock(dev); if (nd_region->pfn_seed == dev) nd_region_create_pfn_seed(nd_region); + if (nd_region->ns_seed == &nd_pfn->ndns->dev) + nd_region_create_ns_seed(nd_region); nvdimm_bus_unlock(dev); } if (is_nd_dax(dev) && probe) { + struct nd_dax *nd_dax = to_nd_dax(dev); + nd_region = to_nd_region(dev->parent); nvdimm_bus_lock(dev); if (nd_region->dax_seed == dev) nd_region_create_dax_seed(nd_region); + if (nd_region->ns_seed == &nd_dax->nd_pfn.ndns->dev) + nd_region_create_ns_seed(nd_region); nvdimm_bus_unlock(dev); } } From d76911ee933a64c9dfc453e580e7ad612b394e83 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 19 Jul 2016 17:51:40 -0700 Subject: [PATCH 37/39] dax: convert devm_create_dax_dev to PTR_ERR For sub-division support we need access to the dax_dev created by devm_create_dax_dev(). Signed-off-by: Dan Williams --- drivers/dax/dax.c | 16 ++++++++++------ drivers/dax/dax.h | 5 +++-- drivers/dax/pmem.c | 5 +++-- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c index 03bb54f7f58f..e7d8a3902437 100644 --- a/drivers/dax/dax.c +++ b/drivers/dax/dax.c @@ -558,8 +558,8 @@ static void unregister_dax_dev(void *dev) device_unregister(dev); } -int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res, - int count) +struct dax_dev *devm_create_dax_dev(struct dax_region *dax_region, + struct resource *res, int count) { struct device *parent = dax_region->dev; struct dax_dev *dax_dev; @@ -570,7 +570,7 @@ int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res, dax_dev = kzalloc(sizeof(*dax_dev) + sizeof(*res) * count, GFP_KERNEL); if (!dax_dev) - return -ENOMEM; + return ERR_PTR(-ENOMEM); for (i = 0; i < count; i++) { if (!IS_ALIGNED(res[i].start, dax_region->align) @@ -632,10 +632,14 @@ int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res, rc = device_add(dev); if (rc) { put_device(dev); - return rc; + return ERR_PTR(rc); } - return devm_add_action_or_reset(dax_region->dev, unregister_dax_dev, dev); + rc = devm_add_action_or_reset(dax_region->dev, unregister_dax_dev, dev); + if (rc) + return ERR_PTR(rc); + + return dax_dev; err_cdev: iput(dax_dev->inode); @@ -646,7 +650,7 @@ int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res, err_id: kfree(dax_dev); - return rc; + return ERR_PTR(rc); } EXPORT_SYMBOL_GPL(devm_create_dax_dev); diff --git a/drivers/dax/dax.h b/drivers/dax/dax.h index d8b8f1f25054..ddd829ab58c0 100644 --- a/drivers/dax/dax.h +++ b/drivers/dax/dax.h @@ -13,12 +13,13 @@ #ifndef __DAX_H__ #define __DAX_H__ struct device; +struct dax_dev; struct resource; struct dax_region; void dax_region_put(struct dax_region *dax_region); struct dax_region *alloc_dax_region(struct device *parent, int region_id, struct resource *res, unsigned int align, void *addr, unsigned long flags); -int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res, - int count); +struct dax_dev *devm_create_dax_dev(struct dax_region *dax_region, + struct resource *res, int count); #endif /* __DAX_H__ */ diff --git a/drivers/dax/pmem.c b/drivers/dax/pmem.c index 59b75c5972bb..c24d32ec9ce6 100644 --- a/drivers/dax/pmem.c +++ b/drivers/dax/pmem.c @@ -61,6 +61,7 @@ static int dax_pmem_probe(struct device *dev) int rc; void *addr; struct resource res; + struct dax_dev *dax_dev; struct nd_pfn_sb *pfn_sb; struct dax_pmem *dax_pmem; struct nd_region *nd_region; @@ -123,12 +124,12 @@ static int dax_pmem_probe(struct device *dev) return -ENOMEM; /* TODO: support for subdividing a dax region... */ - rc = devm_create_dax_dev(dax_region, &res, 1); + dax_dev = devm_create_dax_dev(dax_region, &res, 1); /* child dax_dev instances now own the lifetime of the dax_region */ dax_region_put(dax_region); - return rc; + return PTR_ERR_OR_ZERO(dax_dev); } static struct nd_device_driver dax_pmem_driver = { From bc0a0fe94f33dd15edf2ed555bfc4d6dbb5e1995 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 8 Sep 2016 15:53:28 +0200 Subject: [PATCH 38/39] dax: use correct dev_t value The dev_t variable in devm_create_dax_dev() is used before it's first set: drivers/dax/dax.c: In function 'devm_create_dax_dev': drivers/dax/dax.c:205:39: error: 'dev_t' may be used uninitialized in this function [-Werror=maybe-uninitialized] inode = iget5_locked(dax_superblock, hash_32(devt + DAXFS_MAGIC, 31), ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ drivers/dax/dax.c:688:8: note: 'dev_t' was declared here This reorders the code to how it looks correct to me. Signed-off-by: Arnd Bergmann Fixes: 3bc52c45bac2 ("dax: define a unified inode/address_space for device-dax mappings") Signed-off-by: Dan Williams --- drivers/dax/dax.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c index e7d8a3902437..b917e4d66ad0 100644 --- a/drivers/dax/dax.c +++ b/drivers/dax/dax.c @@ -598,6 +598,8 @@ struct dax_dev *devm_create_dax_dev(struct dax_region *dax_region, goto err_minor; } + dev_t = MKDEV(MAJOR(dax_devt), minor); + dev = &dax_dev->dev; dax_dev->inode = dax_inode_get(&dax_dev->cdev, dev_t); if (!dax_dev->inode) { rc = -ENOMEM; @@ -605,8 +607,6 @@ struct dax_dev *devm_create_dax_dev(struct dax_region *dax_region, } /* device_initialize() so cdev can reference kobj parent */ - dev_t = MKDEV(MAJOR(dax_devt), minor); - dev = &dax_dev->dev; device_initialize(dev); cdev = &dax_dev->cdev; From 4e65e9381c7ac211ec2133e473fcbdd8656c779a Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Mon, 12 Sep 2016 10:15:07 -0600 Subject: [PATCH 39/39] /dev/dax: fix Kconfig dependency build breakage The function dax_pmem_probe() in drivers/dax/pmem.c is compiled under the CONFIG_DEV_DAX_PMEM tri-state config option. This config option currently only depends on CONFIG_NVDIMM_DAX, a bool, which means that the following configuration is possible: CONFIG_LIBNVDIMM=m ... CONFIG_NVDIMM_DAX=y CONFIG_DEV_DAX=y CONFIG_DEV_DAX_PMEM=y With this config LIBNVDIMM is compiled as a module with NVDIMM_DAX=y just meaning that we will compile drivers/nvdimm/dax_devs.c into that module. However, dax_pmem_probe() depends on several symbols defined in drivers/nvdimm/dax_devs.c, which results in the following build errors: drivers/built-in.o: In function `dax_pmem_probe': linux/drivers/dax/pmem.c:70: undefined reference to `to_nd_dax' linux/drivers/dax/pmem.c:74: undefined reference to `nvdimm_namespace_common_probe' linux/drivers/dax/pmem.c:80: undefined reference to `devm_nsio_enable' linux/drivers/dax/pmem.c:81: undefined reference to `nvdimm_setup_pfn' linux/drivers/dax/pmem.c:84: undefined reference to `devm_nsio_disable' linux/drivers/dax/pmem.c:122: undefined reference to `to_nd_region' drivers/built-in.o: In function `dax_pmem_init': linux/drivers/dax/pmem.c:147: undefined reference to `__nd_driver_register' Fix this by making NVDIMM_DAX a tristate. DEV_DAX_PMEM depends on NVDIMM_DAX which depends on LIBNVDIMM. Since they are all now tristates, if LIBNVDIMM is built as a kernel module DEV_DAX_PMEM will be as well. This prevents dax_devs.c from being built as a built-in while its dependencies are in the libnvdimm.ko module. Signed-off-by: Ross Zwisler Signed-off-by: Dan Williams --- drivers/nvdimm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig index 124c2432ac9c..8b2b740d6679 100644 --- a/drivers/nvdimm/Kconfig +++ b/drivers/nvdimm/Kconfig @@ -89,7 +89,7 @@ config NVDIMM_PFN Select Y if unsure config NVDIMM_DAX - bool "NVDIMM DAX: Raw access to persistent memory" + tristate "NVDIMM DAX: Raw access to persistent memory" default LIBNVDIMM depends on NVDIMM_PFN help