1c221c0b0SDave Hansen // SPDX-License-Identifier: GPL-2.0 2c221c0b0SDave Hansen /* Copyright(c) 2016-2019 Intel Corporation. All rights reserved. */ 3c221c0b0SDave Hansen #include <linux/memremap.h> 4c221c0b0SDave Hansen #include <linux/pagemap.h> 5c221c0b0SDave Hansen #include <linux/memory.h> 6c221c0b0SDave Hansen #include <linux/module.h> 7c221c0b0SDave Hansen #include <linux/device.h> 8c221c0b0SDave Hansen #include <linux/pfn_t.h> 9c221c0b0SDave Hansen #include <linux/slab.h> 10c221c0b0SDave Hansen #include <linux/dax.h> 11c221c0b0SDave Hansen #include <linux/fs.h> 12c221c0b0SDave Hansen #include <linux/mm.h> 13c221c0b0SDave Hansen #include <linux/mman.h> 147b88bda3SAneesh Kumar K.V #include <linux/memory-tiers.h> 15c221c0b0SDave Hansen #include "dax-private.h" 16c221c0b0SDave Hansen #include "bus.h" 17c221c0b0SDave Hansen 187b88bda3SAneesh Kumar K.V /* 197b88bda3SAneesh Kumar K.V * Default abstract distance assigned to the NUMA node onlined 207b88bda3SAneesh Kumar K.V * by DAX/kmem if the low level platform driver didn't initialize 217b88bda3SAneesh Kumar K.V * one for this NUMA node. 227b88bda3SAneesh Kumar K.V */ 237b88bda3SAneesh Kumar K.V #define MEMTIER_DEFAULT_DAX_ADISTANCE (MEMTIER_ADISTANCE_DRAM * 5) 247b88bda3SAneesh Kumar K.V 258a725e46SDavid Hildenbrand /* Memory resource name used for add_memory_driver_managed(). */ 268a725e46SDavid Hildenbrand static const char *kmem_name; 278a725e46SDavid Hildenbrand /* Set if any memory will remain added when the driver will be unloaded. */ 288a725e46SDavid Hildenbrand static bool any_hotremove_failed; 298a725e46SDavid Hildenbrand 3060e93dc0SDan Williams static int dax_kmem_range(struct dev_dax *dev_dax, int i, struct range *r) 3159bc8d10SDan Williams { 3260e93dc0SDan Williams struct dev_dax_range *dax_range = &dev_dax->ranges[i]; 3360e93dc0SDan Williams struct range *range = &dax_range->range; 3459bc8d10SDan Williams 3559bc8d10SDan Williams /* memory-block align the hotplug range */ 3660e93dc0SDan Williams r->start = ALIGN(range->start, memory_block_size_bytes()); 3760e93dc0SDan Williams r->end = ALIGN_DOWN(range->end + 1, memory_block_size_bytes()) - 1; 3860e93dc0SDan Williams if (r->start >= r->end) { 3960e93dc0SDan Williams r->start = range->start; 4060e93dc0SDan Williams r->end = range->end; 4160e93dc0SDan Williams return -ENOSPC; 4260e93dc0SDan Williams } 4360e93dc0SDan Williams return 0; 4459bc8d10SDan Williams } 4559bc8d10SDan Williams 46a455aa72SDan Williams struct dax_kmem_data { 47a455aa72SDan Williams const char *res_name; 48eedf634aSDavid Hildenbrand int mgid; 49a455aa72SDan Williams struct resource *res[]; 50a455aa72SDan Williams }; 51a455aa72SDan Williams 527b88bda3SAneesh Kumar K.V static struct memory_dev_type *dax_slowmem_type; 53f11cf813SDan Williams static int dev_dax_kmem_probe(struct dev_dax *dev_dax) 54c221c0b0SDave Hansen { 55f11cf813SDan Williams struct device *dev = &dev_dax->dev; 56eedf634aSDavid Hildenbrand unsigned long total_len = 0; 57a455aa72SDan Williams struct dax_kmem_data *data; 58eedf634aSDavid Hildenbrand int i, rc, mapped = 0; 59c221c0b0SDave Hansen int numa_node; 60c221c0b0SDave Hansen 61c221c0b0SDave Hansen /* 62c221c0b0SDave Hansen * Ensure good NUMA information for the persistent memory. 63c221c0b0SDave Hansen * Without this check, there is a risk that slow memory 64c221c0b0SDave Hansen * could be mixed in a node with faster memory, causing 65c221c0b0SDave Hansen * unavoidable performance issues. 66c221c0b0SDave Hansen */ 67c221c0b0SDave Hansen numa_node = dev_dax->target_node; 68c221c0b0SDave Hansen if (numa_node < 0) { 69f5516ec5SDan Williams dev_warn(dev, "rejecting DAX region with invalid node: %d\n", 70f5516ec5SDan Williams numa_node); 71c221c0b0SDave Hansen return -EINVAL; 72c221c0b0SDave Hansen } 73c221c0b0SDave Hansen 7460e93dc0SDan Williams for (i = 0; i < dev_dax->nr_range; i++) { 7560e93dc0SDan Williams struct range range; 7660e93dc0SDan Williams 7760e93dc0SDan Williams rc = dax_kmem_range(dev_dax, i, &range); 7860e93dc0SDan Williams if (rc) { 7960e93dc0SDan Williams dev_info(dev, "mapping%d: %#llx-%#llx too small after alignment\n", 8060e93dc0SDan Williams i, range.start, range.end); 8160e93dc0SDan Williams continue; 8260e93dc0SDan Williams } 83eedf634aSDavid Hildenbrand total_len += range_len(&range); 84eedf634aSDavid Hildenbrand } 85eedf634aSDavid Hildenbrand 86eedf634aSDavid Hildenbrand if (!total_len) { 87eedf634aSDavid Hildenbrand dev_warn(dev, "rejecting DAX region without any memory after alignment\n"); 88eedf634aSDavid Hildenbrand return -EINVAL; 89eedf634aSDavid Hildenbrand } 90eedf634aSDavid Hildenbrand 917b88bda3SAneesh Kumar K.V init_node_memory_type(numa_node, dax_slowmem_type); 92eedf634aSDavid Hildenbrand 93eedf634aSDavid Hildenbrand rc = -ENOMEM; 947b88bda3SAneesh Kumar K.V data = kzalloc(struct_size(data, res, dev_dax->nr_range), GFP_KERNEL); 957b88bda3SAneesh Kumar K.V if (!data) 967b88bda3SAneesh Kumar K.V goto err_dax_kmem_data; 977b88bda3SAneesh Kumar K.V 98eedf634aSDavid Hildenbrand data->res_name = kstrdup(dev_name(dev), GFP_KERNEL); 99eedf634aSDavid Hildenbrand if (!data->res_name) 100eedf634aSDavid Hildenbrand goto err_res_name; 101eedf634aSDavid Hildenbrand 102*46e66dabSTarun Sahu rc = memory_group_register_static(numa_node, PFN_UP(total_len)); 103eedf634aSDavid Hildenbrand if (rc < 0) 104eedf634aSDavid Hildenbrand goto err_reg_mgid; 105eedf634aSDavid Hildenbrand data->mgid = rc; 106eedf634aSDavid Hildenbrand 107eedf634aSDavid Hildenbrand for (i = 0; i < dev_dax->nr_range; i++) { 108eedf634aSDavid Hildenbrand struct resource *res; 109eedf634aSDavid Hildenbrand struct range range; 110eedf634aSDavid Hildenbrand 111eedf634aSDavid Hildenbrand rc = dax_kmem_range(dev_dax, i, &range); 112eedf634aSDavid Hildenbrand if (rc) 113eedf634aSDavid Hildenbrand continue; 11460e93dc0SDan Williams 11560858c00SDavid Hildenbrand /* Region is permanently reserved if hotremove fails. */ 116a455aa72SDan Williams res = request_mem_region(range.start, range_len(&range), data->res_name); 1170513bd5bSDan Williams if (!res) { 11860e93dc0SDan Williams dev_warn(dev, "mapping%d: %#llx-%#llx could not reserve region\n", 11960e93dc0SDan Williams i, range.start, range.end); 12060e93dc0SDan Williams /* 12160e93dc0SDan Williams * Once some memory has been onlined we can't 12260e93dc0SDan Williams * assume that it can be un-onlined safely. 12360e93dc0SDan Williams */ 12460e93dc0SDan Williams if (mapped) 12560e93dc0SDan Williams continue; 126a455aa72SDan Williams rc = -EBUSY; 127a455aa72SDan Williams goto err_request_mem; 128c221c0b0SDave Hansen } 129a455aa72SDan Williams data->res[i] = res; 130c221c0b0SDave Hansen 131c221c0b0SDave Hansen /* 132c221c0b0SDave Hansen * Set flags appropriate for System RAM. Leave ..._BUSY clear 133c221c0b0SDave Hansen * so that add_memory() can add a child resource. Do not 134c221c0b0SDave Hansen * inherit flags from the parent since it may set new flags 135c221c0b0SDave Hansen * unknown to us that will break add_memory() below. 136c221c0b0SDave Hansen */ 1370513bd5bSDan Williams res->flags = IORESOURCE_SYSTEM_RAM; 138c221c0b0SDave Hansen 1398a725e46SDavid Hildenbrand /* 14060e93dc0SDan Williams * Ensure that future kexec'd kernels will not treat 14160e93dc0SDan Williams * this as RAM automatically. 1428a725e46SDavid Hildenbrand */ 143eedf634aSDavid Hildenbrand rc = add_memory_driver_managed(data->mgid, range.start, 144eedf634aSDavid Hildenbrand range_len(&range), kmem_name, MHP_NID_IS_MGID); 14560e93dc0SDan Williams 14631e4ca92SPavel Tatashin if (rc) { 14760e93dc0SDan Williams dev_warn(dev, "mapping%d: %#llx-%#llx memory add failed\n", 14860e93dc0SDan Williams i, range.start, range.end); 149e686c325SDan Williams remove_resource(res); 150a455aa72SDan Williams kfree(res); 151a455aa72SDan Williams data->res[i] = NULL; 15260e93dc0SDan Williams if (mapped) 15360e93dc0SDan Williams continue; 154a455aa72SDan Williams goto err_request_mem; 15531e4ca92SPavel Tatashin } 15660e93dc0SDan Williams mapped++; 15760e93dc0SDan Williams } 1587e6b431aSDan Williams 159a455aa72SDan Williams dev_set_drvdata(dev, data); 160c221c0b0SDave Hansen 161c221c0b0SDave Hansen return 0; 162a455aa72SDan Williams 163a455aa72SDan Williams err_request_mem: 164eedf634aSDavid Hildenbrand memory_group_unregister(data->mgid); 165eedf634aSDavid Hildenbrand err_reg_mgid: 166a455aa72SDan Williams kfree(data->res_name); 167a455aa72SDan Williams err_res_name: 168a455aa72SDan Williams kfree(data); 1697b88bda3SAneesh Kumar K.V err_dax_kmem_data: 1707b88bda3SAneesh Kumar K.V clear_node_memory_type(numa_node, dax_slowmem_type); 171a455aa72SDan Williams return rc; 172c221c0b0SDave Hansen } 173c221c0b0SDave Hansen 1749f960da7SPavel Tatashin #ifdef CONFIG_MEMORY_HOTREMOVE 1750d519e0dSUwe Kleine-König static void dev_dax_kmem_remove(struct dev_dax *dev_dax) 1769f960da7SPavel Tatashin { 17760e93dc0SDan Williams int i, success = 0; 1787b88bda3SAneesh Kumar K.V int node = dev_dax->target_node; 179f11cf813SDan Williams struct device *dev = &dev_dax->dev; 180a455aa72SDan Williams struct dax_kmem_data *data = dev_get_drvdata(dev); 1819f960da7SPavel Tatashin 1829f960da7SPavel Tatashin /* 1839f960da7SPavel Tatashin * We have one shot for removing memory, if some memory blocks were not 1849f960da7SPavel Tatashin * offline prior to calling this function remove_memory() will fail, and 1859f960da7SPavel Tatashin * there is no way to hotremove this memory until reboot because device 1869f960da7SPavel Tatashin * unbind will succeed even if we return failure. 1879f960da7SPavel Tatashin */ 18860e93dc0SDan Williams for (i = 0; i < dev_dax->nr_range; i++) { 18960e93dc0SDan Williams struct range range; 19060e93dc0SDan Williams int rc; 19160e93dc0SDan Williams 19260e93dc0SDan Williams rc = dax_kmem_range(dev_dax, i, &range); 19360e93dc0SDan Williams if (rc) 19460e93dc0SDan Williams continue; 19560e93dc0SDan Williams 196e1c158e4SDavid Hildenbrand rc = remove_memory(range.start, range_len(&range)); 19760e93dc0SDan Williams if (rc == 0) { 198e686c325SDan Williams remove_resource(data->res[i]); 199a455aa72SDan Williams kfree(data->res[i]); 200a455aa72SDan Williams data->res[i] = NULL; 20160e93dc0SDan Williams success++; 20260e93dc0SDan Williams continue; 20360e93dc0SDan Williams } 2048a725e46SDavid Hildenbrand any_hotremove_failed = true; 20560e93dc0SDan Williams dev_err(dev, 20660e93dc0SDan Williams "mapping%d: %#llx-%#llx cannot be hotremoved until the next reboot\n", 20760e93dc0SDan Williams i, range.start, range.end); 2089f960da7SPavel Tatashin } 2099f960da7SPavel Tatashin 21060e93dc0SDan Williams if (success >= dev_dax->nr_range) { 211eedf634aSDavid Hildenbrand memory_group_unregister(data->mgid); 212a455aa72SDan Williams kfree(data->res_name); 213a455aa72SDan Williams kfree(data); 21460e93dc0SDan Williams dev_set_drvdata(dev, NULL); 2157b88bda3SAneesh Kumar K.V /* 2167b88bda3SAneesh Kumar K.V * Clear the memtype association on successful unplug. 2177b88bda3SAneesh Kumar K.V * If not, we have memory blocks left which can be 2187b88bda3SAneesh Kumar K.V * offlined/onlined later. We need to keep memory_dev_type 2197b88bda3SAneesh Kumar K.V * for that. This implies this reference will be around 2207b88bda3SAneesh Kumar K.V * till next reboot. 2217b88bda3SAneesh Kumar K.V */ 2227b88bda3SAneesh Kumar K.V clear_node_memory_type(node, dax_slowmem_type); 22360e93dc0SDan Williams } 2249f960da7SPavel Tatashin } 2259f960da7SPavel Tatashin #else 2260d519e0dSUwe Kleine-König static void dev_dax_kmem_remove(struct dev_dax *dev_dax) 227c221c0b0SDave Hansen { 228c221c0b0SDave Hansen /* 2299f960da7SPavel Tatashin * Without hotremove purposely leak the request_mem_region() for the 2309f960da7SPavel Tatashin * device-dax range and return '0' to ->remove() attempts. The removal 2319f960da7SPavel Tatashin * of the device from the driver always succeeds, but the region is 2329f960da7SPavel Tatashin * permanently pinned as reserved by the unreleased 233c221c0b0SDave Hansen * request_mem_region(). 234c221c0b0SDave Hansen */ 2358a725e46SDavid Hildenbrand any_hotremove_failed = true; 236c221c0b0SDave Hansen } 2379f960da7SPavel Tatashin #endif /* CONFIG_MEMORY_HOTREMOVE */ 238c221c0b0SDave Hansen 239c221c0b0SDave Hansen static struct dax_device_driver device_dax_kmem_driver = { 240c221c0b0SDave Hansen .probe = dev_dax_kmem_probe, 241c221c0b0SDave Hansen .remove = dev_dax_kmem_remove, 242e9ee9fe3SDan Williams .type = DAXDRV_KMEM_TYPE, 243c221c0b0SDave Hansen }; 244c221c0b0SDave Hansen 245c221c0b0SDave Hansen static int __init dax_kmem_init(void) 246c221c0b0SDave Hansen { 2478a725e46SDavid Hildenbrand int rc; 2488a725e46SDavid Hildenbrand 2498a725e46SDavid Hildenbrand /* Resource name is permanently allocated if any hotremove fails. */ 2508a725e46SDavid Hildenbrand kmem_name = kstrdup_const("System RAM (kmem)", GFP_KERNEL); 2518a725e46SDavid Hildenbrand if (!kmem_name) 2528a725e46SDavid Hildenbrand return -ENOMEM; 2538a725e46SDavid Hildenbrand 2547b88bda3SAneesh Kumar K.V dax_slowmem_type = alloc_memory_type(MEMTIER_DEFAULT_DAX_ADISTANCE); 2557b88bda3SAneesh Kumar K.V if (IS_ERR(dax_slowmem_type)) { 2567b88bda3SAneesh Kumar K.V rc = PTR_ERR(dax_slowmem_type); 2577b88bda3SAneesh Kumar K.V goto err_dax_slowmem_type; 2587b88bda3SAneesh Kumar K.V } 2597b88bda3SAneesh Kumar K.V 2608a725e46SDavid Hildenbrand rc = dax_driver_register(&device_dax_kmem_driver); 2618a725e46SDavid Hildenbrand if (rc) 2627b88bda3SAneesh Kumar K.V goto error_dax_driver; 2637b88bda3SAneesh Kumar K.V 2647b88bda3SAneesh Kumar K.V return rc; 2657b88bda3SAneesh Kumar K.V 2667b88bda3SAneesh Kumar K.V error_dax_driver: 2677b88bda3SAneesh Kumar K.V destroy_memory_type(dax_slowmem_type); 2687b88bda3SAneesh Kumar K.V err_dax_slowmem_type: 2698a725e46SDavid Hildenbrand kfree_const(kmem_name); 2708a725e46SDavid Hildenbrand return rc; 271c221c0b0SDave Hansen } 272c221c0b0SDave Hansen 273c221c0b0SDave Hansen static void __exit dax_kmem_exit(void) 274c221c0b0SDave Hansen { 275c221c0b0SDave Hansen dax_driver_unregister(&device_dax_kmem_driver); 2768a725e46SDavid Hildenbrand if (!any_hotremove_failed) 2778a725e46SDavid Hildenbrand kfree_const(kmem_name); 2787b88bda3SAneesh Kumar K.V destroy_memory_type(dax_slowmem_type); 279c221c0b0SDave Hansen } 280c221c0b0SDave Hansen 281c221c0b0SDave Hansen MODULE_AUTHOR("Intel Corporation"); 282c221c0b0SDave Hansen MODULE_LICENSE("GPL v2"); 283c221c0b0SDave Hansen module_init(dax_kmem_init); 284c221c0b0SDave Hansen module_exit(dax_kmem_exit); 285c221c0b0SDave Hansen MODULE_ALIAS_DAX_DEVICE(0); 286