1 // SPDX-License-Identifier: GPL-2.0 2 /* Copyright(c) 2016-2019 Intel Corporation. All rights reserved. */ 3 #include <linux/memremap.h> 4 #include <linux/pagemap.h> 5 #include <linux/memory.h> 6 #include <linux/module.h> 7 #include <linux/device.h> 8 #include <linux/pfn_t.h> 9 #include <linux/slab.h> 10 #include <linux/dax.h> 11 #include <linux/fs.h> 12 #include <linux/mm.h> 13 #include <linux/mman.h> 14 #include <linux/memory-tiers.h> 15 #include "dax-private.h" 16 #include "bus.h" 17 18 /* 19 * Default abstract distance assigned to the NUMA node onlined 20 * by DAX/kmem if the low level platform driver didn't initialize 21 * one for this NUMA node. 22 */ 23 #define MEMTIER_DEFAULT_DAX_ADISTANCE (MEMTIER_ADISTANCE_DRAM * 5) 24 25 /* Memory resource name used for add_memory_driver_managed(). */ 26 static const char *kmem_name; 27 /* Set if any memory will remain added when the driver will be unloaded. */ 28 static bool any_hotremove_failed; 29 30 static int dax_kmem_range(struct dev_dax *dev_dax, int i, struct range *r) 31 { 32 struct dev_dax_range *dax_range = &dev_dax->ranges[i]; 33 struct range *range = &dax_range->range; 34 35 /* memory-block align the hotplug range */ 36 r->start = ALIGN(range->start, memory_block_size_bytes()); 37 r->end = ALIGN_DOWN(range->end + 1, memory_block_size_bytes()) - 1; 38 if (r->start >= r->end) { 39 r->start = range->start; 40 r->end = range->end; 41 return -ENOSPC; 42 } 43 return 0; 44 } 45 46 struct dax_kmem_data { 47 const char *res_name; 48 int mgid; 49 struct resource *res[]; 50 }; 51 52 static struct memory_dev_type *dax_slowmem_type; 53 static int dev_dax_kmem_probe(struct dev_dax *dev_dax) 54 { 55 struct device *dev = &dev_dax->dev; 56 unsigned long total_len = 0; 57 struct dax_kmem_data *data; 58 int i, rc, mapped = 0; 59 int numa_node; 60 61 /* 62 * Ensure good NUMA information for the persistent memory. 63 * Without this check, there is a risk that slow memory 64 * could be mixed in a node with faster memory, causing 65 * unavoidable performance issues. 66 */ 67 numa_node = dev_dax->target_node; 68 if (numa_node < 0) { 69 dev_warn(dev, "rejecting DAX region with invalid node: %d\n", 70 numa_node); 71 return -EINVAL; 72 } 73 74 for (i = 0; i < dev_dax->nr_range; i++) { 75 struct range range; 76 77 rc = dax_kmem_range(dev_dax, i, &range); 78 if (rc) { 79 dev_info(dev, "mapping%d: %#llx-%#llx too small after alignment\n", 80 i, range.start, range.end); 81 continue; 82 } 83 total_len += range_len(&range); 84 } 85 86 if (!total_len) { 87 dev_warn(dev, "rejecting DAX region without any memory after alignment\n"); 88 return -EINVAL; 89 } 90 91 init_node_memory_type(numa_node, dax_slowmem_type); 92 93 rc = -ENOMEM; 94 data = kzalloc(struct_size(data, res, dev_dax->nr_range), GFP_KERNEL); 95 if (!data) 96 goto err_dax_kmem_data; 97 98 data->res_name = kstrdup(dev_name(dev), GFP_KERNEL); 99 if (!data->res_name) 100 goto err_res_name; 101 102 rc = memory_group_register_static(numa_node, PFN_UP(total_len)); 103 if (rc < 0) 104 goto err_reg_mgid; 105 data->mgid = rc; 106 107 for (i = 0; i < dev_dax->nr_range; i++) { 108 struct resource *res; 109 struct range range; 110 111 rc = dax_kmem_range(dev_dax, i, &range); 112 if (rc) 113 continue; 114 115 /* Region is permanently reserved if hotremove fails. */ 116 res = request_mem_region(range.start, range_len(&range), data->res_name); 117 if (!res) { 118 dev_warn(dev, "mapping%d: %#llx-%#llx could not reserve region\n", 119 i, range.start, range.end); 120 /* 121 * Once some memory has been onlined we can't 122 * assume that it can be un-onlined safely. 123 */ 124 if (mapped) 125 continue; 126 rc = -EBUSY; 127 goto err_request_mem; 128 } 129 data->res[i] = res; 130 131 /* 132 * Set flags appropriate for System RAM. Leave ..._BUSY clear 133 * so that add_memory() can add a child resource. Do not 134 * inherit flags from the parent since it may set new flags 135 * unknown to us that will break add_memory() below. 136 */ 137 res->flags = IORESOURCE_SYSTEM_RAM; 138 139 /* 140 * Ensure that future kexec'd kernels will not treat 141 * this as RAM automatically. 142 */ 143 rc = add_memory_driver_managed(data->mgid, range.start, 144 range_len(&range), kmem_name, MHP_NID_IS_MGID); 145 146 if (rc) { 147 dev_warn(dev, "mapping%d: %#llx-%#llx memory add failed\n", 148 i, range.start, range.end); 149 remove_resource(res); 150 kfree(res); 151 data->res[i] = NULL; 152 if (mapped) 153 continue; 154 goto err_request_mem; 155 } 156 mapped++; 157 } 158 159 dev_set_drvdata(dev, data); 160 161 return 0; 162 163 err_request_mem: 164 memory_group_unregister(data->mgid); 165 err_reg_mgid: 166 kfree(data->res_name); 167 err_res_name: 168 kfree(data); 169 err_dax_kmem_data: 170 clear_node_memory_type(numa_node, dax_slowmem_type); 171 return rc; 172 } 173 174 #ifdef CONFIG_MEMORY_HOTREMOVE 175 static void dev_dax_kmem_remove(struct dev_dax *dev_dax) 176 { 177 int i, success = 0; 178 int node = dev_dax->target_node; 179 struct device *dev = &dev_dax->dev; 180 struct dax_kmem_data *data = dev_get_drvdata(dev); 181 182 /* 183 * We have one shot for removing memory, if some memory blocks were not 184 * offline prior to calling this function remove_memory() will fail, and 185 * there is no way to hotremove this memory until reboot because device 186 * unbind will succeed even if we return failure. 187 */ 188 for (i = 0; i < dev_dax->nr_range; i++) { 189 struct range range; 190 int rc; 191 192 rc = dax_kmem_range(dev_dax, i, &range); 193 if (rc) 194 continue; 195 196 rc = remove_memory(range.start, range_len(&range)); 197 if (rc == 0) { 198 remove_resource(data->res[i]); 199 kfree(data->res[i]); 200 data->res[i] = NULL; 201 success++; 202 continue; 203 } 204 any_hotremove_failed = true; 205 dev_err(dev, 206 "mapping%d: %#llx-%#llx cannot be hotremoved until the next reboot\n", 207 i, range.start, range.end); 208 } 209 210 if (success >= dev_dax->nr_range) { 211 memory_group_unregister(data->mgid); 212 kfree(data->res_name); 213 kfree(data); 214 dev_set_drvdata(dev, NULL); 215 /* 216 * Clear the memtype association on successful unplug. 217 * If not, we have memory blocks left which can be 218 * offlined/onlined later. We need to keep memory_dev_type 219 * for that. This implies this reference will be around 220 * till next reboot. 221 */ 222 clear_node_memory_type(node, dax_slowmem_type); 223 } 224 } 225 #else 226 static void dev_dax_kmem_remove(struct dev_dax *dev_dax) 227 { 228 /* 229 * Without hotremove purposely leak the request_mem_region() for the 230 * device-dax range and return '0' to ->remove() attempts. The removal 231 * of the device from the driver always succeeds, but the region is 232 * permanently pinned as reserved by the unreleased 233 * request_mem_region(). 234 */ 235 any_hotremove_failed = true; 236 } 237 #endif /* CONFIG_MEMORY_HOTREMOVE */ 238 239 static struct dax_device_driver device_dax_kmem_driver = { 240 .probe = dev_dax_kmem_probe, 241 .remove = dev_dax_kmem_remove, 242 .type = DAXDRV_KMEM_TYPE, 243 }; 244 245 static int __init dax_kmem_init(void) 246 { 247 int rc; 248 249 /* Resource name is permanently allocated if any hotremove fails. */ 250 kmem_name = kstrdup_const("System RAM (kmem)", GFP_KERNEL); 251 if (!kmem_name) 252 return -ENOMEM; 253 254 dax_slowmem_type = alloc_memory_type(MEMTIER_DEFAULT_DAX_ADISTANCE); 255 if (IS_ERR(dax_slowmem_type)) { 256 rc = PTR_ERR(dax_slowmem_type); 257 goto err_dax_slowmem_type; 258 } 259 260 rc = dax_driver_register(&device_dax_kmem_driver); 261 if (rc) 262 goto error_dax_driver; 263 264 return rc; 265 266 error_dax_driver: 267 destroy_memory_type(dax_slowmem_type); 268 err_dax_slowmem_type: 269 kfree_const(kmem_name); 270 return rc; 271 } 272 273 static void __exit dax_kmem_exit(void) 274 { 275 dax_driver_unregister(&device_dax_kmem_driver); 276 if (!any_hotremove_failed) 277 kfree_const(kmem_name); 278 destroy_memory_type(dax_slowmem_type); 279 } 280 281 MODULE_AUTHOR("Intel Corporation"); 282 MODULE_LICENSE("GPL v2"); 283 module_init(dax_kmem_init); 284 module_exit(dax_kmem_exit); 285 MODULE_ALIAS_DAX_DEVICE(0); 286