xref: /openbmc/linux/drivers/dax/kmem.c (revision bded67f8)
1c221c0b0SDave Hansen // SPDX-License-Identifier: GPL-2.0
2c221c0b0SDave Hansen /* Copyright(c) 2016-2019 Intel Corporation. All rights reserved. */
3c221c0b0SDave Hansen #include <linux/memremap.h>
4c221c0b0SDave Hansen #include <linux/pagemap.h>
5c221c0b0SDave Hansen #include <linux/memory.h>
6c221c0b0SDave Hansen #include <linux/module.h>
7c221c0b0SDave Hansen #include <linux/device.h>
8c221c0b0SDave Hansen #include <linux/pfn_t.h>
9c221c0b0SDave Hansen #include <linux/slab.h>
10c221c0b0SDave Hansen #include <linux/dax.h>
11c221c0b0SDave Hansen #include <linux/fs.h>
12c221c0b0SDave Hansen #include <linux/mm.h>
13c221c0b0SDave Hansen #include <linux/mman.h>
147b88bda3SAneesh Kumar K.V #include <linux/memory-tiers.h>
15c221c0b0SDave Hansen #include "dax-private.h"
16c221c0b0SDave Hansen #include "bus.h"
17c221c0b0SDave Hansen 
187b88bda3SAneesh Kumar K.V /*
197b88bda3SAneesh Kumar K.V  * Default abstract distance assigned to the NUMA node onlined
207b88bda3SAneesh Kumar K.V  * by DAX/kmem if the low level platform driver didn't initialize
217b88bda3SAneesh Kumar K.V  * one for this NUMA node.
227b88bda3SAneesh Kumar K.V  */
237b88bda3SAneesh Kumar K.V #define MEMTIER_DEFAULT_DAX_ADISTANCE	(MEMTIER_ADISTANCE_DRAM * 5)
247b88bda3SAneesh Kumar K.V 
258a725e46SDavid Hildenbrand /* Memory resource name used for add_memory_driver_managed(). */
268a725e46SDavid Hildenbrand static const char *kmem_name;
278a725e46SDavid Hildenbrand /* Set if any memory will remain added when the driver will be unloaded. */
288a725e46SDavid Hildenbrand static bool any_hotremove_failed;
298a725e46SDavid Hildenbrand 
dax_kmem_range(struct dev_dax * dev_dax,int i,struct range * r)3060e93dc0SDan Williams static int dax_kmem_range(struct dev_dax *dev_dax, int i, struct range *r)
3159bc8d10SDan Williams {
3260e93dc0SDan Williams 	struct dev_dax_range *dax_range = &dev_dax->ranges[i];
3360e93dc0SDan Williams 	struct range *range = &dax_range->range;
3459bc8d10SDan Williams 
3559bc8d10SDan Williams 	/* memory-block align the hotplug range */
3660e93dc0SDan Williams 	r->start = ALIGN(range->start, memory_block_size_bytes());
3760e93dc0SDan Williams 	r->end = ALIGN_DOWN(range->end + 1, memory_block_size_bytes()) - 1;
3860e93dc0SDan Williams 	if (r->start >= r->end) {
3960e93dc0SDan Williams 		r->start = range->start;
4060e93dc0SDan Williams 		r->end = range->end;
4160e93dc0SDan Williams 		return -ENOSPC;
4260e93dc0SDan Williams 	}
4360e93dc0SDan Williams 	return 0;
4459bc8d10SDan Williams }
4559bc8d10SDan Williams 
46a455aa72SDan Williams struct dax_kmem_data {
47a455aa72SDan Williams 	const char *res_name;
48eedf634aSDavid Hildenbrand 	int mgid;
49a455aa72SDan Williams 	struct resource *res[];
50a455aa72SDan Williams };
51a455aa72SDan Williams 
527b88bda3SAneesh Kumar K.V static struct memory_dev_type *dax_slowmem_type;
dev_dax_kmem_probe(struct dev_dax * dev_dax)53f11cf813SDan Williams static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
54c221c0b0SDave Hansen {
55f11cf813SDan Williams 	struct device *dev = &dev_dax->dev;
56eedf634aSDavid Hildenbrand 	unsigned long total_len = 0;
57a455aa72SDan Williams 	struct dax_kmem_data *data;
58eedf634aSDavid Hildenbrand 	int i, rc, mapped = 0;
59c221c0b0SDave Hansen 	int numa_node;
60c221c0b0SDave Hansen 
61c221c0b0SDave Hansen 	/*
62c221c0b0SDave Hansen 	 * Ensure good NUMA information for the persistent memory.
63c221c0b0SDave Hansen 	 * Without this check, there is a risk that slow memory
64c221c0b0SDave Hansen 	 * could be mixed in a node with faster memory, causing
65c221c0b0SDave Hansen 	 * unavoidable performance issues.
66c221c0b0SDave Hansen 	 */
67c221c0b0SDave Hansen 	numa_node = dev_dax->target_node;
68c221c0b0SDave Hansen 	if (numa_node < 0) {
69f5516ec5SDan Williams 		dev_warn(dev, "rejecting DAX region with invalid node: %d\n",
70f5516ec5SDan Williams 				numa_node);
71c221c0b0SDave Hansen 		return -EINVAL;
72c221c0b0SDave Hansen 	}
73c221c0b0SDave Hansen 
7460e93dc0SDan Williams 	for (i = 0; i < dev_dax->nr_range; i++) {
7560e93dc0SDan Williams 		struct range range;
7660e93dc0SDan Williams 
7760e93dc0SDan Williams 		rc = dax_kmem_range(dev_dax, i, &range);
7860e93dc0SDan Williams 		if (rc) {
7960e93dc0SDan Williams 			dev_info(dev, "mapping%d: %#llx-%#llx too small after alignment\n",
8060e93dc0SDan Williams 					i, range.start, range.end);
8160e93dc0SDan Williams 			continue;
8260e93dc0SDan Williams 		}
83eedf634aSDavid Hildenbrand 		total_len += range_len(&range);
84eedf634aSDavid Hildenbrand 	}
85eedf634aSDavid Hildenbrand 
86eedf634aSDavid Hildenbrand 	if (!total_len) {
87eedf634aSDavid Hildenbrand 		dev_warn(dev, "rejecting DAX region without any memory after alignment\n");
88eedf634aSDavid Hildenbrand 		return -EINVAL;
89eedf634aSDavid Hildenbrand 	}
90eedf634aSDavid Hildenbrand 
917b88bda3SAneesh Kumar K.V 	init_node_memory_type(numa_node, dax_slowmem_type);
92eedf634aSDavid Hildenbrand 
93eedf634aSDavid Hildenbrand 	rc = -ENOMEM;
947b88bda3SAneesh Kumar K.V 	data = kzalloc(struct_size(data, res, dev_dax->nr_range), GFP_KERNEL);
957b88bda3SAneesh Kumar K.V 	if (!data)
967b88bda3SAneesh Kumar K.V 		goto err_dax_kmem_data;
977b88bda3SAneesh Kumar K.V 
98eedf634aSDavid Hildenbrand 	data->res_name = kstrdup(dev_name(dev), GFP_KERNEL);
99eedf634aSDavid Hildenbrand 	if (!data->res_name)
100eedf634aSDavid Hildenbrand 		goto err_res_name;
101eedf634aSDavid Hildenbrand 
10246e66dabSTarun Sahu 	rc = memory_group_register_static(numa_node, PFN_UP(total_len));
103eedf634aSDavid Hildenbrand 	if (rc < 0)
104eedf634aSDavid Hildenbrand 		goto err_reg_mgid;
105eedf634aSDavid Hildenbrand 	data->mgid = rc;
106eedf634aSDavid Hildenbrand 
107eedf634aSDavid Hildenbrand 	for (i = 0; i < dev_dax->nr_range; i++) {
108eedf634aSDavid Hildenbrand 		struct resource *res;
109eedf634aSDavid Hildenbrand 		struct range range;
110eedf634aSDavid Hildenbrand 
111eedf634aSDavid Hildenbrand 		rc = dax_kmem_range(dev_dax, i, &range);
112eedf634aSDavid Hildenbrand 		if (rc)
113eedf634aSDavid Hildenbrand 			continue;
11460e93dc0SDan Williams 
11560858c00SDavid Hildenbrand 		/* Region is permanently reserved if hotremove fails. */
116a455aa72SDan Williams 		res = request_mem_region(range.start, range_len(&range), data->res_name);
1170513bd5bSDan Williams 		if (!res) {
11860e93dc0SDan Williams 			dev_warn(dev, "mapping%d: %#llx-%#llx could not reserve region\n",
11960e93dc0SDan Williams 					i, range.start, range.end);
12060e93dc0SDan Williams 			/*
12160e93dc0SDan Williams 			 * Once some memory has been onlined we can't
12260e93dc0SDan Williams 			 * assume that it can be un-onlined safely.
12360e93dc0SDan Williams 			 */
12460e93dc0SDan Williams 			if (mapped)
12560e93dc0SDan Williams 				continue;
126a455aa72SDan Williams 			rc = -EBUSY;
127a455aa72SDan Williams 			goto err_request_mem;
128c221c0b0SDave Hansen 		}
129a455aa72SDan Williams 		data->res[i] = res;
130c221c0b0SDave Hansen 
131c221c0b0SDave Hansen 		/*
132c221c0b0SDave Hansen 		 * Set flags appropriate for System RAM.  Leave ..._BUSY clear
133c221c0b0SDave Hansen 		 * so that add_memory() can add a child resource.  Do not
134c221c0b0SDave Hansen 		 * inherit flags from the parent since it may set new flags
135c221c0b0SDave Hansen 		 * unknown to us that will break add_memory() below.
136c221c0b0SDave Hansen 		 */
1370513bd5bSDan Williams 		res->flags = IORESOURCE_SYSTEM_RAM;
138c221c0b0SDave Hansen 
1398a725e46SDavid Hildenbrand 		/*
14060e93dc0SDan Williams 		 * Ensure that future kexec'd kernels will not treat
14160e93dc0SDan Williams 		 * this as RAM automatically.
1428a725e46SDavid Hildenbrand 		 */
143eedf634aSDavid Hildenbrand 		rc = add_memory_driver_managed(data->mgid, range.start,
144eedf634aSDavid Hildenbrand 				range_len(&range), kmem_name, MHP_NID_IS_MGID);
14560e93dc0SDan Williams 
14631e4ca92SPavel Tatashin 		if (rc) {
14760e93dc0SDan Williams 			dev_warn(dev, "mapping%d: %#llx-%#llx memory add failed\n",
14860e93dc0SDan Williams 					i, range.start, range.end);
149e686c325SDan Williams 			remove_resource(res);
150a455aa72SDan Williams 			kfree(res);
151a455aa72SDan Williams 			data->res[i] = NULL;
15260e93dc0SDan Williams 			if (mapped)
15360e93dc0SDan Williams 				continue;
154a455aa72SDan Williams 			goto err_request_mem;
15531e4ca92SPavel Tatashin 		}
15660e93dc0SDan Williams 		mapped++;
15760e93dc0SDan Williams 	}
1587e6b431aSDan Williams 
159a455aa72SDan Williams 	dev_set_drvdata(dev, data);
160c221c0b0SDave Hansen 
161c221c0b0SDave Hansen 	return 0;
162a455aa72SDan Williams 
163a455aa72SDan Williams err_request_mem:
164eedf634aSDavid Hildenbrand 	memory_group_unregister(data->mgid);
165eedf634aSDavid Hildenbrand err_reg_mgid:
166a455aa72SDan Williams 	kfree(data->res_name);
167a455aa72SDan Williams err_res_name:
168a455aa72SDan Williams 	kfree(data);
1697b88bda3SAneesh Kumar K.V err_dax_kmem_data:
1707b88bda3SAneesh Kumar K.V 	clear_node_memory_type(numa_node, dax_slowmem_type);
171a455aa72SDan Williams 	return rc;
172c221c0b0SDave Hansen }
173c221c0b0SDave Hansen 
1749f960da7SPavel Tatashin #ifdef CONFIG_MEMORY_HOTREMOVE
dev_dax_kmem_remove(struct dev_dax * dev_dax)1750d519e0dSUwe Kleine-König static void dev_dax_kmem_remove(struct dev_dax *dev_dax)
1769f960da7SPavel Tatashin {
17760e93dc0SDan Williams 	int i, success = 0;
1787b88bda3SAneesh Kumar K.V 	int node = dev_dax->target_node;
179f11cf813SDan Williams 	struct device *dev = &dev_dax->dev;
180a455aa72SDan Williams 	struct dax_kmem_data *data = dev_get_drvdata(dev);
1819f960da7SPavel Tatashin 
1829f960da7SPavel Tatashin 	/*
1839f960da7SPavel Tatashin 	 * We have one shot for removing memory, if some memory blocks were not
1849f960da7SPavel Tatashin 	 * offline prior to calling this function remove_memory() will fail, and
1859f960da7SPavel Tatashin 	 * there is no way to hotremove this memory until reboot because device
1869f960da7SPavel Tatashin 	 * unbind will succeed even if we return failure.
1879f960da7SPavel Tatashin 	 */
18860e93dc0SDan Williams 	for (i = 0; i < dev_dax->nr_range; i++) {
18960e93dc0SDan Williams 		struct range range;
19060e93dc0SDan Williams 		int rc;
19160e93dc0SDan Williams 
19260e93dc0SDan Williams 		rc = dax_kmem_range(dev_dax, i, &range);
19360e93dc0SDan Williams 		if (rc)
19460e93dc0SDan Williams 			continue;
19560e93dc0SDan Williams 
196e1c158e4SDavid Hildenbrand 		rc = remove_memory(range.start, range_len(&range));
19760e93dc0SDan Williams 		if (rc == 0) {
198e686c325SDan Williams 			remove_resource(data->res[i]);
199a455aa72SDan Williams 			kfree(data->res[i]);
200a455aa72SDan Williams 			data->res[i] = NULL;
20160e93dc0SDan Williams 			success++;
20260e93dc0SDan Williams 			continue;
20360e93dc0SDan Williams 		}
2048a725e46SDavid Hildenbrand 		any_hotremove_failed = true;
20560e93dc0SDan Williams 		dev_err(dev,
20660e93dc0SDan Williams 			"mapping%d: %#llx-%#llx cannot be hotremoved until the next reboot\n",
20760e93dc0SDan Williams 				i, range.start, range.end);
2089f960da7SPavel Tatashin 	}
2099f960da7SPavel Tatashin 
21060e93dc0SDan Williams 	if (success >= dev_dax->nr_range) {
211eedf634aSDavid Hildenbrand 		memory_group_unregister(data->mgid);
212a455aa72SDan Williams 		kfree(data->res_name);
213a455aa72SDan Williams 		kfree(data);
21460e93dc0SDan Williams 		dev_set_drvdata(dev, NULL);
2157b88bda3SAneesh Kumar K.V 		/*
2167b88bda3SAneesh Kumar K.V 		 * Clear the memtype association on successful unplug.
2177b88bda3SAneesh Kumar K.V 		 * If not, we have memory blocks left which can be
2187b88bda3SAneesh Kumar K.V 		 * offlined/onlined later. We need to keep memory_dev_type
2197b88bda3SAneesh Kumar K.V 		 * for that. This implies this reference will be around
2207b88bda3SAneesh Kumar K.V 		 * till next reboot.
2217b88bda3SAneesh Kumar K.V 		 */
2227b88bda3SAneesh Kumar K.V 		clear_node_memory_type(node, dax_slowmem_type);
22360e93dc0SDan Williams 	}
2249f960da7SPavel Tatashin }
2259f960da7SPavel Tatashin #else
dev_dax_kmem_remove(struct dev_dax * dev_dax)2260d519e0dSUwe Kleine-König static void dev_dax_kmem_remove(struct dev_dax *dev_dax)
227c221c0b0SDave Hansen {
228c221c0b0SDave Hansen 	/*
2299f960da7SPavel Tatashin 	 * Without hotremove purposely leak the request_mem_region() for the
2309f960da7SPavel Tatashin 	 * device-dax range and return '0' to ->remove() attempts. The removal
2319f960da7SPavel Tatashin 	 * of the device from the driver always succeeds, but the region is
2329f960da7SPavel Tatashin 	 * permanently pinned as reserved by the unreleased
233c221c0b0SDave Hansen 	 * request_mem_region().
234c221c0b0SDave Hansen 	 */
2358a725e46SDavid Hildenbrand 	any_hotremove_failed = true;
236c221c0b0SDave Hansen }
2379f960da7SPavel Tatashin #endif /* CONFIG_MEMORY_HOTREMOVE */
238c221c0b0SDave Hansen 
239c221c0b0SDave Hansen static struct dax_device_driver device_dax_kmem_driver = {
240c221c0b0SDave Hansen 	.probe = dev_dax_kmem_probe,
241c221c0b0SDave Hansen 	.remove = dev_dax_kmem_remove,
242e9ee9fe3SDan Williams 	.type = DAXDRV_KMEM_TYPE,
243c221c0b0SDave Hansen };
244c221c0b0SDave Hansen 
dax_kmem_init(void)245c221c0b0SDave Hansen static int __init dax_kmem_init(void)
246c221c0b0SDave Hansen {
2478a725e46SDavid Hildenbrand 	int rc;
2488a725e46SDavid Hildenbrand 
2498a725e46SDavid Hildenbrand 	/* Resource name is permanently allocated if any hotremove fails. */
2508a725e46SDavid Hildenbrand 	kmem_name = kstrdup_const("System RAM (kmem)", GFP_KERNEL);
2518a725e46SDavid Hildenbrand 	if (!kmem_name)
2528a725e46SDavid Hildenbrand 		return -ENOMEM;
2538a725e46SDavid Hildenbrand 
2547b88bda3SAneesh Kumar K.V 	dax_slowmem_type = alloc_memory_type(MEMTIER_DEFAULT_DAX_ADISTANCE);
2557b88bda3SAneesh Kumar K.V 	if (IS_ERR(dax_slowmem_type)) {
2567b88bda3SAneesh Kumar K.V 		rc = PTR_ERR(dax_slowmem_type);
2577b88bda3SAneesh Kumar K.V 		goto err_dax_slowmem_type;
2587b88bda3SAneesh Kumar K.V 	}
2597b88bda3SAneesh Kumar K.V 
2608a725e46SDavid Hildenbrand 	rc = dax_driver_register(&device_dax_kmem_driver);
2618a725e46SDavid Hildenbrand 	if (rc)
2627b88bda3SAneesh Kumar K.V 		goto error_dax_driver;
2637b88bda3SAneesh Kumar K.V 
2647b88bda3SAneesh Kumar K.V 	return rc;
2657b88bda3SAneesh Kumar K.V 
2667b88bda3SAneesh Kumar K.V error_dax_driver:
267*bded67f8SMiaohe Lin 	put_memory_type(dax_slowmem_type);
2687b88bda3SAneesh Kumar K.V err_dax_slowmem_type:
2698a725e46SDavid Hildenbrand 	kfree_const(kmem_name);
2708a725e46SDavid Hildenbrand 	return rc;
271c221c0b0SDave Hansen }
272c221c0b0SDave Hansen 
dax_kmem_exit(void)273c221c0b0SDave Hansen static void __exit dax_kmem_exit(void)
274c221c0b0SDave Hansen {
275c221c0b0SDave Hansen 	dax_driver_unregister(&device_dax_kmem_driver);
2768a725e46SDavid Hildenbrand 	if (!any_hotremove_failed)
2778a725e46SDavid Hildenbrand 		kfree_const(kmem_name);
278*bded67f8SMiaohe Lin 	put_memory_type(dax_slowmem_type);
279c221c0b0SDave Hansen }
280c221c0b0SDave Hansen 
281c221c0b0SDave Hansen MODULE_AUTHOR("Intel Corporation");
282c221c0b0SDave Hansen MODULE_LICENSE("GPL v2");
283c221c0b0SDave Hansen module_init(dax_kmem_init);
284c221c0b0SDave Hansen module_exit(dax_kmem_exit);
285c221c0b0SDave Hansen MODULE_ALIAS_DAX_DEVICE(0);
286