xref: /openbmc/linux/mm/memory-tiers.c (revision 32008027)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/slab.h>
3 #include <linux/lockdep.h>
4 #include <linux/sysfs.h>
5 #include <linux/kobject.h>
6 #include <linux/memory.h>
7 #include <linux/memory-tiers.h>
8 
9 #include "internal.h"
10 
11 struct memory_tier {
12 	/* hierarchy of memory tiers */
13 	struct list_head list;
14 	/* list of all memory types part of this tier */
15 	struct list_head memory_types;
16 	/*
17 	 * start value of abstract distance. memory tier maps
18 	 * an abstract distance  range,
19 	 * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE
20 	 */
21 	int adistance_start;
22 	/* All the nodes that are part of all the lower memory tiers. */
23 	nodemask_t lower_tier_mask;
24 };
25 
26 struct demotion_nodes {
27 	nodemask_t preferred;
28 };
29 
30 struct node_memory_type_map {
31 	struct memory_dev_type *memtype;
32 	int map_count;
33 };
34 
35 static DEFINE_MUTEX(memory_tier_lock);
36 static LIST_HEAD(memory_tiers);
37 static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
38 static struct memory_dev_type *default_dram_type;
39 #ifdef CONFIG_MIGRATION
40 /*
41  * node_demotion[] examples:
42  *
43  * Example 1:
44  *
45  * Node 0 & 1 are CPU + DRAM nodes, node 2 & 3 are PMEM nodes.
46  *
47  * node distances:
48  * node   0    1    2    3
49  *    0  10   20   30   40
50  *    1  20   10   40   30
51  *    2  30   40   10   40
52  *    3  40   30   40   10
53  *
54  * memory_tiers0 = 0-1
55  * memory_tiers1 = 2-3
56  *
57  * node_demotion[0].preferred = 2
58  * node_demotion[1].preferred = 3
59  * node_demotion[2].preferred = <empty>
60  * node_demotion[3].preferred = <empty>
61  *
62  * Example 2:
63  *
64  * Node 0 & 1 are CPU + DRAM nodes, node 2 is memory-only DRAM node.
65  *
66  * node distances:
67  * node   0    1    2
68  *    0  10   20   30
69  *    1  20   10   30
70  *    2  30   30   10
71  *
72  * memory_tiers0 = 0-2
73  *
74  * node_demotion[0].preferred = <empty>
75  * node_demotion[1].preferred = <empty>
76  * node_demotion[2].preferred = <empty>
77  *
78  * Example 3:
79  *
80  * Node 0 is CPU + DRAM nodes, Node 1 is HBM node, node 2 is PMEM node.
81  *
82  * node distances:
83  * node   0    1    2
84  *    0  10   20   30
85  *    1  20   10   40
86  *    2  30   40   10
87  *
88  * memory_tiers0 = 1
89  * memory_tiers1 = 0
90  * memory_tiers2 = 2
91  *
92  * node_demotion[0].preferred = 2
93  * node_demotion[1].preferred = 0
94  * node_demotion[2].preferred = <empty>
95  *
96  */
97 static struct demotion_nodes *node_demotion __read_mostly;
98 #endif /* CONFIG_MIGRATION */
99 
100 static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype)
101 {
102 	bool found_slot = false;
103 	struct memory_tier *memtier, *new_memtier;
104 	int adistance = memtype->adistance;
105 	unsigned int memtier_adistance_chunk_size = MEMTIER_CHUNK_SIZE;
106 
107 	lockdep_assert_held_once(&memory_tier_lock);
108 
109 	adistance = round_down(adistance, memtier_adistance_chunk_size);
110 	/*
111 	 * If the memtype is already part of a memory tier,
112 	 * just return that.
113 	 */
114 	if (!list_empty(&memtype->tier_sibiling)) {
115 		list_for_each_entry(memtier, &memory_tiers, list) {
116 			if (adistance == memtier->adistance_start)
117 				return memtier;
118 		}
119 		WARN_ON(1);
120 		return ERR_PTR(-EINVAL);
121 	}
122 
123 	list_for_each_entry(memtier, &memory_tiers, list) {
124 		if (adistance == memtier->adistance_start) {
125 			list_add(&memtype->tier_sibiling, &memtier->memory_types);
126 			return memtier;
127 		} else if (adistance < memtier->adistance_start) {
128 			found_slot = true;
129 			break;
130 		}
131 	}
132 
133 	new_memtier = kmalloc(sizeof(struct memory_tier), GFP_KERNEL);
134 	if (!new_memtier)
135 		return ERR_PTR(-ENOMEM);
136 
137 	new_memtier->adistance_start = adistance;
138 	INIT_LIST_HEAD(&new_memtier->list);
139 	INIT_LIST_HEAD(&new_memtier->memory_types);
140 	if (found_slot)
141 		list_add_tail(&new_memtier->list, &memtier->list);
142 	else
143 		list_add_tail(&new_memtier->list, &memory_tiers);
144 	list_add(&memtype->tier_sibiling, &new_memtier->memory_types);
145 	return new_memtier;
146 }
147 
148 static struct memory_tier *__node_get_memory_tier(int node)
149 {
150 	pg_data_t *pgdat;
151 
152 	pgdat = NODE_DATA(node);
153 	if (!pgdat)
154 		return NULL;
155 	/*
156 	 * Since we hold memory_tier_lock, we can avoid
157 	 * RCU read locks when accessing the details. No
158 	 * parallel updates are possible here.
159 	 */
160 	return rcu_dereference_check(pgdat->memtier,
161 				     lockdep_is_held(&memory_tier_lock));
162 }
163 
164 #ifdef CONFIG_MIGRATION
165 void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
166 {
167 	struct memory_tier *memtier;
168 
169 	/*
170 	 * pg_data_t.memtier updates includes a synchronize_rcu()
171 	 * which ensures that we either find NULL or a valid memtier
172 	 * in NODE_DATA. protect the access via rcu_read_lock();
173 	 */
174 	rcu_read_lock();
175 	memtier = rcu_dereference(pgdat->memtier);
176 	if (memtier)
177 		*targets = memtier->lower_tier_mask;
178 	else
179 		*targets = NODE_MASK_NONE;
180 	rcu_read_unlock();
181 }
182 
183 /**
184  * next_demotion_node() - Get the next node in the demotion path
185  * @node: The starting node to lookup the next node
186  *
187  * Return: node id for next memory node in the demotion path hierarchy
188  * from @node; NUMA_NO_NODE if @node is terminal.  This does not keep
189  * @node online or guarantee that it *continues* to be the next demotion
190  * target.
191  */
192 int next_demotion_node(int node)
193 {
194 	struct demotion_nodes *nd;
195 	int target;
196 
197 	if (!node_demotion)
198 		return NUMA_NO_NODE;
199 
200 	nd = &node_demotion[node];
201 
202 	/*
203 	 * node_demotion[] is updated without excluding this
204 	 * function from running.
205 	 *
206 	 * Make sure to use RCU over entire code blocks if
207 	 * node_demotion[] reads need to be consistent.
208 	 */
209 	rcu_read_lock();
210 	/*
211 	 * If there are multiple target nodes, just select one
212 	 * target node randomly.
213 	 *
214 	 * In addition, we can also use round-robin to select
215 	 * target node, but we should introduce another variable
216 	 * for node_demotion[] to record last selected target node,
217 	 * that may cause cache ping-pong due to the changing of
218 	 * last target node. Or introducing per-cpu data to avoid
219 	 * caching issue, which seems more complicated. So selecting
220 	 * target node randomly seems better until now.
221 	 */
222 	target = node_random(&nd->preferred);
223 	rcu_read_unlock();
224 
225 	return target;
226 }
227 
228 static void disable_all_demotion_targets(void)
229 {
230 	struct memory_tier *memtier;
231 	int node;
232 
233 	for_each_node_state(node, N_MEMORY) {
234 		node_demotion[node].preferred = NODE_MASK_NONE;
235 		/*
236 		 * We are holding memory_tier_lock, it is safe
237 		 * to access pgda->memtier.
238 		 */
239 		memtier = __node_get_memory_tier(node);
240 		if (memtier)
241 			memtier->lower_tier_mask = NODE_MASK_NONE;
242 	}
243 	/*
244 	 * Ensure that the "disable" is visible across the system.
245 	 * Readers will see either a combination of before+disable
246 	 * state or disable+after.  They will never see before and
247 	 * after state together.
248 	 */
249 	synchronize_rcu();
250 }
251 
252 static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
253 {
254 	nodemask_t nodes = NODE_MASK_NONE;
255 	struct memory_dev_type *memtype;
256 
257 	list_for_each_entry(memtype, &memtier->memory_types, tier_sibiling)
258 		nodes_or(nodes, nodes, memtype->nodes);
259 
260 	return nodes;
261 }
262 
263 /*
264  * Find an automatic demotion target for all memory
265  * nodes. Failing here is OK.  It might just indicate
266  * being at the end of a chain.
267  */
268 static void establish_demotion_targets(void)
269 {
270 	struct memory_tier *memtier;
271 	struct demotion_nodes *nd;
272 	int target = NUMA_NO_NODE, node;
273 	int distance, best_distance;
274 	nodemask_t tier_nodes, lower_tier;
275 
276 	lockdep_assert_held_once(&memory_tier_lock);
277 
278 	if (!node_demotion || !IS_ENABLED(CONFIG_MIGRATION))
279 		return;
280 
281 	disable_all_demotion_targets();
282 
283 	for_each_node_state(node, N_MEMORY) {
284 		best_distance = -1;
285 		nd = &node_demotion[node];
286 
287 		memtier = __node_get_memory_tier(node);
288 		if (!memtier || list_is_last(&memtier->list, &memory_tiers))
289 			continue;
290 		/*
291 		 * Get the lower memtier to find the  demotion node list.
292 		 */
293 		memtier = list_next_entry(memtier, list);
294 		tier_nodes = get_memtier_nodemask(memtier);
295 		/*
296 		 * find_next_best_node, use 'used' nodemask as a skip list.
297 		 * Add all memory nodes except the selected memory tier
298 		 * nodelist to skip list so that we find the best node from the
299 		 * memtier nodelist.
300 		 */
301 		nodes_andnot(tier_nodes, node_states[N_MEMORY], tier_nodes);
302 
303 		/*
304 		 * Find all the nodes in the memory tier node list of same best distance.
305 		 * add them to the preferred mask. We randomly select between nodes
306 		 * in the preferred mask when allocating pages during demotion.
307 		 */
308 		do {
309 			target = find_next_best_node(node, &tier_nodes);
310 			if (target == NUMA_NO_NODE)
311 				break;
312 
313 			distance = node_distance(node, target);
314 			if (distance == best_distance || best_distance == -1) {
315 				best_distance = distance;
316 				node_set(target, nd->preferred);
317 			} else {
318 				break;
319 			}
320 		} while (1);
321 	}
322 	/*
323 	 * Now build the lower_tier mask for each node collecting node mask from
324 	 * all memory tier below it. This allows us to fallback demotion page
325 	 * allocation to a set of nodes that is closer the above selected
326 	 * perferred node.
327 	 */
328 	lower_tier = node_states[N_MEMORY];
329 	list_for_each_entry(memtier, &memory_tiers, list) {
330 		/*
331 		 * Keep removing current tier from lower_tier nodes,
332 		 * This will remove all nodes in current and above
333 		 * memory tier from the lower_tier mask.
334 		 */
335 		tier_nodes = get_memtier_nodemask(memtier);
336 		nodes_andnot(lower_tier, lower_tier, tier_nodes);
337 		memtier->lower_tier_mask = lower_tier;
338 	}
339 }
340 
341 #else
342 static inline void disable_all_demotion_targets(void) {}
343 static inline void establish_demotion_targets(void) {}
344 #endif /* CONFIG_MIGRATION */
345 
346 static inline void __init_node_memory_type(int node, struct memory_dev_type *memtype)
347 {
348 	if (!node_memory_types[node].memtype)
349 		node_memory_types[node].memtype = memtype;
350 	/*
351 	 * for each device getting added in the same NUMA node
352 	 * with this specific memtype, bump the map count. We
353 	 * Only take memtype device reference once, so that
354 	 * changing a node memtype can be done by droping the
355 	 * only reference count taken here.
356 	 */
357 
358 	if (node_memory_types[node].memtype == memtype) {
359 		if (!node_memory_types[node].map_count++)
360 			kref_get(&memtype->kref);
361 	}
362 }
363 
364 static struct memory_tier *set_node_memory_tier(int node)
365 {
366 	struct memory_tier *memtier;
367 	struct memory_dev_type *memtype;
368 	pg_data_t *pgdat = NODE_DATA(node);
369 
370 
371 	lockdep_assert_held_once(&memory_tier_lock);
372 
373 	if (!node_state(node, N_MEMORY))
374 		return ERR_PTR(-EINVAL);
375 
376 	__init_node_memory_type(node, default_dram_type);
377 
378 	memtype = node_memory_types[node].memtype;
379 	node_set(node, memtype->nodes);
380 	memtier = find_create_memory_tier(memtype);
381 	if (!IS_ERR(memtier))
382 		rcu_assign_pointer(pgdat->memtier, memtier);
383 	return memtier;
384 }
385 
386 static void destroy_memory_tier(struct memory_tier *memtier)
387 {
388 	list_del(&memtier->list);
389 	/*
390 	 * synchronize_rcu in clear_node_memory_tier makes sure
391 	 * we don't have rcu access to this memory tier.
392 	 */
393 	kfree(memtier);
394 }
395 
396 static bool clear_node_memory_tier(int node)
397 {
398 	bool cleared = false;
399 	pg_data_t *pgdat;
400 	struct memory_tier *memtier;
401 
402 	pgdat = NODE_DATA(node);
403 	if (!pgdat)
404 		return false;
405 
406 	/*
407 	 * Make sure that anybody looking at NODE_DATA who finds
408 	 * a valid memtier finds memory_dev_types with nodes still
409 	 * linked to the memtier. We achieve this by waiting for
410 	 * rcu read section to finish using synchronize_rcu.
411 	 * This also enables us to free the destroyed memory tier
412 	 * with kfree instead of kfree_rcu
413 	 */
414 	memtier = __node_get_memory_tier(node);
415 	if (memtier) {
416 		struct memory_dev_type *memtype;
417 
418 		rcu_assign_pointer(pgdat->memtier, NULL);
419 		synchronize_rcu();
420 		memtype = node_memory_types[node].memtype;
421 		node_clear(node, memtype->nodes);
422 		if (nodes_empty(memtype->nodes)) {
423 			list_del_init(&memtype->tier_sibiling);
424 			if (list_empty(&memtier->memory_types))
425 				destroy_memory_tier(memtier);
426 		}
427 		cleared = true;
428 	}
429 	return cleared;
430 }
431 
432 static void release_memtype(struct kref *kref)
433 {
434 	struct memory_dev_type *memtype;
435 
436 	memtype = container_of(kref, struct memory_dev_type, kref);
437 	kfree(memtype);
438 }
439 
440 struct memory_dev_type *alloc_memory_type(int adistance)
441 {
442 	struct memory_dev_type *memtype;
443 
444 	memtype = kmalloc(sizeof(*memtype), GFP_KERNEL);
445 	if (!memtype)
446 		return ERR_PTR(-ENOMEM);
447 
448 	memtype->adistance = adistance;
449 	INIT_LIST_HEAD(&memtype->tier_sibiling);
450 	memtype->nodes  = NODE_MASK_NONE;
451 	kref_init(&memtype->kref);
452 	return memtype;
453 }
454 EXPORT_SYMBOL_GPL(alloc_memory_type);
455 
456 void destroy_memory_type(struct memory_dev_type *memtype)
457 {
458 	kref_put(&memtype->kref, release_memtype);
459 }
460 EXPORT_SYMBOL_GPL(destroy_memory_type);
461 
462 void init_node_memory_type(int node, struct memory_dev_type *memtype)
463 {
464 
465 	mutex_lock(&memory_tier_lock);
466 	__init_node_memory_type(node, memtype);
467 	mutex_unlock(&memory_tier_lock);
468 }
469 EXPORT_SYMBOL_GPL(init_node_memory_type);
470 
471 void clear_node_memory_type(int node, struct memory_dev_type *memtype)
472 {
473 	mutex_lock(&memory_tier_lock);
474 	if (node_memory_types[node].memtype == memtype)
475 		node_memory_types[node].map_count--;
476 	/*
477 	 * If we umapped all the attached devices to this node,
478 	 * clear the node memory type.
479 	 */
480 	if (!node_memory_types[node].map_count) {
481 		node_memory_types[node].memtype = NULL;
482 		kref_put(&memtype->kref, release_memtype);
483 	}
484 	mutex_unlock(&memory_tier_lock);
485 }
486 EXPORT_SYMBOL_GPL(clear_node_memory_type);
487 
488 static int __meminit memtier_hotplug_callback(struct notifier_block *self,
489 					      unsigned long action, void *_arg)
490 {
491 	struct memory_tier *memtier;
492 	struct memory_notify *arg = _arg;
493 
494 	/*
495 	 * Only update the node migration order when a node is
496 	 * changing status, like online->offline.
497 	 */
498 	if (arg->status_change_nid < 0)
499 		return notifier_from_errno(0);
500 
501 	switch (action) {
502 	case MEM_OFFLINE:
503 		mutex_lock(&memory_tier_lock);
504 		if (clear_node_memory_tier(arg->status_change_nid))
505 			establish_demotion_targets();
506 		mutex_unlock(&memory_tier_lock);
507 		break;
508 	case MEM_ONLINE:
509 		mutex_lock(&memory_tier_lock);
510 		memtier = set_node_memory_tier(arg->status_change_nid);
511 		if (!IS_ERR(memtier))
512 			establish_demotion_targets();
513 		mutex_unlock(&memory_tier_lock);
514 		break;
515 	}
516 
517 	return notifier_from_errno(0);
518 }
519 
520 static int __init memory_tier_init(void)
521 {
522 	int node;
523 	struct memory_tier *memtier;
524 
525 #ifdef CONFIG_MIGRATION
526 	node_demotion = kcalloc(nr_node_ids, sizeof(struct demotion_nodes),
527 				GFP_KERNEL);
528 	WARN_ON(!node_demotion);
529 #endif
530 	mutex_lock(&memory_tier_lock);
531 	/*
532 	 * For now we can have 4 faster memory tiers with smaller adistance
533 	 * than default DRAM tier.
534 	 */
535 	default_dram_type = alloc_memory_type(MEMTIER_ADISTANCE_DRAM);
536 	if (!default_dram_type)
537 		panic("%s() failed to allocate default DRAM tier\n", __func__);
538 
539 	/*
540 	 * Look at all the existing N_MEMORY nodes and add them to
541 	 * default memory tier or to a tier if we already have memory
542 	 * types assigned.
543 	 */
544 	for_each_node_state(node, N_MEMORY) {
545 		memtier = set_node_memory_tier(node);
546 		if (IS_ERR(memtier))
547 			/*
548 			 * Continue with memtiers we are able to setup
549 			 */
550 			break;
551 	}
552 	establish_demotion_targets();
553 	mutex_unlock(&memory_tier_lock);
554 
555 	hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRIO);
556 	return 0;
557 }
558 subsys_initcall(memory_tier_init);
559 
560 bool numa_demotion_enabled = false;
561 
562 #ifdef CONFIG_MIGRATION
563 #ifdef CONFIG_SYSFS
564 static ssize_t numa_demotion_enabled_show(struct kobject *kobj,
565 					  struct kobj_attribute *attr, char *buf)
566 {
567 	return sysfs_emit(buf, "%s\n",
568 			  numa_demotion_enabled ? "true" : "false");
569 }
570 
571 static ssize_t numa_demotion_enabled_store(struct kobject *kobj,
572 					   struct kobj_attribute *attr,
573 					   const char *buf, size_t count)
574 {
575 	ssize_t ret;
576 
577 	ret = kstrtobool(buf, &numa_demotion_enabled);
578 	if (ret)
579 		return ret;
580 
581 	return count;
582 }
583 
584 static struct kobj_attribute numa_demotion_enabled_attr =
585 	__ATTR(demotion_enabled, 0644, numa_demotion_enabled_show,
586 	       numa_demotion_enabled_store);
587 
588 static struct attribute *numa_attrs[] = {
589 	&numa_demotion_enabled_attr.attr,
590 	NULL,
591 };
592 
593 static const struct attribute_group numa_attr_group = {
594 	.attrs = numa_attrs,
595 };
596 
597 static int __init numa_init_sysfs(void)
598 {
599 	int err;
600 	struct kobject *numa_kobj;
601 
602 	numa_kobj = kobject_create_and_add("numa", mm_kobj);
603 	if (!numa_kobj) {
604 		pr_err("failed to create numa kobject\n");
605 		return -ENOMEM;
606 	}
607 	err = sysfs_create_group(numa_kobj, &numa_attr_group);
608 	if (err) {
609 		pr_err("failed to register numa group\n");
610 		goto delete_obj;
611 	}
612 	return 0;
613 
614 delete_obj:
615 	kobject_put(numa_kobj);
616 	return err;
617 }
618 subsys_initcall(numa_init_sysfs);
619 #endif /* CONFIG_SYSFS */
620 #endif
621