xref: /openbmc/linux/mm/memory-tiers.c (revision 467b171af881282fc627328e6c164f044a6df888)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/slab.h>
3 #include <linux/lockdep.h>
4 #include <linux/sysfs.h>
5 #include <linux/kobject.h>
6 #include <linux/memory.h>
7 #include <linux/memory-tiers.h>
8 
9 #include "internal.h"
10 
11 struct memory_tier {
12 	/* hierarchy of memory tiers */
13 	struct list_head list;
14 	/* list of all memory types part of this tier */
15 	struct list_head memory_types;
16 	/*
17 	 * start value of abstract distance. memory tier maps
18 	 * an abstract distance  range,
19 	 * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE
20 	 */
21 	int adistance_start;
22 	/* All the nodes that are part of all the lower memory tiers. */
23 	nodemask_t lower_tier_mask;
24 };
25 
26 struct demotion_nodes {
27 	nodemask_t preferred;
28 };
29 
30 struct node_memory_type_map {
31 	struct memory_dev_type *memtype;
32 	int map_count;
33 };
34 
35 static DEFINE_MUTEX(memory_tier_lock);
36 static LIST_HEAD(memory_tiers);
37 static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
38 static struct memory_dev_type *default_dram_type;
39 #ifdef CONFIG_MIGRATION
40 static int top_tier_adistance;
41 /*
42  * node_demotion[] examples:
43  *
44  * Example 1:
45  *
46  * Node 0 & 1 are CPU + DRAM nodes, node 2 & 3 are PMEM nodes.
47  *
48  * node distances:
49  * node   0    1    2    3
50  *    0  10   20   30   40
51  *    1  20   10   40   30
52  *    2  30   40   10   40
53  *    3  40   30   40   10
54  *
55  * memory_tiers0 = 0-1
56  * memory_tiers1 = 2-3
57  *
58  * node_demotion[0].preferred = 2
59  * node_demotion[1].preferred = 3
60  * node_demotion[2].preferred = <empty>
61  * node_demotion[3].preferred = <empty>
62  *
63  * Example 2:
64  *
65  * Node 0 & 1 are CPU + DRAM nodes, node 2 is memory-only DRAM node.
66  *
67  * node distances:
68  * node   0    1    2
69  *    0  10   20   30
70  *    1  20   10   30
71  *    2  30   30   10
72  *
73  * memory_tiers0 = 0-2
74  *
75  * node_demotion[0].preferred = <empty>
76  * node_demotion[1].preferred = <empty>
77  * node_demotion[2].preferred = <empty>
78  *
79  * Example 3:
80  *
81  * Node 0 is CPU + DRAM nodes, Node 1 is HBM node, node 2 is PMEM node.
82  *
83  * node distances:
84  * node   0    1    2
85  *    0  10   20   30
86  *    1  20   10   40
87  *    2  30   40   10
88  *
89  * memory_tiers0 = 1
90  * memory_tiers1 = 0
91  * memory_tiers2 = 2
92  *
93  * node_demotion[0].preferred = 2
94  * node_demotion[1].preferred = 0
95  * node_demotion[2].preferred = <empty>
96  *
97  */
98 static struct demotion_nodes *node_demotion __read_mostly;
99 #endif /* CONFIG_MIGRATION */
100 
101 static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype)
102 {
103 	bool found_slot = false;
104 	struct memory_tier *memtier, *new_memtier;
105 	int adistance = memtype->adistance;
106 	unsigned int memtier_adistance_chunk_size = MEMTIER_CHUNK_SIZE;
107 
108 	lockdep_assert_held_once(&memory_tier_lock);
109 
110 	adistance = round_down(adistance, memtier_adistance_chunk_size);
111 	/*
112 	 * If the memtype is already part of a memory tier,
113 	 * just return that.
114 	 */
115 	if (!list_empty(&memtype->tier_sibiling)) {
116 		list_for_each_entry(memtier, &memory_tiers, list) {
117 			if (adistance == memtier->adistance_start)
118 				return memtier;
119 		}
120 		WARN_ON(1);
121 		return ERR_PTR(-EINVAL);
122 	}
123 
124 	list_for_each_entry(memtier, &memory_tiers, list) {
125 		if (adistance == memtier->adistance_start) {
126 			list_add(&memtype->tier_sibiling, &memtier->memory_types);
127 			return memtier;
128 		} else if (adistance < memtier->adistance_start) {
129 			found_slot = true;
130 			break;
131 		}
132 	}
133 
134 	new_memtier = kmalloc(sizeof(struct memory_tier), GFP_KERNEL);
135 	if (!new_memtier)
136 		return ERR_PTR(-ENOMEM);
137 
138 	new_memtier->adistance_start = adistance;
139 	INIT_LIST_HEAD(&new_memtier->list);
140 	INIT_LIST_HEAD(&new_memtier->memory_types);
141 	if (found_slot)
142 		list_add_tail(&new_memtier->list, &memtier->list);
143 	else
144 		list_add_tail(&new_memtier->list, &memory_tiers);
145 	list_add(&memtype->tier_sibiling, &new_memtier->memory_types);
146 	return new_memtier;
147 }
148 
149 static struct memory_tier *__node_get_memory_tier(int node)
150 {
151 	pg_data_t *pgdat;
152 
153 	pgdat = NODE_DATA(node);
154 	if (!pgdat)
155 		return NULL;
156 	/*
157 	 * Since we hold memory_tier_lock, we can avoid
158 	 * RCU read locks when accessing the details. No
159 	 * parallel updates are possible here.
160 	 */
161 	return rcu_dereference_check(pgdat->memtier,
162 				     lockdep_is_held(&memory_tier_lock));
163 }
164 
165 #ifdef CONFIG_MIGRATION
166 bool node_is_toptier(int node)
167 {
168 	bool toptier;
169 	pg_data_t *pgdat;
170 	struct memory_tier *memtier;
171 
172 	pgdat = NODE_DATA(node);
173 	if (!pgdat)
174 		return false;
175 
176 	rcu_read_lock();
177 	memtier = rcu_dereference(pgdat->memtier);
178 	if (!memtier) {
179 		toptier = true;
180 		goto out;
181 	}
182 	if (memtier->adistance_start <= top_tier_adistance)
183 		toptier = true;
184 	else
185 		toptier = false;
186 out:
187 	rcu_read_unlock();
188 	return toptier;
189 }
190 
191 void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
192 {
193 	struct memory_tier *memtier;
194 
195 	/*
196 	 * pg_data_t.memtier updates includes a synchronize_rcu()
197 	 * which ensures that we either find NULL or a valid memtier
198 	 * in NODE_DATA. protect the access via rcu_read_lock();
199 	 */
200 	rcu_read_lock();
201 	memtier = rcu_dereference(pgdat->memtier);
202 	if (memtier)
203 		*targets = memtier->lower_tier_mask;
204 	else
205 		*targets = NODE_MASK_NONE;
206 	rcu_read_unlock();
207 }
208 
209 /**
210  * next_demotion_node() - Get the next node in the demotion path
211  * @node: The starting node to lookup the next node
212  *
213  * Return: node id for next memory node in the demotion path hierarchy
214  * from @node; NUMA_NO_NODE if @node is terminal.  This does not keep
215  * @node online or guarantee that it *continues* to be the next demotion
216  * target.
217  */
218 int next_demotion_node(int node)
219 {
220 	struct demotion_nodes *nd;
221 	int target;
222 
223 	if (!node_demotion)
224 		return NUMA_NO_NODE;
225 
226 	nd = &node_demotion[node];
227 
228 	/*
229 	 * node_demotion[] is updated without excluding this
230 	 * function from running.
231 	 *
232 	 * Make sure to use RCU over entire code blocks if
233 	 * node_demotion[] reads need to be consistent.
234 	 */
235 	rcu_read_lock();
236 	/*
237 	 * If there are multiple target nodes, just select one
238 	 * target node randomly.
239 	 *
240 	 * In addition, we can also use round-robin to select
241 	 * target node, but we should introduce another variable
242 	 * for node_demotion[] to record last selected target node,
243 	 * that may cause cache ping-pong due to the changing of
244 	 * last target node. Or introducing per-cpu data to avoid
245 	 * caching issue, which seems more complicated. So selecting
246 	 * target node randomly seems better until now.
247 	 */
248 	target = node_random(&nd->preferred);
249 	rcu_read_unlock();
250 
251 	return target;
252 }
253 
254 static void disable_all_demotion_targets(void)
255 {
256 	struct memory_tier *memtier;
257 	int node;
258 
259 	for_each_node_state(node, N_MEMORY) {
260 		node_demotion[node].preferred = NODE_MASK_NONE;
261 		/*
262 		 * We are holding memory_tier_lock, it is safe
263 		 * to access pgda->memtier.
264 		 */
265 		memtier = __node_get_memory_tier(node);
266 		if (memtier)
267 			memtier->lower_tier_mask = NODE_MASK_NONE;
268 	}
269 	/*
270 	 * Ensure that the "disable" is visible across the system.
271 	 * Readers will see either a combination of before+disable
272 	 * state or disable+after.  They will never see before and
273 	 * after state together.
274 	 */
275 	synchronize_rcu();
276 }
277 
278 static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
279 {
280 	nodemask_t nodes = NODE_MASK_NONE;
281 	struct memory_dev_type *memtype;
282 
283 	list_for_each_entry(memtype, &memtier->memory_types, tier_sibiling)
284 		nodes_or(nodes, nodes, memtype->nodes);
285 
286 	return nodes;
287 }
288 
289 /*
290  * Find an automatic demotion target for all memory
291  * nodes. Failing here is OK.  It might just indicate
292  * being at the end of a chain.
293  */
294 static void establish_demotion_targets(void)
295 {
296 	struct memory_tier *memtier;
297 	struct demotion_nodes *nd;
298 	int target = NUMA_NO_NODE, node;
299 	int distance, best_distance;
300 	nodemask_t tier_nodes, lower_tier;
301 
302 	lockdep_assert_held_once(&memory_tier_lock);
303 
304 	if (!node_demotion || !IS_ENABLED(CONFIG_MIGRATION))
305 		return;
306 
307 	disable_all_demotion_targets();
308 
309 	for_each_node_state(node, N_MEMORY) {
310 		best_distance = -1;
311 		nd = &node_demotion[node];
312 
313 		memtier = __node_get_memory_tier(node);
314 		if (!memtier || list_is_last(&memtier->list, &memory_tiers))
315 			continue;
316 		/*
317 		 * Get the lower memtier to find the  demotion node list.
318 		 */
319 		memtier = list_next_entry(memtier, list);
320 		tier_nodes = get_memtier_nodemask(memtier);
321 		/*
322 		 * find_next_best_node, use 'used' nodemask as a skip list.
323 		 * Add all memory nodes except the selected memory tier
324 		 * nodelist to skip list so that we find the best node from the
325 		 * memtier nodelist.
326 		 */
327 		nodes_andnot(tier_nodes, node_states[N_MEMORY], tier_nodes);
328 
329 		/*
330 		 * Find all the nodes in the memory tier node list of same best distance.
331 		 * add them to the preferred mask. We randomly select between nodes
332 		 * in the preferred mask when allocating pages during demotion.
333 		 */
334 		do {
335 			target = find_next_best_node(node, &tier_nodes);
336 			if (target == NUMA_NO_NODE)
337 				break;
338 
339 			distance = node_distance(node, target);
340 			if (distance == best_distance || best_distance == -1) {
341 				best_distance = distance;
342 				node_set(target, nd->preferred);
343 			} else {
344 				break;
345 			}
346 		} while (1);
347 	}
348 	/*
349 	 * Promotion is allowed from a memory tier to higher
350 	 * memory tier only if the memory tier doesn't include
351 	 * compute. We want to skip promotion from a memory tier,
352 	 * if any node that is part of the memory tier have CPUs.
353 	 * Once we detect such a memory tier, we consider that tier
354 	 * as top tiper from which promotion is not allowed.
355 	 */
356 	list_for_each_entry_reverse(memtier, &memory_tiers, list) {
357 		tier_nodes = get_memtier_nodemask(memtier);
358 		nodes_and(tier_nodes, node_states[N_CPU], tier_nodes);
359 		if (!nodes_empty(tier_nodes)) {
360 			/*
361 			 * abstract distance below the max value of this memtier
362 			 * is considered toptier.
363 			 */
364 			top_tier_adistance = memtier->adistance_start +
365 						MEMTIER_CHUNK_SIZE - 1;
366 			break;
367 		}
368 	}
369 	/*
370 	 * Now build the lower_tier mask for each node collecting node mask from
371 	 * all memory tier below it. This allows us to fallback demotion page
372 	 * allocation to a set of nodes that is closer the above selected
373 	 * perferred node.
374 	 */
375 	lower_tier = node_states[N_MEMORY];
376 	list_for_each_entry(memtier, &memory_tiers, list) {
377 		/*
378 		 * Keep removing current tier from lower_tier nodes,
379 		 * This will remove all nodes in current and above
380 		 * memory tier from the lower_tier mask.
381 		 */
382 		tier_nodes = get_memtier_nodemask(memtier);
383 		nodes_andnot(lower_tier, lower_tier, tier_nodes);
384 		memtier->lower_tier_mask = lower_tier;
385 	}
386 }
387 
388 #else
389 static inline void disable_all_demotion_targets(void) {}
390 static inline void establish_demotion_targets(void) {}
391 #endif /* CONFIG_MIGRATION */
392 
393 static inline void __init_node_memory_type(int node, struct memory_dev_type *memtype)
394 {
395 	if (!node_memory_types[node].memtype)
396 		node_memory_types[node].memtype = memtype;
397 	/*
398 	 * for each device getting added in the same NUMA node
399 	 * with this specific memtype, bump the map count. We
400 	 * Only take memtype device reference once, so that
401 	 * changing a node memtype can be done by droping the
402 	 * only reference count taken here.
403 	 */
404 
405 	if (node_memory_types[node].memtype == memtype) {
406 		if (!node_memory_types[node].map_count++)
407 			kref_get(&memtype->kref);
408 	}
409 }
410 
411 static struct memory_tier *set_node_memory_tier(int node)
412 {
413 	struct memory_tier *memtier;
414 	struct memory_dev_type *memtype;
415 	pg_data_t *pgdat = NODE_DATA(node);
416 
417 
418 	lockdep_assert_held_once(&memory_tier_lock);
419 
420 	if (!node_state(node, N_MEMORY))
421 		return ERR_PTR(-EINVAL);
422 
423 	__init_node_memory_type(node, default_dram_type);
424 
425 	memtype = node_memory_types[node].memtype;
426 	node_set(node, memtype->nodes);
427 	memtier = find_create_memory_tier(memtype);
428 	if (!IS_ERR(memtier))
429 		rcu_assign_pointer(pgdat->memtier, memtier);
430 	return memtier;
431 }
432 
433 static void destroy_memory_tier(struct memory_tier *memtier)
434 {
435 	list_del(&memtier->list);
436 	/*
437 	 * synchronize_rcu in clear_node_memory_tier makes sure
438 	 * we don't have rcu access to this memory tier.
439 	 */
440 	kfree(memtier);
441 }
442 
443 static bool clear_node_memory_tier(int node)
444 {
445 	bool cleared = false;
446 	pg_data_t *pgdat;
447 	struct memory_tier *memtier;
448 
449 	pgdat = NODE_DATA(node);
450 	if (!pgdat)
451 		return false;
452 
453 	/*
454 	 * Make sure that anybody looking at NODE_DATA who finds
455 	 * a valid memtier finds memory_dev_types with nodes still
456 	 * linked to the memtier. We achieve this by waiting for
457 	 * rcu read section to finish using synchronize_rcu.
458 	 * This also enables us to free the destroyed memory tier
459 	 * with kfree instead of kfree_rcu
460 	 */
461 	memtier = __node_get_memory_tier(node);
462 	if (memtier) {
463 		struct memory_dev_type *memtype;
464 
465 		rcu_assign_pointer(pgdat->memtier, NULL);
466 		synchronize_rcu();
467 		memtype = node_memory_types[node].memtype;
468 		node_clear(node, memtype->nodes);
469 		if (nodes_empty(memtype->nodes)) {
470 			list_del_init(&memtype->tier_sibiling);
471 			if (list_empty(&memtier->memory_types))
472 				destroy_memory_tier(memtier);
473 		}
474 		cleared = true;
475 	}
476 	return cleared;
477 }
478 
479 static void release_memtype(struct kref *kref)
480 {
481 	struct memory_dev_type *memtype;
482 
483 	memtype = container_of(kref, struct memory_dev_type, kref);
484 	kfree(memtype);
485 }
486 
487 struct memory_dev_type *alloc_memory_type(int adistance)
488 {
489 	struct memory_dev_type *memtype;
490 
491 	memtype = kmalloc(sizeof(*memtype), GFP_KERNEL);
492 	if (!memtype)
493 		return ERR_PTR(-ENOMEM);
494 
495 	memtype->adistance = adistance;
496 	INIT_LIST_HEAD(&memtype->tier_sibiling);
497 	memtype->nodes  = NODE_MASK_NONE;
498 	kref_init(&memtype->kref);
499 	return memtype;
500 }
501 EXPORT_SYMBOL_GPL(alloc_memory_type);
502 
503 void destroy_memory_type(struct memory_dev_type *memtype)
504 {
505 	kref_put(&memtype->kref, release_memtype);
506 }
507 EXPORT_SYMBOL_GPL(destroy_memory_type);
508 
509 void init_node_memory_type(int node, struct memory_dev_type *memtype)
510 {
511 
512 	mutex_lock(&memory_tier_lock);
513 	__init_node_memory_type(node, memtype);
514 	mutex_unlock(&memory_tier_lock);
515 }
516 EXPORT_SYMBOL_GPL(init_node_memory_type);
517 
518 void clear_node_memory_type(int node, struct memory_dev_type *memtype)
519 {
520 	mutex_lock(&memory_tier_lock);
521 	if (node_memory_types[node].memtype == memtype)
522 		node_memory_types[node].map_count--;
523 	/*
524 	 * If we umapped all the attached devices to this node,
525 	 * clear the node memory type.
526 	 */
527 	if (!node_memory_types[node].map_count) {
528 		node_memory_types[node].memtype = NULL;
529 		kref_put(&memtype->kref, release_memtype);
530 	}
531 	mutex_unlock(&memory_tier_lock);
532 }
533 EXPORT_SYMBOL_GPL(clear_node_memory_type);
534 
535 static int __meminit memtier_hotplug_callback(struct notifier_block *self,
536 					      unsigned long action, void *_arg)
537 {
538 	struct memory_tier *memtier;
539 	struct memory_notify *arg = _arg;
540 
541 	/*
542 	 * Only update the node migration order when a node is
543 	 * changing status, like online->offline.
544 	 */
545 	if (arg->status_change_nid < 0)
546 		return notifier_from_errno(0);
547 
548 	switch (action) {
549 	case MEM_OFFLINE:
550 		mutex_lock(&memory_tier_lock);
551 		if (clear_node_memory_tier(arg->status_change_nid))
552 			establish_demotion_targets();
553 		mutex_unlock(&memory_tier_lock);
554 		break;
555 	case MEM_ONLINE:
556 		mutex_lock(&memory_tier_lock);
557 		memtier = set_node_memory_tier(arg->status_change_nid);
558 		if (!IS_ERR(memtier))
559 			establish_demotion_targets();
560 		mutex_unlock(&memory_tier_lock);
561 		break;
562 	}
563 
564 	return notifier_from_errno(0);
565 }
566 
567 static int __init memory_tier_init(void)
568 {
569 	int node;
570 	struct memory_tier *memtier;
571 
572 #ifdef CONFIG_MIGRATION
573 	node_demotion = kcalloc(nr_node_ids, sizeof(struct demotion_nodes),
574 				GFP_KERNEL);
575 	WARN_ON(!node_demotion);
576 #endif
577 	mutex_lock(&memory_tier_lock);
578 	/*
579 	 * For now we can have 4 faster memory tiers with smaller adistance
580 	 * than default DRAM tier.
581 	 */
582 	default_dram_type = alloc_memory_type(MEMTIER_ADISTANCE_DRAM);
583 	if (!default_dram_type)
584 		panic("%s() failed to allocate default DRAM tier\n", __func__);
585 
586 	/*
587 	 * Look at all the existing N_MEMORY nodes and add them to
588 	 * default memory tier or to a tier if we already have memory
589 	 * types assigned.
590 	 */
591 	for_each_node_state(node, N_MEMORY) {
592 		memtier = set_node_memory_tier(node);
593 		if (IS_ERR(memtier))
594 			/*
595 			 * Continue with memtiers we are able to setup
596 			 */
597 			break;
598 	}
599 	establish_demotion_targets();
600 	mutex_unlock(&memory_tier_lock);
601 
602 	hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRIO);
603 	return 0;
604 }
605 subsys_initcall(memory_tier_init);
606 
607 bool numa_demotion_enabled = false;
608 
609 #ifdef CONFIG_MIGRATION
610 #ifdef CONFIG_SYSFS
611 static ssize_t numa_demotion_enabled_show(struct kobject *kobj,
612 					  struct kobj_attribute *attr, char *buf)
613 {
614 	return sysfs_emit(buf, "%s\n",
615 			  numa_demotion_enabled ? "true" : "false");
616 }
617 
618 static ssize_t numa_demotion_enabled_store(struct kobject *kobj,
619 					   struct kobj_attribute *attr,
620 					   const char *buf, size_t count)
621 {
622 	ssize_t ret;
623 
624 	ret = kstrtobool(buf, &numa_demotion_enabled);
625 	if (ret)
626 		return ret;
627 
628 	return count;
629 }
630 
631 static struct kobj_attribute numa_demotion_enabled_attr =
632 	__ATTR(demotion_enabled, 0644, numa_demotion_enabled_show,
633 	       numa_demotion_enabled_store);
634 
635 static struct attribute *numa_attrs[] = {
636 	&numa_demotion_enabled_attr.attr,
637 	NULL,
638 };
639 
640 static const struct attribute_group numa_attr_group = {
641 	.attrs = numa_attrs,
642 };
643 
644 static int __init numa_init_sysfs(void)
645 {
646 	int err;
647 	struct kobject *numa_kobj;
648 
649 	numa_kobj = kobject_create_and_add("numa", mm_kobj);
650 	if (!numa_kobj) {
651 		pr_err("failed to create numa kobject\n");
652 		return -ENOMEM;
653 	}
654 	err = sysfs_create_group(numa_kobj, &numa_attr_group);
655 	if (err) {
656 		pr_err("failed to register numa group\n");
657 		goto delete_obj;
658 	}
659 	return 0;
660 
661 delete_obj:
662 	kobject_put(numa_kobj);
663 	return err;
664 }
665 subsys_initcall(numa_init_sysfs);
666 #endif /* CONFIG_SYSFS */
667 #endif
668