1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/slab.h>
3 #include <linux/lockdep.h>
4 #include <linux/sysfs.h>
5 #include <linux/kobject.h>
6 #include <linux/memory.h>
7 #include <linux/memory-tiers.h>
8
9 #include "internal.h"
10
11 struct memory_tier {
12 /* hierarchy of memory tiers */
13 struct list_head list;
14 /* list of all memory types part of this tier */
15 struct list_head memory_types;
16 /*
17 * start value of abstract distance. memory tier maps
18 * an abstract distance range,
19 * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE
20 */
21 int adistance_start;
22 struct device dev;
23 /* All the nodes that are part of all the lower memory tiers. */
24 nodemask_t lower_tier_mask;
25 };
26
27 struct demotion_nodes {
28 nodemask_t preferred;
29 };
30
31 struct node_memory_type_map {
32 struct memory_dev_type *memtype;
33 int map_count;
34 };
35
36 static DEFINE_MUTEX(memory_tier_lock);
37 static LIST_HEAD(memory_tiers);
38 static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
39 static struct memory_dev_type *default_dram_type;
40
41 static struct bus_type memory_tier_subsys = {
42 .name = "memory_tiering",
43 .dev_name = "memory_tier",
44 };
45
46 #ifdef CONFIG_MIGRATION
47 static int top_tier_adistance;
48 /*
49 * node_demotion[] examples:
50 *
51 * Example 1:
52 *
53 * Node 0 & 1 are CPU + DRAM nodes, node 2 & 3 are PMEM nodes.
54 *
55 * node distances:
56 * node 0 1 2 3
57 * 0 10 20 30 40
58 * 1 20 10 40 30
59 * 2 30 40 10 40
60 * 3 40 30 40 10
61 *
62 * memory_tiers0 = 0-1
63 * memory_tiers1 = 2-3
64 *
65 * node_demotion[0].preferred = 2
66 * node_demotion[1].preferred = 3
67 * node_demotion[2].preferred = <empty>
68 * node_demotion[3].preferred = <empty>
69 *
70 * Example 2:
71 *
72 * Node 0 & 1 are CPU + DRAM nodes, node 2 is memory-only DRAM node.
73 *
74 * node distances:
75 * node 0 1 2
76 * 0 10 20 30
77 * 1 20 10 30
78 * 2 30 30 10
79 *
80 * memory_tiers0 = 0-2
81 *
82 * node_demotion[0].preferred = <empty>
83 * node_demotion[1].preferred = <empty>
84 * node_demotion[2].preferred = <empty>
85 *
86 * Example 3:
87 *
88 * Node 0 is CPU + DRAM nodes, Node 1 is HBM node, node 2 is PMEM node.
89 *
90 * node distances:
91 * node 0 1 2
92 * 0 10 20 30
93 * 1 20 10 40
94 * 2 30 40 10
95 *
96 * memory_tiers0 = 1
97 * memory_tiers1 = 0
98 * memory_tiers2 = 2
99 *
100 * node_demotion[0].preferred = 2
101 * node_demotion[1].preferred = 0
102 * node_demotion[2].preferred = <empty>
103 *
104 */
105 static struct demotion_nodes *node_demotion __read_mostly;
106 #endif /* CONFIG_MIGRATION */
107
to_memory_tier(struct device * device)108 static inline struct memory_tier *to_memory_tier(struct device *device)
109 {
110 return container_of(device, struct memory_tier, dev);
111 }
112
get_memtier_nodemask(struct memory_tier * memtier)113 static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
114 {
115 nodemask_t nodes = NODE_MASK_NONE;
116 struct memory_dev_type *memtype;
117
118 list_for_each_entry(memtype, &memtier->memory_types, tier_sibiling)
119 nodes_or(nodes, nodes, memtype->nodes);
120
121 return nodes;
122 }
123
memory_tier_device_release(struct device * dev)124 static void memory_tier_device_release(struct device *dev)
125 {
126 struct memory_tier *tier = to_memory_tier(dev);
127 /*
128 * synchronize_rcu in clear_node_memory_tier makes sure
129 * we don't have rcu access to this memory tier.
130 */
131 kfree(tier);
132 }
133
nodelist_show(struct device * dev,struct device_attribute * attr,char * buf)134 static ssize_t nodelist_show(struct device *dev,
135 struct device_attribute *attr, char *buf)
136 {
137 int ret;
138 nodemask_t nmask;
139
140 mutex_lock(&memory_tier_lock);
141 nmask = get_memtier_nodemask(to_memory_tier(dev));
142 ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&nmask));
143 mutex_unlock(&memory_tier_lock);
144 return ret;
145 }
146 static DEVICE_ATTR_RO(nodelist);
147
148 static struct attribute *memtier_dev_attrs[] = {
149 &dev_attr_nodelist.attr,
150 NULL
151 };
152
153 static const struct attribute_group memtier_dev_group = {
154 .attrs = memtier_dev_attrs,
155 };
156
157 static const struct attribute_group *memtier_dev_groups[] = {
158 &memtier_dev_group,
159 NULL
160 };
161
find_create_memory_tier(struct memory_dev_type * memtype)162 static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype)
163 {
164 int ret;
165 bool found_slot = false;
166 struct memory_tier *memtier, *new_memtier;
167 int adistance = memtype->adistance;
168 unsigned int memtier_adistance_chunk_size = MEMTIER_CHUNK_SIZE;
169
170 lockdep_assert_held_once(&memory_tier_lock);
171
172 adistance = round_down(adistance, memtier_adistance_chunk_size);
173 /*
174 * If the memtype is already part of a memory tier,
175 * just return that.
176 */
177 if (!list_empty(&memtype->tier_sibiling)) {
178 list_for_each_entry(memtier, &memory_tiers, list) {
179 if (adistance == memtier->adistance_start)
180 return memtier;
181 }
182 WARN_ON(1);
183 return ERR_PTR(-EINVAL);
184 }
185
186 list_for_each_entry(memtier, &memory_tiers, list) {
187 if (adistance == memtier->adistance_start) {
188 goto link_memtype;
189 } else if (adistance < memtier->adistance_start) {
190 found_slot = true;
191 break;
192 }
193 }
194
195 new_memtier = kzalloc(sizeof(struct memory_tier), GFP_KERNEL);
196 if (!new_memtier)
197 return ERR_PTR(-ENOMEM);
198
199 new_memtier->adistance_start = adistance;
200 INIT_LIST_HEAD(&new_memtier->list);
201 INIT_LIST_HEAD(&new_memtier->memory_types);
202 if (found_slot)
203 list_add_tail(&new_memtier->list, &memtier->list);
204 else
205 list_add_tail(&new_memtier->list, &memory_tiers);
206
207 new_memtier->dev.id = adistance >> MEMTIER_CHUNK_BITS;
208 new_memtier->dev.bus = &memory_tier_subsys;
209 new_memtier->dev.release = memory_tier_device_release;
210 new_memtier->dev.groups = memtier_dev_groups;
211
212 ret = device_register(&new_memtier->dev);
213 if (ret) {
214 list_del(&new_memtier->list);
215 put_device(&new_memtier->dev);
216 return ERR_PTR(ret);
217 }
218 memtier = new_memtier;
219
220 link_memtype:
221 list_add(&memtype->tier_sibiling, &memtier->memory_types);
222 return memtier;
223 }
224
__node_get_memory_tier(int node)225 static struct memory_tier *__node_get_memory_tier(int node)
226 {
227 pg_data_t *pgdat;
228
229 pgdat = NODE_DATA(node);
230 if (!pgdat)
231 return NULL;
232 /*
233 * Since we hold memory_tier_lock, we can avoid
234 * RCU read locks when accessing the details. No
235 * parallel updates are possible here.
236 */
237 return rcu_dereference_check(pgdat->memtier,
238 lockdep_is_held(&memory_tier_lock));
239 }
240
241 #ifdef CONFIG_MIGRATION
node_is_toptier(int node)242 bool node_is_toptier(int node)
243 {
244 bool toptier;
245 pg_data_t *pgdat;
246 struct memory_tier *memtier;
247
248 pgdat = NODE_DATA(node);
249 if (!pgdat)
250 return false;
251
252 rcu_read_lock();
253 memtier = rcu_dereference(pgdat->memtier);
254 if (!memtier) {
255 toptier = true;
256 goto out;
257 }
258 if (memtier->adistance_start <= top_tier_adistance)
259 toptier = true;
260 else
261 toptier = false;
262 out:
263 rcu_read_unlock();
264 return toptier;
265 }
266
node_get_allowed_targets(pg_data_t * pgdat,nodemask_t * targets)267 void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
268 {
269 struct memory_tier *memtier;
270
271 /*
272 * pg_data_t.memtier updates includes a synchronize_rcu()
273 * which ensures that we either find NULL or a valid memtier
274 * in NODE_DATA. protect the access via rcu_read_lock();
275 */
276 rcu_read_lock();
277 memtier = rcu_dereference(pgdat->memtier);
278 if (memtier)
279 *targets = memtier->lower_tier_mask;
280 else
281 *targets = NODE_MASK_NONE;
282 rcu_read_unlock();
283 }
284
285 /**
286 * next_demotion_node() - Get the next node in the demotion path
287 * @node: The starting node to lookup the next node
288 *
289 * Return: node id for next memory node in the demotion path hierarchy
290 * from @node; NUMA_NO_NODE if @node is terminal. This does not keep
291 * @node online or guarantee that it *continues* to be the next demotion
292 * target.
293 */
next_demotion_node(int node)294 int next_demotion_node(int node)
295 {
296 struct demotion_nodes *nd;
297 int target;
298
299 if (!node_demotion)
300 return NUMA_NO_NODE;
301
302 nd = &node_demotion[node];
303
304 /*
305 * node_demotion[] is updated without excluding this
306 * function from running.
307 *
308 * Make sure to use RCU over entire code blocks if
309 * node_demotion[] reads need to be consistent.
310 */
311 rcu_read_lock();
312 /*
313 * If there are multiple target nodes, just select one
314 * target node randomly.
315 *
316 * In addition, we can also use round-robin to select
317 * target node, but we should introduce another variable
318 * for node_demotion[] to record last selected target node,
319 * that may cause cache ping-pong due to the changing of
320 * last target node. Or introducing per-cpu data to avoid
321 * caching issue, which seems more complicated. So selecting
322 * target node randomly seems better until now.
323 */
324 target = node_random(&nd->preferred);
325 rcu_read_unlock();
326
327 return target;
328 }
329
disable_all_demotion_targets(void)330 static void disable_all_demotion_targets(void)
331 {
332 struct memory_tier *memtier;
333 int node;
334
335 for_each_node_state(node, N_MEMORY) {
336 node_demotion[node].preferred = NODE_MASK_NONE;
337 /*
338 * We are holding memory_tier_lock, it is safe
339 * to access pgda->memtier.
340 */
341 memtier = __node_get_memory_tier(node);
342 if (memtier)
343 memtier->lower_tier_mask = NODE_MASK_NONE;
344 }
345 /*
346 * Ensure that the "disable" is visible across the system.
347 * Readers will see either a combination of before+disable
348 * state or disable+after. They will never see before and
349 * after state together.
350 */
351 synchronize_rcu();
352 }
353
354 /*
355 * Find an automatic demotion target for all memory
356 * nodes. Failing here is OK. It might just indicate
357 * being at the end of a chain.
358 */
establish_demotion_targets(void)359 static void establish_demotion_targets(void)
360 {
361 struct memory_tier *memtier;
362 struct demotion_nodes *nd;
363 int target = NUMA_NO_NODE, node;
364 int distance, best_distance;
365 nodemask_t tier_nodes, lower_tier;
366
367 lockdep_assert_held_once(&memory_tier_lock);
368
369 if (!node_demotion)
370 return;
371
372 disable_all_demotion_targets();
373
374 for_each_node_state(node, N_MEMORY) {
375 best_distance = -1;
376 nd = &node_demotion[node];
377
378 memtier = __node_get_memory_tier(node);
379 if (!memtier || list_is_last(&memtier->list, &memory_tiers))
380 continue;
381 /*
382 * Get the lower memtier to find the demotion node list.
383 */
384 memtier = list_next_entry(memtier, list);
385 tier_nodes = get_memtier_nodemask(memtier);
386 /*
387 * find_next_best_node, use 'used' nodemask as a skip list.
388 * Add all memory nodes except the selected memory tier
389 * nodelist to skip list so that we find the best node from the
390 * memtier nodelist.
391 */
392 nodes_andnot(tier_nodes, node_states[N_MEMORY], tier_nodes);
393
394 /*
395 * Find all the nodes in the memory tier node list of same best distance.
396 * add them to the preferred mask. We randomly select between nodes
397 * in the preferred mask when allocating pages during demotion.
398 */
399 do {
400 target = find_next_best_node(node, &tier_nodes);
401 if (target == NUMA_NO_NODE)
402 break;
403
404 distance = node_distance(node, target);
405 if (distance == best_distance || best_distance == -1) {
406 best_distance = distance;
407 node_set(target, nd->preferred);
408 } else {
409 break;
410 }
411 } while (1);
412 }
413 /*
414 * Promotion is allowed from a memory tier to higher
415 * memory tier only if the memory tier doesn't include
416 * compute. We want to skip promotion from a memory tier,
417 * if any node that is part of the memory tier have CPUs.
418 * Once we detect such a memory tier, we consider that tier
419 * as top tiper from which promotion is not allowed.
420 */
421 list_for_each_entry_reverse(memtier, &memory_tiers, list) {
422 tier_nodes = get_memtier_nodemask(memtier);
423 nodes_and(tier_nodes, node_states[N_CPU], tier_nodes);
424 if (!nodes_empty(tier_nodes)) {
425 /*
426 * abstract distance below the max value of this memtier
427 * is considered toptier.
428 */
429 top_tier_adistance = memtier->adistance_start +
430 MEMTIER_CHUNK_SIZE - 1;
431 break;
432 }
433 }
434 /*
435 * Now build the lower_tier mask for each node collecting node mask from
436 * all memory tier below it. This allows us to fallback demotion page
437 * allocation to a set of nodes that is closer the above selected
438 * perferred node.
439 */
440 lower_tier = node_states[N_MEMORY];
441 list_for_each_entry(memtier, &memory_tiers, list) {
442 /*
443 * Keep removing current tier from lower_tier nodes,
444 * This will remove all nodes in current and above
445 * memory tier from the lower_tier mask.
446 */
447 tier_nodes = get_memtier_nodemask(memtier);
448 nodes_andnot(lower_tier, lower_tier, tier_nodes);
449 memtier->lower_tier_mask = lower_tier;
450 }
451 }
452
453 #else
establish_demotion_targets(void)454 static inline void establish_demotion_targets(void) {}
455 #endif /* CONFIG_MIGRATION */
456
__init_node_memory_type(int node,struct memory_dev_type * memtype)457 static inline void __init_node_memory_type(int node, struct memory_dev_type *memtype)
458 {
459 if (!node_memory_types[node].memtype)
460 node_memory_types[node].memtype = memtype;
461 /*
462 * for each device getting added in the same NUMA node
463 * with this specific memtype, bump the map count. We
464 * Only take memtype device reference once, so that
465 * changing a node memtype can be done by droping the
466 * only reference count taken here.
467 */
468
469 if (node_memory_types[node].memtype == memtype) {
470 if (!node_memory_types[node].map_count++)
471 kref_get(&memtype->kref);
472 }
473 }
474
set_node_memory_tier(int node)475 static struct memory_tier *set_node_memory_tier(int node)
476 {
477 struct memory_tier *memtier;
478 struct memory_dev_type *memtype;
479 pg_data_t *pgdat = NODE_DATA(node);
480
481
482 lockdep_assert_held_once(&memory_tier_lock);
483
484 if (!node_state(node, N_MEMORY))
485 return ERR_PTR(-EINVAL);
486
487 __init_node_memory_type(node, default_dram_type);
488
489 memtype = node_memory_types[node].memtype;
490 node_set(node, memtype->nodes);
491 memtier = find_create_memory_tier(memtype);
492 if (!IS_ERR(memtier))
493 rcu_assign_pointer(pgdat->memtier, memtier);
494 return memtier;
495 }
496
destroy_memory_tier(struct memory_tier * memtier)497 static void destroy_memory_tier(struct memory_tier *memtier)
498 {
499 list_del(&memtier->list);
500 device_unregister(&memtier->dev);
501 }
502
clear_node_memory_tier(int node)503 static bool clear_node_memory_tier(int node)
504 {
505 bool cleared = false;
506 pg_data_t *pgdat;
507 struct memory_tier *memtier;
508
509 pgdat = NODE_DATA(node);
510 if (!pgdat)
511 return false;
512
513 /*
514 * Make sure that anybody looking at NODE_DATA who finds
515 * a valid memtier finds memory_dev_types with nodes still
516 * linked to the memtier. We achieve this by waiting for
517 * rcu read section to finish using synchronize_rcu.
518 * This also enables us to free the destroyed memory tier
519 * with kfree instead of kfree_rcu
520 */
521 memtier = __node_get_memory_tier(node);
522 if (memtier) {
523 struct memory_dev_type *memtype;
524
525 rcu_assign_pointer(pgdat->memtier, NULL);
526 synchronize_rcu();
527 memtype = node_memory_types[node].memtype;
528 node_clear(node, memtype->nodes);
529 if (nodes_empty(memtype->nodes)) {
530 list_del_init(&memtype->tier_sibiling);
531 if (list_empty(&memtier->memory_types))
532 destroy_memory_tier(memtier);
533 }
534 cleared = true;
535 }
536 return cleared;
537 }
538
release_memtype(struct kref * kref)539 static void release_memtype(struct kref *kref)
540 {
541 struct memory_dev_type *memtype;
542
543 memtype = container_of(kref, struct memory_dev_type, kref);
544 kfree(memtype);
545 }
546
alloc_memory_type(int adistance)547 struct memory_dev_type *alloc_memory_type(int adistance)
548 {
549 struct memory_dev_type *memtype;
550
551 memtype = kmalloc(sizeof(*memtype), GFP_KERNEL);
552 if (!memtype)
553 return ERR_PTR(-ENOMEM);
554
555 memtype->adistance = adistance;
556 INIT_LIST_HEAD(&memtype->tier_sibiling);
557 memtype->nodes = NODE_MASK_NONE;
558 kref_init(&memtype->kref);
559 return memtype;
560 }
561 EXPORT_SYMBOL_GPL(alloc_memory_type);
562
put_memory_type(struct memory_dev_type * memtype)563 void put_memory_type(struct memory_dev_type *memtype)
564 {
565 kref_put(&memtype->kref, release_memtype);
566 }
567 EXPORT_SYMBOL_GPL(put_memory_type);
568
init_node_memory_type(int node,struct memory_dev_type * memtype)569 void init_node_memory_type(int node, struct memory_dev_type *memtype)
570 {
571
572 mutex_lock(&memory_tier_lock);
573 __init_node_memory_type(node, memtype);
574 mutex_unlock(&memory_tier_lock);
575 }
576 EXPORT_SYMBOL_GPL(init_node_memory_type);
577
clear_node_memory_type(int node,struct memory_dev_type * memtype)578 void clear_node_memory_type(int node, struct memory_dev_type *memtype)
579 {
580 mutex_lock(&memory_tier_lock);
581 if (node_memory_types[node].memtype == memtype)
582 node_memory_types[node].map_count--;
583 /*
584 * If we umapped all the attached devices to this node,
585 * clear the node memory type.
586 */
587 if (!node_memory_types[node].map_count) {
588 node_memory_types[node].memtype = NULL;
589 put_memory_type(memtype);
590 }
591 mutex_unlock(&memory_tier_lock);
592 }
593 EXPORT_SYMBOL_GPL(clear_node_memory_type);
594
memtier_hotplug_callback(struct notifier_block * self,unsigned long action,void * _arg)595 static int __meminit memtier_hotplug_callback(struct notifier_block *self,
596 unsigned long action, void *_arg)
597 {
598 struct memory_tier *memtier;
599 struct memory_notify *arg = _arg;
600
601 /*
602 * Only update the node migration order when a node is
603 * changing status, like online->offline.
604 */
605 if (arg->status_change_nid < 0)
606 return notifier_from_errno(0);
607
608 switch (action) {
609 case MEM_OFFLINE:
610 mutex_lock(&memory_tier_lock);
611 if (clear_node_memory_tier(arg->status_change_nid))
612 establish_demotion_targets();
613 mutex_unlock(&memory_tier_lock);
614 break;
615 case MEM_ONLINE:
616 mutex_lock(&memory_tier_lock);
617 memtier = set_node_memory_tier(arg->status_change_nid);
618 if (!IS_ERR(memtier))
619 establish_demotion_targets();
620 mutex_unlock(&memory_tier_lock);
621 break;
622 }
623
624 return notifier_from_errno(0);
625 }
626
memory_tier_init(void)627 static int __init memory_tier_init(void)
628 {
629 int ret, node;
630 struct memory_tier *memtier;
631
632 ret = subsys_virtual_register(&memory_tier_subsys, NULL);
633 if (ret)
634 panic("%s() failed to register memory tier subsystem\n", __func__);
635
636 #ifdef CONFIG_MIGRATION
637 node_demotion = kcalloc(nr_node_ids, sizeof(struct demotion_nodes),
638 GFP_KERNEL);
639 WARN_ON(!node_demotion);
640 #endif
641 mutex_lock(&memory_tier_lock);
642 /*
643 * For now we can have 4 faster memory tiers with smaller adistance
644 * than default DRAM tier.
645 */
646 default_dram_type = alloc_memory_type(MEMTIER_ADISTANCE_DRAM);
647 if (IS_ERR(default_dram_type))
648 panic("%s() failed to allocate default DRAM tier\n", __func__);
649
650 /*
651 * Look at all the existing N_MEMORY nodes and add them to
652 * default memory tier or to a tier if we already have memory
653 * types assigned.
654 */
655 for_each_node_state(node, N_MEMORY) {
656 memtier = set_node_memory_tier(node);
657 if (IS_ERR(memtier))
658 /*
659 * Continue with memtiers we are able to setup
660 */
661 break;
662 }
663 establish_demotion_targets();
664 mutex_unlock(&memory_tier_lock);
665
666 hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRI);
667 return 0;
668 }
669 subsys_initcall(memory_tier_init);
670
671 bool numa_demotion_enabled = false;
672
673 #ifdef CONFIG_MIGRATION
674 #ifdef CONFIG_SYSFS
demotion_enabled_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)675 static ssize_t demotion_enabled_show(struct kobject *kobj,
676 struct kobj_attribute *attr, char *buf)
677 {
678 return sysfs_emit(buf, "%s\n",
679 numa_demotion_enabled ? "true" : "false");
680 }
681
demotion_enabled_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)682 static ssize_t demotion_enabled_store(struct kobject *kobj,
683 struct kobj_attribute *attr,
684 const char *buf, size_t count)
685 {
686 ssize_t ret;
687
688 ret = kstrtobool(buf, &numa_demotion_enabled);
689 if (ret)
690 return ret;
691
692 return count;
693 }
694
695 static struct kobj_attribute numa_demotion_enabled_attr =
696 __ATTR_RW(demotion_enabled);
697
698 static struct attribute *numa_attrs[] = {
699 &numa_demotion_enabled_attr.attr,
700 NULL,
701 };
702
703 static const struct attribute_group numa_attr_group = {
704 .attrs = numa_attrs,
705 };
706
numa_init_sysfs(void)707 static int __init numa_init_sysfs(void)
708 {
709 int err;
710 struct kobject *numa_kobj;
711
712 numa_kobj = kobject_create_and_add("numa", mm_kobj);
713 if (!numa_kobj) {
714 pr_err("failed to create numa kobject\n");
715 return -ENOMEM;
716 }
717 err = sysfs_create_group(numa_kobj, &numa_attr_group);
718 if (err) {
719 pr_err("failed to register numa group\n");
720 goto delete_obj;
721 }
722 return 0;
723
724 delete_obj:
725 kobject_put(numa_kobj);
726 return err;
727 }
728 subsys_initcall(numa_init_sysfs);
729 #endif /* CONFIG_SYSFS */
730 #endif
731