1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/slab.h> 3 #include <linux/lockdep.h> 4 #include <linux/sysfs.h> 5 #include <linux/kobject.h> 6 #include <linux/memory.h> 7 #include <linux/memory-tiers.h> 8 9 #include "internal.h" 10 11 struct memory_tier { 12 /* hierarchy of memory tiers */ 13 struct list_head list; 14 /* list of all memory types part of this tier */ 15 struct list_head memory_types; 16 /* 17 * start value of abstract distance. memory tier maps 18 * an abstract distance range, 19 * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE 20 */ 21 int adistance_start; 22 /* All the nodes that are part of all the lower memory tiers. */ 23 nodemask_t lower_tier_mask; 24 }; 25 26 struct demotion_nodes { 27 nodemask_t preferred; 28 }; 29 30 struct node_memory_type_map { 31 struct memory_dev_type *memtype; 32 int map_count; 33 }; 34 35 static DEFINE_MUTEX(memory_tier_lock); 36 static LIST_HEAD(memory_tiers); 37 static struct node_memory_type_map node_memory_types[MAX_NUMNODES]; 38 static struct memory_dev_type *default_dram_type; 39 #ifdef CONFIG_MIGRATION 40 /* 41 * node_demotion[] examples: 42 * 43 * Example 1: 44 * 45 * Node 0 & 1 are CPU + DRAM nodes, node 2 & 3 are PMEM nodes. 46 * 47 * node distances: 48 * node 0 1 2 3 49 * 0 10 20 30 40 50 * 1 20 10 40 30 51 * 2 30 40 10 40 52 * 3 40 30 40 10 53 * 54 * memory_tiers0 = 0-1 55 * memory_tiers1 = 2-3 56 * 57 * node_demotion[0].preferred = 2 58 * node_demotion[1].preferred = 3 59 * node_demotion[2].preferred = <empty> 60 * node_demotion[3].preferred = <empty> 61 * 62 * Example 2: 63 * 64 * Node 0 & 1 are CPU + DRAM nodes, node 2 is memory-only DRAM node. 65 * 66 * node distances: 67 * node 0 1 2 68 * 0 10 20 30 69 * 1 20 10 30 70 * 2 30 30 10 71 * 72 * memory_tiers0 = 0-2 73 * 74 * node_demotion[0].preferred = <empty> 75 * node_demotion[1].preferred = <empty> 76 * node_demotion[2].preferred = <empty> 77 * 78 * Example 3: 79 * 80 * Node 0 is CPU + DRAM nodes, Node 1 is HBM node, node 2 is PMEM node. 81 * 82 * node distances: 83 * node 0 1 2 84 * 0 10 20 30 85 * 1 20 10 40 86 * 2 30 40 10 87 * 88 * memory_tiers0 = 1 89 * memory_tiers1 = 0 90 * memory_tiers2 = 2 91 * 92 * node_demotion[0].preferred = 2 93 * node_demotion[1].preferred = 0 94 * node_demotion[2].preferred = <empty> 95 * 96 */ 97 static struct demotion_nodes *node_demotion __read_mostly; 98 #endif /* CONFIG_MIGRATION */ 99 100 static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype) 101 { 102 bool found_slot = false; 103 struct memory_tier *memtier, *new_memtier; 104 int adistance = memtype->adistance; 105 unsigned int memtier_adistance_chunk_size = MEMTIER_CHUNK_SIZE; 106 107 lockdep_assert_held_once(&memory_tier_lock); 108 109 adistance = round_down(adistance, memtier_adistance_chunk_size); 110 /* 111 * If the memtype is already part of a memory tier, 112 * just return that. 113 */ 114 if (!list_empty(&memtype->tier_sibiling)) { 115 list_for_each_entry(memtier, &memory_tiers, list) { 116 if (adistance == memtier->adistance_start) 117 return memtier; 118 } 119 WARN_ON(1); 120 return ERR_PTR(-EINVAL); 121 } 122 123 list_for_each_entry(memtier, &memory_tiers, list) { 124 if (adistance == memtier->adistance_start) { 125 list_add(&memtype->tier_sibiling, &memtier->memory_types); 126 return memtier; 127 } else if (adistance < memtier->adistance_start) { 128 found_slot = true; 129 break; 130 } 131 } 132 133 new_memtier = kmalloc(sizeof(struct memory_tier), GFP_KERNEL); 134 if (!new_memtier) 135 return ERR_PTR(-ENOMEM); 136 137 new_memtier->adistance_start = adistance; 138 INIT_LIST_HEAD(&new_memtier->list); 139 INIT_LIST_HEAD(&new_memtier->memory_types); 140 if (found_slot) 141 list_add_tail(&new_memtier->list, &memtier->list); 142 else 143 list_add_tail(&new_memtier->list, &memory_tiers); 144 list_add(&memtype->tier_sibiling, &new_memtier->memory_types); 145 return new_memtier; 146 } 147 148 static struct memory_tier *__node_get_memory_tier(int node) 149 { 150 pg_data_t *pgdat; 151 152 pgdat = NODE_DATA(node); 153 if (!pgdat) 154 return NULL; 155 /* 156 * Since we hold memory_tier_lock, we can avoid 157 * RCU read locks when accessing the details. No 158 * parallel updates are possible here. 159 */ 160 return rcu_dereference_check(pgdat->memtier, 161 lockdep_is_held(&memory_tier_lock)); 162 } 163 164 #ifdef CONFIG_MIGRATION 165 void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets) 166 { 167 struct memory_tier *memtier; 168 169 /* 170 * pg_data_t.memtier updates includes a synchronize_rcu() 171 * which ensures that we either find NULL or a valid memtier 172 * in NODE_DATA. protect the access via rcu_read_lock(); 173 */ 174 rcu_read_lock(); 175 memtier = rcu_dereference(pgdat->memtier); 176 if (memtier) 177 *targets = memtier->lower_tier_mask; 178 else 179 *targets = NODE_MASK_NONE; 180 rcu_read_unlock(); 181 } 182 183 /** 184 * next_demotion_node() - Get the next node in the demotion path 185 * @node: The starting node to lookup the next node 186 * 187 * Return: node id for next memory node in the demotion path hierarchy 188 * from @node; NUMA_NO_NODE if @node is terminal. This does not keep 189 * @node online or guarantee that it *continues* to be the next demotion 190 * target. 191 */ 192 int next_demotion_node(int node) 193 { 194 struct demotion_nodes *nd; 195 int target; 196 197 if (!node_demotion) 198 return NUMA_NO_NODE; 199 200 nd = &node_demotion[node]; 201 202 /* 203 * node_demotion[] is updated without excluding this 204 * function from running. 205 * 206 * Make sure to use RCU over entire code blocks if 207 * node_demotion[] reads need to be consistent. 208 */ 209 rcu_read_lock(); 210 /* 211 * If there are multiple target nodes, just select one 212 * target node randomly. 213 * 214 * In addition, we can also use round-robin to select 215 * target node, but we should introduce another variable 216 * for node_demotion[] to record last selected target node, 217 * that may cause cache ping-pong due to the changing of 218 * last target node. Or introducing per-cpu data to avoid 219 * caching issue, which seems more complicated. So selecting 220 * target node randomly seems better until now. 221 */ 222 target = node_random(&nd->preferred); 223 rcu_read_unlock(); 224 225 return target; 226 } 227 228 static void disable_all_demotion_targets(void) 229 { 230 struct memory_tier *memtier; 231 int node; 232 233 for_each_node_state(node, N_MEMORY) { 234 node_demotion[node].preferred = NODE_MASK_NONE; 235 /* 236 * We are holding memory_tier_lock, it is safe 237 * to access pgda->memtier. 238 */ 239 memtier = __node_get_memory_tier(node); 240 if (memtier) 241 memtier->lower_tier_mask = NODE_MASK_NONE; 242 } 243 /* 244 * Ensure that the "disable" is visible across the system. 245 * Readers will see either a combination of before+disable 246 * state or disable+after. They will never see before and 247 * after state together. 248 */ 249 synchronize_rcu(); 250 } 251 252 static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier) 253 { 254 nodemask_t nodes = NODE_MASK_NONE; 255 struct memory_dev_type *memtype; 256 257 list_for_each_entry(memtype, &memtier->memory_types, tier_sibiling) 258 nodes_or(nodes, nodes, memtype->nodes); 259 260 return nodes; 261 } 262 263 /* 264 * Find an automatic demotion target for all memory 265 * nodes. Failing here is OK. It might just indicate 266 * being at the end of a chain. 267 */ 268 static void establish_demotion_targets(void) 269 { 270 struct memory_tier *memtier; 271 struct demotion_nodes *nd; 272 int target = NUMA_NO_NODE, node; 273 int distance, best_distance; 274 nodemask_t tier_nodes, lower_tier; 275 276 lockdep_assert_held_once(&memory_tier_lock); 277 278 if (!node_demotion || !IS_ENABLED(CONFIG_MIGRATION)) 279 return; 280 281 disable_all_demotion_targets(); 282 283 for_each_node_state(node, N_MEMORY) { 284 best_distance = -1; 285 nd = &node_demotion[node]; 286 287 memtier = __node_get_memory_tier(node); 288 if (!memtier || list_is_last(&memtier->list, &memory_tiers)) 289 continue; 290 /* 291 * Get the lower memtier to find the demotion node list. 292 */ 293 memtier = list_next_entry(memtier, list); 294 tier_nodes = get_memtier_nodemask(memtier); 295 /* 296 * find_next_best_node, use 'used' nodemask as a skip list. 297 * Add all memory nodes except the selected memory tier 298 * nodelist to skip list so that we find the best node from the 299 * memtier nodelist. 300 */ 301 nodes_andnot(tier_nodes, node_states[N_MEMORY], tier_nodes); 302 303 /* 304 * Find all the nodes in the memory tier node list of same best distance. 305 * add them to the preferred mask. We randomly select between nodes 306 * in the preferred mask when allocating pages during demotion. 307 */ 308 do { 309 target = find_next_best_node(node, &tier_nodes); 310 if (target == NUMA_NO_NODE) 311 break; 312 313 distance = node_distance(node, target); 314 if (distance == best_distance || best_distance == -1) { 315 best_distance = distance; 316 node_set(target, nd->preferred); 317 } else { 318 break; 319 } 320 } while (1); 321 } 322 /* 323 * Now build the lower_tier mask for each node collecting node mask from 324 * all memory tier below it. This allows us to fallback demotion page 325 * allocation to a set of nodes that is closer the above selected 326 * perferred node. 327 */ 328 lower_tier = node_states[N_MEMORY]; 329 list_for_each_entry(memtier, &memory_tiers, list) { 330 /* 331 * Keep removing current tier from lower_tier nodes, 332 * This will remove all nodes in current and above 333 * memory tier from the lower_tier mask. 334 */ 335 tier_nodes = get_memtier_nodemask(memtier); 336 nodes_andnot(lower_tier, lower_tier, tier_nodes); 337 memtier->lower_tier_mask = lower_tier; 338 } 339 } 340 341 #else 342 static inline void disable_all_demotion_targets(void) {} 343 static inline void establish_demotion_targets(void) {} 344 #endif /* CONFIG_MIGRATION */ 345 346 static inline void __init_node_memory_type(int node, struct memory_dev_type *memtype) 347 { 348 if (!node_memory_types[node].memtype) 349 node_memory_types[node].memtype = memtype; 350 /* 351 * for each device getting added in the same NUMA node 352 * with this specific memtype, bump the map count. We 353 * Only take memtype device reference once, so that 354 * changing a node memtype can be done by droping the 355 * only reference count taken here. 356 */ 357 358 if (node_memory_types[node].memtype == memtype) { 359 if (!node_memory_types[node].map_count++) 360 kref_get(&memtype->kref); 361 } 362 } 363 364 static struct memory_tier *set_node_memory_tier(int node) 365 { 366 struct memory_tier *memtier; 367 struct memory_dev_type *memtype; 368 pg_data_t *pgdat = NODE_DATA(node); 369 370 371 lockdep_assert_held_once(&memory_tier_lock); 372 373 if (!node_state(node, N_MEMORY)) 374 return ERR_PTR(-EINVAL); 375 376 __init_node_memory_type(node, default_dram_type); 377 378 memtype = node_memory_types[node].memtype; 379 node_set(node, memtype->nodes); 380 memtier = find_create_memory_tier(memtype); 381 if (!IS_ERR(memtier)) 382 rcu_assign_pointer(pgdat->memtier, memtier); 383 return memtier; 384 } 385 386 static void destroy_memory_tier(struct memory_tier *memtier) 387 { 388 list_del(&memtier->list); 389 /* 390 * synchronize_rcu in clear_node_memory_tier makes sure 391 * we don't have rcu access to this memory tier. 392 */ 393 kfree(memtier); 394 } 395 396 static bool clear_node_memory_tier(int node) 397 { 398 bool cleared = false; 399 pg_data_t *pgdat; 400 struct memory_tier *memtier; 401 402 pgdat = NODE_DATA(node); 403 if (!pgdat) 404 return false; 405 406 /* 407 * Make sure that anybody looking at NODE_DATA who finds 408 * a valid memtier finds memory_dev_types with nodes still 409 * linked to the memtier. We achieve this by waiting for 410 * rcu read section to finish using synchronize_rcu. 411 * This also enables us to free the destroyed memory tier 412 * with kfree instead of kfree_rcu 413 */ 414 memtier = __node_get_memory_tier(node); 415 if (memtier) { 416 struct memory_dev_type *memtype; 417 418 rcu_assign_pointer(pgdat->memtier, NULL); 419 synchronize_rcu(); 420 memtype = node_memory_types[node].memtype; 421 node_clear(node, memtype->nodes); 422 if (nodes_empty(memtype->nodes)) { 423 list_del_init(&memtype->tier_sibiling); 424 if (list_empty(&memtier->memory_types)) 425 destroy_memory_tier(memtier); 426 } 427 cleared = true; 428 } 429 return cleared; 430 } 431 432 static void release_memtype(struct kref *kref) 433 { 434 struct memory_dev_type *memtype; 435 436 memtype = container_of(kref, struct memory_dev_type, kref); 437 kfree(memtype); 438 } 439 440 struct memory_dev_type *alloc_memory_type(int adistance) 441 { 442 struct memory_dev_type *memtype; 443 444 memtype = kmalloc(sizeof(*memtype), GFP_KERNEL); 445 if (!memtype) 446 return ERR_PTR(-ENOMEM); 447 448 memtype->adistance = adistance; 449 INIT_LIST_HEAD(&memtype->tier_sibiling); 450 memtype->nodes = NODE_MASK_NONE; 451 kref_init(&memtype->kref); 452 return memtype; 453 } 454 EXPORT_SYMBOL_GPL(alloc_memory_type); 455 456 void destroy_memory_type(struct memory_dev_type *memtype) 457 { 458 kref_put(&memtype->kref, release_memtype); 459 } 460 EXPORT_SYMBOL_GPL(destroy_memory_type); 461 462 void init_node_memory_type(int node, struct memory_dev_type *memtype) 463 { 464 465 mutex_lock(&memory_tier_lock); 466 __init_node_memory_type(node, memtype); 467 mutex_unlock(&memory_tier_lock); 468 } 469 EXPORT_SYMBOL_GPL(init_node_memory_type); 470 471 void clear_node_memory_type(int node, struct memory_dev_type *memtype) 472 { 473 mutex_lock(&memory_tier_lock); 474 if (node_memory_types[node].memtype == memtype) 475 node_memory_types[node].map_count--; 476 /* 477 * If we umapped all the attached devices to this node, 478 * clear the node memory type. 479 */ 480 if (!node_memory_types[node].map_count) { 481 node_memory_types[node].memtype = NULL; 482 kref_put(&memtype->kref, release_memtype); 483 } 484 mutex_unlock(&memory_tier_lock); 485 } 486 EXPORT_SYMBOL_GPL(clear_node_memory_type); 487 488 static int __meminit memtier_hotplug_callback(struct notifier_block *self, 489 unsigned long action, void *_arg) 490 { 491 struct memory_tier *memtier; 492 struct memory_notify *arg = _arg; 493 494 /* 495 * Only update the node migration order when a node is 496 * changing status, like online->offline. 497 */ 498 if (arg->status_change_nid < 0) 499 return notifier_from_errno(0); 500 501 switch (action) { 502 case MEM_OFFLINE: 503 mutex_lock(&memory_tier_lock); 504 if (clear_node_memory_tier(arg->status_change_nid)) 505 establish_demotion_targets(); 506 mutex_unlock(&memory_tier_lock); 507 break; 508 case MEM_ONLINE: 509 mutex_lock(&memory_tier_lock); 510 memtier = set_node_memory_tier(arg->status_change_nid); 511 if (!IS_ERR(memtier)) 512 establish_demotion_targets(); 513 mutex_unlock(&memory_tier_lock); 514 break; 515 } 516 517 return notifier_from_errno(0); 518 } 519 520 static int __init memory_tier_init(void) 521 { 522 int node; 523 struct memory_tier *memtier; 524 525 #ifdef CONFIG_MIGRATION 526 node_demotion = kcalloc(nr_node_ids, sizeof(struct demotion_nodes), 527 GFP_KERNEL); 528 WARN_ON(!node_demotion); 529 #endif 530 mutex_lock(&memory_tier_lock); 531 /* 532 * For now we can have 4 faster memory tiers with smaller adistance 533 * than default DRAM tier. 534 */ 535 default_dram_type = alloc_memory_type(MEMTIER_ADISTANCE_DRAM); 536 if (!default_dram_type) 537 panic("%s() failed to allocate default DRAM tier\n", __func__); 538 539 /* 540 * Look at all the existing N_MEMORY nodes and add them to 541 * default memory tier or to a tier if we already have memory 542 * types assigned. 543 */ 544 for_each_node_state(node, N_MEMORY) { 545 memtier = set_node_memory_tier(node); 546 if (IS_ERR(memtier)) 547 /* 548 * Continue with memtiers we are able to setup 549 */ 550 break; 551 } 552 establish_demotion_targets(); 553 mutex_unlock(&memory_tier_lock); 554 555 hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRIO); 556 return 0; 557 } 558 subsys_initcall(memory_tier_init); 559 560 bool numa_demotion_enabled = false; 561 562 #ifdef CONFIG_MIGRATION 563 #ifdef CONFIG_SYSFS 564 static ssize_t numa_demotion_enabled_show(struct kobject *kobj, 565 struct kobj_attribute *attr, char *buf) 566 { 567 return sysfs_emit(buf, "%s\n", 568 numa_demotion_enabled ? "true" : "false"); 569 } 570 571 static ssize_t numa_demotion_enabled_store(struct kobject *kobj, 572 struct kobj_attribute *attr, 573 const char *buf, size_t count) 574 { 575 ssize_t ret; 576 577 ret = kstrtobool(buf, &numa_demotion_enabled); 578 if (ret) 579 return ret; 580 581 return count; 582 } 583 584 static struct kobj_attribute numa_demotion_enabled_attr = 585 __ATTR(demotion_enabled, 0644, numa_demotion_enabled_show, 586 numa_demotion_enabled_store); 587 588 static struct attribute *numa_attrs[] = { 589 &numa_demotion_enabled_attr.attr, 590 NULL, 591 }; 592 593 static const struct attribute_group numa_attr_group = { 594 .attrs = numa_attrs, 595 }; 596 597 static int __init numa_init_sysfs(void) 598 { 599 int err; 600 struct kobject *numa_kobj; 601 602 numa_kobj = kobject_create_and_add("numa", mm_kobj); 603 if (!numa_kobj) { 604 pr_err("failed to create numa kobject\n"); 605 return -ENOMEM; 606 } 607 err = sysfs_create_group(numa_kobj, &numa_attr_group); 608 if (err) { 609 pr_err("failed to register numa group\n"); 610 goto delete_obj; 611 } 612 return 0; 613 614 delete_obj: 615 kobject_put(numa_kobj); 616 return err; 617 } 618 subsys_initcall(numa_init_sysfs); 619 #endif /* CONFIG_SYSFS */ 620 #endif 621