1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Memory subsystem support 4 * 5 * Written by Matt Tolentino <matthew.e.tolentino@intel.com> 6 * Dave Hansen <haveblue@us.ibm.com> 7 * 8 * This file provides the necessary infrastructure to represent 9 * a SPARSEMEM-memory-model system's physical memory in /sysfs. 10 * All arch-independent code that assumes MEMORY_HOTPLUG requires 11 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c. 12 */ 13 14 #include <linux/module.h> 15 #include <linux/init.h> 16 #include <linux/topology.h> 17 #include <linux/capability.h> 18 #include <linux/device.h> 19 #include <linux/memory.h> 20 #include <linux/memory_hotplug.h> 21 #include <linux/mm.h> 22 #include <linux/stat.h> 23 #include <linux/slab.h> 24 #include <linux/xarray.h> 25 26 #include <linux/atomic.h> 27 #include <linux/uaccess.h> 28 29 #define MEMORY_CLASS_NAME "memory" 30 31 static const char *const online_type_to_str[] = { 32 [MMOP_OFFLINE] = "offline", 33 [MMOP_ONLINE] = "online", 34 [MMOP_ONLINE_KERNEL] = "online_kernel", 35 [MMOP_ONLINE_MOVABLE] = "online_movable", 36 }; 37 38 int mhp_online_type_from_str(const char *str) 39 { 40 int i; 41 42 for (i = 0; i < ARRAY_SIZE(online_type_to_str); i++) { 43 if (sysfs_streq(str, online_type_to_str[i])) 44 return i; 45 } 46 return -EINVAL; 47 } 48 49 #define to_memory_block(dev) container_of(dev, struct memory_block, dev) 50 51 static int sections_per_block; 52 53 static inline unsigned long memory_block_id(unsigned long section_nr) 54 { 55 return section_nr / sections_per_block; 56 } 57 58 static inline unsigned long pfn_to_block_id(unsigned long pfn) 59 { 60 return memory_block_id(pfn_to_section_nr(pfn)); 61 } 62 63 static inline unsigned long phys_to_block_id(unsigned long phys) 64 { 65 return pfn_to_block_id(PFN_DOWN(phys)); 66 } 67 68 static int memory_subsys_online(struct device *dev); 69 static int memory_subsys_offline(struct device *dev); 70 71 static struct bus_type memory_subsys = { 72 .name = MEMORY_CLASS_NAME, 73 .dev_name = MEMORY_CLASS_NAME, 74 .online = memory_subsys_online, 75 .offline = memory_subsys_offline, 76 }; 77 78 /* 79 * Memory blocks are cached in a local radix tree to avoid 80 * a costly linear search for the corresponding device on 81 * the subsystem bus. 82 */ 83 static DEFINE_XARRAY(memory_blocks); 84 85 /* 86 * Memory groups, indexed by memory group id (mgid). 87 */ 88 static DEFINE_XARRAY_FLAGS(memory_groups, XA_FLAGS_ALLOC); 89 #define MEMORY_GROUP_MARK_DYNAMIC XA_MARK_1 90 91 static BLOCKING_NOTIFIER_HEAD(memory_chain); 92 93 int register_memory_notifier(struct notifier_block *nb) 94 { 95 return blocking_notifier_chain_register(&memory_chain, nb); 96 } 97 EXPORT_SYMBOL(register_memory_notifier); 98 99 void unregister_memory_notifier(struct notifier_block *nb) 100 { 101 blocking_notifier_chain_unregister(&memory_chain, nb); 102 } 103 EXPORT_SYMBOL(unregister_memory_notifier); 104 105 static void memory_block_release(struct device *dev) 106 { 107 struct memory_block *mem = to_memory_block(dev); 108 /* Verify that the altmap is freed */ 109 WARN_ON(mem->altmap); 110 kfree(mem); 111 } 112 113 unsigned long __weak memory_block_size_bytes(void) 114 { 115 return MIN_MEMORY_BLOCK_SIZE; 116 } 117 EXPORT_SYMBOL_GPL(memory_block_size_bytes); 118 119 /* Show the memory block ID, relative to the memory block size */ 120 static ssize_t phys_index_show(struct device *dev, 121 struct device_attribute *attr, char *buf) 122 { 123 struct memory_block *mem = to_memory_block(dev); 124 125 return sysfs_emit(buf, "%08lx\n", memory_block_id(mem->start_section_nr)); 126 } 127 128 /* 129 * Legacy interface that we cannot remove. Always indicate "removable" 130 * with CONFIG_MEMORY_HOTREMOVE - bad heuristic. 131 */ 132 static ssize_t removable_show(struct device *dev, struct device_attribute *attr, 133 char *buf) 134 { 135 return sysfs_emit(buf, "%d\n", (int)IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)); 136 } 137 138 /* 139 * online, offline, going offline, etc. 140 */ 141 static ssize_t state_show(struct device *dev, struct device_attribute *attr, 142 char *buf) 143 { 144 struct memory_block *mem = to_memory_block(dev); 145 const char *output; 146 147 /* 148 * We can probably put these states in a nice little array 149 * so that they're not open-coded 150 */ 151 switch (mem->state) { 152 case MEM_ONLINE: 153 output = "online"; 154 break; 155 case MEM_OFFLINE: 156 output = "offline"; 157 break; 158 case MEM_GOING_OFFLINE: 159 output = "going-offline"; 160 break; 161 default: 162 WARN_ON(1); 163 return sysfs_emit(buf, "ERROR-UNKNOWN-%ld\n", mem->state); 164 } 165 166 return sysfs_emit(buf, "%s\n", output); 167 } 168 169 int memory_notify(unsigned long val, void *v) 170 { 171 return blocking_notifier_call_chain(&memory_chain, val, v); 172 } 173 174 #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) 175 static unsigned long memblk_nr_poison(struct memory_block *mem); 176 #else 177 static inline unsigned long memblk_nr_poison(struct memory_block *mem) 178 { 179 return 0; 180 } 181 #endif 182 183 /* 184 * Must acquire mem_hotplug_lock in write mode. 185 */ 186 static int memory_block_online(struct memory_block *mem) 187 { 188 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 189 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 190 unsigned long nr_vmemmap_pages = 0; 191 struct zone *zone; 192 int ret; 193 194 if (memblk_nr_poison(mem)) 195 return -EHWPOISON; 196 197 zone = zone_for_pfn_range(mem->online_type, mem->nid, mem->group, 198 start_pfn, nr_pages); 199 200 /* 201 * Although vmemmap pages have a different lifecycle than the pages 202 * they describe (they remain until the memory is unplugged), doing 203 * their initialization and accounting at memory onlining/offlining 204 * stage helps to keep accounting easier to follow - e.g vmemmaps 205 * belong to the same zone as the memory they backed. 206 */ 207 if (mem->altmap) 208 nr_vmemmap_pages = mem->altmap->free; 209 210 mem_hotplug_begin(); 211 if (nr_vmemmap_pages) { 212 ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, zone); 213 if (ret) 214 goto out; 215 } 216 217 ret = online_pages(start_pfn + nr_vmemmap_pages, 218 nr_pages - nr_vmemmap_pages, zone, mem->group); 219 if (ret) { 220 if (nr_vmemmap_pages) 221 mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages); 222 goto out; 223 } 224 225 /* 226 * Account once onlining succeeded. If the zone was unpopulated, it is 227 * now already properly populated. 228 */ 229 if (nr_vmemmap_pages) 230 adjust_present_page_count(pfn_to_page(start_pfn), mem->group, 231 nr_vmemmap_pages); 232 233 mem->zone = zone; 234 out: 235 mem_hotplug_done(); 236 return ret; 237 } 238 239 /* 240 * Must acquire mem_hotplug_lock in write mode. 241 */ 242 static int memory_block_offline(struct memory_block *mem) 243 { 244 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 245 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 246 unsigned long nr_vmemmap_pages = 0; 247 int ret; 248 249 if (!mem->zone) 250 return -EINVAL; 251 252 /* 253 * Unaccount before offlining, such that unpopulated zone and kthreads 254 * can properly be torn down in offline_pages(). 255 */ 256 if (mem->altmap) 257 nr_vmemmap_pages = mem->altmap->free; 258 259 mem_hotplug_begin(); 260 if (nr_vmemmap_pages) 261 adjust_present_page_count(pfn_to_page(start_pfn), mem->group, 262 -nr_vmemmap_pages); 263 264 ret = offline_pages(start_pfn + nr_vmemmap_pages, 265 nr_pages - nr_vmemmap_pages, mem->zone, mem->group); 266 if (ret) { 267 /* offline_pages() failed. Account back. */ 268 if (nr_vmemmap_pages) 269 adjust_present_page_count(pfn_to_page(start_pfn), 270 mem->group, nr_vmemmap_pages); 271 goto out; 272 } 273 274 if (nr_vmemmap_pages) 275 mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages); 276 277 mem->zone = NULL; 278 out: 279 mem_hotplug_done(); 280 return ret; 281 } 282 283 /* 284 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is 285 * OK to have direct references to sparsemem variables in here. 286 */ 287 static int 288 memory_block_action(struct memory_block *mem, unsigned long action) 289 { 290 int ret; 291 292 switch (action) { 293 case MEM_ONLINE: 294 ret = memory_block_online(mem); 295 break; 296 case MEM_OFFLINE: 297 ret = memory_block_offline(mem); 298 break; 299 default: 300 WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: " 301 "%ld\n", __func__, mem->start_section_nr, action, action); 302 ret = -EINVAL; 303 } 304 305 return ret; 306 } 307 308 static int memory_block_change_state(struct memory_block *mem, 309 unsigned long to_state, unsigned long from_state_req) 310 { 311 int ret = 0; 312 313 if (mem->state != from_state_req) 314 return -EINVAL; 315 316 if (to_state == MEM_OFFLINE) 317 mem->state = MEM_GOING_OFFLINE; 318 319 ret = memory_block_action(mem, to_state); 320 mem->state = ret ? from_state_req : to_state; 321 322 return ret; 323 } 324 325 /* The device lock serializes operations on memory_subsys_[online|offline] */ 326 static int memory_subsys_online(struct device *dev) 327 { 328 struct memory_block *mem = to_memory_block(dev); 329 int ret; 330 331 if (mem->state == MEM_ONLINE) 332 return 0; 333 334 /* 335 * When called via device_online() without configuring the online_type, 336 * we want to default to MMOP_ONLINE. 337 */ 338 if (mem->online_type == MMOP_OFFLINE) 339 mem->online_type = MMOP_ONLINE; 340 341 ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); 342 mem->online_type = MMOP_OFFLINE; 343 344 return ret; 345 } 346 347 static int memory_subsys_offline(struct device *dev) 348 { 349 struct memory_block *mem = to_memory_block(dev); 350 351 if (mem->state == MEM_OFFLINE) 352 return 0; 353 354 return memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); 355 } 356 357 static ssize_t state_store(struct device *dev, struct device_attribute *attr, 358 const char *buf, size_t count) 359 { 360 const int online_type = mhp_online_type_from_str(buf); 361 struct memory_block *mem = to_memory_block(dev); 362 int ret; 363 364 if (online_type < 0) 365 return -EINVAL; 366 367 ret = lock_device_hotplug_sysfs(); 368 if (ret) 369 return ret; 370 371 switch (online_type) { 372 case MMOP_ONLINE_KERNEL: 373 case MMOP_ONLINE_MOVABLE: 374 case MMOP_ONLINE: 375 /* mem->online_type is protected by device_hotplug_lock */ 376 mem->online_type = online_type; 377 ret = device_online(&mem->dev); 378 break; 379 case MMOP_OFFLINE: 380 ret = device_offline(&mem->dev); 381 break; 382 default: 383 ret = -EINVAL; /* should never happen */ 384 } 385 386 unlock_device_hotplug(); 387 388 if (ret < 0) 389 return ret; 390 if (ret) 391 return -EINVAL; 392 393 return count; 394 } 395 396 /* 397 * Legacy interface that we cannot remove: s390x exposes the storage increment 398 * covered by a memory block, allowing for identifying which memory blocks 399 * comprise a storage increment. Since a memory block spans complete 400 * storage increments nowadays, this interface is basically unused. Other 401 * archs never exposed != 0. 402 */ 403 static ssize_t phys_device_show(struct device *dev, 404 struct device_attribute *attr, char *buf) 405 { 406 struct memory_block *mem = to_memory_block(dev); 407 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 408 409 return sysfs_emit(buf, "%d\n", 410 arch_get_memory_phys_device(start_pfn)); 411 } 412 413 #ifdef CONFIG_MEMORY_HOTREMOVE 414 static int print_allowed_zone(char *buf, int len, int nid, 415 struct memory_group *group, 416 unsigned long start_pfn, unsigned long nr_pages, 417 int online_type, struct zone *default_zone) 418 { 419 struct zone *zone; 420 421 zone = zone_for_pfn_range(online_type, nid, group, start_pfn, nr_pages); 422 if (zone == default_zone) 423 return 0; 424 425 return sysfs_emit_at(buf, len, " %s", zone->name); 426 } 427 428 static ssize_t valid_zones_show(struct device *dev, 429 struct device_attribute *attr, char *buf) 430 { 431 struct memory_block *mem = to_memory_block(dev); 432 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 433 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 434 struct memory_group *group = mem->group; 435 struct zone *default_zone; 436 int nid = mem->nid; 437 int len = 0; 438 439 /* 440 * Check the existing zone. Make sure that we do that only on the 441 * online nodes otherwise the page_zone is not reliable 442 */ 443 if (mem->state == MEM_ONLINE) { 444 /* 445 * If !mem->zone, the memory block spans multiple zones and 446 * cannot get offlined. 447 */ 448 default_zone = mem->zone; 449 if (!default_zone) 450 return sysfs_emit(buf, "%s\n", "none"); 451 len += sysfs_emit_at(buf, len, "%s", default_zone->name); 452 goto out; 453 } 454 455 default_zone = zone_for_pfn_range(MMOP_ONLINE, nid, group, 456 start_pfn, nr_pages); 457 458 len += sysfs_emit_at(buf, len, "%s", default_zone->name); 459 len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages, 460 MMOP_ONLINE_KERNEL, default_zone); 461 len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages, 462 MMOP_ONLINE_MOVABLE, default_zone); 463 out: 464 len += sysfs_emit_at(buf, len, "\n"); 465 return len; 466 } 467 static DEVICE_ATTR_RO(valid_zones); 468 #endif 469 470 static DEVICE_ATTR_RO(phys_index); 471 static DEVICE_ATTR_RW(state); 472 static DEVICE_ATTR_RO(phys_device); 473 static DEVICE_ATTR_RO(removable); 474 475 /* 476 * Show the memory block size (shared by all memory blocks). 477 */ 478 static ssize_t block_size_bytes_show(struct device *dev, 479 struct device_attribute *attr, char *buf) 480 { 481 return sysfs_emit(buf, "%lx\n", memory_block_size_bytes()); 482 } 483 484 static DEVICE_ATTR_RO(block_size_bytes); 485 486 /* 487 * Memory auto online policy. 488 */ 489 490 static ssize_t auto_online_blocks_show(struct device *dev, 491 struct device_attribute *attr, char *buf) 492 { 493 return sysfs_emit(buf, "%s\n", 494 online_type_to_str[mhp_default_online_type]); 495 } 496 497 static ssize_t auto_online_blocks_store(struct device *dev, 498 struct device_attribute *attr, 499 const char *buf, size_t count) 500 { 501 const int online_type = mhp_online_type_from_str(buf); 502 503 if (online_type < 0) 504 return -EINVAL; 505 506 mhp_default_online_type = online_type; 507 return count; 508 } 509 510 static DEVICE_ATTR_RW(auto_online_blocks); 511 512 #ifdef CONFIG_CRASH_HOTPLUG 513 #include <linux/kexec.h> 514 static ssize_t crash_hotplug_show(struct device *dev, 515 struct device_attribute *attr, char *buf) 516 { 517 return sysfs_emit(buf, "%d\n", crash_hotplug_memory_support()); 518 } 519 static DEVICE_ATTR_RO(crash_hotplug); 520 #endif 521 522 /* 523 * Some architectures will have custom drivers to do this, and 524 * will not need to do it from userspace. The fake hot-add code 525 * as well as ppc64 will do all of their discovery in userspace 526 * and will require this interface. 527 */ 528 #ifdef CONFIG_ARCH_MEMORY_PROBE 529 static ssize_t probe_store(struct device *dev, struct device_attribute *attr, 530 const char *buf, size_t count) 531 { 532 u64 phys_addr; 533 int nid, ret; 534 unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block; 535 536 ret = kstrtoull(buf, 0, &phys_addr); 537 if (ret) 538 return ret; 539 540 if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1)) 541 return -EINVAL; 542 543 ret = lock_device_hotplug_sysfs(); 544 if (ret) 545 return ret; 546 547 nid = memory_add_physaddr_to_nid(phys_addr); 548 ret = __add_memory(nid, phys_addr, 549 MIN_MEMORY_BLOCK_SIZE * sections_per_block, 550 MHP_NONE); 551 552 if (ret) 553 goto out; 554 555 ret = count; 556 out: 557 unlock_device_hotplug(); 558 return ret; 559 } 560 561 static DEVICE_ATTR_WO(probe); 562 #endif 563 564 #ifdef CONFIG_MEMORY_FAILURE 565 /* 566 * Support for offlining pages of memory 567 */ 568 569 /* Soft offline a page */ 570 static ssize_t soft_offline_page_store(struct device *dev, 571 struct device_attribute *attr, 572 const char *buf, size_t count) 573 { 574 int ret; 575 u64 pfn; 576 if (!capable(CAP_SYS_ADMIN)) 577 return -EPERM; 578 if (kstrtoull(buf, 0, &pfn) < 0) 579 return -EINVAL; 580 pfn >>= PAGE_SHIFT; 581 ret = soft_offline_page(pfn, 0); 582 return ret == 0 ? count : ret; 583 } 584 585 /* Forcibly offline a page, including killing processes. */ 586 static ssize_t hard_offline_page_store(struct device *dev, 587 struct device_attribute *attr, 588 const char *buf, size_t count) 589 { 590 int ret; 591 u64 pfn; 592 if (!capable(CAP_SYS_ADMIN)) 593 return -EPERM; 594 if (kstrtoull(buf, 0, &pfn) < 0) 595 return -EINVAL; 596 pfn >>= PAGE_SHIFT; 597 ret = memory_failure(pfn, MF_SW_SIMULATED); 598 if (ret == -EOPNOTSUPP) 599 ret = 0; 600 return ret ? ret : count; 601 } 602 603 static DEVICE_ATTR_WO(soft_offline_page); 604 static DEVICE_ATTR_WO(hard_offline_page); 605 #endif 606 607 /* See phys_device_show(). */ 608 int __weak arch_get_memory_phys_device(unsigned long start_pfn) 609 { 610 return 0; 611 } 612 613 /* 614 * A reference for the returned memory block device is acquired. 615 * 616 * Called under device_hotplug_lock. 617 */ 618 static struct memory_block *find_memory_block_by_id(unsigned long block_id) 619 { 620 struct memory_block *mem; 621 622 mem = xa_load(&memory_blocks, block_id); 623 if (mem) 624 get_device(&mem->dev); 625 return mem; 626 } 627 628 /* 629 * Called under device_hotplug_lock. 630 */ 631 struct memory_block *find_memory_block(unsigned long section_nr) 632 { 633 unsigned long block_id = memory_block_id(section_nr); 634 635 return find_memory_block_by_id(block_id); 636 } 637 638 static struct attribute *memory_memblk_attrs[] = { 639 &dev_attr_phys_index.attr, 640 &dev_attr_state.attr, 641 &dev_attr_phys_device.attr, 642 &dev_attr_removable.attr, 643 #ifdef CONFIG_MEMORY_HOTREMOVE 644 &dev_attr_valid_zones.attr, 645 #endif 646 NULL 647 }; 648 649 static const struct attribute_group memory_memblk_attr_group = { 650 .attrs = memory_memblk_attrs, 651 }; 652 653 static const struct attribute_group *memory_memblk_attr_groups[] = { 654 &memory_memblk_attr_group, 655 NULL, 656 }; 657 658 static int __add_memory_block(struct memory_block *memory) 659 { 660 int ret; 661 662 memory->dev.bus = &memory_subsys; 663 memory->dev.id = memory->start_section_nr / sections_per_block; 664 memory->dev.release = memory_block_release; 665 memory->dev.groups = memory_memblk_attr_groups; 666 memory->dev.offline = memory->state == MEM_OFFLINE; 667 668 ret = device_register(&memory->dev); 669 if (ret) { 670 put_device(&memory->dev); 671 return ret; 672 } 673 ret = xa_err(xa_store(&memory_blocks, memory->dev.id, memory, 674 GFP_KERNEL)); 675 if (ret) 676 device_unregister(&memory->dev); 677 678 return ret; 679 } 680 681 static struct zone *early_node_zone_for_memory_block(struct memory_block *mem, 682 int nid) 683 { 684 const unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 685 const unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 686 struct zone *zone, *matching_zone = NULL; 687 pg_data_t *pgdat = NODE_DATA(nid); 688 int i; 689 690 /* 691 * This logic only works for early memory, when the applicable zones 692 * already span the memory block. We don't expect overlapping zones on 693 * a single node for early memory. So if we're told that some PFNs 694 * of a node fall into this memory block, we can assume that all node 695 * zones that intersect with the memory block are actually applicable. 696 * No need to look at the memmap. 697 */ 698 for (i = 0; i < MAX_NR_ZONES; i++) { 699 zone = pgdat->node_zones + i; 700 if (!populated_zone(zone)) 701 continue; 702 if (!zone_intersects(zone, start_pfn, nr_pages)) 703 continue; 704 if (!matching_zone) { 705 matching_zone = zone; 706 continue; 707 } 708 /* Spans multiple zones ... */ 709 matching_zone = NULL; 710 break; 711 } 712 return matching_zone; 713 } 714 715 #ifdef CONFIG_NUMA 716 /** 717 * memory_block_add_nid() - Indicate that system RAM falling into this memory 718 * block device (partially) belongs to the given node. 719 * @mem: The memory block device. 720 * @nid: The node id. 721 * @context: The memory initialization context. 722 * 723 * Indicate that system RAM falling into this memory block (partially) belongs 724 * to the given node. If the context indicates ("early") that we are adding the 725 * node during node device subsystem initialization, this will also properly 726 * set/adjust mem->zone based on the zone ranges of the given node. 727 */ 728 void memory_block_add_nid(struct memory_block *mem, int nid, 729 enum meminit_context context) 730 { 731 if (context == MEMINIT_EARLY && mem->nid != nid) { 732 /* 733 * For early memory we have to determine the zone when setting 734 * the node id and handle multiple nodes spanning a single 735 * memory block by indicate via zone == NULL that we're not 736 * dealing with a single zone. So if we're setting the node id 737 * the first time, determine if there is a single zone. If we're 738 * setting the node id a second time to a different node, 739 * invalidate the single detected zone. 740 */ 741 if (mem->nid == NUMA_NO_NODE) 742 mem->zone = early_node_zone_for_memory_block(mem, nid); 743 else 744 mem->zone = NULL; 745 } 746 747 /* 748 * If this memory block spans multiple nodes, we only indicate 749 * the last processed node. If we span multiple nodes (not applicable 750 * to hotplugged memory), zone == NULL will prohibit memory offlining 751 * and consequently unplug. 752 */ 753 mem->nid = nid; 754 } 755 #endif 756 757 static int add_memory_block(unsigned long block_id, unsigned long state, 758 struct vmem_altmap *altmap, 759 struct memory_group *group) 760 { 761 struct memory_block *mem; 762 int ret = 0; 763 764 mem = find_memory_block_by_id(block_id); 765 if (mem) { 766 put_device(&mem->dev); 767 return -EEXIST; 768 } 769 mem = kzalloc(sizeof(*mem), GFP_KERNEL); 770 if (!mem) 771 return -ENOMEM; 772 773 mem->start_section_nr = block_id * sections_per_block; 774 mem->state = state; 775 mem->nid = NUMA_NO_NODE; 776 mem->altmap = altmap; 777 INIT_LIST_HEAD(&mem->group_next); 778 779 #ifndef CONFIG_NUMA 780 if (state == MEM_ONLINE) 781 /* 782 * MEM_ONLINE at this point implies early memory. With NUMA, 783 * we'll determine the zone when setting the node id via 784 * memory_block_add_nid(). Memory hotplug updated the zone 785 * manually when memory onlining/offlining succeeds. 786 */ 787 mem->zone = early_node_zone_for_memory_block(mem, NUMA_NO_NODE); 788 #endif /* CONFIG_NUMA */ 789 790 ret = __add_memory_block(mem); 791 if (ret) 792 return ret; 793 794 if (group) { 795 mem->group = group; 796 list_add(&mem->group_next, &group->memory_blocks); 797 } 798 799 return 0; 800 } 801 802 static int __init add_boot_memory_block(unsigned long base_section_nr) 803 { 804 int section_count = 0; 805 unsigned long nr; 806 807 for (nr = base_section_nr; nr < base_section_nr + sections_per_block; 808 nr++) 809 if (present_section_nr(nr)) 810 section_count++; 811 812 if (section_count == 0) 813 return 0; 814 return add_memory_block(memory_block_id(base_section_nr), 815 MEM_ONLINE, NULL, NULL); 816 } 817 818 static int add_hotplug_memory_block(unsigned long block_id, 819 struct vmem_altmap *altmap, 820 struct memory_group *group) 821 { 822 return add_memory_block(block_id, MEM_OFFLINE, altmap, group); 823 } 824 825 static void remove_memory_block(struct memory_block *memory) 826 { 827 if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys)) 828 return; 829 830 WARN_ON(xa_erase(&memory_blocks, memory->dev.id) == NULL); 831 832 if (memory->group) { 833 list_del(&memory->group_next); 834 memory->group = NULL; 835 } 836 837 /* drop the ref. we got via find_memory_block() */ 838 put_device(&memory->dev); 839 device_unregister(&memory->dev); 840 } 841 842 /* 843 * Create memory block devices for the given memory area. Start and size 844 * have to be aligned to memory block granularity. Memory block devices 845 * will be initialized as offline. 846 * 847 * Called under device_hotplug_lock. 848 */ 849 int create_memory_block_devices(unsigned long start, unsigned long size, 850 struct vmem_altmap *altmap, 851 struct memory_group *group) 852 { 853 const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start)); 854 unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size)); 855 struct memory_block *mem; 856 unsigned long block_id; 857 int ret = 0; 858 859 if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) || 860 !IS_ALIGNED(size, memory_block_size_bytes()))) 861 return -EINVAL; 862 863 for (block_id = start_block_id; block_id != end_block_id; block_id++) { 864 ret = add_hotplug_memory_block(block_id, altmap, group); 865 if (ret) 866 break; 867 } 868 if (ret) { 869 end_block_id = block_id; 870 for (block_id = start_block_id; block_id != end_block_id; 871 block_id++) { 872 mem = find_memory_block_by_id(block_id); 873 if (WARN_ON_ONCE(!mem)) 874 continue; 875 remove_memory_block(mem); 876 } 877 } 878 return ret; 879 } 880 881 /* 882 * Remove memory block devices for the given memory area. Start and size 883 * have to be aligned to memory block granularity. Memory block devices 884 * have to be offline. 885 * 886 * Called under device_hotplug_lock. 887 */ 888 void remove_memory_block_devices(unsigned long start, unsigned long size) 889 { 890 const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start)); 891 const unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size)); 892 struct memory_block *mem; 893 unsigned long block_id; 894 895 if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) || 896 !IS_ALIGNED(size, memory_block_size_bytes()))) 897 return; 898 899 for (block_id = start_block_id; block_id != end_block_id; block_id++) { 900 mem = find_memory_block_by_id(block_id); 901 if (WARN_ON_ONCE(!mem)) 902 continue; 903 num_poisoned_pages_sub(-1UL, memblk_nr_poison(mem)); 904 unregister_memory_block_under_nodes(mem); 905 remove_memory_block(mem); 906 } 907 } 908 909 static struct attribute *memory_root_attrs[] = { 910 #ifdef CONFIG_ARCH_MEMORY_PROBE 911 &dev_attr_probe.attr, 912 #endif 913 914 #ifdef CONFIG_MEMORY_FAILURE 915 &dev_attr_soft_offline_page.attr, 916 &dev_attr_hard_offline_page.attr, 917 #endif 918 919 &dev_attr_block_size_bytes.attr, 920 &dev_attr_auto_online_blocks.attr, 921 #ifdef CONFIG_CRASH_HOTPLUG 922 &dev_attr_crash_hotplug.attr, 923 #endif 924 NULL 925 }; 926 927 static const struct attribute_group memory_root_attr_group = { 928 .attrs = memory_root_attrs, 929 }; 930 931 static const struct attribute_group *memory_root_attr_groups[] = { 932 &memory_root_attr_group, 933 NULL, 934 }; 935 936 /* 937 * Initialize the sysfs support for memory devices. At the time this function 938 * is called, we cannot have concurrent creation/deletion of memory block 939 * devices, the device_hotplug_lock is not needed. 940 */ 941 void __init memory_dev_init(void) 942 { 943 int ret; 944 unsigned long block_sz, nr; 945 946 /* Validate the configured memory block size */ 947 block_sz = memory_block_size_bytes(); 948 if (!is_power_of_2(block_sz) || block_sz < MIN_MEMORY_BLOCK_SIZE) 949 panic("Memory block size not suitable: 0x%lx\n", block_sz); 950 sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; 951 952 ret = subsys_system_register(&memory_subsys, memory_root_attr_groups); 953 if (ret) 954 panic("%s() failed to register subsystem: %d\n", __func__, ret); 955 956 /* 957 * Create entries for memory sections that were found 958 * during boot and have been initialized 959 */ 960 for (nr = 0; nr <= __highest_present_section_nr; 961 nr += sections_per_block) { 962 ret = add_boot_memory_block(nr); 963 if (ret) 964 panic("%s() failed to add memory block: %d\n", __func__, 965 ret); 966 } 967 } 968 969 /** 970 * walk_memory_blocks - walk through all present memory blocks overlapped 971 * by the range [start, start + size) 972 * 973 * @start: start address of the memory range 974 * @size: size of the memory range 975 * @arg: argument passed to func 976 * @func: callback for each memory section walked 977 * 978 * This function walks through all present memory blocks overlapped by the 979 * range [start, start + size), calling func on each memory block. 980 * 981 * In case func() returns an error, walking is aborted and the error is 982 * returned. 983 * 984 * Called under device_hotplug_lock. 985 */ 986 int walk_memory_blocks(unsigned long start, unsigned long size, 987 void *arg, walk_memory_blocks_func_t func) 988 { 989 const unsigned long start_block_id = phys_to_block_id(start); 990 const unsigned long end_block_id = phys_to_block_id(start + size - 1); 991 struct memory_block *mem; 992 unsigned long block_id; 993 int ret = 0; 994 995 if (!size) 996 return 0; 997 998 for (block_id = start_block_id; block_id <= end_block_id; block_id++) { 999 mem = find_memory_block_by_id(block_id); 1000 if (!mem) 1001 continue; 1002 1003 ret = func(mem, arg); 1004 put_device(&mem->dev); 1005 if (ret) 1006 break; 1007 } 1008 return ret; 1009 } 1010 1011 struct for_each_memory_block_cb_data { 1012 walk_memory_blocks_func_t func; 1013 void *arg; 1014 }; 1015 1016 static int for_each_memory_block_cb(struct device *dev, void *data) 1017 { 1018 struct memory_block *mem = to_memory_block(dev); 1019 struct for_each_memory_block_cb_data *cb_data = data; 1020 1021 return cb_data->func(mem, cb_data->arg); 1022 } 1023 1024 /** 1025 * for_each_memory_block - walk through all present memory blocks 1026 * 1027 * @arg: argument passed to func 1028 * @func: callback for each memory block walked 1029 * 1030 * This function walks through all present memory blocks, calling func on 1031 * each memory block. 1032 * 1033 * In case func() returns an error, walking is aborted and the error is 1034 * returned. 1035 */ 1036 int for_each_memory_block(void *arg, walk_memory_blocks_func_t func) 1037 { 1038 struct for_each_memory_block_cb_data cb_data = { 1039 .func = func, 1040 .arg = arg, 1041 }; 1042 1043 return bus_for_each_dev(&memory_subsys, NULL, &cb_data, 1044 for_each_memory_block_cb); 1045 } 1046 1047 /* 1048 * This is an internal helper to unify allocation and initialization of 1049 * memory groups. Note that the passed memory group will be copied to a 1050 * dynamically allocated memory group. After this call, the passed 1051 * memory group should no longer be used. 1052 */ 1053 static int memory_group_register(struct memory_group group) 1054 { 1055 struct memory_group *new_group; 1056 uint32_t mgid; 1057 int ret; 1058 1059 if (!node_possible(group.nid)) 1060 return -EINVAL; 1061 1062 new_group = kzalloc(sizeof(group), GFP_KERNEL); 1063 if (!new_group) 1064 return -ENOMEM; 1065 *new_group = group; 1066 INIT_LIST_HEAD(&new_group->memory_blocks); 1067 1068 ret = xa_alloc(&memory_groups, &mgid, new_group, xa_limit_31b, 1069 GFP_KERNEL); 1070 if (ret) { 1071 kfree(new_group); 1072 return ret; 1073 } else if (group.is_dynamic) { 1074 xa_set_mark(&memory_groups, mgid, MEMORY_GROUP_MARK_DYNAMIC); 1075 } 1076 return mgid; 1077 } 1078 1079 /** 1080 * memory_group_register_static() - Register a static memory group. 1081 * @nid: The node id. 1082 * @max_pages: The maximum number of pages we'll have in this static memory 1083 * group. 1084 * 1085 * Register a new static memory group and return the memory group id. 1086 * All memory in the group belongs to a single unit, such as a DIMM. All 1087 * memory belonging to a static memory group is added in one go to be removed 1088 * in one go -- it's static. 1089 * 1090 * Returns an error if out of memory, if the node id is invalid, if no new 1091 * memory groups can be registered, or if max_pages is invalid (0). Otherwise, 1092 * returns the new memory group id. 1093 */ 1094 int memory_group_register_static(int nid, unsigned long max_pages) 1095 { 1096 struct memory_group group = { 1097 .nid = nid, 1098 .s = { 1099 .max_pages = max_pages, 1100 }, 1101 }; 1102 1103 if (!max_pages) 1104 return -EINVAL; 1105 return memory_group_register(group); 1106 } 1107 EXPORT_SYMBOL_GPL(memory_group_register_static); 1108 1109 /** 1110 * memory_group_register_dynamic() - Register a dynamic memory group. 1111 * @nid: The node id. 1112 * @unit_pages: Unit in pages in which is memory added/removed in this dynamic 1113 * memory group. 1114 * 1115 * Register a new dynamic memory group and return the memory group id. 1116 * Memory within a dynamic memory group is added/removed dynamically 1117 * in unit_pages. 1118 * 1119 * Returns an error if out of memory, if the node id is invalid, if no new 1120 * memory groups can be registered, or if unit_pages is invalid (0, not a 1121 * power of two, smaller than a single memory block). Otherwise, returns the 1122 * new memory group id. 1123 */ 1124 int memory_group_register_dynamic(int nid, unsigned long unit_pages) 1125 { 1126 struct memory_group group = { 1127 .nid = nid, 1128 .is_dynamic = true, 1129 .d = { 1130 .unit_pages = unit_pages, 1131 }, 1132 }; 1133 1134 if (!unit_pages || !is_power_of_2(unit_pages) || 1135 unit_pages < PHYS_PFN(memory_block_size_bytes())) 1136 return -EINVAL; 1137 return memory_group_register(group); 1138 } 1139 EXPORT_SYMBOL_GPL(memory_group_register_dynamic); 1140 1141 /** 1142 * memory_group_unregister() - Unregister a memory group. 1143 * @mgid: the memory group id 1144 * 1145 * Unregister a memory group. If any memory block still belongs to this 1146 * memory group, unregistering will fail. 1147 * 1148 * Returns -EINVAL if the memory group id is invalid, returns -EBUSY if some 1149 * memory blocks still belong to this memory group and returns 0 if 1150 * unregistering succeeded. 1151 */ 1152 int memory_group_unregister(int mgid) 1153 { 1154 struct memory_group *group; 1155 1156 if (mgid < 0) 1157 return -EINVAL; 1158 1159 group = xa_load(&memory_groups, mgid); 1160 if (!group) 1161 return -EINVAL; 1162 if (!list_empty(&group->memory_blocks)) 1163 return -EBUSY; 1164 xa_erase(&memory_groups, mgid); 1165 kfree(group); 1166 return 0; 1167 } 1168 EXPORT_SYMBOL_GPL(memory_group_unregister); 1169 1170 /* 1171 * This is an internal helper only to be used in core memory hotplug code to 1172 * lookup a memory group. We don't care about locking, as we don't expect a 1173 * memory group to get unregistered while adding memory to it -- because 1174 * the group and the memory is managed by the same driver. 1175 */ 1176 struct memory_group *memory_group_find_by_id(int mgid) 1177 { 1178 return xa_load(&memory_groups, mgid); 1179 } 1180 1181 /* 1182 * This is an internal helper only to be used in core memory hotplug code to 1183 * walk all dynamic memory groups excluding a given memory group, either 1184 * belonging to a specific node, or belonging to any node. 1185 */ 1186 int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func, 1187 struct memory_group *excluded, void *arg) 1188 { 1189 struct memory_group *group; 1190 unsigned long index; 1191 int ret = 0; 1192 1193 xa_for_each_marked(&memory_groups, index, group, 1194 MEMORY_GROUP_MARK_DYNAMIC) { 1195 if (group == excluded) 1196 continue; 1197 #ifdef CONFIG_NUMA 1198 if (nid != NUMA_NO_NODE && group->nid != nid) 1199 continue; 1200 #endif /* CONFIG_NUMA */ 1201 ret = func(group, arg); 1202 if (ret) 1203 break; 1204 } 1205 return ret; 1206 } 1207 1208 #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) 1209 void memblk_nr_poison_inc(unsigned long pfn) 1210 { 1211 const unsigned long block_id = pfn_to_block_id(pfn); 1212 struct memory_block *mem = find_memory_block_by_id(block_id); 1213 1214 if (mem) 1215 atomic_long_inc(&mem->nr_hwpoison); 1216 } 1217 1218 void memblk_nr_poison_sub(unsigned long pfn, long i) 1219 { 1220 const unsigned long block_id = pfn_to_block_id(pfn); 1221 struct memory_block *mem = find_memory_block_by_id(block_id); 1222 1223 if (mem) 1224 atomic_long_sub(i, &mem->nr_hwpoison); 1225 } 1226 1227 static unsigned long memblk_nr_poison(struct memory_block *mem) 1228 { 1229 return atomic_long_read(&mem->nr_hwpoison); 1230 } 1231 #endif 1232