1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Memory subsystem support 4 * 5 * Written by Matt Tolentino <matthew.e.tolentino@intel.com> 6 * Dave Hansen <haveblue@us.ibm.com> 7 * 8 * This file provides the necessary infrastructure to represent 9 * a SPARSEMEM-memory-model system's physical memory in /sysfs. 10 * All arch-independent code that assumes MEMORY_HOTPLUG requires 11 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c. 12 */ 13 14 #include <linux/module.h> 15 #include <linux/init.h> 16 #include <linux/topology.h> 17 #include <linux/capability.h> 18 #include <linux/device.h> 19 #include <linux/memory.h> 20 #include <linux/memory_hotplug.h> 21 #include <linux/mm.h> 22 #include <linux/stat.h> 23 #include <linux/slab.h> 24 #include <linux/xarray.h> 25 26 #include <linux/atomic.h> 27 #include <linux/uaccess.h> 28 29 #define MEMORY_CLASS_NAME "memory" 30 31 static const char *const online_type_to_str[] = { 32 [MMOP_OFFLINE] = "offline", 33 [MMOP_ONLINE] = "online", 34 [MMOP_ONLINE_KERNEL] = "online_kernel", 35 [MMOP_ONLINE_MOVABLE] = "online_movable", 36 }; 37 38 int mhp_online_type_from_str(const char *str) 39 { 40 int i; 41 42 for (i = 0; i < ARRAY_SIZE(online_type_to_str); i++) { 43 if (sysfs_streq(str, online_type_to_str[i])) 44 return i; 45 } 46 return -EINVAL; 47 } 48 49 #define to_memory_block(dev) container_of(dev, struct memory_block, dev) 50 51 static int sections_per_block; 52 53 static inline unsigned long memory_block_id(unsigned long section_nr) 54 { 55 return section_nr / sections_per_block; 56 } 57 58 static inline unsigned long pfn_to_block_id(unsigned long pfn) 59 { 60 return memory_block_id(pfn_to_section_nr(pfn)); 61 } 62 63 static inline unsigned long phys_to_block_id(unsigned long phys) 64 { 65 return pfn_to_block_id(PFN_DOWN(phys)); 66 } 67 68 static int memory_subsys_online(struct device *dev); 69 static int memory_subsys_offline(struct device *dev); 70 71 static struct bus_type memory_subsys = { 72 .name = MEMORY_CLASS_NAME, 73 .dev_name = MEMORY_CLASS_NAME, 74 .online = memory_subsys_online, 75 .offline = memory_subsys_offline, 76 }; 77 78 /* 79 * Memory blocks are cached in a local radix tree to avoid 80 * a costly linear search for the corresponding device on 81 * the subsystem bus. 82 */ 83 static DEFINE_XARRAY(memory_blocks); 84 85 /* 86 * Memory groups, indexed by memory group id (mgid). 87 */ 88 static DEFINE_XARRAY_FLAGS(memory_groups, XA_FLAGS_ALLOC); 89 #define MEMORY_GROUP_MARK_DYNAMIC XA_MARK_1 90 91 static BLOCKING_NOTIFIER_HEAD(memory_chain); 92 93 int register_memory_notifier(struct notifier_block *nb) 94 { 95 return blocking_notifier_chain_register(&memory_chain, nb); 96 } 97 EXPORT_SYMBOL(register_memory_notifier); 98 99 void unregister_memory_notifier(struct notifier_block *nb) 100 { 101 blocking_notifier_chain_unregister(&memory_chain, nb); 102 } 103 EXPORT_SYMBOL(unregister_memory_notifier); 104 105 static void memory_block_release(struct device *dev) 106 { 107 struct memory_block *mem = to_memory_block(dev); 108 /* Verify that the altmap is freed */ 109 WARN_ON(mem->altmap); 110 kfree(mem); 111 } 112 113 unsigned long __weak memory_block_size_bytes(void) 114 { 115 return MIN_MEMORY_BLOCK_SIZE; 116 } 117 EXPORT_SYMBOL_GPL(memory_block_size_bytes); 118 119 /* Show the memory block ID, relative to the memory block size */ 120 static ssize_t phys_index_show(struct device *dev, 121 struct device_attribute *attr, char *buf) 122 { 123 struct memory_block *mem = to_memory_block(dev); 124 125 return sysfs_emit(buf, "%08lx\n", memory_block_id(mem->start_section_nr)); 126 } 127 128 /* 129 * Legacy interface that we cannot remove. Always indicate "removable" 130 * with CONFIG_MEMORY_HOTREMOVE - bad heuristic. 131 */ 132 static ssize_t removable_show(struct device *dev, struct device_attribute *attr, 133 char *buf) 134 { 135 return sysfs_emit(buf, "%d\n", (int)IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)); 136 } 137 138 /* 139 * online, offline, going offline, etc. 140 */ 141 static ssize_t state_show(struct device *dev, struct device_attribute *attr, 142 char *buf) 143 { 144 struct memory_block *mem = to_memory_block(dev); 145 const char *output; 146 147 /* 148 * We can probably put these states in a nice little array 149 * so that they're not open-coded 150 */ 151 switch (mem->state) { 152 case MEM_ONLINE: 153 output = "online"; 154 break; 155 case MEM_OFFLINE: 156 output = "offline"; 157 break; 158 case MEM_GOING_OFFLINE: 159 output = "going-offline"; 160 break; 161 default: 162 WARN_ON(1); 163 return sysfs_emit(buf, "ERROR-UNKNOWN-%ld\n", mem->state); 164 } 165 166 return sysfs_emit(buf, "%s\n", output); 167 } 168 169 int memory_notify(unsigned long val, void *v) 170 { 171 return blocking_notifier_call_chain(&memory_chain, val, v); 172 } 173 174 #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) 175 static unsigned long memblk_nr_poison(struct memory_block *mem); 176 #else 177 static inline unsigned long memblk_nr_poison(struct memory_block *mem) 178 { 179 return 0; 180 } 181 #endif 182 183 static int memory_block_online(struct memory_block *mem) 184 { 185 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 186 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 187 unsigned long nr_vmemmap_pages = 0; 188 struct zone *zone; 189 int ret; 190 191 if (memblk_nr_poison(mem)) 192 return -EHWPOISON; 193 194 zone = zone_for_pfn_range(mem->online_type, mem->nid, mem->group, 195 start_pfn, nr_pages); 196 197 /* 198 * Although vmemmap pages have a different lifecycle than the pages 199 * they describe (they remain until the memory is unplugged), doing 200 * their initialization and accounting at memory onlining/offlining 201 * stage helps to keep accounting easier to follow - e.g vmemmaps 202 * belong to the same zone as the memory they backed. 203 */ 204 if (mem->altmap) 205 nr_vmemmap_pages = mem->altmap->free; 206 207 if (nr_vmemmap_pages) { 208 ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, zone); 209 if (ret) 210 return ret; 211 } 212 213 ret = online_pages(start_pfn + nr_vmemmap_pages, 214 nr_pages - nr_vmemmap_pages, zone, mem->group); 215 if (ret) { 216 if (nr_vmemmap_pages) 217 mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages); 218 return ret; 219 } 220 221 /* 222 * Account once onlining succeeded. If the zone was unpopulated, it is 223 * now already properly populated. 224 */ 225 if (nr_vmemmap_pages) 226 adjust_present_page_count(pfn_to_page(start_pfn), mem->group, 227 nr_vmemmap_pages); 228 229 mem->zone = zone; 230 return ret; 231 } 232 233 static int memory_block_offline(struct memory_block *mem) 234 { 235 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 236 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 237 unsigned long nr_vmemmap_pages = 0; 238 int ret; 239 240 if (!mem->zone) 241 return -EINVAL; 242 243 /* 244 * Unaccount before offlining, such that unpopulated zone and kthreads 245 * can properly be torn down in offline_pages(). 246 */ 247 if (mem->altmap) 248 nr_vmemmap_pages = mem->altmap->free; 249 250 if (nr_vmemmap_pages) 251 adjust_present_page_count(pfn_to_page(start_pfn), mem->group, 252 -nr_vmemmap_pages); 253 254 ret = offline_pages(start_pfn + nr_vmemmap_pages, 255 nr_pages - nr_vmemmap_pages, mem->zone, mem->group); 256 if (ret) { 257 /* offline_pages() failed. Account back. */ 258 if (nr_vmemmap_pages) 259 adjust_present_page_count(pfn_to_page(start_pfn), 260 mem->group, nr_vmemmap_pages); 261 return ret; 262 } 263 264 if (nr_vmemmap_pages) 265 mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages); 266 267 mem->zone = NULL; 268 return ret; 269 } 270 271 /* 272 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is 273 * OK to have direct references to sparsemem variables in here. 274 */ 275 static int 276 memory_block_action(struct memory_block *mem, unsigned long action) 277 { 278 int ret; 279 280 switch (action) { 281 case MEM_ONLINE: 282 ret = memory_block_online(mem); 283 break; 284 case MEM_OFFLINE: 285 ret = memory_block_offline(mem); 286 break; 287 default: 288 WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: " 289 "%ld\n", __func__, mem->start_section_nr, action, action); 290 ret = -EINVAL; 291 } 292 293 return ret; 294 } 295 296 static int memory_block_change_state(struct memory_block *mem, 297 unsigned long to_state, unsigned long from_state_req) 298 { 299 int ret = 0; 300 301 if (mem->state != from_state_req) 302 return -EINVAL; 303 304 if (to_state == MEM_OFFLINE) 305 mem->state = MEM_GOING_OFFLINE; 306 307 ret = memory_block_action(mem, to_state); 308 mem->state = ret ? from_state_req : to_state; 309 310 return ret; 311 } 312 313 /* The device lock serializes operations on memory_subsys_[online|offline] */ 314 static int memory_subsys_online(struct device *dev) 315 { 316 struct memory_block *mem = to_memory_block(dev); 317 int ret; 318 319 if (mem->state == MEM_ONLINE) 320 return 0; 321 322 /* 323 * When called via device_online() without configuring the online_type, 324 * we want to default to MMOP_ONLINE. 325 */ 326 if (mem->online_type == MMOP_OFFLINE) 327 mem->online_type = MMOP_ONLINE; 328 329 ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); 330 mem->online_type = MMOP_OFFLINE; 331 332 return ret; 333 } 334 335 static int memory_subsys_offline(struct device *dev) 336 { 337 struct memory_block *mem = to_memory_block(dev); 338 339 if (mem->state == MEM_OFFLINE) 340 return 0; 341 342 return memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); 343 } 344 345 static ssize_t state_store(struct device *dev, struct device_attribute *attr, 346 const char *buf, size_t count) 347 { 348 const int online_type = mhp_online_type_from_str(buf); 349 struct memory_block *mem = to_memory_block(dev); 350 int ret; 351 352 if (online_type < 0) 353 return -EINVAL; 354 355 ret = lock_device_hotplug_sysfs(); 356 if (ret) 357 return ret; 358 359 switch (online_type) { 360 case MMOP_ONLINE_KERNEL: 361 case MMOP_ONLINE_MOVABLE: 362 case MMOP_ONLINE: 363 /* mem->online_type is protected by device_hotplug_lock */ 364 mem->online_type = online_type; 365 ret = device_online(&mem->dev); 366 break; 367 case MMOP_OFFLINE: 368 ret = device_offline(&mem->dev); 369 break; 370 default: 371 ret = -EINVAL; /* should never happen */ 372 } 373 374 unlock_device_hotplug(); 375 376 if (ret < 0) 377 return ret; 378 if (ret) 379 return -EINVAL; 380 381 return count; 382 } 383 384 /* 385 * Legacy interface that we cannot remove: s390x exposes the storage increment 386 * covered by a memory block, allowing for identifying which memory blocks 387 * comprise a storage increment. Since a memory block spans complete 388 * storage increments nowadays, this interface is basically unused. Other 389 * archs never exposed != 0. 390 */ 391 static ssize_t phys_device_show(struct device *dev, 392 struct device_attribute *attr, char *buf) 393 { 394 struct memory_block *mem = to_memory_block(dev); 395 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 396 397 return sysfs_emit(buf, "%d\n", 398 arch_get_memory_phys_device(start_pfn)); 399 } 400 401 #ifdef CONFIG_MEMORY_HOTREMOVE 402 static int print_allowed_zone(char *buf, int len, int nid, 403 struct memory_group *group, 404 unsigned long start_pfn, unsigned long nr_pages, 405 int online_type, struct zone *default_zone) 406 { 407 struct zone *zone; 408 409 zone = zone_for_pfn_range(online_type, nid, group, start_pfn, nr_pages); 410 if (zone == default_zone) 411 return 0; 412 413 return sysfs_emit_at(buf, len, " %s", zone->name); 414 } 415 416 static ssize_t valid_zones_show(struct device *dev, 417 struct device_attribute *attr, char *buf) 418 { 419 struct memory_block *mem = to_memory_block(dev); 420 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 421 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 422 struct memory_group *group = mem->group; 423 struct zone *default_zone; 424 int nid = mem->nid; 425 int len = 0; 426 427 /* 428 * Check the existing zone. Make sure that we do that only on the 429 * online nodes otherwise the page_zone is not reliable 430 */ 431 if (mem->state == MEM_ONLINE) { 432 /* 433 * If !mem->zone, the memory block spans multiple zones and 434 * cannot get offlined. 435 */ 436 default_zone = mem->zone; 437 if (!default_zone) 438 return sysfs_emit(buf, "%s\n", "none"); 439 len += sysfs_emit_at(buf, len, "%s", default_zone->name); 440 goto out; 441 } 442 443 default_zone = zone_for_pfn_range(MMOP_ONLINE, nid, group, 444 start_pfn, nr_pages); 445 446 len += sysfs_emit_at(buf, len, "%s", default_zone->name); 447 len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages, 448 MMOP_ONLINE_KERNEL, default_zone); 449 len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages, 450 MMOP_ONLINE_MOVABLE, default_zone); 451 out: 452 len += sysfs_emit_at(buf, len, "\n"); 453 return len; 454 } 455 static DEVICE_ATTR_RO(valid_zones); 456 #endif 457 458 static DEVICE_ATTR_RO(phys_index); 459 static DEVICE_ATTR_RW(state); 460 static DEVICE_ATTR_RO(phys_device); 461 static DEVICE_ATTR_RO(removable); 462 463 /* 464 * Show the memory block size (shared by all memory blocks). 465 */ 466 static ssize_t block_size_bytes_show(struct device *dev, 467 struct device_attribute *attr, char *buf) 468 { 469 return sysfs_emit(buf, "%lx\n", memory_block_size_bytes()); 470 } 471 472 static DEVICE_ATTR_RO(block_size_bytes); 473 474 /* 475 * Memory auto online policy. 476 */ 477 478 static ssize_t auto_online_blocks_show(struct device *dev, 479 struct device_attribute *attr, char *buf) 480 { 481 return sysfs_emit(buf, "%s\n", 482 online_type_to_str[mhp_default_online_type]); 483 } 484 485 static ssize_t auto_online_blocks_store(struct device *dev, 486 struct device_attribute *attr, 487 const char *buf, size_t count) 488 { 489 const int online_type = mhp_online_type_from_str(buf); 490 491 if (online_type < 0) 492 return -EINVAL; 493 494 mhp_default_online_type = online_type; 495 return count; 496 } 497 498 static DEVICE_ATTR_RW(auto_online_blocks); 499 500 #ifdef CONFIG_CRASH_HOTPLUG 501 #include <linux/kexec.h> 502 static ssize_t crash_hotplug_show(struct device *dev, 503 struct device_attribute *attr, char *buf) 504 { 505 return sysfs_emit(buf, "%d\n", crash_hotplug_memory_support()); 506 } 507 static DEVICE_ATTR_RO(crash_hotplug); 508 #endif 509 510 /* 511 * Some architectures will have custom drivers to do this, and 512 * will not need to do it from userspace. The fake hot-add code 513 * as well as ppc64 will do all of their discovery in userspace 514 * and will require this interface. 515 */ 516 #ifdef CONFIG_ARCH_MEMORY_PROBE 517 static ssize_t probe_store(struct device *dev, struct device_attribute *attr, 518 const char *buf, size_t count) 519 { 520 u64 phys_addr; 521 int nid, ret; 522 unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block; 523 524 ret = kstrtoull(buf, 0, &phys_addr); 525 if (ret) 526 return ret; 527 528 if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1)) 529 return -EINVAL; 530 531 ret = lock_device_hotplug_sysfs(); 532 if (ret) 533 return ret; 534 535 nid = memory_add_physaddr_to_nid(phys_addr); 536 ret = __add_memory(nid, phys_addr, 537 MIN_MEMORY_BLOCK_SIZE * sections_per_block, 538 MHP_NONE); 539 540 if (ret) 541 goto out; 542 543 ret = count; 544 out: 545 unlock_device_hotplug(); 546 return ret; 547 } 548 549 static DEVICE_ATTR_WO(probe); 550 #endif 551 552 #ifdef CONFIG_MEMORY_FAILURE 553 /* 554 * Support for offlining pages of memory 555 */ 556 557 /* Soft offline a page */ 558 static ssize_t soft_offline_page_store(struct device *dev, 559 struct device_attribute *attr, 560 const char *buf, size_t count) 561 { 562 int ret; 563 u64 pfn; 564 if (!capable(CAP_SYS_ADMIN)) 565 return -EPERM; 566 if (kstrtoull(buf, 0, &pfn) < 0) 567 return -EINVAL; 568 pfn >>= PAGE_SHIFT; 569 ret = soft_offline_page(pfn, 0); 570 return ret == 0 ? count : ret; 571 } 572 573 /* Forcibly offline a page, including killing processes. */ 574 static ssize_t hard_offline_page_store(struct device *dev, 575 struct device_attribute *attr, 576 const char *buf, size_t count) 577 { 578 int ret; 579 u64 pfn; 580 if (!capable(CAP_SYS_ADMIN)) 581 return -EPERM; 582 if (kstrtoull(buf, 0, &pfn) < 0) 583 return -EINVAL; 584 pfn >>= PAGE_SHIFT; 585 ret = memory_failure(pfn, MF_SW_SIMULATED); 586 if (ret == -EOPNOTSUPP) 587 ret = 0; 588 return ret ? ret : count; 589 } 590 591 static DEVICE_ATTR_WO(soft_offline_page); 592 static DEVICE_ATTR_WO(hard_offline_page); 593 #endif 594 595 /* See phys_device_show(). */ 596 int __weak arch_get_memory_phys_device(unsigned long start_pfn) 597 { 598 return 0; 599 } 600 601 /* 602 * A reference for the returned memory block device is acquired. 603 * 604 * Called under device_hotplug_lock. 605 */ 606 static struct memory_block *find_memory_block_by_id(unsigned long block_id) 607 { 608 struct memory_block *mem; 609 610 mem = xa_load(&memory_blocks, block_id); 611 if (mem) 612 get_device(&mem->dev); 613 return mem; 614 } 615 616 /* 617 * Called under device_hotplug_lock. 618 */ 619 struct memory_block *find_memory_block(unsigned long section_nr) 620 { 621 unsigned long block_id = memory_block_id(section_nr); 622 623 return find_memory_block_by_id(block_id); 624 } 625 626 static struct attribute *memory_memblk_attrs[] = { 627 &dev_attr_phys_index.attr, 628 &dev_attr_state.attr, 629 &dev_attr_phys_device.attr, 630 &dev_attr_removable.attr, 631 #ifdef CONFIG_MEMORY_HOTREMOVE 632 &dev_attr_valid_zones.attr, 633 #endif 634 NULL 635 }; 636 637 static const struct attribute_group memory_memblk_attr_group = { 638 .attrs = memory_memblk_attrs, 639 }; 640 641 static const struct attribute_group *memory_memblk_attr_groups[] = { 642 &memory_memblk_attr_group, 643 NULL, 644 }; 645 646 static int __add_memory_block(struct memory_block *memory) 647 { 648 int ret; 649 650 memory->dev.bus = &memory_subsys; 651 memory->dev.id = memory->start_section_nr / sections_per_block; 652 memory->dev.release = memory_block_release; 653 memory->dev.groups = memory_memblk_attr_groups; 654 memory->dev.offline = memory->state == MEM_OFFLINE; 655 656 ret = device_register(&memory->dev); 657 if (ret) { 658 put_device(&memory->dev); 659 return ret; 660 } 661 ret = xa_err(xa_store(&memory_blocks, memory->dev.id, memory, 662 GFP_KERNEL)); 663 if (ret) 664 device_unregister(&memory->dev); 665 666 return ret; 667 } 668 669 static struct zone *early_node_zone_for_memory_block(struct memory_block *mem, 670 int nid) 671 { 672 const unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 673 const unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 674 struct zone *zone, *matching_zone = NULL; 675 pg_data_t *pgdat = NODE_DATA(nid); 676 int i; 677 678 /* 679 * This logic only works for early memory, when the applicable zones 680 * already span the memory block. We don't expect overlapping zones on 681 * a single node for early memory. So if we're told that some PFNs 682 * of a node fall into this memory block, we can assume that all node 683 * zones that intersect with the memory block are actually applicable. 684 * No need to look at the memmap. 685 */ 686 for (i = 0; i < MAX_NR_ZONES; i++) { 687 zone = pgdat->node_zones + i; 688 if (!populated_zone(zone)) 689 continue; 690 if (!zone_intersects(zone, start_pfn, nr_pages)) 691 continue; 692 if (!matching_zone) { 693 matching_zone = zone; 694 continue; 695 } 696 /* Spans multiple zones ... */ 697 matching_zone = NULL; 698 break; 699 } 700 return matching_zone; 701 } 702 703 #ifdef CONFIG_NUMA 704 /** 705 * memory_block_add_nid() - Indicate that system RAM falling into this memory 706 * block device (partially) belongs to the given node. 707 * @mem: The memory block device. 708 * @nid: The node id. 709 * @context: The memory initialization context. 710 * 711 * Indicate that system RAM falling into this memory block (partially) belongs 712 * to the given node. If the context indicates ("early") that we are adding the 713 * node during node device subsystem initialization, this will also properly 714 * set/adjust mem->zone based on the zone ranges of the given node. 715 */ 716 void memory_block_add_nid(struct memory_block *mem, int nid, 717 enum meminit_context context) 718 { 719 if (context == MEMINIT_EARLY && mem->nid != nid) { 720 /* 721 * For early memory we have to determine the zone when setting 722 * the node id and handle multiple nodes spanning a single 723 * memory block by indicate via zone == NULL that we're not 724 * dealing with a single zone. So if we're setting the node id 725 * the first time, determine if there is a single zone. If we're 726 * setting the node id a second time to a different node, 727 * invalidate the single detected zone. 728 */ 729 if (mem->nid == NUMA_NO_NODE) 730 mem->zone = early_node_zone_for_memory_block(mem, nid); 731 else 732 mem->zone = NULL; 733 } 734 735 /* 736 * If this memory block spans multiple nodes, we only indicate 737 * the last processed node. If we span multiple nodes (not applicable 738 * to hotplugged memory), zone == NULL will prohibit memory offlining 739 * and consequently unplug. 740 */ 741 mem->nid = nid; 742 } 743 #endif 744 745 static int add_memory_block(unsigned long block_id, unsigned long state, 746 struct vmem_altmap *altmap, 747 struct memory_group *group) 748 { 749 struct memory_block *mem; 750 int ret = 0; 751 752 mem = find_memory_block_by_id(block_id); 753 if (mem) { 754 put_device(&mem->dev); 755 return -EEXIST; 756 } 757 mem = kzalloc(sizeof(*mem), GFP_KERNEL); 758 if (!mem) 759 return -ENOMEM; 760 761 mem->start_section_nr = block_id * sections_per_block; 762 mem->state = state; 763 mem->nid = NUMA_NO_NODE; 764 mem->altmap = altmap; 765 INIT_LIST_HEAD(&mem->group_next); 766 767 #ifndef CONFIG_NUMA 768 if (state == MEM_ONLINE) 769 /* 770 * MEM_ONLINE at this point implies early memory. With NUMA, 771 * we'll determine the zone when setting the node id via 772 * memory_block_add_nid(). Memory hotplug updated the zone 773 * manually when memory onlining/offlining succeeds. 774 */ 775 mem->zone = early_node_zone_for_memory_block(mem, NUMA_NO_NODE); 776 #endif /* CONFIG_NUMA */ 777 778 ret = __add_memory_block(mem); 779 if (ret) 780 return ret; 781 782 if (group) { 783 mem->group = group; 784 list_add(&mem->group_next, &group->memory_blocks); 785 } 786 787 return 0; 788 } 789 790 static int __init add_boot_memory_block(unsigned long base_section_nr) 791 { 792 int section_count = 0; 793 unsigned long nr; 794 795 for (nr = base_section_nr; nr < base_section_nr + sections_per_block; 796 nr++) 797 if (present_section_nr(nr)) 798 section_count++; 799 800 if (section_count == 0) 801 return 0; 802 return add_memory_block(memory_block_id(base_section_nr), 803 MEM_ONLINE, NULL, NULL); 804 } 805 806 static int add_hotplug_memory_block(unsigned long block_id, 807 struct vmem_altmap *altmap, 808 struct memory_group *group) 809 { 810 return add_memory_block(block_id, MEM_OFFLINE, altmap, group); 811 } 812 813 static void remove_memory_block(struct memory_block *memory) 814 { 815 if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys)) 816 return; 817 818 WARN_ON(xa_erase(&memory_blocks, memory->dev.id) == NULL); 819 820 if (memory->group) { 821 list_del(&memory->group_next); 822 memory->group = NULL; 823 } 824 825 /* drop the ref. we got via find_memory_block() */ 826 put_device(&memory->dev); 827 device_unregister(&memory->dev); 828 } 829 830 /* 831 * Create memory block devices for the given memory area. Start and size 832 * have to be aligned to memory block granularity. Memory block devices 833 * will be initialized as offline. 834 * 835 * Called under device_hotplug_lock. 836 */ 837 int create_memory_block_devices(unsigned long start, unsigned long size, 838 struct vmem_altmap *altmap, 839 struct memory_group *group) 840 { 841 const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start)); 842 unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size)); 843 struct memory_block *mem; 844 unsigned long block_id; 845 int ret = 0; 846 847 if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) || 848 !IS_ALIGNED(size, memory_block_size_bytes()))) 849 return -EINVAL; 850 851 for (block_id = start_block_id; block_id != end_block_id; block_id++) { 852 ret = add_hotplug_memory_block(block_id, altmap, group); 853 if (ret) 854 break; 855 } 856 if (ret) { 857 end_block_id = block_id; 858 for (block_id = start_block_id; block_id != end_block_id; 859 block_id++) { 860 mem = find_memory_block_by_id(block_id); 861 if (WARN_ON_ONCE(!mem)) 862 continue; 863 remove_memory_block(mem); 864 } 865 } 866 return ret; 867 } 868 869 /* 870 * Remove memory block devices for the given memory area. Start and size 871 * have to be aligned to memory block granularity. Memory block devices 872 * have to be offline. 873 * 874 * Called under device_hotplug_lock. 875 */ 876 void remove_memory_block_devices(unsigned long start, unsigned long size) 877 { 878 const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start)); 879 const unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size)); 880 struct memory_block *mem; 881 unsigned long block_id; 882 883 if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) || 884 !IS_ALIGNED(size, memory_block_size_bytes()))) 885 return; 886 887 for (block_id = start_block_id; block_id != end_block_id; block_id++) { 888 mem = find_memory_block_by_id(block_id); 889 if (WARN_ON_ONCE(!mem)) 890 continue; 891 num_poisoned_pages_sub(-1UL, memblk_nr_poison(mem)); 892 unregister_memory_block_under_nodes(mem); 893 remove_memory_block(mem); 894 } 895 } 896 897 static struct attribute *memory_root_attrs[] = { 898 #ifdef CONFIG_ARCH_MEMORY_PROBE 899 &dev_attr_probe.attr, 900 #endif 901 902 #ifdef CONFIG_MEMORY_FAILURE 903 &dev_attr_soft_offline_page.attr, 904 &dev_attr_hard_offline_page.attr, 905 #endif 906 907 &dev_attr_block_size_bytes.attr, 908 &dev_attr_auto_online_blocks.attr, 909 #ifdef CONFIG_CRASH_HOTPLUG 910 &dev_attr_crash_hotplug.attr, 911 #endif 912 NULL 913 }; 914 915 static const struct attribute_group memory_root_attr_group = { 916 .attrs = memory_root_attrs, 917 }; 918 919 static const struct attribute_group *memory_root_attr_groups[] = { 920 &memory_root_attr_group, 921 NULL, 922 }; 923 924 /* 925 * Initialize the sysfs support for memory devices. At the time this function 926 * is called, we cannot have concurrent creation/deletion of memory block 927 * devices, the device_hotplug_lock is not needed. 928 */ 929 void __init memory_dev_init(void) 930 { 931 int ret; 932 unsigned long block_sz, nr; 933 934 /* Validate the configured memory block size */ 935 block_sz = memory_block_size_bytes(); 936 if (!is_power_of_2(block_sz) || block_sz < MIN_MEMORY_BLOCK_SIZE) 937 panic("Memory block size not suitable: 0x%lx\n", block_sz); 938 sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; 939 940 ret = subsys_system_register(&memory_subsys, memory_root_attr_groups); 941 if (ret) 942 panic("%s() failed to register subsystem: %d\n", __func__, ret); 943 944 /* 945 * Create entries for memory sections that were found 946 * during boot and have been initialized 947 */ 948 for (nr = 0; nr <= __highest_present_section_nr; 949 nr += sections_per_block) { 950 ret = add_boot_memory_block(nr); 951 if (ret) 952 panic("%s() failed to add memory block: %d\n", __func__, 953 ret); 954 } 955 } 956 957 /** 958 * walk_memory_blocks - walk through all present memory blocks overlapped 959 * by the range [start, start + size) 960 * 961 * @start: start address of the memory range 962 * @size: size of the memory range 963 * @arg: argument passed to func 964 * @func: callback for each memory section walked 965 * 966 * This function walks through all present memory blocks overlapped by the 967 * range [start, start + size), calling func on each memory block. 968 * 969 * In case func() returns an error, walking is aborted and the error is 970 * returned. 971 * 972 * Called under device_hotplug_lock. 973 */ 974 int walk_memory_blocks(unsigned long start, unsigned long size, 975 void *arg, walk_memory_blocks_func_t func) 976 { 977 const unsigned long start_block_id = phys_to_block_id(start); 978 const unsigned long end_block_id = phys_to_block_id(start + size - 1); 979 struct memory_block *mem; 980 unsigned long block_id; 981 int ret = 0; 982 983 if (!size) 984 return 0; 985 986 for (block_id = start_block_id; block_id <= end_block_id; block_id++) { 987 mem = find_memory_block_by_id(block_id); 988 if (!mem) 989 continue; 990 991 ret = func(mem, arg); 992 put_device(&mem->dev); 993 if (ret) 994 break; 995 } 996 return ret; 997 } 998 999 struct for_each_memory_block_cb_data { 1000 walk_memory_blocks_func_t func; 1001 void *arg; 1002 }; 1003 1004 static int for_each_memory_block_cb(struct device *dev, void *data) 1005 { 1006 struct memory_block *mem = to_memory_block(dev); 1007 struct for_each_memory_block_cb_data *cb_data = data; 1008 1009 return cb_data->func(mem, cb_data->arg); 1010 } 1011 1012 /** 1013 * for_each_memory_block - walk through all present memory blocks 1014 * 1015 * @arg: argument passed to func 1016 * @func: callback for each memory block walked 1017 * 1018 * This function walks through all present memory blocks, calling func on 1019 * each memory block. 1020 * 1021 * In case func() returns an error, walking is aborted and the error is 1022 * returned. 1023 */ 1024 int for_each_memory_block(void *arg, walk_memory_blocks_func_t func) 1025 { 1026 struct for_each_memory_block_cb_data cb_data = { 1027 .func = func, 1028 .arg = arg, 1029 }; 1030 1031 return bus_for_each_dev(&memory_subsys, NULL, &cb_data, 1032 for_each_memory_block_cb); 1033 } 1034 1035 /* 1036 * This is an internal helper to unify allocation and initialization of 1037 * memory groups. Note that the passed memory group will be copied to a 1038 * dynamically allocated memory group. After this call, the passed 1039 * memory group should no longer be used. 1040 */ 1041 static int memory_group_register(struct memory_group group) 1042 { 1043 struct memory_group *new_group; 1044 uint32_t mgid; 1045 int ret; 1046 1047 if (!node_possible(group.nid)) 1048 return -EINVAL; 1049 1050 new_group = kzalloc(sizeof(group), GFP_KERNEL); 1051 if (!new_group) 1052 return -ENOMEM; 1053 *new_group = group; 1054 INIT_LIST_HEAD(&new_group->memory_blocks); 1055 1056 ret = xa_alloc(&memory_groups, &mgid, new_group, xa_limit_31b, 1057 GFP_KERNEL); 1058 if (ret) { 1059 kfree(new_group); 1060 return ret; 1061 } else if (group.is_dynamic) { 1062 xa_set_mark(&memory_groups, mgid, MEMORY_GROUP_MARK_DYNAMIC); 1063 } 1064 return mgid; 1065 } 1066 1067 /** 1068 * memory_group_register_static() - Register a static memory group. 1069 * @nid: The node id. 1070 * @max_pages: The maximum number of pages we'll have in this static memory 1071 * group. 1072 * 1073 * Register a new static memory group and return the memory group id. 1074 * All memory in the group belongs to a single unit, such as a DIMM. All 1075 * memory belonging to a static memory group is added in one go to be removed 1076 * in one go -- it's static. 1077 * 1078 * Returns an error if out of memory, if the node id is invalid, if no new 1079 * memory groups can be registered, or if max_pages is invalid (0). Otherwise, 1080 * returns the new memory group id. 1081 */ 1082 int memory_group_register_static(int nid, unsigned long max_pages) 1083 { 1084 struct memory_group group = { 1085 .nid = nid, 1086 .s = { 1087 .max_pages = max_pages, 1088 }, 1089 }; 1090 1091 if (!max_pages) 1092 return -EINVAL; 1093 return memory_group_register(group); 1094 } 1095 EXPORT_SYMBOL_GPL(memory_group_register_static); 1096 1097 /** 1098 * memory_group_register_dynamic() - Register a dynamic memory group. 1099 * @nid: The node id. 1100 * @unit_pages: Unit in pages in which is memory added/removed in this dynamic 1101 * memory group. 1102 * 1103 * Register a new dynamic memory group and return the memory group id. 1104 * Memory within a dynamic memory group is added/removed dynamically 1105 * in unit_pages. 1106 * 1107 * Returns an error if out of memory, if the node id is invalid, if no new 1108 * memory groups can be registered, or if unit_pages is invalid (0, not a 1109 * power of two, smaller than a single memory block). Otherwise, returns the 1110 * new memory group id. 1111 */ 1112 int memory_group_register_dynamic(int nid, unsigned long unit_pages) 1113 { 1114 struct memory_group group = { 1115 .nid = nid, 1116 .is_dynamic = true, 1117 .d = { 1118 .unit_pages = unit_pages, 1119 }, 1120 }; 1121 1122 if (!unit_pages || !is_power_of_2(unit_pages) || 1123 unit_pages < PHYS_PFN(memory_block_size_bytes())) 1124 return -EINVAL; 1125 return memory_group_register(group); 1126 } 1127 EXPORT_SYMBOL_GPL(memory_group_register_dynamic); 1128 1129 /** 1130 * memory_group_unregister() - Unregister a memory group. 1131 * @mgid: the memory group id 1132 * 1133 * Unregister a memory group. If any memory block still belongs to this 1134 * memory group, unregistering will fail. 1135 * 1136 * Returns -EINVAL if the memory group id is invalid, returns -EBUSY if some 1137 * memory blocks still belong to this memory group and returns 0 if 1138 * unregistering succeeded. 1139 */ 1140 int memory_group_unregister(int mgid) 1141 { 1142 struct memory_group *group; 1143 1144 if (mgid < 0) 1145 return -EINVAL; 1146 1147 group = xa_load(&memory_groups, mgid); 1148 if (!group) 1149 return -EINVAL; 1150 if (!list_empty(&group->memory_blocks)) 1151 return -EBUSY; 1152 xa_erase(&memory_groups, mgid); 1153 kfree(group); 1154 return 0; 1155 } 1156 EXPORT_SYMBOL_GPL(memory_group_unregister); 1157 1158 /* 1159 * This is an internal helper only to be used in core memory hotplug code to 1160 * lookup a memory group. We don't care about locking, as we don't expect a 1161 * memory group to get unregistered while adding memory to it -- because 1162 * the group and the memory is managed by the same driver. 1163 */ 1164 struct memory_group *memory_group_find_by_id(int mgid) 1165 { 1166 return xa_load(&memory_groups, mgid); 1167 } 1168 1169 /* 1170 * This is an internal helper only to be used in core memory hotplug code to 1171 * walk all dynamic memory groups excluding a given memory group, either 1172 * belonging to a specific node, or belonging to any node. 1173 */ 1174 int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func, 1175 struct memory_group *excluded, void *arg) 1176 { 1177 struct memory_group *group; 1178 unsigned long index; 1179 int ret = 0; 1180 1181 xa_for_each_marked(&memory_groups, index, group, 1182 MEMORY_GROUP_MARK_DYNAMIC) { 1183 if (group == excluded) 1184 continue; 1185 #ifdef CONFIG_NUMA 1186 if (nid != NUMA_NO_NODE && group->nid != nid) 1187 continue; 1188 #endif /* CONFIG_NUMA */ 1189 ret = func(group, arg); 1190 if (ret) 1191 break; 1192 } 1193 return ret; 1194 } 1195 1196 #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) 1197 void memblk_nr_poison_inc(unsigned long pfn) 1198 { 1199 const unsigned long block_id = pfn_to_block_id(pfn); 1200 struct memory_block *mem = find_memory_block_by_id(block_id); 1201 1202 if (mem) 1203 atomic_long_inc(&mem->nr_hwpoison); 1204 } 1205 1206 void memblk_nr_poison_sub(unsigned long pfn, long i) 1207 { 1208 const unsigned long block_id = pfn_to_block_id(pfn); 1209 struct memory_block *mem = find_memory_block_by_id(block_id); 1210 1211 if (mem) 1212 atomic_long_sub(i, &mem->nr_hwpoison); 1213 } 1214 1215 static unsigned long memblk_nr_poison(struct memory_block *mem) 1216 { 1217 return atomic_long_read(&mem->nr_hwpoison); 1218 } 1219 #endif 1220