1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Memory subsystem support 4 * 5 * Written by Matt Tolentino <matthew.e.tolentino@intel.com> 6 * Dave Hansen <haveblue@us.ibm.com> 7 * 8 * This file provides the necessary infrastructure to represent 9 * a SPARSEMEM-memory-model system's physical memory in /sysfs. 10 * All arch-independent code that assumes MEMORY_HOTPLUG requires 11 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c. 12 */ 13 14 #include <linux/module.h> 15 #include <linux/init.h> 16 #include <linux/topology.h> 17 #include <linux/capability.h> 18 #include <linux/device.h> 19 #include <linux/memory.h> 20 #include <linux/memory_hotplug.h> 21 #include <linux/mm.h> 22 #include <linux/stat.h> 23 #include <linux/slab.h> 24 #include <linux/xarray.h> 25 26 #include <linux/atomic.h> 27 #include <linux/uaccess.h> 28 29 #define MEMORY_CLASS_NAME "memory" 30 31 static const char *const online_type_to_str[] = { 32 [MMOP_OFFLINE] = "offline", 33 [MMOP_ONLINE] = "online", 34 [MMOP_ONLINE_KERNEL] = "online_kernel", 35 [MMOP_ONLINE_MOVABLE] = "online_movable", 36 }; 37 38 int mhp_online_type_from_str(const char *str) 39 { 40 int i; 41 42 for (i = 0; i < ARRAY_SIZE(online_type_to_str); i++) { 43 if (sysfs_streq(str, online_type_to_str[i])) 44 return i; 45 } 46 return -EINVAL; 47 } 48 49 #define to_memory_block(dev) container_of(dev, struct memory_block, dev) 50 51 static int sections_per_block; 52 53 static inline unsigned long memory_block_id(unsigned long section_nr) 54 { 55 return section_nr / sections_per_block; 56 } 57 58 static inline unsigned long pfn_to_block_id(unsigned long pfn) 59 { 60 return memory_block_id(pfn_to_section_nr(pfn)); 61 } 62 63 static inline unsigned long phys_to_block_id(unsigned long phys) 64 { 65 return pfn_to_block_id(PFN_DOWN(phys)); 66 } 67 68 static int memory_subsys_online(struct device *dev); 69 static int memory_subsys_offline(struct device *dev); 70 71 static struct bus_type memory_subsys = { 72 .name = MEMORY_CLASS_NAME, 73 .dev_name = MEMORY_CLASS_NAME, 74 .online = memory_subsys_online, 75 .offline = memory_subsys_offline, 76 }; 77 78 /* 79 * Memory blocks are cached in a local radix tree to avoid 80 * a costly linear search for the corresponding device on 81 * the subsystem bus. 82 */ 83 static DEFINE_XARRAY(memory_blocks); 84 85 /* 86 * Memory groups, indexed by memory group id (mgid). 87 */ 88 static DEFINE_XARRAY_FLAGS(memory_groups, XA_FLAGS_ALLOC); 89 #define MEMORY_GROUP_MARK_DYNAMIC XA_MARK_1 90 91 static BLOCKING_NOTIFIER_HEAD(memory_chain); 92 93 int register_memory_notifier(struct notifier_block *nb) 94 { 95 return blocking_notifier_chain_register(&memory_chain, nb); 96 } 97 EXPORT_SYMBOL(register_memory_notifier); 98 99 void unregister_memory_notifier(struct notifier_block *nb) 100 { 101 blocking_notifier_chain_unregister(&memory_chain, nb); 102 } 103 EXPORT_SYMBOL(unregister_memory_notifier); 104 105 static void memory_block_release(struct device *dev) 106 { 107 struct memory_block *mem = to_memory_block(dev); 108 109 kfree(mem); 110 } 111 112 unsigned long __weak memory_block_size_bytes(void) 113 { 114 return MIN_MEMORY_BLOCK_SIZE; 115 } 116 EXPORT_SYMBOL_GPL(memory_block_size_bytes); 117 118 /* Show the memory block ID, relative to the memory block size */ 119 static ssize_t phys_index_show(struct device *dev, 120 struct device_attribute *attr, char *buf) 121 { 122 struct memory_block *mem = to_memory_block(dev); 123 124 return sysfs_emit(buf, "%08lx\n", memory_block_id(mem->start_section_nr)); 125 } 126 127 /* 128 * Legacy interface that we cannot remove. Always indicate "removable" 129 * with CONFIG_MEMORY_HOTREMOVE - bad heuristic. 130 */ 131 static ssize_t removable_show(struct device *dev, struct device_attribute *attr, 132 char *buf) 133 { 134 return sysfs_emit(buf, "%d\n", (int)IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)); 135 } 136 137 /* 138 * online, offline, going offline, etc. 139 */ 140 static ssize_t state_show(struct device *dev, struct device_attribute *attr, 141 char *buf) 142 { 143 struct memory_block *mem = to_memory_block(dev); 144 const char *output; 145 146 /* 147 * We can probably put these states in a nice little array 148 * so that they're not open-coded 149 */ 150 switch (mem->state) { 151 case MEM_ONLINE: 152 output = "online"; 153 break; 154 case MEM_OFFLINE: 155 output = "offline"; 156 break; 157 case MEM_GOING_OFFLINE: 158 output = "going-offline"; 159 break; 160 default: 161 WARN_ON(1); 162 return sysfs_emit(buf, "ERROR-UNKNOWN-%ld\n", mem->state); 163 } 164 165 return sysfs_emit(buf, "%s\n", output); 166 } 167 168 int memory_notify(unsigned long val, void *v) 169 { 170 return blocking_notifier_call_chain(&memory_chain, val, v); 171 } 172 173 #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) 174 static unsigned long memblk_nr_poison(struct memory_block *mem); 175 #else 176 static inline unsigned long memblk_nr_poison(struct memory_block *mem) 177 { 178 return 0; 179 } 180 #endif 181 182 static int memory_block_online(struct memory_block *mem) 183 { 184 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 185 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 186 unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages; 187 struct zone *zone; 188 int ret; 189 190 if (memblk_nr_poison(mem)) 191 return -EHWPOISON; 192 193 zone = zone_for_pfn_range(mem->online_type, mem->nid, mem->group, 194 start_pfn, nr_pages); 195 196 /* 197 * Although vmemmap pages have a different lifecycle than the pages 198 * they describe (they remain until the memory is unplugged), doing 199 * their initialization and accounting at memory onlining/offlining 200 * stage helps to keep accounting easier to follow - e.g vmemmaps 201 * belong to the same zone as the memory they backed. 202 */ 203 if (nr_vmemmap_pages) { 204 ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, zone); 205 if (ret) 206 return ret; 207 } 208 209 ret = online_pages(start_pfn + nr_vmemmap_pages, 210 nr_pages - nr_vmemmap_pages, zone, mem->group); 211 if (ret) { 212 if (nr_vmemmap_pages) 213 mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages); 214 return ret; 215 } 216 217 /* 218 * Account once onlining succeeded. If the zone was unpopulated, it is 219 * now already properly populated. 220 */ 221 if (nr_vmemmap_pages) 222 adjust_present_page_count(pfn_to_page(start_pfn), mem->group, 223 nr_vmemmap_pages); 224 225 mem->zone = zone; 226 return ret; 227 } 228 229 static int memory_block_offline(struct memory_block *mem) 230 { 231 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 232 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 233 unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages; 234 int ret; 235 236 if (!mem->zone) 237 return -EINVAL; 238 239 /* 240 * Unaccount before offlining, such that unpopulated zone and kthreads 241 * can properly be torn down in offline_pages(). 242 */ 243 if (nr_vmemmap_pages) 244 adjust_present_page_count(pfn_to_page(start_pfn), mem->group, 245 -nr_vmemmap_pages); 246 247 ret = offline_pages(start_pfn + nr_vmemmap_pages, 248 nr_pages - nr_vmemmap_pages, mem->zone, mem->group); 249 if (ret) { 250 /* offline_pages() failed. Account back. */ 251 if (nr_vmemmap_pages) 252 adjust_present_page_count(pfn_to_page(start_pfn), 253 mem->group, nr_vmemmap_pages); 254 return ret; 255 } 256 257 if (nr_vmemmap_pages) 258 mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages); 259 260 mem->zone = NULL; 261 return ret; 262 } 263 264 /* 265 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is 266 * OK to have direct references to sparsemem variables in here. 267 */ 268 static int 269 memory_block_action(struct memory_block *mem, unsigned long action) 270 { 271 int ret; 272 273 switch (action) { 274 case MEM_ONLINE: 275 ret = memory_block_online(mem); 276 break; 277 case MEM_OFFLINE: 278 ret = memory_block_offline(mem); 279 break; 280 default: 281 WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: " 282 "%ld\n", __func__, mem->start_section_nr, action, action); 283 ret = -EINVAL; 284 } 285 286 return ret; 287 } 288 289 static int memory_block_change_state(struct memory_block *mem, 290 unsigned long to_state, unsigned long from_state_req) 291 { 292 int ret = 0; 293 294 if (mem->state != from_state_req) 295 return -EINVAL; 296 297 if (to_state == MEM_OFFLINE) 298 mem->state = MEM_GOING_OFFLINE; 299 300 ret = memory_block_action(mem, to_state); 301 mem->state = ret ? from_state_req : to_state; 302 303 return ret; 304 } 305 306 /* The device lock serializes operations on memory_subsys_[online|offline] */ 307 static int memory_subsys_online(struct device *dev) 308 { 309 struct memory_block *mem = to_memory_block(dev); 310 int ret; 311 312 if (mem->state == MEM_ONLINE) 313 return 0; 314 315 /* 316 * When called via device_online() without configuring the online_type, 317 * we want to default to MMOP_ONLINE. 318 */ 319 if (mem->online_type == MMOP_OFFLINE) 320 mem->online_type = MMOP_ONLINE; 321 322 ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); 323 mem->online_type = MMOP_OFFLINE; 324 325 return ret; 326 } 327 328 static int memory_subsys_offline(struct device *dev) 329 { 330 struct memory_block *mem = to_memory_block(dev); 331 332 if (mem->state == MEM_OFFLINE) 333 return 0; 334 335 return memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); 336 } 337 338 static ssize_t state_store(struct device *dev, struct device_attribute *attr, 339 const char *buf, size_t count) 340 { 341 const int online_type = mhp_online_type_from_str(buf); 342 struct memory_block *mem = to_memory_block(dev); 343 int ret; 344 345 if (online_type < 0) 346 return -EINVAL; 347 348 ret = lock_device_hotplug_sysfs(); 349 if (ret) 350 return ret; 351 352 switch (online_type) { 353 case MMOP_ONLINE_KERNEL: 354 case MMOP_ONLINE_MOVABLE: 355 case MMOP_ONLINE: 356 /* mem->online_type is protected by device_hotplug_lock */ 357 mem->online_type = online_type; 358 ret = device_online(&mem->dev); 359 break; 360 case MMOP_OFFLINE: 361 ret = device_offline(&mem->dev); 362 break; 363 default: 364 ret = -EINVAL; /* should never happen */ 365 } 366 367 unlock_device_hotplug(); 368 369 if (ret < 0) 370 return ret; 371 if (ret) 372 return -EINVAL; 373 374 return count; 375 } 376 377 /* 378 * Legacy interface that we cannot remove: s390x exposes the storage increment 379 * covered by a memory block, allowing for identifying which memory blocks 380 * comprise a storage increment. Since a memory block spans complete 381 * storage increments nowadays, this interface is basically unused. Other 382 * archs never exposed != 0. 383 */ 384 static ssize_t phys_device_show(struct device *dev, 385 struct device_attribute *attr, char *buf) 386 { 387 struct memory_block *mem = to_memory_block(dev); 388 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 389 390 return sysfs_emit(buf, "%d\n", 391 arch_get_memory_phys_device(start_pfn)); 392 } 393 394 #ifdef CONFIG_MEMORY_HOTREMOVE 395 static int print_allowed_zone(char *buf, int len, int nid, 396 struct memory_group *group, 397 unsigned long start_pfn, unsigned long nr_pages, 398 int online_type, struct zone *default_zone) 399 { 400 struct zone *zone; 401 402 zone = zone_for_pfn_range(online_type, nid, group, start_pfn, nr_pages); 403 if (zone == default_zone) 404 return 0; 405 406 return sysfs_emit_at(buf, len, " %s", zone->name); 407 } 408 409 static ssize_t valid_zones_show(struct device *dev, 410 struct device_attribute *attr, char *buf) 411 { 412 struct memory_block *mem = to_memory_block(dev); 413 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 414 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 415 struct memory_group *group = mem->group; 416 struct zone *default_zone; 417 int nid = mem->nid; 418 int len = 0; 419 420 /* 421 * Check the existing zone. Make sure that we do that only on the 422 * online nodes otherwise the page_zone is not reliable 423 */ 424 if (mem->state == MEM_ONLINE) { 425 /* 426 * If !mem->zone, the memory block spans multiple zones and 427 * cannot get offlined. 428 */ 429 default_zone = mem->zone; 430 if (!default_zone) 431 return sysfs_emit(buf, "%s\n", "none"); 432 len += sysfs_emit_at(buf, len, "%s", default_zone->name); 433 goto out; 434 } 435 436 default_zone = zone_for_pfn_range(MMOP_ONLINE, nid, group, 437 start_pfn, nr_pages); 438 439 len += sysfs_emit_at(buf, len, "%s", default_zone->name); 440 len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages, 441 MMOP_ONLINE_KERNEL, default_zone); 442 len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages, 443 MMOP_ONLINE_MOVABLE, default_zone); 444 out: 445 len += sysfs_emit_at(buf, len, "\n"); 446 return len; 447 } 448 static DEVICE_ATTR_RO(valid_zones); 449 #endif 450 451 static DEVICE_ATTR_RO(phys_index); 452 static DEVICE_ATTR_RW(state); 453 static DEVICE_ATTR_RO(phys_device); 454 static DEVICE_ATTR_RO(removable); 455 456 /* 457 * Show the memory block size (shared by all memory blocks). 458 */ 459 static ssize_t block_size_bytes_show(struct device *dev, 460 struct device_attribute *attr, char *buf) 461 { 462 return sysfs_emit(buf, "%lx\n", memory_block_size_bytes()); 463 } 464 465 static DEVICE_ATTR_RO(block_size_bytes); 466 467 /* 468 * Memory auto online policy. 469 */ 470 471 static ssize_t auto_online_blocks_show(struct device *dev, 472 struct device_attribute *attr, char *buf) 473 { 474 return sysfs_emit(buf, "%s\n", 475 online_type_to_str[mhp_default_online_type]); 476 } 477 478 static ssize_t auto_online_blocks_store(struct device *dev, 479 struct device_attribute *attr, 480 const char *buf, size_t count) 481 { 482 const int online_type = mhp_online_type_from_str(buf); 483 484 if (online_type < 0) 485 return -EINVAL; 486 487 mhp_default_online_type = online_type; 488 return count; 489 } 490 491 static DEVICE_ATTR_RW(auto_online_blocks); 492 493 /* 494 * Some architectures will have custom drivers to do this, and 495 * will not need to do it from userspace. The fake hot-add code 496 * as well as ppc64 will do all of their discovery in userspace 497 * and will require this interface. 498 */ 499 #ifdef CONFIG_ARCH_MEMORY_PROBE 500 static ssize_t probe_store(struct device *dev, struct device_attribute *attr, 501 const char *buf, size_t count) 502 { 503 u64 phys_addr; 504 int nid, ret; 505 unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block; 506 507 ret = kstrtoull(buf, 0, &phys_addr); 508 if (ret) 509 return ret; 510 511 if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1)) 512 return -EINVAL; 513 514 ret = lock_device_hotplug_sysfs(); 515 if (ret) 516 return ret; 517 518 nid = memory_add_physaddr_to_nid(phys_addr); 519 ret = __add_memory(nid, phys_addr, 520 MIN_MEMORY_BLOCK_SIZE * sections_per_block, 521 MHP_NONE); 522 523 if (ret) 524 goto out; 525 526 ret = count; 527 out: 528 unlock_device_hotplug(); 529 return ret; 530 } 531 532 static DEVICE_ATTR_WO(probe); 533 #endif 534 535 #ifdef CONFIG_MEMORY_FAILURE 536 /* 537 * Support for offlining pages of memory 538 */ 539 540 /* Soft offline a page */ 541 static ssize_t soft_offline_page_store(struct device *dev, 542 struct device_attribute *attr, 543 const char *buf, size_t count) 544 { 545 int ret; 546 u64 pfn; 547 if (!capable(CAP_SYS_ADMIN)) 548 return -EPERM; 549 if (kstrtoull(buf, 0, &pfn) < 0) 550 return -EINVAL; 551 pfn >>= PAGE_SHIFT; 552 ret = soft_offline_page(pfn, 0); 553 return ret == 0 ? count : ret; 554 } 555 556 /* Forcibly offline a page, including killing processes. */ 557 static ssize_t hard_offline_page_store(struct device *dev, 558 struct device_attribute *attr, 559 const char *buf, size_t count) 560 { 561 int ret; 562 u64 pfn; 563 if (!capable(CAP_SYS_ADMIN)) 564 return -EPERM; 565 if (kstrtoull(buf, 0, &pfn) < 0) 566 return -EINVAL; 567 pfn >>= PAGE_SHIFT; 568 ret = memory_failure(pfn, MF_SW_SIMULATED); 569 if (ret == -EOPNOTSUPP) 570 ret = 0; 571 return ret ? ret : count; 572 } 573 574 static DEVICE_ATTR_WO(soft_offline_page); 575 static DEVICE_ATTR_WO(hard_offline_page); 576 #endif 577 578 /* See phys_device_show(). */ 579 int __weak arch_get_memory_phys_device(unsigned long start_pfn) 580 { 581 return 0; 582 } 583 584 /* 585 * A reference for the returned memory block device is acquired. 586 * 587 * Called under device_hotplug_lock. 588 */ 589 static struct memory_block *find_memory_block_by_id(unsigned long block_id) 590 { 591 struct memory_block *mem; 592 593 mem = xa_load(&memory_blocks, block_id); 594 if (mem) 595 get_device(&mem->dev); 596 return mem; 597 } 598 599 /* 600 * Called under device_hotplug_lock. 601 */ 602 struct memory_block *find_memory_block(unsigned long section_nr) 603 { 604 unsigned long block_id = memory_block_id(section_nr); 605 606 return find_memory_block_by_id(block_id); 607 } 608 609 static struct attribute *memory_memblk_attrs[] = { 610 &dev_attr_phys_index.attr, 611 &dev_attr_state.attr, 612 &dev_attr_phys_device.attr, 613 &dev_attr_removable.attr, 614 #ifdef CONFIG_MEMORY_HOTREMOVE 615 &dev_attr_valid_zones.attr, 616 #endif 617 NULL 618 }; 619 620 static const struct attribute_group memory_memblk_attr_group = { 621 .attrs = memory_memblk_attrs, 622 }; 623 624 static const struct attribute_group *memory_memblk_attr_groups[] = { 625 &memory_memblk_attr_group, 626 NULL, 627 }; 628 629 static int __add_memory_block(struct memory_block *memory) 630 { 631 int ret; 632 633 memory->dev.bus = &memory_subsys; 634 memory->dev.id = memory->start_section_nr / sections_per_block; 635 memory->dev.release = memory_block_release; 636 memory->dev.groups = memory_memblk_attr_groups; 637 memory->dev.offline = memory->state == MEM_OFFLINE; 638 639 ret = device_register(&memory->dev); 640 if (ret) { 641 put_device(&memory->dev); 642 return ret; 643 } 644 ret = xa_err(xa_store(&memory_blocks, memory->dev.id, memory, 645 GFP_KERNEL)); 646 if (ret) 647 device_unregister(&memory->dev); 648 649 return ret; 650 } 651 652 static struct zone *early_node_zone_for_memory_block(struct memory_block *mem, 653 int nid) 654 { 655 const unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 656 const unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 657 struct zone *zone, *matching_zone = NULL; 658 pg_data_t *pgdat = NODE_DATA(nid); 659 int i; 660 661 /* 662 * This logic only works for early memory, when the applicable zones 663 * already span the memory block. We don't expect overlapping zones on 664 * a single node for early memory. So if we're told that some PFNs 665 * of a node fall into this memory block, we can assume that all node 666 * zones that intersect with the memory block are actually applicable. 667 * No need to look at the memmap. 668 */ 669 for (i = 0; i < MAX_NR_ZONES; i++) { 670 zone = pgdat->node_zones + i; 671 if (!populated_zone(zone)) 672 continue; 673 if (!zone_intersects(zone, start_pfn, nr_pages)) 674 continue; 675 if (!matching_zone) { 676 matching_zone = zone; 677 continue; 678 } 679 /* Spans multiple zones ... */ 680 matching_zone = NULL; 681 break; 682 } 683 return matching_zone; 684 } 685 686 #ifdef CONFIG_NUMA 687 /** 688 * memory_block_add_nid() - Indicate that system RAM falling into this memory 689 * block device (partially) belongs to the given node. 690 * @mem: The memory block device. 691 * @nid: The node id. 692 * @context: The memory initialization context. 693 * 694 * Indicate that system RAM falling into this memory block (partially) belongs 695 * to the given node. If the context indicates ("early") that we are adding the 696 * node during node device subsystem initialization, this will also properly 697 * set/adjust mem->zone based on the zone ranges of the given node. 698 */ 699 void memory_block_add_nid(struct memory_block *mem, int nid, 700 enum meminit_context context) 701 { 702 if (context == MEMINIT_EARLY && mem->nid != nid) { 703 /* 704 * For early memory we have to determine the zone when setting 705 * the node id and handle multiple nodes spanning a single 706 * memory block by indicate via zone == NULL that we're not 707 * dealing with a single zone. So if we're setting the node id 708 * the first time, determine if there is a single zone. If we're 709 * setting the node id a second time to a different node, 710 * invalidate the single detected zone. 711 */ 712 if (mem->nid == NUMA_NO_NODE) 713 mem->zone = early_node_zone_for_memory_block(mem, nid); 714 else 715 mem->zone = NULL; 716 } 717 718 /* 719 * If this memory block spans multiple nodes, we only indicate 720 * the last processed node. If we span multiple nodes (not applicable 721 * to hotplugged memory), zone == NULL will prohibit memory offlining 722 * and consequently unplug. 723 */ 724 mem->nid = nid; 725 } 726 #endif 727 728 static int add_memory_block(unsigned long block_id, unsigned long state, 729 unsigned long nr_vmemmap_pages, 730 struct memory_group *group) 731 { 732 struct memory_block *mem; 733 int ret = 0; 734 735 mem = find_memory_block_by_id(block_id); 736 if (mem) { 737 put_device(&mem->dev); 738 return -EEXIST; 739 } 740 mem = kzalloc(sizeof(*mem), GFP_KERNEL); 741 if (!mem) 742 return -ENOMEM; 743 744 mem->start_section_nr = block_id * sections_per_block; 745 mem->state = state; 746 mem->nid = NUMA_NO_NODE; 747 mem->nr_vmemmap_pages = nr_vmemmap_pages; 748 INIT_LIST_HEAD(&mem->group_next); 749 750 #ifndef CONFIG_NUMA 751 if (state == MEM_ONLINE) 752 /* 753 * MEM_ONLINE at this point implies early memory. With NUMA, 754 * we'll determine the zone when setting the node id via 755 * memory_block_add_nid(). Memory hotplug updated the zone 756 * manually when memory onlining/offlining succeeds. 757 */ 758 mem->zone = early_node_zone_for_memory_block(mem, NUMA_NO_NODE); 759 #endif /* CONFIG_NUMA */ 760 761 ret = __add_memory_block(mem); 762 if (ret) 763 return ret; 764 765 if (group) { 766 mem->group = group; 767 list_add(&mem->group_next, &group->memory_blocks); 768 } 769 770 return 0; 771 } 772 773 static int __init add_boot_memory_block(unsigned long base_section_nr) 774 { 775 int section_count = 0; 776 unsigned long nr; 777 778 for (nr = base_section_nr; nr < base_section_nr + sections_per_block; 779 nr++) 780 if (present_section_nr(nr)) 781 section_count++; 782 783 if (section_count == 0) 784 return 0; 785 return add_memory_block(memory_block_id(base_section_nr), 786 MEM_ONLINE, 0, NULL); 787 } 788 789 static int add_hotplug_memory_block(unsigned long block_id, 790 unsigned long nr_vmemmap_pages, 791 struct memory_group *group) 792 { 793 return add_memory_block(block_id, MEM_OFFLINE, nr_vmemmap_pages, group); 794 } 795 796 static void remove_memory_block(struct memory_block *memory) 797 { 798 if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys)) 799 return; 800 801 WARN_ON(xa_erase(&memory_blocks, memory->dev.id) == NULL); 802 803 if (memory->group) { 804 list_del(&memory->group_next); 805 memory->group = NULL; 806 } 807 808 /* drop the ref. we got via find_memory_block() */ 809 put_device(&memory->dev); 810 device_unregister(&memory->dev); 811 } 812 813 /* 814 * Create memory block devices for the given memory area. Start and size 815 * have to be aligned to memory block granularity. Memory block devices 816 * will be initialized as offline. 817 * 818 * Called under device_hotplug_lock. 819 */ 820 int create_memory_block_devices(unsigned long start, unsigned long size, 821 unsigned long vmemmap_pages, 822 struct memory_group *group) 823 { 824 const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start)); 825 unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size)); 826 struct memory_block *mem; 827 unsigned long block_id; 828 int ret = 0; 829 830 if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) || 831 !IS_ALIGNED(size, memory_block_size_bytes()))) 832 return -EINVAL; 833 834 for (block_id = start_block_id; block_id != end_block_id; block_id++) { 835 ret = add_hotplug_memory_block(block_id, vmemmap_pages, group); 836 if (ret) 837 break; 838 } 839 if (ret) { 840 end_block_id = block_id; 841 for (block_id = start_block_id; block_id != end_block_id; 842 block_id++) { 843 mem = find_memory_block_by_id(block_id); 844 if (WARN_ON_ONCE(!mem)) 845 continue; 846 remove_memory_block(mem); 847 } 848 } 849 return ret; 850 } 851 852 /* 853 * Remove memory block devices for the given memory area. Start and size 854 * have to be aligned to memory block granularity. Memory block devices 855 * have to be offline. 856 * 857 * Called under device_hotplug_lock. 858 */ 859 void remove_memory_block_devices(unsigned long start, unsigned long size) 860 { 861 const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start)); 862 const unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size)); 863 struct memory_block *mem; 864 unsigned long block_id; 865 866 if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) || 867 !IS_ALIGNED(size, memory_block_size_bytes()))) 868 return; 869 870 for (block_id = start_block_id; block_id != end_block_id; block_id++) { 871 mem = find_memory_block_by_id(block_id); 872 if (WARN_ON_ONCE(!mem)) 873 continue; 874 num_poisoned_pages_sub(-1UL, memblk_nr_poison(mem)); 875 unregister_memory_block_under_nodes(mem); 876 remove_memory_block(mem); 877 } 878 } 879 880 static struct attribute *memory_root_attrs[] = { 881 #ifdef CONFIG_ARCH_MEMORY_PROBE 882 &dev_attr_probe.attr, 883 #endif 884 885 #ifdef CONFIG_MEMORY_FAILURE 886 &dev_attr_soft_offline_page.attr, 887 &dev_attr_hard_offline_page.attr, 888 #endif 889 890 &dev_attr_block_size_bytes.attr, 891 &dev_attr_auto_online_blocks.attr, 892 NULL 893 }; 894 895 static const struct attribute_group memory_root_attr_group = { 896 .attrs = memory_root_attrs, 897 }; 898 899 static const struct attribute_group *memory_root_attr_groups[] = { 900 &memory_root_attr_group, 901 NULL, 902 }; 903 904 /* 905 * Initialize the sysfs support for memory devices. At the time this function 906 * is called, we cannot have concurrent creation/deletion of memory block 907 * devices, the device_hotplug_lock is not needed. 908 */ 909 void __init memory_dev_init(void) 910 { 911 int ret; 912 unsigned long block_sz, nr; 913 914 /* Validate the configured memory block size */ 915 block_sz = memory_block_size_bytes(); 916 if (!is_power_of_2(block_sz) || block_sz < MIN_MEMORY_BLOCK_SIZE) 917 panic("Memory block size not suitable: 0x%lx\n", block_sz); 918 sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; 919 920 ret = subsys_system_register(&memory_subsys, memory_root_attr_groups); 921 if (ret) 922 panic("%s() failed to register subsystem: %d\n", __func__, ret); 923 924 /* 925 * Create entries for memory sections that were found 926 * during boot and have been initialized 927 */ 928 for (nr = 0; nr <= __highest_present_section_nr; 929 nr += sections_per_block) { 930 ret = add_boot_memory_block(nr); 931 if (ret) 932 panic("%s() failed to add memory block: %d\n", __func__, 933 ret); 934 } 935 } 936 937 /** 938 * walk_memory_blocks - walk through all present memory blocks overlapped 939 * by the range [start, start + size) 940 * 941 * @start: start address of the memory range 942 * @size: size of the memory range 943 * @arg: argument passed to func 944 * @func: callback for each memory section walked 945 * 946 * This function walks through all present memory blocks overlapped by the 947 * range [start, start + size), calling func on each memory block. 948 * 949 * In case func() returns an error, walking is aborted and the error is 950 * returned. 951 * 952 * Called under device_hotplug_lock. 953 */ 954 int walk_memory_blocks(unsigned long start, unsigned long size, 955 void *arg, walk_memory_blocks_func_t func) 956 { 957 const unsigned long start_block_id = phys_to_block_id(start); 958 const unsigned long end_block_id = phys_to_block_id(start + size - 1); 959 struct memory_block *mem; 960 unsigned long block_id; 961 int ret = 0; 962 963 if (!size) 964 return 0; 965 966 for (block_id = start_block_id; block_id <= end_block_id; block_id++) { 967 mem = find_memory_block_by_id(block_id); 968 if (!mem) 969 continue; 970 971 ret = func(mem, arg); 972 put_device(&mem->dev); 973 if (ret) 974 break; 975 } 976 return ret; 977 } 978 979 struct for_each_memory_block_cb_data { 980 walk_memory_blocks_func_t func; 981 void *arg; 982 }; 983 984 static int for_each_memory_block_cb(struct device *dev, void *data) 985 { 986 struct memory_block *mem = to_memory_block(dev); 987 struct for_each_memory_block_cb_data *cb_data = data; 988 989 return cb_data->func(mem, cb_data->arg); 990 } 991 992 /** 993 * for_each_memory_block - walk through all present memory blocks 994 * 995 * @arg: argument passed to func 996 * @func: callback for each memory block walked 997 * 998 * This function walks through all present memory blocks, calling func on 999 * each memory block. 1000 * 1001 * In case func() returns an error, walking is aborted and the error is 1002 * returned. 1003 */ 1004 int for_each_memory_block(void *arg, walk_memory_blocks_func_t func) 1005 { 1006 struct for_each_memory_block_cb_data cb_data = { 1007 .func = func, 1008 .arg = arg, 1009 }; 1010 1011 return bus_for_each_dev(&memory_subsys, NULL, &cb_data, 1012 for_each_memory_block_cb); 1013 } 1014 1015 /* 1016 * This is an internal helper to unify allocation and initialization of 1017 * memory groups. Note that the passed memory group will be copied to a 1018 * dynamically allocated memory group. After this call, the passed 1019 * memory group should no longer be used. 1020 */ 1021 static int memory_group_register(struct memory_group group) 1022 { 1023 struct memory_group *new_group; 1024 uint32_t mgid; 1025 int ret; 1026 1027 if (!node_possible(group.nid)) 1028 return -EINVAL; 1029 1030 new_group = kzalloc(sizeof(group), GFP_KERNEL); 1031 if (!new_group) 1032 return -ENOMEM; 1033 *new_group = group; 1034 INIT_LIST_HEAD(&new_group->memory_blocks); 1035 1036 ret = xa_alloc(&memory_groups, &mgid, new_group, xa_limit_31b, 1037 GFP_KERNEL); 1038 if (ret) { 1039 kfree(new_group); 1040 return ret; 1041 } else if (group.is_dynamic) { 1042 xa_set_mark(&memory_groups, mgid, MEMORY_GROUP_MARK_DYNAMIC); 1043 } 1044 return mgid; 1045 } 1046 1047 /** 1048 * memory_group_register_static() - Register a static memory group. 1049 * @nid: The node id. 1050 * @max_pages: The maximum number of pages we'll have in this static memory 1051 * group. 1052 * 1053 * Register a new static memory group and return the memory group id. 1054 * All memory in the group belongs to a single unit, such as a DIMM. All 1055 * memory belonging to a static memory group is added in one go to be removed 1056 * in one go -- it's static. 1057 * 1058 * Returns an error if out of memory, if the node id is invalid, if no new 1059 * memory groups can be registered, or if max_pages is invalid (0). Otherwise, 1060 * returns the new memory group id. 1061 */ 1062 int memory_group_register_static(int nid, unsigned long max_pages) 1063 { 1064 struct memory_group group = { 1065 .nid = nid, 1066 .s = { 1067 .max_pages = max_pages, 1068 }, 1069 }; 1070 1071 if (!max_pages) 1072 return -EINVAL; 1073 return memory_group_register(group); 1074 } 1075 EXPORT_SYMBOL_GPL(memory_group_register_static); 1076 1077 /** 1078 * memory_group_register_dynamic() - Register a dynamic memory group. 1079 * @nid: The node id. 1080 * @unit_pages: Unit in pages in which is memory added/removed in this dynamic 1081 * memory group. 1082 * 1083 * Register a new dynamic memory group and return the memory group id. 1084 * Memory within a dynamic memory group is added/removed dynamically 1085 * in unit_pages. 1086 * 1087 * Returns an error if out of memory, if the node id is invalid, if no new 1088 * memory groups can be registered, or if unit_pages is invalid (0, not a 1089 * power of two, smaller than a single memory block). Otherwise, returns the 1090 * new memory group id. 1091 */ 1092 int memory_group_register_dynamic(int nid, unsigned long unit_pages) 1093 { 1094 struct memory_group group = { 1095 .nid = nid, 1096 .is_dynamic = true, 1097 .d = { 1098 .unit_pages = unit_pages, 1099 }, 1100 }; 1101 1102 if (!unit_pages || !is_power_of_2(unit_pages) || 1103 unit_pages < PHYS_PFN(memory_block_size_bytes())) 1104 return -EINVAL; 1105 return memory_group_register(group); 1106 } 1107 EXPORT_SYMBOL_GPL(memory_group_register_dynamic); 1108 1109 /** 1110 * memory_group_unregister() - Unregister a memory group. 1111 * @mgid: the memory group id 1112 * 1113 * Unregister a memory group. If any memory block still belongs to this 1114 * memory group, unregistering will fail. 1115 * 1116 * Returns -EINVAL if the memory group id is invalid, returns -EBUSY if some 1117 * memory blocks still belong to this memory group and returns 0 if 1118 * unregistering succeeded. 1119 */ 1120 int memory_group_unregister(int mgid) 1121 { 1122 struct memory_group *group; 1123 1124 if (mgid < 0) 1125 return -EINVAL; 1126 1127 group = xa_load(&memory_groups, mgid); 1128 if (!group) 1129 return -EINVAL; 1130 if (!list_empty(&group->memory_blocks)) 1131 return -EBUSY; 1132 xa_erase(&memory_groups, mgid); 1133 kfree(group); 1134 return 0; 1135 } 1136 EXPORT_SYMBOL_GPL(memory_group_unregister); 1137 1138 /* 1139 * This is an internal helper only to be used in core memory hotplug code to 1140 * lookup a memory group. We don't care about locking, as we don't expect a 1141 * memory group to get unregistered while adding memory to it -- because 1142 * the group and the memory is managed by the same driver. 1143 */ 1144 struct memory_group *memory_group_find_by_id(int mgid) 1145 { 1146 return xa_load(&memory_groups, mgid); 1147 } 1148 1149 /* 1150 * This is an internal helper only to be used in core memory hotplug code to 1151 * walk all dynamic memory groups excluding a given memory group, either 1152 * belonging to a specific node, or belonging to any node. 1153 */ 1154 int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func, 1155 struct memory_group *excluded, void *arg) 1156 { 1157 struct memory_group *group; 1158 unsigned long index; 1159 int ret = 0; 1160 1161 xa_for_each_marked(&memory_groups, index, group, 1162 MEMORY_GROUP_MARK_DYNAMIC) { 1163 if (group == excluded) 1164 continue; 1165 #ifdef CONFIG_NUMA 1166 if (nid != NUMA_NO_NODE && group->nid != nid) 1167 continue; 1168 #endif /* CONFIG_NUMA */ 1169 ret = func(group, arg); 1170 if (ret) 1171 break; 1172 } 1173 return ret; 1174 } 1175 1176 #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) 1177 void memblk_nr_poison_inc(unsigned long pfn) 1178 { 1179 const unsigned long block_id = pfn_to_block_id(pfn); 1180 struct memory_block *mem = find_memory_block_by_id(block_id); 1181 1182 if (mem) 1183 atomic_long_inc(&mem->nr_hwpoison); 1184 } 1185 1186 void memblk_nr_poison_sub(unsigned long pfn, long i) 1187 { 1188 const unsigned long block_id = pfn_to_block_id(pfn); 1189 struct memory_block *mem = find_memory_block_by_id(block_id); 1190 1191 if (mem) 1192 atomic_long_sub(i, &mem->nr_hwpoison); 1193 } 1194 1195 static unsigned long memblk_nr_poison(struct memory_block *mem) 1196 { 1197 return atomic_long_read(&mem->nr_hwpoison); 1198 } 1199 #endif 1200