1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Memory subsystem support 4 * 5 * Written by Matt Tolentino <matthew.e.tolentino@intel.com> 6 * Dave Hansen <haveblue@us.ibm.com> 7 * 8 * This file provides the necessary infrastructure to represent 9 * a SPARSEMEM-memory-model system's physical memory in /sysfs. 10 * All arch-independent code that assumes MEMORY_HOTPLUG requires 11 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c. 12 */ 13 14 #include <linux/module.h> 15 #include <linux/init.h> 16 #include <linux/topology.h> 17 #include <linux/capability.h> 18 #include <linux/device.h> 19 #include <linux/memory.h> 20 #include <linux/memory_hotplug.h> 21 #include <linux/mm.h> 22 #include <linux/stat.h> 23 #include <linux/slab.h> 24 #include <linux/xarray.h> 25 26 #include <linux/atomic.h> 27 #include <linux/uaccess.h> 28 29 #define MEMORY_CLASS_NAME "memory" 30 31 static const char *const online_type_to_str[] = { 32 [MMOP_OFFLINE] = "offline", 33 [MMOP_ONLINE] = "online", 34 [MMOP_ONLINE_KERNEL] = "online_kernel", 35 [MMOP_ONLINE_MOVABLE] = "online_movable", 36 }; 37 38 int mhp_online_type_from_str(const char *str) 39 { 40 int i; 41 42 for (i = 0; i < ARRAY_SIZE(online_type_to_str); i++) { 43 if (sysfs_streq(str, online_type_to_str[i])) 44 return i; 45 } 46 return -EINVAL; 47 } 48 49 #define to_memory_block(dev) container_of(dev, struct memory_block, dev) 50 51 static int sections_per_block; 52 53 static inline unsigned long memory_block_id(unsigned long section_nr) 54 { 55 return section_nr / sections_per_block; 56 } 57 58 static inline unsigned long pfn_to_block_id(unsigned long pfn) 59 { 60 return memory_block_id(pfn_to_section_nr(pfn)); 61 } 62 63 static inline unsigned long phys_to_block_id(unsigned long phys) 64 { 65 return pfn_to_block_id(PFN_DOWN(phys)); 66 } 67 68 static int memory_subsys_online(struct device *dev); 69 static int memory_subsys_offline(struct device *dev); 70 71 static struct bus_type memory_subsys = { 72 .name = MEMORY_CLASS_NAME, 73 .dev_name = MEMORY_CLASS_NAME, 74 .online = memory_subsys_online, 75 .offline = memory_subsys_offline, 76 }; 77 78 /* 79 * Memory blocks are cached in a local radix tree to avoid 80 * a costly linear search for the corresponding device on 81 * the subsystem bus. 82 */ 83 static DEFINE_XARRAY(memory_blocks); 84 85 /* 86 * Memory groups, indexed by memory group id (mgid). 87 */ 88 static DEFINE_XARRAY_FLAGS(memory_groups, XA_FLAGS_ALLOC); 89 #define MEMORY_GROUP_MARK_DYNAMIC XA_MARK_1 90 91 static BLOCKING_NOTIFIER_HEAD(memory_chain); 92 93 int register_memory_notifier(struct notifier_block *nb) 94 { 95 return blocking_notifier_chain_register(&memory_chain, nb); 96 } 97 EXPORT_SYMBOL(register_memory_notifier); 98 99 void unregister_memory_notifier(struct notifier_block *nb) 100 { 101 blocking_notifier_chain_unregister(&memory_chain, nb); 102 } 103 EXPORT_SYMBOL(unregister_memory_notifier); 104 105 static void memory_block_release(struct device *dev) 106 { 107 struct memory_block *mem = to_memory_block(dev); 108 109 kfree(mem); 110 } 111 112 unsigned long __weak memory_block_size_bytes(void) 113 { 114 return MIN_MEMORY_BLOCK_SIZE; 115 } 116 EXPORT_SYMBOL_GPL(memory_block_size_bytes); 117 118 /* 119 * Show the first physical section index (number) of this memory block. 120 */ 121 static ssize_t phys_index_show(struct device *dev, 122 struct device_attribute *attr, char *buf) 123 { 124 struct memory_block *mem = to_memory_block(dev); 125 unsigned long phys_index; 126 127 phys_index = mem->start_section_nr / sections_per_block; 128 129 return sysfs_emit(buf, "%08lx\n", phys_index); 130 } 131 132 /* 133 * Legacy interface that we cannot remove. Always indicate "removable" 134 * with CONFIG_MEMORY_HOTREMOVE - bad heuristic. 135 */ 136 static ssize_t removable_show(struct device *dev, struct device_attribute *attr, 137 char *buf) 138 { 139 return sysfs_emit(buf, "%d\n", (int)IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)); 140 } 141 142 /* 143 * online, offline, going offline, etc. 144 */ 145 static ssize_t state_show(struct device *dev, struct device_attribute *attr, 146 char *buf) 147 { 148 struct memory_block *mem = to_memory_block(dev); 149 const char *output; 150 151 /* 152 * We can probably put these states in a nice little array 153 * so that they're not open-coded 154 */ 155 switch (mem->state) { 156 case MEM_ONLINE: 157 output = "online"; 158 break; 159 case MEM_OFFLINE: 160 output = "offline"; 161 break; 162 case MEM_GOING_OFFLINE: 163 output = "going-offline"; 164 break; 165 default: 166 WARN_ON(1); 167 return sysfs_emit(buf, "ERROR-UNKNOWN-%ld\n", mem->state); 168 } 169 170 return sysfs_emit(buf, "%s\n", output); 171 } 172 173 int memory_notify(unsigned long val, void *v) 174 { 175 return blocking_notifier_call_chain(&memory_chain, val, v); 176 } 177 178 #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) 179 static unsigned long memblk_nr_poison(struct memory_block *mem); 180 #else 181 static inline unsigned long memblk_nr_poison(struct memory_block *mem) 182 { 183 return 0; 184 } 185 #endif 186 187 static int memory_block_online(struct memory_block *mem) 188 { 189 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 190 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 191 unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages; 192 struct zone *zone; 193 int ret; 194 195 if (memblk_nr_poison(mem)) 196 return -EHWPOISON; 197 198 zone = zone_for_pfn_range(mem->online_type, mem->nid, mem->group, 199 start_pfn, nr_pages); 200 201 /* 202 * Although vmemmap pages have a different lifecycle than the pages 203 * they describe (they remain until the memory is unplugged), doing 204 * their initialization and accounting at memory onlining/offlining 205 * stage helps to keep accounting easier to follow - e.g vmemmaps 206 * belong to the same zone as the memory they backed. 207 */ 208 if (nr_vmemmap_pages) { 209 ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, zone); 210 if (ret) 211 return ret; 212 } 213 214 ret = online_pages(start_pfn + nr_vmemmap_pages, 215 nr_pages - nr_vmemmap_pages, zone, mem->group); 216 if (ret) { 217 if (nr_vmemmap_pages) 218 mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages); 219 return ret; 220 } 221 222 /* 223 * Account once onlining succeeded. If the zone was unpopulated, it is 224 * now already properly populated. 225 */ 226 if (nr_vmemmap_pages) 227 adjust_present_page_count(pfn_to_page(start_pfn), mem->group, 228 nr_vmemmap_pages); 229 230 mem->zone = zone; 231 return ret; 232 } 233 234 static int memory_block_offline(struct memory_block *mem) 235 { 236 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 237 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 238 unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages; 239 int ret; 240 241 if (!mem->zone) 242 return -EINVAL; 243 244 /* 245 * Unaccount before offlining, such that unpopulated zone and kthreads 246 * can properly be torn down in offline_pages(). 247 */ 248 if (nr_vmemmap_pages) 249 adjust_present_page_count(pfn_to_page(start_pfn), mem->group, 250 -nr_vmemmap_pages); 251 252 ret = offline_pages(start_pfn + nr_vmemmap_pages, 253 nr_pages - nr_vmemmap_pages, mem->zone, mem->group); 254 if (ret) { 255 /* offline_pages() failed. Account back. */ 256 if (nr_vmemmap_pages) 257 adjust_present_page_count(pfn_to_page(start_pfn), 258 mem->group, nr_vmemmap_pages); 259 return ret; 260 } 261 262 if (nr_vmemmap_pages) 263 mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages); 264 265 mem->zone = NULL; 266 return ret; 267 } 268 269 /* 270 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is 271 * OK to have direct references to sparsemem variables in here. 272 */ 273 static int 274 memory_block_action(struct memory_block *mem, unsigned long action) 275 { 276 int ret; 277 278 switch (action) { 279 case MEM_ONLINE: 280 ret = memory_block_online(mem); 281 break; 282 case MEM_OFFLINE: 283 ret = memory_block_offline(mem); 284 break; 285 default: 286 WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: " 287 "%ld\n", __func__, mem->start_section_nr, action, action); 288 ret = -EINVAL; 289 } 290 291 return ret; 292 } 293 294 static int memory_block_change_state(struct memory_block *mem, 295 unsigned long to_state, unsigned long from_state_req) 296 { 297 int ret = 0; 298 299 if (mem->state != from_state_req) 300 return -EINVAL; 301 302 if (to_state == MEM_OFFLINE) 303 mem->state = MEM_GOING_OFFLINE; 304 305 ret = memory_block_action(mem, to_state); 306 mem->state = ret ? from_state_req : to_state; 307 308 return ret; 309 } 310 311 /* The device lock serializes operations on memory_subsys_[online|offline] */ 312 static int memory_subsys_online(struct device *dev) 313 { 314 struct memory_block *mem = to_memory_block(dev); 315 int ret; 316 317 if (mem->state == MEM_ONLINE) 318 return 0; 319 320 /* 321 * When called via device_online() without configuring the online_type, 322 * we want to default to MMOP_ONLINE. 323 */ 324 if (mem->online_type == MMOP_OFFLINE) 325 mem->online_type = MMOP_ONLINE; 326 327 ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); 328 mem->online_type = MMOP_OFFLINE; 329 330 return ret; 331 } 332 333 static int memory_subsys_offline(struct device *dev) 334 { 335 struct memory_block *mem = to_memory_block(dev); 336 337 if (mem->state == MEM_OFFLINE) 338 return 0; 339 340 return memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); 341 } 342 343 static ssize_t state_store(struct device *dev, struct device_attribute *attr, 344 const char *buf, size_t count) 345 { 346 const int online_type = mhp_online_type_from_str(buf); 347 struct memory_block *mem = to_memory_block(dev); 348 int ret; 349 350 if (online_type < 0) 351 return -EINVAL; 352 353 ret = lock_device_hotplug_sysfs(); 354 if (ret) 355 return ret; 356 357 switch (online_type) { 358 case MMOP_ONLINE_KERNEL: 359 case MMOP_ONLINE_MOVABLE: 360 case MMOP_ONLINE: 361 /* mem->online_type is protected by device_hotplug_lock */ 362 mem->online_type = online_type; 363 ret = device_online(&mem->dev); 364 break; 365 case MMOP_OFFLINE: 366 ret = device_offline(&mem->dev); 367 break; 368 default: 369 ret = -EINVAL; /* should never happen */ 370 } 371 372 unlock_device_hotplug(); 373 374 if (ret < 0) 375 return ret; 376 if (ret) 377 return -EINVAL; 378 379 return count; 380 } 381 382 /* 383 * Legacy interface that we cannot remove: s390x exposes the storage increment 384 * covered by a memory block, allowing for identifying which memory blocks 385 * comprise a storage increment. Since a memory block spans complete 386 * storage increments nowadays, this interface is basically unused. Other 387 * archs never exposed != 0. 388 */ 389 static ssize_t phys_device_show(struct device *dev, 390 struct device_attribute *attr, char *buf) 391 { 392 struct memory_block *mem = to_memory_block(dev); 393 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 394 395 return sysfs_emit(buf, "%d\n", 396 arch_get_memory_phys_device(start_pfn)); 397 } 398 399 #ifdef CONFIG_MEMORY_HOTREMOVE 400 static int print_allowed_zone(char *buf, int len, int nid, 401 struct memory_group *group, 402 unsigned long start_pfn, unsigned long nr_pages, 403 int online_type, struct zone *default_zone) 404 { 405 struct zone *zone; 406 407 zone = zone_for_pfn_range(online_type, nid, group, start_pfn, nr_pages); 408 if (zone == default_zone) 409 return 0; 410 411 return sysfs_emit_at(buf, len, " %s", zone->name); 412 } 413 414 static ssize_t valid_zones_show(struct device *dev, 415 struct device_attribute *attr, char *buf) 416 { 417 struct memory_block *mem = to_memory_block(dev); 418 unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 419 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 420 struct memory_group *group = mem->group; 421 struct zone *default_zone; 422 int nid = mem->nid; 423 int len = 0; 424 425 /* 426 * Check the existing zone. Make sure that we do that only on the 427 * online nodes otherwise the page_zone is not reliable 428 */ 429 if (mem->state == MEM_ONLINE) { 430 /* 431 * If !mem->zone, the memory block spans multiple zones and 432 * cannot get offlined. 433 */ 434 default_zone = mem->zone; 435 if (!default_zone) 436 return sysfs_emit(buf, "%s\n", "none"); 437 len += sysfs_emit_at(buf, len, "%s", default_zone->name); 438 goto out; 439 } 440 441 default_zone = zone_for_pfn_range(MMOP_ONLINE, nid, group, 442 start_pfn, nr_pages); 443 444 len += sysfs_emit_at(buf, len, "%s", default_zone->name); 445 len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages, 446 MMOP_ONLINE_KERNEL, default_zone); 447 len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages, 448 MMOP_ONLINE_MOVABLE, default_zone); 449 out: 450 len += sysfs_emit_at(buf, len, "\n"); 451 return len; 452 } 453 static DEVICE_ATTR_RO(valid_zones); 454 #endif 455 456 static DEVICE_ATTR_RO(phys_index); 457 static DEVICE_ATTR_RW(state); 458 static DEVICE_ATTR_RO(phys_device); 459 static DEVICE_ATTR_RO(removable); 460 461 /* 462 * Show the memory block size (shared by all memory blocks). 463 */ 464 static ssize_t block_size_bytes_show(struct device *dev, 465 struct device_attribute *attr, char *buf) 466 { 467 return sysfs_emit(buf, "%lx\n", memory_block_size_bytes()); 468 } 469 470 static DEVICE_ATTR_RO(block_size_bytes); 471 472 /* 473 * Memory auto online policy. 474 */ 475 476 static ssize_t auto_online_blocks_show(struct device *dev, 477 struct device_attribute *attr, char *buf) 478 { 479 return sysfs_emit(buf, "%s\n", 480 online_type_to_str[mhp_default_online_type]); 481 } 482 483 static ssize_t auto_online_blocks_store(struct device *dev, 484 struct device_attribute *attr, 485 const char *buf, size_t count) 486 { 487 const int online_type = mhp_online_type_from_str(buf); 488 489 if (online_type < 0) 490 return -EINVAL; 491 492 mhp_default_online_type = online_type; 493 return count; 494 } 495 496 static DEVICE_ATTR_RW(auto_online_blocks); 497 498 /* 499 * Some architectures will have custom drivers to do this, and 500 * will not need to do it from userspace. The fake hot-add code 501 * as well as ppc64 will do all of their discovery in userspace 502 * and will require this interface. 503 */ 504 #ifdef CONFIG_ARCH_MEMORY_PROBE 505 static ssize_t probe_store(struct device *dev, struct device_attribute *attr, 506 const char *buf, size_t count) 507 { 508 u64 phys_addr; 509 int nid, ret; 510 unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block; 511 512 ret = kstrtoull(buf, 0, &phys_addr); 513 if (ret) 514 return ret; 515 516 if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1)) 517 return -EINVAL; 518 519 ret = lock_device_hotplug_sysfs(); 520 if (ret) 521 return ret; 522 523 nid = memory_add_physaddr_to_nid(phys_addr); 524 ret = __add_memory(nid, phys_addr, 525 MIN_MEMORY_BLOCK_SIZE * sections_per_block, 526 MHP_NONE); 527 528 if (ret) 529 goto out; 530 531 ret = count; 532 out: 533 unlock_device_hotplug(); 534 return ret; 535 } 536 537 static DEVICE_ATTR_WO(probe); 538 #endif 539 540 #ifdef CONFIG_MEMORY_FAILURE 541 /* 542 * Support for offlining pages of memory 543 */ 544 545 /* Soft offline a page */ 546 static ssize_t soft_offline_page_store(struct device *dev, 547 struct device_attribute *attr, 548 const char *buf, size_t count) 549 { 550 int ret; 551 u64 pfn; 552 if (!capable(CAP_SYS_ADMIN)) 553 return -EPERM; 554 if (kstrtoull(buf, 0, &pfn) < 0) 555 return -EINVAL; 556 pfn >>= PAGE_SHIFT; 557 ret = soft_offline_page(pfn, 0); 558 return ret == 0 ? count : ret; 559 } 560 561 /* Forcibly offline a page, including killing processes. */ 562 static ssize_t hard_offline_page_store(struct device *dev, 563 struct device_attribute *attr, 564 const char *buf, size_t count) 565 { 566 int ret; 567 u64 pfn; 568 if (!capable(CAP_SYS_ADMIN)) 569 return -EPERM; 570 if (kstrtoull(buf, 0, &pfn) < 0) 571 return -EINVAL; 572 pfn >>= PAGE_SHIFT; 573 ret = memory_failure(pfn, MF_SW_SIMULATED); 574 if (ret == -EOPNOTSUPP) 575 ret = 0; 576 return ret ? ret : count; 577 } 578 579 static DEVICE_ATTR_WO(soft_offline_page); 580 static DEVICE_ATTR_WO(hard_offline_page); 581 #endif 582 583 /* See phys_device_show(). */ 584 int __weak arch_get_memory_phys_device(unsigned long start_pfn) 585 { 586 return 0; 587 } 588 589 /* 590 * A reference for the returned memory block device is acquired. 591 * 592 * Called under device_hotplug_lock. 593 */ 594 static struct memory_block *find_memory_block_by_id(unsigned long block_id) 595 { 596 struct memory_block *mem; 597 598 mem = xa_load(&memory_blocks, block_id); 599 if (mem) 600 get_device(&mem->dev); 601 return mem; 602 } 603 604 /* 605 * Called under device_hotplug_lock. 606 */ 607 struct memory_block *find_memory_block(unsigned long section_nr) 608 { 609 unsigned long block_id = memory_block_id(section_nr); 610 611 return find_memory_block_by_id(block_id); 612 } 613 614 static struct attribute *memory_memblk_attrs[] = { 615 &dev_attr_phys_index.attr, 616 &dev_attr_state.attr, 617 &dev_attr_phys_device.attr, 618 &dev_attr_removable.attr, 619 #ifdef CONFIG_MEMORY_HOTREMOVE 620 &dev_attr_valid_zones.attr, 621 #endif 622 NULL 623 }; 624 625 static const struct attribute_group memory_memblk_attr_group = { 626 .attrs = memory_memblk_attrs, 627 }; 628 629 static const struct attribute_group *memory_memblk_attr_groups[] = { 630 &memory_memblk_attr_group, 631 NULL, 632 }; 633 634 static int __add_memory_block(struct memory_block *memory) 635 { 636 int ret; 637 638 memory->dev.bus = &memory_subsys; 639 memory->dev.id = memory->start_section_nr / sections_per_block; 640 memory->dev.release = memory_block_release; 641 memory->dev.groups = memory_memblk_attr_groups; 642 memory->dev.offline = memory->state == MEM_OFFLINE; 643 644 ret = device_register(&memory->dev); 645 if (ret) { 646 put_device(&memory->dev); 647 return ret; 648 } 649 ret = xa_err(xa_store(&memory_blocks, memory->dev.id, memory, 650 GFP_KERNEL)); 651 if (ret) 652 device_unregister(&memory->dev); 653 654 return ret; 655 } 656 657 static struct zone *early_node_zone_for_memory_block(struct memory_block *mem, 658 int nid) 659 { 660 const unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); 661 const unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 662 struct zone *zone, *matching_zone = NULL; 663 pg_data_t *pgdat = NODE_DATA(nid); 664 int i; 665 666 /* 667 * This logic only works for early memory, when the applicable zones 668 * already span the memory block. We don't expect overlapping zones on 669 * a single node for early memory. So if we're told that some PFNs 670 * of a node fall into this memory block, we can assume that all node 671 * zones that intersect with the memory block are actually applicable. 672 * No need to look at the memmap. 673 */ 674 for (i = 0; i < MAX_NR_ZONES; i++) { 675 zone = pgdat->node_zones + i; 676 if (!populated_zone(zone)) 677 continue; 678 if (!zone_intersects(zone, start_pfn, nr_pages)) 679 continue; 680 if (!matching_zone) { 681 matching_zone = zone; 682 continue; 683 } 684 /* Spans multiple zones ... */ 685 matching_zone = NULL; 686 break; 687 } 688 return matching_zone; 689 } 690 691 #ifdef CONFIG_NUMA 692 /** 693 * memory_block_add_nid() - Indicate that system RAM falling into this memory 694 * block device (partially) belongs to the given node. 695 * @mem: The memory block device. 696 * @nid: The node id. 697 * @context: The memory initialization context. 698 * 699 * Indicate that system RAM falling into this memory block (partially) belongs 700 * to the given node. If the context indicates ("early") that we are adding the 701 * node during node device subsystem initialization, this will also properly 702 * set/adjust mem->zone based on the zone ranges of the given node. 703 */ 704 void memory_block_add_nid(struct memory_block *mem, int nid, 705 enum meminit_context context) 706 { 707 if (context == MEMINIT_EARLY && mem->nid != nid) { 708 /* 709 * For early memory we have to determine the zone when setting 710 * the node id and handle multiple nodes spanning a single 711 * memory block by indicate via zone == NULL that we're not 712 * dealing with a single zone. So if we're setting the node id 713 * the first time, determine if there is a single zone. If we're 714 * setting the node id a second time to a different node, 715 * invalidate the single detected zone. 716 */ 717 if (mem->nid == NUMA_NO_NODE) 718 mem->zone = early_node_zone_for_memory_block(mem, nid); 719 else 720 mem->zone = NULL; 721 } 722 723 /* 724 * If this memory block spans multiple nodes, we only indicate 725 * the last processed node. If we span multiple nodes (not applicable 726 * to hotplugged memory), zone == NULL will prohibit memory offlining 727 * and consequently unplug. 728 */ 729 mem->nid = nid; 730 } 731 #endif 732 733 static int add_memory_block(unsigned long block_id, unsigned long state, 734 unsigned long nr_vmemmap_pages, 735 struct memory_group *group) 736 { 737 struct memory_block *mem; 738 int ret = 0; 739 740 mem = find_memory_block_by_id(block_id); 741 if (mem) { 742 put_device(&mem->dev); 743 return -EEXIST; 744 } 745 mem = kzalloc(sizeof(*mem), GFP_KERNEL); 746 if (!mem) 747 return -ENOMEM; 748 749 mem->start_section_nr = block_id * sections_per_block; 750 mem->state = state; 751 mem->nid = NUMA_NO_NODE; 752 mem->nr_vmemmap_pages = nr_vmemmap_pages; 753 INIT_LIST_HEAD(&mem->group_next); 754 755 #ifndef CONFIG_NUMA 756 if (state == MEM_ONLINE) 757 /* 758 * MEM_ONLINE at this point implies early memory. With NUMA, 759 * we'll determine the zone when setting the node id via 760 * memory_block_add_nid(). Memory hotplug updated the zone 761 * manually when memory onlining/offlining succeeds. 762 */ 763 mem->zone = early_node_zone_for_memory_block(mem, NUMA_NO_NODE); 764 #endif /* CONFIG_NUMA */ 765 766 ret = __add_memory_block(mem); 767 if (ret) 768 return ret; 769 770 if (group) { 771 mem->group = group; 772 list_add(&mem->group_next, &group->memory_blocks); 773 } 774 775 return 0; 776 } 777 778 static int __init add_boot_memory_block(unsigned long base_section_nr) 779 { 780 int section_count = 0; 781 unsigned long nr; 782 783 for (nr = base_section_nr; nr < base_section_nr + sections_per_block; 784 nr++) 785 if (present_section_nr(nr)) 786 section_count++; 787 788 if (section_count == 0) 789 return 0; 790 return add_memory_block(memory_block_id(base_section_nr), 791 MEM_ONLINE, 0, NULL); 792 } 793 794 static int add_hotplug_memory_block(unsigned long block_id, 795 unsigned long nr_vmemmap_pages, 796 struct memory_group *group) 797 { 798 return add_memory_block(block_id, MEM_OFFLINE, nr_vmemmap_pages, group); 799 } 800 801 static void remove_memory_block(struct memory_block *memory) 802 { 803 if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys)) 804 return; 805 806 WARN_ON(xa_erase(&memory_blocks, memory->dev.id) == NULL); 807 808 if (memory->group) { 809 list_del(&memory->group_next); 810 memory->group = NULL; 811 } 812 813 /* drop the ref. we got via find_memory_block() */ 814 put_device(&memory->dev); 815 device_unregister(&memory->dev); 816 } 817 818 /* 819 * Create memory block devices for the given memory area. Start and size 820 * have to be aligned to memory block granularity. Memory block devices 821 * will be initialized as offline. 822 * 823 * Called under device_hotplug_lock. 824 */ 825 int create_memory_block_devices(unsigned long start, unsigned long size, 826 unsigned long vmemmap_pages, 827 struct memory_group *group) 828 { 829 const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start)); 830 unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size)); 831 struct memory_block *mem; 832 unsigned long block_id; 833 int ret = 0; 834 835 if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) || 836 !IS_ALIGNED(size, memory_block_size_bytes()))) 837 return -EINVAL; 838 839 for (block_id = start_block_id; block_id != end_block_id; block_id++) { 840 ret = add_hotplug_memory_block(block_id, vmemmap_pages, group); 841 if (ret) 842 break; 843 } 844 if (ret) { 845 end_block_id = block_id; 846 for (block_id = start_block_id; block_id != end_block_id; 847 block_id++) { 848 mem = find_memory_block_by_id(block_id); 849 if (WARN_ON_ONCE(!mem)) 850 continue; 851 remove_memory_block(mem); 852 } 853 } 854 return ret; 855 } 856 857 /* 858 * Remove memory block devices for the given memory area. Start and size 859 * have to be aligned to memory block granularity. Memory block devices 860 * have to be offline. 861 * 862 * Called under device_hotplug_lock. 863 */ 864 void remove_memory_block_devices(unsigned long start, unsigned long size) 865 { 866 const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start)); 867 const unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size)); 868 struct memory_block *mem; 869 unsigned long block_id; 870 871 if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) || 872 !IS_ALIGNED(size, memory_block_size_bytes()))) 873 return; 874 875 for (block_id = start_block_id; block_id != end_block_id; block_id++) { 876 mem = find_memory_block_by_id(block_id); 877 if (WARN_ON_ONCE(!mem)) 878 continue; 879 num_poisoned_pages_sub(-1UL, memblk_nr_poison(mem)); 880 unregister_memory_block_under_nodes(mem); 881 remove_memory_block(mem); 882 } 883 } 884 885 static struct attribute *memory_root_attrs[] = { 886 #ifdef CONFIG_ARCH_MEMORY_PROBE 887 &dev_attr_probe.attr, 888 #endif 889 890 #ifdef CONFIG_MEMORY_FAILURE 891 &dev_attr_soft_offline_page.attr, 892 &dev_attr_hard_offline_page.attr, 893 #endif 894 895 &dev_attr_block_size_bytes.attr, 896 &dev_attr_auto_online_blocks.attr, 897 NULL 898 }; 899 900 static const struct attribute_group memory_root_attr_group = { 901 .attrs = memory_root_attrs, 902 }; 903 904 static const struct attribute_group *memory_root_attr_groups[] = { 905 &memory_root_attr_group, 906 NULL, 907 }; 908 909 /* 910 * Initialize the sysfs support for memory devices. At the time this function 911 * is called, we cannot have concurrent creation/deletion of memory block 912 * devices, the device_hotplug_lock is not needed. 913 */ 914 void __init memory_dev_init(void) 915 { 916 int ret; 917 unsigned long block_sz, nr; 918 919 /* Validate the configured memory block size */ 920 block_sz = memory_block_size_bytes(); 921 if (!is_power_of_2(block_sz) || block_sz < MIN_MEMORY_BLOCK_SIZE) 922 panic("Memory block size not suitable: 0x%lx\n", block_sz); 923 sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; 924 925 ret = subsys_system_register(&memory_subsys, memory_root_attr_groups); 926 if (ret) 927 panic("%s() failed to register subsystem: %d\n", __func__, ret); 928 929 /* 930 * Create entries for memory sections that were found 931 * during boot and have been initialized 932 */ 933 for (nr = 0; nr <= __highest_present_section_nr; 934 nr += sections_per_block) { 935 ret = add_boot_memory_block(nr); 936 if (ret) 937 panic("%s() failed to add memory block: %d\n", __func__, 938 ret); 939 } 940 } 941 942 /** 943 * walk_memory_blocks - walk through all present memory blocks overlapped 944 * by the range [start, start + size) 945 * 946 * @start: start address of the memory range 947 * @size: size of the memory range 948 * @arg: argument passed to func 949 * @func: callback for each memory section walked 950 * 951 * This function walks through all present memory blocks overlapped by the 952 * range [start, start + size), calling func on each memory block. 953 * 954 * In case func() returns an error, walking is aborted and the error is 955 * returned. 956 * 957 * Called under device_hotplug_lock. 958 */ 959 int walk_memory_blocks(unsigned long start, unsigned long size, 960 void *arg, walk_memory_blocks_func_t func) 961 { 962 const unsigned long start_block_id = phys_to_block_id(start); 963 const unsigned long end_block_id = phys_to_block_id(start + size - 1); 964 struct memory_block *mem; 965 unsigned long block_id; 966 int ret = 0; 967 968 if (!size) 969 return 0; 970 971 for (block_id = start_block_id; block_id <= end_block_id; block_id++) { 972 mem = find_memory_block_by_id(block_id); 973 if (!mem) 974 continue; 975 976 ret = func(mem, arg); 977 put_device(&mem->dev); 978 if (ret) 979 break; 980 } 981 return ret; 982 } 983 984 struct for_each_memory_block_cb_data { 985 walk_memory_blocks_func_t func; 986 void *arg; 987 }; 988 989 static int for_each_memory_block_cb(struct device *dev, void *data) 990 { 991 struct memory_block *mem = to_memory_block(dev); 992 struct for_each_memory_block_cb_data *cb_data = data; 993 994 return cb_data->func(mem, cb_data->arg); 995 } 996 997 /** 998 * for_each_memory_block - walk through all present memory blocks 999 * 1000 * @arg: argument passed to func 1001 * @func: callback for each memory block walked 1002 * 1003 * This function walks through all present memory blocks, calling func on 1004 * each memory block. 1005 * 1006 * In case func() returns an error, walking is aborted and the error is 1007 * returned. 1008 */ 1009 int for_each_memory_block(void *arg, walk_memory_blocks_func_t func) 1010 { 1011 struct for_each_memory_block_cb_data cb_data = { 1012 .func = func, 1013 .arg = arg, 1014 }; 1015 1016 return bus_for_each_dev(&memory_subsys, NULL, &cb_data, 1017 for_each_memory_block_cb); 1018 } 1019 1020 /* 1021 * This is an internal helper to unify allocation and initialization of 1022 * memory groups. Note that the passed memory group will be copied to a 1023 * dynamically allocated memory group. After this call, the passed 1024 * memory group should no longer be used. 1025 */ 1026 static int memory_group_register(struct memory_group group) 1027 { 1028 struct memory_group *new_group; 1029 uint32_t mgid; 1030 int ret; 1031 1032 if (!node_possible(group.nid)) 1033 return -EINVAL; 1034 1035 new_group = kzalloc(sizeof(group), GFP_KERNEL); 1036 if (!new_group) 1037 return -ENOMEM; 1038 *new_group = group; 1039 INIT_LIST_HEAD(&new_group->memory_blocks); 1040 1041 ret = xa_alloc(&memory_groups, &mgid, new_group, xa_limit_31b, 1042 GFP_KERNEL); 1043 if (ret) { 1044 kfree(new_group); 1045 return ret; 1046 } else if (group.is_dynamic) { 1047 xa_set_mark(&memory_groups, mgid, MEMORY_GROUP_MARK_DYNAMIC); 1048 } 1049 return mgid; 1050 } 1051 1052 /** 1053 * memory_group_register_static() - Register a static memory group. 1054 * @nid: The node id. 1055 * @max_pages: The maximum number of pages we'll have in this static memory 1056 * group. 1057 * 1058 * Register a new static memory group and return the memory group id. 1059 * All memory in the group belongs to a single unit, such as a DIMM. All 1060 * memory belonging to a static memory group is added in one go to be removed 1061 * in one go -- it's static. 1062 * 1063 * Returns an error if out of memory, if the node id is invalid, if no new 1064 * memory groups can be registered, or if max_pages is invalid (0). Otherwise, 1065 * returns the new memory group id. 1066 */ 1067 int memory_group_register_static(int nid, unsigned long max_pages) 1068 { 1069 struct memory_group group = { 1070 .nid = nid, 1071 .s = { 1072 .max_pages = max_pages, 1073 }, 1074 }; 1075 1076 if (!max_pages) 1077 return -EINVAL; 1078 return memory_group_register(group); 1079 } 1080 EXPORT_SYMBOL_GPL(memory_group_register_static); 1081 1082 /** 1083 * memory_group_register_dynamic() - Register a dynamic memory group. 1084 * @nid: The node id. 1085 * @unit_pages: Unit in pages in which is memory added/removed in this dynamic 1086 * memory group. 1087 * 1088 * Register a new dynamic memory group and return the memory group id. 1089 * Memory within a dynamic memory group is added/removed dynamically 1090 * in unit_pages. 1091 * 1092 * Returns an error if out of memory, if the node id is invalid, if no new 1093 * memory groups can be registered, or if unit_pages is invalid (0, not a 1094 * power of two, smaller than a single memory block). Otherwise, returns the 1095 * new memory group id. 1096 */ 1097 int memory_group_register_dynamic(int nid, unsigned long unit_pages) 1098 { 1099 struct memory_group group = { 1100 .nid = nid, 1101 .is_dynamic = true, 1102 .d = { 1103 .unit_pages = unit_pages, 1104 }, 1105 }; 1106 1107 if (!unit_pages || !is_power_of_2(unit_pages) || 1108 unit_pages < PHYS_PFN(memory_block_size_bytes())) 1109 return -EINVAL; 1110 return memory_group_register(group); 1111 } 1112 EXPORT_SYMBOL_GPL(memory_group_register_dynamic); 1113 1114 /** 1115 * memory_group_unregister() - Unregister a memory group. 1116 * @mgid: the memory group id 1117 * 1118 * Unregister a memory group. If any memory block still belongs to this 1119 * memory group, unregistering will fail. 1120 * 1121 * Returns -EINVAL if the memory group id is invalid, returns -EBUSY if some 1122 * memory blocks still belong to this memory group and returns 0 if 1123 * unregistering succeeded. 1124 */ 1125 int memory_group_unregister(int mgid) 1126 { 1127 struct memory_group *group; 1128 1129 if (mgid < 0) 1130 return -EINVAL; 1131 1132 group = xa_load(&memory_groups, mgid); 1133 if (!group) 1134 return -EINVAL; 1135 if (!list_empty(&group->memory_blocks)) 1136 return -EBUSY; 1137 xa_erase(&memory_groups, mgid); 1138 kfree(group); 1139 return 0; 1140 } 1141 EXPORT_SYMBOL_GPL(memory_group_unregister); 1142 1143 /* 1144 * This is an internal helper only to be used in core memory hotplug code to 1145 * lookup a memory group. We don't care about locking, as we don't expect a 1146 * memory group to get unregistered while adding memory to it -- because 1147 * the group and the memory is managed by the same driver. 1148 */ 1149 struct memory_group *memory_group_find_by_id(int mgid) 1150 { 1151 return xa_load(&memory_groups, mgid); 1152 } 1153 1154 /* 1155 * This is an internal helper only to be used in core memory hotplug code to 1156 * walk all dynamic memory groups excluding a given memory group, either 1157 * belonging to a specific node, or belonging to any node. 1158 */ 1159 int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func, 1160 struct memory_group *excluded, void *arg) 1161 { 1162 struct memory_group *group; 1163 unsigned long index; 1164 int ret = 0; 1165 1166 xa_for_each_marked(&memory_groups, index, group, 1167 MEMORY_GROUP_MARK_DYNAMIC) { 1168 if (group == excluded) 1169 continue; 1170 #ifdef CONFIG_NUMA 1171 if (nid != NUMA_NO_NODE && group->nid != nid) 1172 continue; 1173 #endif /* CONFIG_NUMA */ 1174 ret = func(group, arg); 1175 if (ret) 1176 break; 1177 } 1178 return ret; 1179 } 1180 1181 #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) 1182 void memblk_nr_poison_inc(unsigned long pfn) 1183 { 1184 const unsigned long block_id = pfn_to_block_id(pfn); 1185 struct memory_block *mem = find_memory_block_by_id(block_id); 1186 1187 if (mem) 1188 atomic_long_inc(&mem->nr_hwpoison); 1189 } 1190 1191 void memblk_nr_poison_sub(unsigned long pfn, long i) 1192 { 1193 const unsigned long block_id = pfn_to_block_id(pfn); 1194 struct memory_block *mem = find_memory_block_by_id(block_id); 1195 1196 if (mem) 1197 atomic_long_sub(i, &mem->nr_hwpoison); 1198 } 1199 1200 static unsigned long memblk_nr_poison(struct memory_block *mem) 1201 { 1202 return atomic_long_read(&mem->nr_hwpoison); 1203 } 1204 #endif 1205