1 /* 2 * Memory subsystem support 3 * 4 * Written by Matt Tolentino <matthew.e.tolentino@intel.com> 5 * Dave Hansen <haveblue@us.ibm.com> 6 * 7 * This file provides the necessary infrastructure to represent 8 * a SPARSEMEM-memory-model system's physical memory in /sysfs. 9 * All arch-independent code that assumes MEMORY_HOTPLUG requires 10 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c. 11 */ 12 13 #include <linux/module.h> 14 #include <linux/init.h> 15 #include <linux/topology.h> 16 #include <linux/capability.h> 17 #include <linux/device.h> 18 #include <linux/memory.h> 19 #include <linux/memory_hotplug.h> 20 #include <linux/mm.h> 21 #include <linux/mutex.h> 22 #include <linux/stat.h> 23 #include <linux/slab.h> 24 25 #include <linux/atomic.h> 26 #include <asm/uaccess.h> 27 28 static DEFINE_MUTEX(mem_sysfs_mutex); 29 30 #define MEMORY_CLASS_NAME "memory" 31 32 #define to_memory_block(dev) container_of(dev, struct memory_block, dev) 33 34 static int sections_per_block; 35 36 static inline int base_memory_block_id(int section_nr) 37 { 38 return section_nr / sections_per_block; 39 } 40 41 static int memory_subsys_online(struct device *dev); 42 static int memory_subsys_offline(struct device *dev); 43 44 static struct bus_type memory_subsys = { 45 .name = MEMORY_CLASS_NAME, 46 .dev_name = MEMORY_CLASS_NAME, 47 .online = memory_subsys_online, 48 .offline = memory_subsys_offline, 49 }; 50 51 static BLOCKING_NOTIFIER_HEAD(memory_chain); 52 53 int register_memory_notifier(struct notifier_block *nb) 54 { 55 return blocking_notifier_chain_register(&memory_chain, nb); 56 } 57 EXPORT_SYMBOL(register_memory_notifier); 58 59 void unregister_memory_notifier(struct notifier_block *nb) 60 { 61 blocking_notifier_chain_unregister(&memory_chain, nb); 62 } 63 EXPORT_SYMBOL(unregister_memory_notifier); 64 65 static ATOMIC_NOTIFIER_HEAD(memory_isolate_chain); 66 67 int register_memory_isolate_notifier(struct notifier_block *nb) 68 { 69 return atomic_notifier_chain_register(&memory_isolate_chain, nb); 70 } 71 EXPORT_SYMBOL(register_memory_isolate_notifier); 72 73 void unregister_memory_isolate_notifier(struct notifier_block *nb) 74 { 75 atomic_notifier_chain_unregister(&memory_isolate_chain, nb); 76 } 77 EXPORT_SYMBOL(unregister_memory_isolate_notifier); 78 79 static void memory_block_release(struct device *dev) 80 { 81 struct memory_block *mem = to_memory_block(dev); 82 83 kfree(mem); 84 } 85 86 unsigned long __weak memory_block_size_bytes(void) 87 { 88 return MIN_MEMORY_BLOCK_SIZE; 89 } 90 91 static unsigned long get_memory_block_size(void) 92 { 93 unsigned long block_sz; 94 95 block_sz = memory_block_size_bytes(); 96 97 /* Validate blk_sz is a power of 2 and not less than section size */ 98 if ((block_sz & (block_sz - 1)) || (block_sz < MIN_MEMORY_BLOCK_SIZE)) { 99 WARN_ON(1); 100 block_sz = MIN_MEMORY_BLOCK_SIZE; 101 } 102 103 return block_sz; 104 } 105 106 /* 107 * use this as the physical section index that this memsection 108 * uses. 109 */ 110 111 static ssize_t show_mem_start_phys_index(struct device *dev, 112 struct device_attribute *attr, char *buf) 113 { 114 struct memory_block *mem = to_memory_block(dev); 115 unsigned long phys_index; 116 117 phys_index = mem->start_section_nr / sections_per_block; 118 return sprintf(buf, "%08lx\n", phys_index); 119 } 120 121 static ssize_t show_mem_end_phys_index(struct device *dev, 122 struct device_attribute *attr, char *buf) 123 { 124 struct memory_block *mem = to_memory_block(dev); 125 unsigned long phys_index; 126 127 phys_index = mem->end_section_nr / sections_per_block; 128 return sprintf(buf, "%08lx\n", phys_index); 129 } 130 131 /* 132 * Show whether the section of memory is likely to be hot-removable 133 */ 134 static ssize_t show_mem_removable(struct device *dev, 135 struct device_attribute *attr, char *buf) 136 { 137 unsigned long i, pfn; 138 int ret = 1; 139 struct memory_block *mem = to_memory_block(dev); 140 141 for (i = 0; i < sections_per_block; i++) { 142 if (!present_section_nr(mem->start_section_nr + i)) 143 continue; 144 pfn = section_nr_to_pfn(mem->start_section_nr + i); 145 ret &= is_mem_section_removable(pfn, PAGES_PER_SECTION); 146 } 147 148 return sprintf(buf, "%d\n", ret); 149 } 150 151 /* 152 * online, offline, going offline, etc. 153 */ 154 static ssize_t show_mem_state(struct device *dev, 155 struct device_attribute *attr, char *buf) 156 { 157 struct memory_block *mem = to_memory_block(dev); 158 ssize_t len = 0; 159 160 /* 161 * We can probably put these states in a nice little array 162 * so that they're not open-coded 163 */ 164 switch (mem->state) { 165 case MEM_ONLINE: 166 len = sprintf(buf, "online\n"); 167 break; 168 case MEM_OFFLINE: 169 len = sprintf(buf, "offline\n"); 170 break; 171 case MEM_GOING_OFFLINE: 172 len = sprintf(buf, "going-offline\n"); 173 break; 174 default: 175 len = sprintf(buf, "ERROR-UNKNOWN-%ld\n", 176 mem->state); 177 WARN_ON(1); 178 break; 179 } 180 181 return len; 182 } 183 184 int memory_notify(unsigned long val, void *v) 185 { 186 return blocking_notifier_call_chain(&memory_chain, val, v); 187 } 188 189 int memory_isolate_notify(unsigned long val, void *v) 190 { 191 return atomic_notifier_call_chain(&memory_isolate_chain, val, v); 192 } 193 194 /* 195 * The probe routines leave the pages reserved, just as the bootmem code does. 196 * Make sure they're still that way. 197 */ 198 static bool pages_correctly_reserved(unsigned long start_pfn) 199 { 200 int i, j; 201 struct page *page; 202 unsigned long pfn = start_pfn; 203 204 /* 205 * memmap between sections is not contiguous except with 206 * SPARSEMEM_VMEMMAP. We lookup the page once per section 207 * and assume memmap is contiguous within each section 208 */ 209 for (i = 0; i < sections_per_block; i++, pfn += PAGES_PER_SECTION) { 210 if (WARN_ON_ONCE(!pfn_valid(pfn))) 211 return false; 212 page = pfn_to_page(pfn); 213 214 for (j = 0; j < PAGES_PER_SECTION; j++) { 215 if (PageReserved(page + j)) 216 continue; 217 218 printk(KERN_WARNING "section number %ld page number %d " 219 "not reserved, was it already online?\n", 220 pfn_to_section_nr(pfn), j); 221 222 return false; 223 } 224 } 225 226 return true; 227 } 228 229 /* 230 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is 231 * OK to have direct references to sparsemem variables in here. 232 */ 233 static int 234 memory_block_action(unsigned long phys_index, unsigned long action, int online_type) 235 { 236 unsigned long start_pfn; 237 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 238 struct page *first_page; 239 int ret; 240 241 first_page = pfn_to_page(phys_index << PFN_SECTION_SHIFT); 242 start_pfn = page_to_pfn(first_page); 243 244 switch (action) { 245 case MEM_ONLINE: 246 if (!pages_correctly_reserved(start_pfn)) 247 return -EBUSY; 248 249 ret = online_pages(start_pfn, nr_pages, online_type); 250 break; 251 case MEM_OFFLINE: 252 ret = offline_pages(start_pfn, nr_pages); 253 break; 254 default: 255 WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: " 256 "%ld\n", __func__, phys_index, action, action); 257 ret = -EINVAL; 258 } 259 260 return ret; 261 } 262 263 static int memory_block_change_state(struct memory_block *mem, 264 unsigned long to_state, unsigned long from_state_req) 265 { 266 int ret = 0; 267 268 if (mem->state != from_state_req) 269 return -EINVAL; 270 271 if (to_state == MEM_OFFLINE) 272 mem->state = MEM_GOING_OFFLINE; 273 274 ret = memory_block_action(mem->start_section_nr, to_state, 275 mem->online_type); 276 277 mem->state = ret ? from_state_req : to_state; 278 279 return ret; 280 } 281 282 /* The device lock serializes operations on memory_subsys_[online|offline] */ 283 static int memory_subsys_online(struct device *dev) 284 { 285 struct memory_block *mem = to_memory_block(dev); 286 int ret; 287 288 if (mem->state == MEM_ONLINE) 289 return 0; 290 291 /* 292 * If we are called from store_mem_state(), online_type will be 293 * set >= 0 Otherwise we were called from the device online 294 * attribute and need to set the online_type. 295 */ 296 if (mem->online_type < 0) 297 mem->online_type = ONLINE_KEEP; 298 299 ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); 300 301 /* clear online_type */ 302 mem->online_type = -1; 303 304 return ret; 305 } 306 307 static int memory_subsys_offline(struct device *dev) 308 { 309 struct memory_block *mem = to_memory_block(dev); 310 311 if (mem->state == MEM_OFFLINE) 312 return 0; 313 314 return memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); 315 } 316 317 static ssize_t 318 store_mem_state(struct device *dev, 319 struct device_attribute *attr, const char *buf, size_t count) 320 { 321 struct memory_block *mem = to_memory_block(dev); 322 int ret, online_type; 323 324 ret = lock_device_hotplug_sysfs(); 325 if (ret) 326 return ret; 327 328 if (!strncmp(buf, "online_kernel", min_t(int, count, 13))) 329 online_type = ONLINE_KERNEL; 330 else if (!strncmp(buf, "online_movable", min_t(int, count, 14))) 331 online_type = ONLINE_MOVABLE; 332 else if (!strncmp(buf, "online", min_t(int, count, 6))) 333 online_type = ONLINE_KEEP; 334 else if (!strncmp(buf, "offline", min_t(int, count, 7))) 335 online_type = -1; 336 else { 337 ret = -EINVAL; 338 goto err; 339 } 340 341 switch (online_type) { 342 case ONLINE_KERNEL: 343 case ONLINE_MOVABLE: 344 case ONLINE_KEEP: 345 /* 346 * mem->online_type is not protected so there can be a 347 * race here. However, when racing online, the first 348 * will succeed and the second will just return as the 349 * block will already be online. The online type 350 * could be either one, but that is expected. 351 */ 352 mem->online_type = online_type; 353 ret = device_online(&mem->dev); 354 break; 355 case -1: 356 ret = device_offline(&mem->dev); 357 break; 358 default: 359 ret = -EINVAL; /* should never happen */ 360 } 361 362 err: 363 unlock_device_hotplug(); 364 365 if (ret) 366 return ret; 367 return count; 368 } 369 370 /* 371 * phys_device is a bad name for this. What I really want 372 * is a way to differentiate between memory ranges that 373 * are part of physical devices that constitute 374 * a complete removable unit or fru. 375 * i.e. do these ranges belong to the same physical device, 376 * s.t. if I offline all of these sections I can then 377 * remove the physical device? 378 */ 379 static ssize_t show_phys_device(struct device *dev, 380 struct device_attribute *attr, char *buf) 381 { 382 struct memory_block *mem = to_memory_block(dev); 383 return sprintf(buf, "%d\n", mem->phys_device); 384 } 385 386 static DEVICE_ATTR(phys_index, 0444, show_mem_start_phys_index, NULL); 387 static DEVICE_ATTR(end_phys_index, 0444, show_mem_end_phys_index, NULL); 388 static DEVICE_ATTR(state, 0644, show_mem_state, store_mem_state); 389 static DEVICE_ATTR(phys_device, 0444, show_phys_device, NULL); 390 static DEVICE_ATTR(removable, 0444, show_mem_removable, NULL); 391 392 /* 393 * Block size attribute stuff 394 */ 395 static ssize_t 396 print_block_size(struct device *dev, struct device_attribute *attr, 397 char *buf) 398 { 399 return sprintf(buf, "%lx\n", get_memory_block_size()); 400 } 401 402 static DEVICE_ATTR(block_size_bytes, 0444, print_block_size, NULL); 403 404 /* 405 * Some architectures will have custom drivers to do this, and 406 * will not need to do it from userspace. The fake hot-add code 407 * as well as ppc64 will do all of their discovery in userspace 408 * and will require this interface. 409 */ 410 #ifdef CONFIG_ARCH_MEMORY_PROBE 411 static ssize_t 412 memory_probe_store(struct device *dev, struct device_attribute *attr, 413 const char *buf, size_t count) 414 { 415 u64 phys_addr; 416 int nid; 417 int i, ret; 418 unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block; 419 420 phys_addr = simple_strtoull(buf, NULL, 0); 421 422 if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1)) 423 return -EINVAL; 424 425 for (i = 0; i < sections_per_block; i++) { 426 nid = memory_add_physaddr_to_nid(phys_addr); 427 ret = add_memory(nid, phys_addr, 428 PAGES_PER_SECTION << PAGE_SHIFT); 429 if (ret) 430 goto out; 431 432 phys_addr += MIN_MEMORY_BLOCK_SIZE; 433 } 434 435 ret = count; 436 out: 437 return ret; 438 } 439 440 static DEVICE_ATTR(probe, S_IWUSR, NULL, memory_probe_store); 441 #endif 442 443 #ifdef CONFIG_MEMORY_FAILURE 444 /* 445 * Support for offlining pages of memory 446 */ 447 448 /* Soft offline a page */ 449 static ssize_t 450 store_soft_offline_page(struct device *dev, 451 struct device_attribute *attr, 452 const char *buf, size_t count) 453 { 454 int ret; 455 u64 pfn; 456 if (!capable(CAP_SYS_ADMIN)) 457 return -EPERM; 458 if (kstrtoull(buf, 0, &pfn) < 0) 459 return -EINVAL; 460 pfn >>= PAGE_SHIFT; 461 if (!pfn_valid(pfn)) 462 return -ENXIO; 463 ret = soft_offline_page(pfn_to_page(pfn), 0); 464 return ret == 0 ? count : ret; 465 } 466 467 /* Forcibly offline a page, including killing processes. */ 468 static ssize_t 469 store_hard_offline_page(struct device *dev, 470 struct device_attribute *attr, 471 const char *buf, size_t count) 472 { 473 int ret; 474 u64 pfn; 475 if (!capable(CAP_SYS_ADMIN)) 476 return -EPERM; 477 if (kstrtoull(buf, 0, &pfn) < 0) 478 return -EINVAL; 479 pfn >>= PAGE_SHIFT; 480 ret = memory_failure(pfn, 0, 0); 481 return ret ? ret : count; 482 } 483 484 static DEVICE_ATTR(soft_offline_page, S_IWUSR, NULL, store_soft_offline_page); 485 static DEVICE_ATTR(hard_offline_page, S_IWUSR, NULL, store_hard_offline_page); 486 #endif 487 488 /* 489 * Note that phys_device is optional. It is here to allow for 490 * differentiation between which *physical* devices each 491 * section belongs to... 492 */ 493 int __weak arch_get_memory_phys_device(unsigned long start_pfn) 494 { 495 return 0; 496 } 497 498 /* 499 * A reference for the returned object is held and the reference for the 500 * hinted object is released. 501 */ 502 struct memory_block *find_memory_block_hinted(struct mem_section *section, 503 struct memory_block *hint) 504 { 505 int block_id = base_memory_block_id(__section_nr(section)); 506 struct device *hintdev = hint ? &hint->dev : NULL; 507 struct device *dev; 508 509 dev = subsys_find_device_by_id(&memory_subsys, block_id, hintdev); 510 if (hint) 511 put_device(&hint->dev); 512 if (!dev) 513 return NULL; 514 return to_memory_block(dev); 515 } 516 517 /* 518 * For now, we have a linear search to go find the appropriate 519 * memory_block corresponding to a particular phys_index. If 520 * this gets to be a real problem, we can always use a radix 521 * tree or something here. 522 * 523 * This could be made generic for all device subsystems. 524 */ 525 struct memory_block *find_memory_block(struct mem_section *section) 526 { 527 return find_memory_block_hinted(section, NULL); 528 } 529 530 static struct attribute *memory_memblk_attrs[] = { 531 &dev_attr_phys_index.attr, 532 &dev_attr_end_phys_index.attr, 533 &dev_attr_state.attr, 534 &dev_attr_phys_device.attr, 535 &dev_attr_removable.attr, 536 NULL 537 }; 538 539 static struct attribute_group memory_memblk_attr_group = { 540 .attrs = memory_memblk_attrs, 541 }; 542 543 static const struct attribute_group *memory_memblk_attr_groups[] = { 544 &memory_memblk_attr_group, 545 NULL, 546 }; 547 548 /* 549 * register_memory - Setup a sysfs device for a memory block 550 */ 551 static 552 int register_memory(struct memory_block *memory) 553 { 554 memory->dev.bus = &memory_subsys; 555 memory->dev.id = memory->start_section_nr / sections_per_block; 556 memory->dev.release = memory_block_release; 557 memory->dev.groups = memory_memblk_attr_groups; 558 memory->dev.offline = memory->state == MEM_OFFLINE; 559 560 return device_register(&memory->dev); 561 } 562 563 static int init_memory_block(struct memory_block **memory, 564 struct mem_section *section, unsigned long state) 565 { 566 struct memory_block *mem; 567 unsigned long start_pfn; 568 int scn_nr; 569 int ret = 0; 570 571 mem = kzalloc(sizeof(*mem), GFP_KERNEL); 572 if (!mem) 573 return -ENOMEM; 574 575 scn_nr = __section_nr(section); 576 mem->start_section_nr = 577 base_memory_block_id(scn_nr) * sections_per_block; 578 mem->end_section_nr = mem->start_section_nr + sections_per_block - 1; 579 mem->state = state; 580 mem->section_count++; 581 start_pfn = section_nr_to_pfn(mem->start_section_nr); 582 mem->phys_device = arch_get_memory_phys_device(start_pfn); 583 584 ret = register_memory(mem); 585 586 *memory = mem; 587 return ret; 588 } 589 590 static int add_memory_block(int base_section_nr) 591 { 592 struct memory_block *mem; 593 int i, ret, section_count = 0, section_nr; 594 595 for (i = base_section_nr; 596 (i < base_section_nr + sections_per_block) && i < NR_MEM_SECTIONS; 597 i++) { 598 if (!present_section_nr(i)) 599 continue; 600 if (section_count == 0) 601 section_nr = i; 602 section_count++; 603 } 604 605 if (section_count == 0) 606 return 0; 607 ret = init_memory_block(&mem, __nr_to_section(section_nr), MEM_ONLINE); 608 if (ret) 609 return ret; 610 mem->section_count = section_count; 611 return 0; 612 } 613 614 615 /* 616 * need an interface for the VM to add new memory regions, 617 * but without onlining it. 618 */ 619 int register_new_memory(int nid, struct mem_section *section) 620 { 621 int ret = 0; 622 struct memory_block *mem; 623 624 mutex_lock(&mem_sysfs_mutex); 625 626 mem = find_memory_block(section); 627 if (mem) { 628 mem->section_count++; 629 put_device(&mem->dev); 630 } else { 631 ret = init_memory_block(&mem, section, MEM_OFFLINE); 632 if (ret) 633 goto out; 634 } 635 636 if (mem->section_count == sections_per_block) 637 ret = register_mem_sect_under_node(mem, nid); 638 out: 639 mutex_unlock(&mem_sysfs_mutex); 640 return ret; 641 } 642 643 #ifdef CONFIG_MEMORY_HOTREMOVE 644 static void 645 unregister_memory(struct memory_block *memory) 646 { 647 BUG_ON(memory->dev.bus != &memory_subsys); 648 649 /* drop the ref. we got in remove_memory_block() */ 650 put_device(&memory->dev); 651 device_unregister(&memory->dev); 652 } 653 654 static int remove_memory_block(unsigned long node_id, 655 struct mem_section *section, int phys_device) 656 { 657 struct memory_block *mem; 658 659 mutex_lock(&mem_sysfs_mutex); 660 mem = find_memory_block(section); 661 unregister_mem_sect_under_nodes(mem, __section_nr(section)); 662 663 mem->section_count--; 664 if (mem->section_count == 0) 665 unregister_memory(mem); 666 else 667 put_device(&mem->dev); 668 669 mutex_unlock(&mem_sysfs_mutex); 670 return 0; 671 } 672 673 int unregister_memory_section(struct mem_section *section) 674 { 675 if (!present_section(section)) 676 return -EINVAL; 677 678 return remove_memory_block(0, section, 0); 679 } 680 #endif /* CONFIG_MEMORY_HOTREMOVE */ 681 682 /* return true if the memory block is offlined, otherwise, return false */ 683 bool is_memblock_offlined(struct memory_block *mem) 684 { 685 return mem->state == MEM_OFFLINE; 686 } 687 688 static struct attribute *memory_root_attrs[] = { 689 #ifdef CONFIG_ARCH_MEMORY_PROBE 690 &dev_attr_probe.attr, 691 #endif 692 693 #ifdef CONFIG_MEMORY_FAILURE 694 &dev_attr_soft_offline_page.attr, 695 &dev_attr_hard_offline_page.attr, 696 #endif 697 698 &dev_attr_block_size_bytes.attr, 699 NULL 700 }; 701 702 static struct attribute_group memory_root_attr_group = { 703 .attrs = memory_root_attrs, 704 }; 705 706 static const struct attribute_group *memory_root_attr_groups[] = { 707 &memory_root_attr_group, 708 NULL, 709 }; 710 711 /* 712 * Initialize the sysfs support for memory devices... 713 */ 714 int __init memory_dev_init(void) 715 { 716 unsigned int i; 717 int ret; 718 int err; 719 unsigned long block_sz; 720 721 ret = subsys_system_register(&memory_subsys, memory_root_attr_groups); 722 if (ret) 723 goto out; 724 725 block_sz = get_memory_block_size(); 726 sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; 727 728 /* 729 * Create entries for memory sections that were found 730 * during boot and have been initialized 731 */ 732 mutex_lock(&mem_sysfs_mutex); 733 for (i = 0; i < NR_MEM_SECTIONS; i += sections_per_block) { 734 err = add_memory_block(i); 735 if (!ret) 736 ret = err; 737 } 738 mutex_unlock(&mem_sysfs_mutex); 739 740 out: 741 if (ret) 742 printk(KERN_ERR "%s() failed: %d\n", __func__, ret); 743 return ret; 744 } 745