1 /* 2 * Memory subsystem support 3 * 4 * Written by Matt Tolentino <matthew.e.tolentino@intel.com> 5 * Dave Hansen <haveblue@us.ibm.com> 6 * 7 * This file provides the necessary infrastructure to represent 8 * a SPARSEMEM-memory-model system's physical memory in /sysfs. 9 * All arch-independent code that assumes MEMORY_HOTPLUG requires 10 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c. 11 */ 12 13 #include <linux/module.h> 14 #include <linux/init.h> 15 #include <linux/topology.h> 16 #include <linux/capability.h> 17 #include <linux/device.h> 18 #include <linux/memory.h> 19 #include <linux/memory_hotplug.h> 20 #include <linux/mm.h> 21 #include <linux/mutex.h> 22 #include <linux/stat.h> 23 #include <linux/slab.h> 24 25 #include <linux/atomic.h> 26 #include <asm/uaccess.h> 27 28 static DEFINE_MUTEX(mem_sysfs_mutex); 29 30 #define MEMORY_CLASS_NAME "memory" 31 32 #define to_memory_block(dev) container_of(dev, struct memory_block, dev) 33 34 static int sections_per_block; 35 36 static inline int base_memory_block_id(int section_nr) 37 { 38 return section_nr / sections_per_block; 39 } 40 41 static int memory_subsys_online(struct device *dev); 42 static int memory_subsys_offline(struct device *dev); 43 44 static struct bus_type memory_subsys = { 45 .name = MEMORY_CLASS_NAME, 46 .dev_name = MEMORY_CLASS_NAME, 47 .online = memory_subsys_online, 48 .offline = memory_subsys_offline, 49 }; 50 51 static BLOCKING_NOTIFIER_HEAD(memory_chain); 52 53 int register_memory_notifier(struct notifier_block *nb) 54 { 55 return blocking_notifier_chain_register(&memory_chain, nb); 56 } 57 EXPORT_SYMBOL(register_memory_notifier); 58 59 void unregister_memory_notifier(struct notifier_block *nb) 60 { 61 blocking_notifier_chain_unregister(&memory_chain, nb); 62 } 63 EXPORT_SYMBOL(unregister_memory_notifier); 64 65 static ATOMIC_NOTIFIER_HEAD(memory_isolate_chain); 66 67 int register_memory_isolate_notifier(struct notifier_block *nb) 68 { 69 return atomic_notifier_chain_register(&memory_isolate_chain, nb); 70 } 71 EXPORT_SYMBOL(register_memory_isolate_notifier); 72 73 void unregister_memory_isolate_notifier(struct notifier_block *nb) 74 { 75 atomic_notifier_chain_unregister(&memory_isolate_chain, nb); 76 } 77 EXPORT_SYMBOL(unregister_memory_isolate_notifier); 78 79 static void memory_block_release(struct device *dev) 80 { 81 struct memory_block *mem = to_memory_block(dev); 82 83 kfree(mem); 84 } 85 86 unsigned long __weak memory_block_size_bytes(void) 87 { 88 return MIN_MEMORY_BLOCK_SIZE; 89 } 90 91 static unsigned long get_memory_block_size(void) 92 { 93 unsigned long block_sz; 94 95 block_sz = memory_block_size_bytes(); 96 97 /* Validate blk_sz is a power of 2 and not less than section size */ 98 if ((block_sz & (block_sz - 1)) || (block_sz < MIN_MEMORY_BLOCK_SIZE)) { 99 WARN_ON(1); 100 block_sz = MIN_MEMORY_BLOCK_SIZE; 101 } 102 103 return block_sz; 104 } 105 106 /* 107 * use this as the physical section index that this memsection 108 * uses. 109 */ 110 111 static ssize_t show_mem_start_phys_index(struct device *dev, 112 struct device_attribute *attr, char *buf) 113 { 114 struct memory_block *mem = to_memory_block(dev); 115 unsigned long phys_index; 116 117 phys_index = mem->start_section_nr / sections_per_block; 118 return sprintf(buf, "%08lx\n", phys_index); 119 } 120 121 static ssize_t show_mem_end_phys_index(struct device *dev, 122 struct device_attribute *attr, char *buf) 123 { 124 struct memory_block *mem = to_memory_block(dev); 125 unsigned long phys_index; 126 127 phys_index = mem->end_section_nr / sections_per_block; 128 return sprintf(buf, "%08lx\n", phys_index); 129 } 130 131 /* 132 * Show whether the section of memory is likely to be hot-removable 133 */ 134 static ssize_t show_mem_removable(struct device *dev, 135 struct device_attribute *attr, char *buf) 136 { 137 unsigned long i, pfn; 138 int ret = 1; 139 struct memory_block *mem = to_memory_block(dev); 140 141 for (i = 0; i < sections_per_block; i++) { 142 if (!present_section_nr(mem->start_section_nr + i)) 143 continue; 144 pfn = section_nr_to_pfn(mem->start_section_nr + i); 145 ret &= is_mem_section_removable(pfn, PAGES_PER_SECTION); 146 } 147 148 return sprintf(buf, "%d\n", ret); 149 } 150 151 /* 152 * online, offline, going offline, etc. 153 */ 154 static ssize_t show_mem_state(struct device *dev, 155 struct device_attribute *attr, char *buf) 156 { 157 struct memory_block *mem = to_memory_block(dev); 158 ssize_t len = 0; 159 160 /* 161 * We can probably put these states in a nice little array 162 * so that they're not open-coded 163 */ 164 switch (mem->state) { 165 case MEM_ONLINE: 166 len = sprintf(buf, "online\n"); 167 break; 168 case MEM_OFFLINE: 169 len = sprintf(buf, "offline\n"); 170 break; 171 case MEM_GOING_OFFLINE: 172 len = sprintf(buf, "going-offline\n"); 173 break; 174 default: 175 len = sprintf(buf, "ERROR-UNKNOWN-%ld\n", 176 mem->state); 177 WARN_ON(1); 178 break; 179 } 180 181 return len; 182 } 183 184 int memory_notify(unsigned long val, void *v) 185 { 186 return blocking_notifier_call_chain(&memory_chain, val, v); 187 } 188 189 int memory_isolate_notify(unsigned long val, void *v) 190 { 191 return atomic_notifier_call_chain(&memory_isolate_chain, val, v); 192 } 193 194 /* 195 * The probe routines leave the pages reserved, just as the bootmem code does. 196 * Make sure they're still that way. 197 */ 198 static bool pages_correctly_reserved(unsigned long start_pfn) 199 { 200 int i, j; 201 struct page *page; 202 unsigned long pfn = start_pfn; 203 204 /* 205 * memmap between sections is not contiguous except with 206 * SPARSEMEM_VMEMMAP. We lookup the page once per section 207 * and assume memmap is contiguous within each section 208 */ 209 for (i = 0; i < sections_per_block; i++, pfn += PAGES_PER_SECTION) { 210 if (WARN_ON_ONCE(!pfn_valid(pfn))) 211 return false; 212 page = pfn_to_page(pfn); 213 214 for (j = 0; j < PAGES_PER_SECTION; j++) { 215 if (PageReserved(page + j)) 216 continue; 217 218 printk(KERN_WARNING "section number %ld page number %d " 219 "not reserved, was it already online?\n", 220 pfn_to_section_nr(pfn), j); 221 222 return false; 223 } 224 } 225 226 return true; 227 } 228 229 /* 230 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is 231 * OK to have direct references to sparsemem variables in here. 232 */ 233 static int 234 memory_block_action(unsigned long phys_index, unsigned long action, int online_type) 235 { 236 unsigned long start_pfn; 237 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 238 struct page *first_page; 239 int ret; 240 241 first_page = pfn_to_page(phys_index << PFN_SECTION_SHIFT); 242 start_pfn = page_to_pfn(first_page); 243 244 switch (action) { 245 case MEM_ONLINE: 246 if (!pages_correctly_reserved(start_pfn)) 247 return -EBUSY; 248 249 ret = online_pages(start_pfn, nr_pages, online_type); 250 break; 251 case MEM_OFFLINE: 252 ret = offline_pages(start_pfn, nr_pages); 253 break; 254 default: 255 WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: " 256 "%ld\n", __func__, phys_index, action, action); 257 ret = -EINVAL; 258 } 259 260 return ret; 261 } 262 263 static int memory_block_change_state(struct memory_block *mem, 264 unsigned long to_state, unsigned long from_state_req) 265 { 266 int ret = 0; 267 268 if (mem->state != from_state_req) 269 return -EINVAL; 270 271 if (to_state == MEM_OFFLINE) 272 mem->state = MEM_GOING_OFFLINE; 273 274 ret = memory_block_action(mem->start_section_nr, to_state, 275 mem->online_type); 276 277 mem->state = ret ? from_state_req : to_state; 278 279 return ret; 280 } 281 282 /* The device lock serializes operations on memory_subsys_[online|offline] */ 283 static int memory_subsys_online(struct device *dev) 284 { 285 struct memory_block *mem = to_memory_block(dev); 286 int ret; 287 288 if (mem->state == MEM_ONLINE) 289 return 0; 290 291 /* 292 * If we are called from store_mem_state(), online_type will be 293 * set >= 0 Otherwise we were called from the device online 294 * attribute and need to set the online_type. 295 */ 296 if (mem->online_type < 0) 297 mem->online_type = ONLINE_KEEP; 298 299 ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); 300 301 /* clear online_type */ 302 mem->online_type = -1; 303 304 return ret; 305 } 306 307 static int memory_subsys_offline(struct device *dev) 308 { 309 struct memory_block *mem = to_memory_block(dev); 310 311 if (mem->state == MEM_OFFLINE) 312 return 0; 313 314 return memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); 315 } 316 317 static ssize_t 318 store_mem_state(struct device *dev, 319 struct device_attribute *attr, const char *buf, size_t count) 320 { 321 struct memory_block *mem = to_memory_block(dev); 322 int ret, online_type; 323 324 ret = lock_device_hotplug_sysfs(); 325 if (ret) 326 return ret; 327 328 if (!strncmp(buf, "online_kernel", min_t(int, count, 13))) 329 online_type = ONLINE_KERNEL; 330 else if (!strncmp(buf, "online_movable", min_t(int, count, 14))) 331 online_type = ONLINE_MOVABLE; 332 else if (!strncmp(buf, "online", min_t(int, count, 6))) 333 online_type = ONLINE_KEEP; 334 else if (!strncmp(buf, "offline", min_t(int, count, 7))) 335 online_type = -1; 336 else 337 return -EINVAL; 338 339 switch (online_type) { 340 case ONLINE_KERNEL: 341 case ONLINE_MOVABLE: 342 case ONLINE_KEEP: 343 /* 344 * mem->online_type is not protected so there can be a 345 * race here. However, when racing online, the first 346 * will succeed and the second will just return as the 347 * block will already be online. The online type 348 * could be either one, but that is expected. 349 */ 350 mem->online_type = online_type; 351 ret = device_online(&mem->dev); 352 break; 353 case -1: 354 ret = device_offline(&mem->dev); 355 break; 356 default: 357 ret = -EINVAL; /* should never happen */ 358 } 359 360 unlock_device_hotplug(); 361 362 if (ret) 363 return ret; 364 return count; 365 } 366 367 /* 368 * phys_device is a bad name for this. What I really want 369 * is a way to differentiate between memory ranges that 370 * are part of physical devices that constitute 371 * a complete removable unit or fru. 372 * i.e. do these ranges belong to the same physical device, 373 * s.t. if I offline all of these sections I can then 374 * remove the physical device? 375 */ 376 static ssize_t show_phys_device(struct device *dev, 377 struct device_attribute *attr, char *buf) 378 { 379 struct memory_block *mem = to_memory_block(dev); 380 return sprintf(buf, "%d\n", mem->phys_device); 381 } 382 383 static DEVICE_ATTR(phys_index, 0444, show_mem_start_phys_index, NULL); 384 static DEVICE_ATTR(end_phys_index, 0444, show_mem_end_phys_index, NULL); 385 static DEVICE_ATTR(state, 0644, show_mem_state, store_mem_state); 386 static DEVICE_ATTR(phys_device, 0444, show_phys_device, NULL); 387 static DEVICE_ATTR(removable, 0444, show_mem_removable, NULL); 388 389 /* 390 * Block size attribute stuff 391 */ 392 static ssize_t 393 print_block_size(struct device *dev, struct device_attribute *attr, 394 char *buf) 395 { 396 return sprintf(buf, "%lx\n", get_memory_block_size()); 397 } 398 399 static DEVICE_ATTR(block_size_bytes, 0444, print_block_size, NULL); 400 401 /* 402 * Some architectures will have custom drivers to do this, and 403 * will not need to do it from userspace. The fake hot-add code 404 * as well as ppc64 will do all of their discovery in userspace 405 * and will require this interface. 406 */ 407 #ifdef CONFIG_ARCH_MEMORY_PROBE 408 static ssize_t 409 memory_probe_store(struct device *dev, struct device_attribute *attr, 410 const char *buf, size_t count) 411 { 412 u64 phys_addr; 413 int nid; 414 int i, ret; 415 unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block; 416 417 phys_addr = simple_strtoull(buf, NULL, 0); 418 419 if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1)) 420 return -EINVAL; 421 422 for (i = 0; i < sections_per_block; i++) { 423 nid = memory_add_physaddr_to_nid(phys_addr); 424 ret = add_memory(nid, phys_addr, 425 PAGES_PER_SECTION << PAGE_SHIFT); 426 if (ret) 427 goto out; 428 429 phys_addr += MIN_MEMORY_BLOCK_SIZE; 430 } 431 432 ret = count; 433 out: 434 return ret; 435 } 436 437 static DEVICE_ATTR(probe, S_IWUSR, NULL, memory_probe_store); 438 #endif 439 440 #ifdef CONFIG_MEMORY_FAILURE 441 /* 442 * Support for offlining pages of memory 443 */ 444 445 /* Soft offline a page */ 446 static ssize_t 447 store_soft_offline_page(struct device *dev, 448 struct device_attribute *attr, 449 const char *buf, size_t count) 450 { 451 int ret; 452 u64 pfn; 453 if (!capable(CAP_SYS_ADMIN)) 454 return -EPERM; 455 if (kstrtoull(buf, 0, &pfn) < 0) 456 return -EINVAL; 457 pfn >>= PAGE_SHIFT; 458 if (!pfn_valid(pfn)) 459 return -ENXIO; 460 ret = soft_offline_page(pfn_to_page(pfn), 0); 461 return ret == 0 ? count : ret; 462 } 463 464 /* Forcibly offline a page, including killing processes. */ 465 static ssize_t 466 store_hard_offline_page(struct device *dev, 467 struct device_attribute *attr, 468 const char *buf, size_t count) 469 { 470 int ret; 471 u64 pfn; 472 if (!capable(CAP_SYS_ADMIN)) 473 return -EPERM; 474 if (kstrtoull(buf, 0, &pfn) < 0) 475 return -EINVAL; 476 pfn >>= PAGE_SHIFT; 477 ret = memory_failure(pfn, 0, 0); 478 return ret ? ret : count; 479 } 480 481 static DEVICE_ATTR(soft_offline_page, S_IWUSR, NULL, store_soft_offline_page); 482 static DEVICE_ATTR(hard_offline_page, S_IWUSR, NULL, store_hard_offline_page); 483 #endif 484 485 /* 486 * Note that phys_device is optional. It is here to allow for 487 * differentiation between which *physical* devices each 488 * section belongs to... 489 */ 490 int __weak arch_get_memory_phys_device(unsigned long start_pfn) 491 { 492 return 0; 493 } 494 495 /* 496 * A reference for the returned object is held and the reference for the 497 * hinted object is released. 498 */ 499 struct memory_block *find_memory_block_hinted(struct mem_section *section, 500 struct memory_block *hint) 501 { 502 int block_id = base_memory_block_id(__section_nr(section)); 503 struct device *hintdev = hint ? &hint->dev : NULL; 504 struct device *dev; 505 506 dev = subsys_find_device_by_id(&memory_subsys, block_id, hintdev); 507 if (hint) 508 put_device(&hint->dev); 509 if (!dev) 510 return NULL; 511 return to_memory_block(dev); 512 } 513 514 /* 515 * For now, we have a linear search to go find the appropriate 516 * memory_block corresponding to a particular phys_index. If 517 * this gets to be a real problem, we can always use a radix 518 * tree or something here. 519 * 520 * This could be made generic for all device subsystems. 521 */ 522 struct memory_block *find_memory_block(struct mem_section *section) 523 { 524 return find_memory_block_hinted(section, NULL); 525 } 526 527 static struct attribute *memory_memblk_attrs[] = { 528 &dev_attr_phys_index.attr, 529 &dev_attr_end_phys_index.attr, 530 &dev_attr_state.attr, 531 &dev_attr_phys_device.attr, 532 &dev_attr_removable.attr, 533 NULL 534 }; 535 536 static struct attribute_group memory_memblk_attr_group = { 537 .attrs = memory_memblk_attrs, 538 }; 539 540 static const struct attribute_group *memory_memblk_attr_groups[] = { 541 &memory_memblk_attr_group, 542 NULL, 543 }; 544 545 /* 546 * register_memory - Setup a sysfs device for a memory block 547 */ 548 static 549 int register_memory(struct memory_block *memory) 550 { 551 memory->dev.bus = &memory_subsys; 552 memory->dev.id = memory->start_section_nr / sections_per_block; 553 memory->dev.release = memory_block_release; 554 memory->dev.groups = memory_memblk_attr_groups; 555 memory->dev.offline = memory->state == MEM_OFFLINE; 556 557 return device_register(&memory->dev); 558 } 559 560 static int init_memory_block(struct memory_block **memory, 561 struct mem_section *section, unsigned long state) 562 { 563 struct memory_block *mem; 564 unsigned long start_pfn; 565 int scn_nr; 566 int ret = 0; 567 568 mem = kzalloc(sizeof(*mem), GFP_KERNEL); 569 if (!mem) 570 return -ENOMEM; 571 572 scn_nr = __section_nr(section); 573 mem->start_section_nr = 574 base_memory_block_id(scn_nr) * sections_per_block; 575 mem->end_section_nr = mem->start_section_nr + sections_per_block - 1; 576 mem->state = state; 577 mem->section_count++; 578 start_pfn = section_nr_to_pfn(mem->start_section_nr); 579 mem->phys_device = arch_get_memory_phys_device(start_pfn); 580 581 ret = register_memory(mem); 582 583 *memory = mem; 584 return ret; 585 } 586 587 static int add_memory_block(int base_section_nr) 588 { 589 struct memory_block *mem; 590 int i, ret, section_count = 0, section_nr; 591 592 for (i = base_section_nr; 593 (i < base_section_nr + sections_per_block) && i < NR_MEM_SECTIONS; 594 i++) { 595 if (!present_section_nr(i)) 596 continue; 597 if (section_count == 0) 598 section_nr = i; 599 section_count++; 600 } 601 602 if (section_count == 0) 603 return 0; 604 ret = init_memory_block(&mem, __nr_to_section(section_nr), MEM_ONLINE); 605 if (ret) 606 return ret; 607 mem->section_count = section_count; 608 return 0; 609 } 610 611 612 /* 613 * need an interface for the VM to add new memory regions, 614 * but without onlining it. 615 */ 616 int register_new_memory(int nid, struct mem_section *section) 617 { 618 int ret = 0; 619 struct memory_block *mem; 620 621 mutex_lock(&mem_sysfs_mutex); 622 623 mem = find_memory_block(section); 624 if (mem) { 625 mem->section_count++; 626 put_device(&mem->dev); 627 } else { 628 ret = init_memory_block(&mem, section, MEM_OFFLINE); 629 if (ret) 630 goto out; 631 } 632 633 if (mem->section_count == sections_per_block) 634 ret = register_mem_sect_under_node(mem, nid); 635 out: 636 mutex_unlock(&mem_sysfs_mutex); 637 return ret; 638 } 639 640 #ifdef CONFIG_MEMORY_HOTREMOVE 641 static void 642 unregister_memory(struct memory_block *memory) 643 { 644 BUG_ON(memory->dev.bus != &memory_subsys); 645 646 /* drop the ref. we got in remove_memory_block() */ 647 put_device(&memory->dev); 648 device_unregister(&memory->dev); 649 } 650 651 static int remove_memory_block(unsigned long node_id, 652 struct mem_section *section, int phys_device) 653 { 654 struct memory_block *mem; 655 656 mutex_lock(&mem_sysfs_mutex); 657 mem = find_memory_block(section); 658 unregister_mem_sect_under_nodes(mem, __section_nr(section)); 659 660 mem->section_count--; 661 if (mem->section_count == 0) 662 unregister_memory(mem); 663 else 664 put_device(&mem->dev); 665 666 mutex_unlock(&mem_sysfs_mutex); 667 return 0; 668 } 669 670 int unregister_memory_section(struct mem_section *section) 671 { 672 if (!present_section(section)) 673 return -EINVAL; 674 675 return remove_memory_block(0, section, 0); 676 } 677 #endif /* CONFIG_MEMORY_HOTREMOVE */ 678 679 /* return true if the memory block is offlined, otherwise, return false */ 680 bool is_memblock_offlined(struct memory_block *mem) 681 { 682 return mem->state == MEM_OFFLINE; 683 } 684 685 static struct attribute *memory_root_attrs[] = { 686 #ifdef CONFIG_ARCH_MEMORY_PROBE 687 &dev_attr_probe.attr, 688 #endif 689 690 #ifdef CONFIG_MEMORY_FAILURE 691 &dev_attr_soft_offline_page.attr, 692 &dev_attr_hard_offline_page.attr, 693 #endif 694 695 &dev_attr_block_size_bytes.attr, 696 NULL 697 }; 698 699 static struct attribute_group memory_root_attr_group = { 700 .attrs = memory_root_attrs, 701 }; 702 703 static const struct attribute_group *memory_root_attr_groups[] = { 704 &memory_root_attr_group, 705 NULL, 706 }; 707 708 /* 709 * Initialize the sysfs support for memory devices... 710 */ 711 int __init memory_dev_init(void) 712 { 713 unsigned int i; 714 int ret; 715 int err; 716 unsigned long block_sz; 717 718 ret = subsys_system_register(&memory_subsys, memory_root_attr_groups); 719 if (ret) 720 goto out; 721 722 block_sz = get_memory_block_size(); 723 sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; 724 725 /* 726 * Create entries for memory sections that were found 727 * during boot and have been initialized 728 */ 729 mutex_lock(&mem_sysfs_mutex); 730 for (i = 0; i < NR_MEM_SECTIONS; i += sections_per_block) { 731 err = add_memory_block(i); 732 if (!ret) 733 ret = err; 734 } 735 mutex_unlock(&mem_sysfs_mutex); 736 737 out: 738 if (ret) 739 printk(KERN_ERR "%s() failed: %d\n", __func__, ret); 740 return ret; 741 } 742