1 /* 2 * Memory subsystem support 3 * 4 * Written by Matt Tolentino <matthew.e.tolentino@intel.com> 5 * Dave Hansen <haveblue@us.ibm.com> 6 * 7 * This file provides the necessary infrastructure to represent 8 * a SPARSEMEM-memory-model system's physical memory in /sysfs. 9 * All arch-independent code that assumes MEMORY_HOTPLUG requires 10 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c. 11 */ 12 13 #include <linux/module.h> 14 #include <linux/init.h> 15 #include <linux/topology.h> 16 #include <linux/capability.h> 17 #include <linux/device.h> 18 #include <linux/memory.h> 19 #include <linux/kobject.h> 20 #include <linux/memory_hotplug.h> 21 #include <linux/mm.h> 22 #include <linux/mutex.h> 23 #include <linux/stat.h> 24 #include <linux/slab.h> 25 26 #include <linux/atomic.h> 27 #include <asm/uaccess.h> 28 29 static DEFINE_MUTEX(mem_sysfs_mutex); 30 31 #define MEMORY_CLASS_NAME "memory" 32 33 static int sections_per_block; 34 35 static inline int base_memory_block_id(int section_nr) 36 { 37 return section_nr / sections_per_block; 38 } 39 40 static struct bus_type memory_subsys = { 41 .name = MEMORY_CLASS_NAME, 42 .dev_name = MEMORY_CLASS_NAME, 43 }; 44 45 static BLOCKING_NOTIFIER_HEAD(memory_chain); 46 47 int register_memory_notifier(struct notifier_block *nb) 48 { 49 return blocking_notifier_chain_register(&memory_chain, nb); 50 } 51 EXPORT_SYMBOL(register_memory_notifier); 52 53 void unregister_memory_notifier(struct notifier_block *nb) 54 { 55 blocking_notifier_chain_unregister(&memory_chain, nb); 56 } 57 EXPORT_SYMBOL(unregister_memory_notifier); 58 59 static ATOMIC_NOTIFIER_HEAD(memory_isolate_chain); 60 61 int register_memory_isolate_notifier(struct notifier_block *nb) 62 { 63 return atomic_notifier_chain_register(&memory_isolate_chain, nb); 64 } 65 EXPORT_SYMBOL(register_memory_isolate_notifier); 66 67 void unregister_memory_isolate_notifier(struct notifier_block *nb) 68 { 69 atomic_notifier_chain_unregister(&memory_isolate_chain, nb); 70 } 71 EXPORT_SYMBOL(unregister_memory_isolate_notifier); 72 73 static void memory_block_release(struct device *dev) 74 { 75 struct memory_block *mem = container_of(dev, struct memory_block, dev); 76 77 kfree(mem); 78 } 79 80 /* 81 * register_memory - Setup a sysfs device for a memory block 82 */ 83 static 84 int register_memory(struct memory_block *memory) 85 { 86 int error; 87 88 memory->dev.bus = &memory_subsys; 89 memory->dev.id = memory->start_section_nr / sections_per_block; 90 memory->dev.release = memory_block_release; 91 92 error = device_register(&memory->dev); 93 return error; 94 } 95 96 unsigned long __weak memory_block_size_bytes(void) 97 { 98 return MIN_MEMORY_BLOCK_SIZE; 99 } 100 101 static unsigned long get_memory_block_size(void) 102 { 103 unsigned long block_sz; 104 105 block_sz = memory_block_size_bytes(); 106 107 /* Validate blk_sz is a power of 2 and not less than section size */ 108 if ((block_sz & (block_sz - 1)) || (block_sz < MIN_MEMORY_BLOCK_SIZE)) { 109 WARN_ON(1); 110 block_sz = MIN_MEMORY_BLOCK_SIZE; 111 } 112 113 return block_sz; 114 } 115 116 /* 117 * use this as the physical section index that this memsection 118 * uses. 119 */ 120 121 static ssize_t show_mem_start_phys_index(struct device *dev, 122 struct device_attribute *attr, char *buf) 123 { 124 struct memory_block *mem = 125 container_of(dev, struct memory_block, dev); 126 unsigned long phys_index; 127 128 phys_index = mem->start_section_nr / sections_per_block; 129 return sprintf(buf, "%08lx\n", phys_index); 130 } 131 132 static ssize_t show_mem_end_phys_index(struct device *dev, 133 struct device_attribute *attr, char *buf) 134 { 135 struct memory_block *mem = 136 container_of(dev, struct memory_block, dev); 137 unsigned long phys_index; 138 139 phys_index = mem->end_section_nr / sections_per_block; 140 return sprintf(buf, "%08lx\n", phys_index); 141 } 142 143 /* 144 * Show whether the section of memory is likely to be hot-removable 145 */ 146 static ssize_t show_mem_removable(struct device *dev, 147 struct device_attribute *attr, char *buf) 148 { 149 unsigned long i, pfn; 150 int ret = 1; 151 struct memory_block *mem = 152 container_of(dev, struct memory_block, dev); 153 154 for (i = 0; i < sections_per_block; i++) { 155 pfn = section_nr_to_pfn(mem->start_section_nr + i); 156 ret &= is_mem_section_removable(pfn, PAGES_PER_SECTION); 157 } 158 159 return sprintf(buf, "%d\n", ret); 160 } 161 162 /* 163 * online, offline, going offline, etc. 164 */ 165 static ssize_t show_mem_state(struct device *dev, 166 struct device_attribute *attr, char *buf) 167 { 168 struct memory_block *mem = 169 container_of(dev, struct memory_block, dev); 170 ssize_t len = 0; 171 172 /* 173 * We can probably put these states in a nice little array 174 * so that they're not open-coded 175 */ 176 switch (mem->state) { 177 case MEM_ONLINE: 178 len = sprintf(buf, "online\n"); 179 break; 180 case MEM_OFFLINE: 181 len = sprintf(buf, "offline\n"); 182 break; 183 case MEM_GOING_OFFLINE: 184 len = sprintf(buf, "going-offline\n"); 185 break; 186 default: 187 len = sprintf(buf, "ERROR-UNKNOWN-%ld\n", 188 mem->state); 189 WARN_ON(1); 190 break; 191 } 192 193 return len; 194 } 195 196 int memory_notify(unsigned long val, void *v) 197 { 198 return blocking_notifier_call_chain(&memory_chain, val, v); 199 } 200 201 int memory_isolate_notify(unsigned long val, void *v) 202 { 203 return atomic_notifier_call_chain(&memory_isolate_chain, val, v); 204 } 205 206 /* 207 * The probe routines leave the pages reserved, just as the bootmem code does. 208 * Make sure they're still that way. 209 */ 210 static bool pages_correctly_reserved(unsigned long start_pfn) 211 { 212 int i, j; 213 struct page *page; 214 unsigned long pfn = start_pfn; 215 216 /* 217 * memmap between sections is not contiguous except with 218 * SPARSEMEM_VMEMMAP. We lookup the page once per section 219 * and assume memmap is contiguous within each section 220 */ 221 for (i = 0; i < sections_per_block; i++, pfn += PAGES_PER_SECTION) { 222 if (WARN_ON_ONCE(!pfn_valid(pfn))) 223 return false; 224 page = pfn_to_page(pfn); 225 226 for (j = 0; j < PAGES_PER_SECTION; j++) { 227 if (PageReserved(page + j)) 228 continue; 229 230 printk(KERN_WARNING "section number %ld page number %d " 231 "not reserved, was it already online?\n", 232 pfn_to_section_nr(pfn), j); 233 234 return false; 235 } 236 } 237 238 return true; 239 } 240 241 /* 242 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is 243 * OK to have direct references to sparsemem variables in here. 244 */ 245 static int 246 memory_block_action(unsigned long phys_index, unsigned long action, int online_type) 247 { 248 unsigned long start_pfn; 249 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 250 struct page *first_page; 251 int ret; 252 253 first_page = pfn_to_page(phys_index << PFN_SECTION_SHIFT); 254 start_pfn = page_to_pfn(first_page); 255 256 switch (action) { 257 case MEM_ONLINE: 258 if (!pages_correctly_reserved(start_pfn)) 259 return -EBUSY; 260 261 ret = online_pages(start_pfn, nr_pages, online_type); 262 break; 263 case MEM_OFFLINE: 264 ret = offline_pages(start_pfn, nr_pages); 265 break; 266 default: 267 WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: " 268 "%ld\n", __func__, phys_index, action, action); 269 ret = -EINVAL; 270 } 271 272 return ret; 273 } 274 275 static int __memory_block_change_state(struct memory_block *mem, 276 unsigned long to_state, unsigned long from_state_req, 277 int online_type) 278 { 279 int ret = 0; 280 281 if (mem->state != from_state_req) { 282 ret = -EINVAL; 283 goto out; 284 } 285 286 if (to_state == MEM_OFFLINE) 287 mem->state = MEM_GOING_OFFLINE; 288 289 ret = memory_block_action(mem->start_section_nr, to_state, online_type); 290 291 if (ret) { 292 mem->state = from_state_req; 293 goto out; 294 } 295 296 mem->state = to_state; 297 switch (mem->state) { 298 case MEM_OFFLINE: 299 kobject_uevent(&mem->dev.kobj, KOBJ_OFFLINE); 300 break; 301 case MEM_ONLINE: 302 kobject_uevent(&mem->dev.kobj, KOBJ_ONLINE); 303 break; 304 default: 305 break; 306 } 307 out: 308 return ret; 309 } 310 311 static int memory_block_change_state(struct memory_block *mem, 312 unsigned long to_state, unsigned long from_state_req, 313 int online_type) 314 { 315 int ret; 316 317 mutex_lock(&mem->state_mutex); 318 ret = __memory_block_change_state(mem, to_state, from_state_req, 319 online_type); 320 mutex_unlock(&mem->state_mutex); 321 322 return ret; 323 } 324 static ssize_t 325 store_mem_state(struct device *dev, 326 struct device_attribute *attr, const char *buf, size_t count) 327 { 328 struct memory_block *mem; 329 int ret = -EINVAL; 330 331 mem = container_of(dev, struct memory_block, dev); 332 333 if (!strncmp(buf, "online_kernel", min_t(int, count, 13))) 334 ret = memory_block_change_state(mem, MEM_ONLINE, 335 MEM_OFFLINE, ONLINE_KERNEL); 336 else if (!strncmp(buf, "online_movable", min_t(int, count, 14))) 337 ret = memory_block_change_state(mem, MEM_ONLINE, 338 MEM_OFFLINE, ONLINE_MOVABLE); 339 else if (!strncmp(buf, "online", min_t(int, count, 6))) 340 ret = memory_block_change_state(mem, MEM_ONLINE, 341 MEM_OFFLINE, ONLINE_KEEP); 342 else if(!strncmp(buf, "offline", min_t(int, count, 7))) 343 ret = memory_block_change_state(mem, MEM_OFFLINE, 344 MEM_ONLINE, -1); 345 346 if (ret) 347 return ret; 348 return count; 349 } 350 351 /* 352 * phys_device is a bad name for this. What I really want 353 * is a way to differentiate between memory ranges that 354 * are part of physical devices that constitute 355 * a complete removable unit or fru. 356 * i.e. do these ranges belong to the same physical device, 357 * s.t. if I offline all of these sections I can then 358 * remove the physical device? 359 */ 360 static ssize_t show_phys_device(struct device *dev, 361 struct device_attribute *attr, char *buf) 362 { 363 struct memory_block *mem = 364 container_of(dev, struct memory_block, dev); 365 return sprintf(buf, "%d\n", mem->phys_device); 366 } 367 368 static DEVICE_ATTR(phys_index, 0444, show_mem_start_phys_index, NULL); 369 static DEVICE_ATTR(end_phys_index, 0444, show_mem_end_phys_index, NULL); 370 static DEVICE_ATTR(state, 0644, show_mem_state, store_mem_state); 371 static DEVICE_ATTR(phys_device, 0444, show_phys_device, NULL); 372 static DEVICE_ATTR(removable, 0444, show_mem_removable, NULL); 373 374 #define mem_create_simple_file(mem, attr_name) \ 375 device_create_file(&mem->dev, &dev_attr_##attr_name) 376 #define mem_remove_simple_file(mem, attr_name) \ 377 device_remove_file(&mem->dev, &dev_attr_##attr_name) 378 379 /* 380 * Block size attribute stuff 381 */ 382 static ssize_t 383 print_block_size(struct device *dev, struct device_attribute *attr, 384 char *buf) 385 { 386 return sprintf(buf, "%lx\n", get_memory_block_size()); 387 } 388 389 static DEVICE_ATTR(block_size_bytes, 0444, print_block_size, NULL); 390 391 static int block_size_init(void) 392 { 393 return device_create_file(memory_subsys.dev_root, 394 &dev_attr_block_size_bytes); 395 } 396 397 /* 398 * Some architectures will have custom drivers to do this, and 399 * will not need to do it from userspace. The fake hot-add code 400 * as well as ppc64 will do all of their discovery in userspace 401 * and will require this interface. 402 */ 403 #ifdef CONFIG_ARCH_MEMORY_PROBE 404 static ssize_t 405 memory_probe_store(struct device *dev, struct device_attribute *attr, 406 const char *buf, size_t count) 407 { 408 u64 phys_addr; 409 int nid; 410 int i, ret; 411 unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block; 412 413 phys_addr = simple_strtoull(buf, NULL, 0); 414 415 if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1)) 416 return -EINVAL; 417 418 for (i = 0; i < sections_per_block; i++) { 419 nid = memory_add_physaddr_to_nid(phys_addr); 420 ret = add_memory(nid, phys_addr, 421 PAGES_PER_SECTION << PAGE_SHIFT); 422 if (ret) 423 goto out; 424 425 phys_addr += MIN_MEMORY_BLOCK_SIZE; 426 } 427 428 ret = count; 429 out: 430 return ret; 431 } 432 static DEVICE_ATTR(probe, S_IWUSR, NULL, memory_probe_store); 433 434 static int memory_probe_init(void) 435 { 436 return device_create_file(memory_subsys.dev_root, &dev_attr_probe); 437 } 438 #else 439 static inline int memory_probe_init(void) 440 { 441 return 0; 442 } 443 #endif 444 445 #ifdef CONFIG_MEMORY_FAILURE 446 /* 447 * Support for offlining pages of memory 448 */ 449 450 /* Soft offline a page */ 451 static ssize_t 452 store_soft_offline_page(struct device *dev, 453 struct device_attribute *attr, 454 const char *buf, size_t count) 455 { 456 int ret; 457 u64 pfn; 458 if (!capable(CAP_SYS_ADMIN)) 459 return -EPERM; 460 if (strict_strtoull(buf, 0, &pfn) < 0) 461 return -EINVAL; 462 pfn >>= PAGE_SHIFT; 463 if (!pfn_valid(pfn)) 464 return -ENXIO; 465 ret = soft_offline_page(pfn_to_page(pfn), 0); 466 return ret == 0 ? count : ret; 467 } 468 469 /* Forcibly offline a page, including killing processes. */ 470 static ssize_t 471 store_hard_offline_page(struct device *dev, 472 struct device_attribute *attr, 473 const char *buf, size_t count) 474 { 475 int ret; 476 u64 pfn; 477 if (!capable(CAP_SYS_ADMIN)) 478 return -EPERM; 479 if (strict_strtoull(buf, 0, &pfn) < 0) 480 return -EINVAL; 481 pfn >>= PAGE_SHIFT; 482 ret = memory_failure(pfn, 0, 0); 483 return ret ? ret : count; 484 } 485 486 static DEVICE_ATTR(soft_offline_page, S_IWUSR, NULL, store_soft_offline_page); 487 static DEVICE_ATTR(hard_offline_page, S_IWUSR, NULL, store_hard_offline_page); 488 489 static __init int memory_fail_init(void) 490 { 491 int err; 492 493 err = device_create_file(memory_subsys.dev_root, 494 &dev_attr_soft_offline_page); 495 if (!err) 496 err = device_create_file(memory_subsys.dev_root, 497 &dev_attr_hard_offline_page); 498 return err; 499 } 500 #else 501 static inline int memory_fail_init(void) 502 { 503 return 0; 504 } 505 #endif 506 507 /* 508 * Note that phys_device is optional. It is here to allow for 509 * differentiation between which *physical* devices each 510 * section belongs to... 511 */ 512 int __weak arch_get_memory_phys_device(unsigned long start_pfn) 513 { 514 return 0; 515 } 516 517 /* 518 * A reference for the returned object is held and the reference for the 519 * hinted object is released. 520 */ 521 struct memory_block *find_memory_block_hinted(struct mem_section *section, 522 struct memory_block *hint) 523 { 524 int block_id = base_memory_block_id(__section_nr(section)); 525 struct device *hintdev = hint ? &hint->dev : NULL; 526 struct device *dev; 527 528 dev = subsys_find_device_by_id(&memory_subsys, block_id, hintdev); 529 if (hint) 530 put_device(&hint->dev); 531 if (!dev) 532 return NULL; 533 return container_of(dev, struct memory_block, dev); 534 } 535 536 /* 537 * For now, we have a linear search to go find the appropriate 538 * memory_block corresponding to a particular phys_index. If 539 * this gets to be a real problem, we can always use a radix 540 * tree or something here. 541 * 542 * This could be made generic for all device subsystems. 543 */ 544 struct memory_block *find_memory_block(struct mem_section *section) 545 { 546 return find_memory_block_hinted(section, NULL); 547 } 548 549 static int init_memory_block(struct memory_block **memory, 550 struct mem_section *section, unsigned long state) 551 { 552 struct memory_block *mem; 553 unsigned long start_pfn; 554 int scn_nr; 555 int ret = 0; 556 557 mem = kzalloc(sizeof(*mem), GFP_KERNEL); 558 if (!mem) 559 return -ENOMEM; 560 561 scn_nr = __section_nr(section); 562 mem->start_section_nr = 563 base_memory_block_id(scn_nr) * sections_per_block; 564 mem->end_section_nr = mem->start_section_nr + sections_per_block - 1; 565 mem->state = state; 566 mem->section_count++; 567 mutex_init(&mem->state_mutex); 568 start_pfn = section_nr_to_pfn(mem->start_section_nr); 569 mem->phys_device = arch_get_memory_phys_device(start_pfn); 570 571 ret = register_memory(mem); 572 if (!ret) 573 ret = mem_create_simple_file(mem, phys_index); 574 if (!ret) 575 ret = mem_create_simple_file(mem, end_phys_index); 576 if (!ret) 577 ret = mem_create_simple_file(mem, state); 578 if (!ret) 579 ret = mem_create_simple_file(mem, phys_device); 580 if (!ret) 581 ret = mem_create_simple_file(mem, removable); 582 583 *memory = mem; 584 return ret; 585 } 586 587 static int add_memory_section(int nid, struct mem_section *section, 588 struct memory_block **mem_p, 589 unsigned long state, enum mem_add_context context) 590 { 591 struct memory_block *mem = NULL; 592 int scn_nr = __section_nr(section); 593 int ret = 0; 594 595 mutex_lock(&mem_sysfs_mutex); 596 597 if (context == BOOT) { 598 /* same memory block ? */ 599 if (mem_p && *mem_p) 600 if (scn_nr >= (*mem_p)->start_section_nr && 601 scn_nr <= (*mem_p)->end_section_nr) { 602 mem = *mem_p; 603 kobject_get(&mem->dev.kobj); 604 } 605 } else 606 mem = find_memory_block(section); 607 608 if (mem) { 609 mem->section_count++; 610 kobject_put(&mem->dev.kobj); 611 } else { 612 ret = init_memory_block(&mem, section, state); 613 /* store memory_block pointer for next loop */ 614 if (!ret && context == BOOT) 615 if (mem_p) 616 *mem_p = mem; 617 } 618 619 if (!ret) { 620 if (context == HOTPLUG && 621 mem->section_count == sections_per_block) 622 ret = register_mem_sect_under_node(mem, nid); 623 } 624 625 mutex_unlock(&mem_sysfs_mutex); 626 return ret; 627 } 628 629 /* 630 * need an interface for the VM to add new memory regions, 631 * but without onlining it. 632 */ 633 int register_new_memory(int nid, struct mem_section *section) 634 { 635 return add_memory_section(nid, section, NULL, MEM_OFFLINE, HOTPLUG); 636 } 637 638 #ifdef CONFIG_MEMORY_HOTREMOVE 639 static void 640 unregister_memory(struct memory_block *memory) 641 { 642 BUG_ON(memory->dev.bus != &memory_subsys); 643 644 /* drop the ref. we got in remove_memory_block() */ 645 kobject_put(&memory->dev.kobj); 646 device_unregister(&memory->dev); 647 } 648 649 static int remove_memory_block(unsigned long node_id, 650 struct mem_section *section, int phys_device) 651 { 652 struct memory_block *mem; 653 654 mutex_lock(&mem_sysfs_mutex); 655 mem = find_memory_block(section); 656 unregister_mem_sect_under_nodes(mem, __section_nr(section)); 657 658 mem->section_count--; 659 if (mem->section_count == 0) { 660 mem_remove_simple_file(mem, phys_index); 661 mem_remove_simple_file(mem, end_phys_index); 662 mem_remove_simple_file(mem, state); 663 mem_remove_simple_file(mem, phys_device); 664 mem_remove_simple_file(mem, removable); 665 unregister_memory(mem); 666 } else 667 kobject_put(&mem->dev.kobj); 668 669 mutex_unlock(&mem_sysfs_mutex); 670 return 0; 671 } 672 673 int unregister_memory_section(struct mem_section *section) 674 { 675 if (!present_section(section)) 676 return -EINVAL; 677 678 return remove_memory_block(0, section, 0); 679 } 680 #endif /* CONFIG_MEMORY_HOTREMOVE */ 681 682 /* 683 * offline one memory block. If the memory block has been offlined, do nothing. 684 */ 685 int offline_memory_block(struct memory_block *mem) 686 { 687 int ret = 0; 688 689 mutex_lock(&mem->state_mutex); 690 if (mem->state != MEM_OFFLINE) 691 ret = __memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE, -1); 692 mutex_unlock(&mem->state_mutex); 693 694 return ret; 695 } 696 697 /* return true if the memory block is offlined, otherwise, return false */ 698 bool is_memblock_offlined(struct memory_block *mem) 699 { 700 return mem->state == MEM_OFFLINE; 701 } 702 703 /* 704 * Initialize the sysfs support for memory devices... 705 */ 706 int __init memory_dev_init(void) 707 { 708 unsigned int i; 709 int ret; 710 int err; 711 unsigned long block_sz; 712 struct memory_block *mem = NULL; 713 714 ret = subsys_system_register(&memory_subsys, NULL); 715 if (ret) 716 goto out; 717 718 block_sz = get_memory_block_size(); 719 sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; 720 721 /* 722 * Create entries for memory sections that were found 723 * during boot and have been initialized 724 */ 725 for (i = 0; i < NR_MEM_SECTIONS; i++) { 726 if (!present_section_nr(i)) 727 continue; 728 /* don't need to reuse memory_block if only one per block */ 729 err = add_memory_section(0, __nr_to_section(i), 730 (sections_per_block == 1) ? NULL : &mem, 731 MEM_ONLINE, 732 BOOT); 733 if (!ret) 734 ret = err; 735 } 736 737 err = memory_probe_init(); 738 if (!ret) 739 ret = err; 740 err = memory_fail_init(); 741 if (!ret) 742 ret = err; 743 err = block_size_init(); 744 if (!ret) 745 ret = err; 746 out: 747 if (ret) 748 printk(KERN_ERR "%s() failed: %d\n", __func__, ret); 749 return ret; 750 } 751