1 /* 2 * Memory subsystem support 3 * 4 * Written by Matt Tolentino <matthew.e.tolentino@intel.com> 5 * Dave Hansen <haveblue@us.ibm.com> 6 * 7 * This file provides the necessary infrastructure to represent 8 * a SPARSEMEM-memory-model system's physical memory in /sysfs. 9 * All arch-independent code that assumes MEMORY_HOTPLUG requires 10 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c. 11 */ 12 13 #include <linux/module.h> 14 #include <linux/init.h> 15 #include <linux/topology.h> 16 #include <linux/capability.h> 17 #include <linux/device.h> 18 #include <linux/memory.h> 19 #include <linux/kobject.h> 20 #include <linux/memory_hotplug.h> 21 #include <linux/mm.h> 22 #include <linux/mutex.h> 23 #include <linux/stat.h> 24 #include <linux/slab.h> 25 26 #include <linux/atomic.h> 27 #include <asm/uaccess.h> 28 29 static DEFINE_MUTEX(mem_sysfs_mutex); 30 31 #define MEMORY_CLASS_NAME "memory" 32 33 static int sections_per_block; 34 35 static inline int base_memory_block_id(int section_nr) 36 { 37 return section_nr / sections_per_block; 38 } 39 40 static struct bus_type memory_subsys = { 41 .name = MEMORY_CLASS_NAME, 42 .dev_name = MEMORY_CLASS_NAME, 43 }; 44 45 static BLOCKING_NOTIFIER_HEAD(memory_chain); 46 47 int register_memory_notifier(struct notifier_block *nb) 48 { 49 return blocking_notifier_chain_register(&memory_chain, nb); 50 } 51 EXPORT_SYMBOL(register_memory_notifier); 52 53 void unregister_memory_notifier(struct notifier_block *nb) 54 { 55 blocking_notifier_chain_unregister(&memory_chain, nb); 56 } 57 EXPORT_SYMBOL(unregister_memory_notifier); 58 59 static ATOMIC_NOTIFIER_HEAD(memory_isolate_chain); 60 61 int register_memory_isolate_notifier(struct notifier_block *nb) 62 { 63 return atomic_notifier_chain_register(&memory_isolate_chain, nb); 64 } 65 EXPORT_SYMBOL(register_memory_isolate_notifier); 66 67 void unregister_memory_isolate_notifier(struct notifier_block *nb) 68 { 69 atomic_notifier_chain_unregister(&memory_isolate_chain, nb); 70 } 71 EXPORT_SYMBOL(unregister_memory_isolate_notifier); 72 73 static void memory_block_release(struct device *dev) 74 { 75 struct memory_block *mem = container_of(dev, struct memory_block, dev); 76 77 kfree(mem); 78 } 79 80 /* 81 * register_memory - Setup a sysfs device for a memory block 82 */ 83 static 84 int register_memory(struct memory_block *memory) 85 { 86 int error; 87 88 memory->dev.bus = &memory_subsys; 89 memory->dev.id = memory->start_section_nr / sections_per_block; 90 memory->dev.release = memory_block_release; 91 92 error = device_register(&memory->dev); 93 return error; 94 } 95 96 static void 97 unregister_memory(struct memory_block *memory) 98 { 99 BUG_ON(memory->dev.bus != &memory_subsys); 100 101 /* drop the ref. we got in remove_memory_block() */ 102 kobject_put(&memory->dev.kobj); 103 device_unregister(&memory->dev); 104 } 105 106 unsigned long __weak memory_block_size_bytes(void) 107 { 108 return MIN_MEMORY_BLOCK_SIZE; 109 } 110 111 static unsigned long get_memory_block_size(void) 112 { 113 unsigned long block_sz; 114 115 block_sz = memory_block_size_bytes(); 116 117 /* Validate blk_sz is a power of 2 and not less than section size */ 118 if ((block_sz & (block_sz - 1)) || (block_sz < MIN_MEMORY_BLOCK_SIZE)) { 119 WARN_ON(1); 120 block_sz = MIN_MEMORY_BLOCK_SIZE; 121 } 122 123 return block_sz; 124 } 125 126 /* 127 * use this as the physical section index that this memsection 128 * uses. 129 */ 130 131 static ssize_t show_mem_start_phys_index(struct device *dev, 132 struct device_attribute *attr, char *buf) 133 { 134 struct memory_block *mem = 135 container_of(dev, struct memory_block, dev); 136 unsigned long phys_index; 137 138 phys_index = mem->start_section_nr / sections_per_block; 139 return sprintf(buf, "%08lx\n", phys_index); 140 } 141 142 static ssize_t show_mem_end_phys_index(struct device *dev, 143 struct device_attribute *attr, char *buf) 144 { 145 struct memory_block *mem = 146 container_of(dev, struct memory_block, dev); 147 unsigned long phys_index; 148 149 phys_index = mem->end_section_nr / sections_per_block; 150 return sprintf(buf, "%08lx\n", phys_index); 151 } 152 153 /* 154 * Show whether the section of memory is likely to be hot-removable 155 */ 156 static ssize_t show_mem_removable(struct device *dev, 157 struct device_attribute *attr, char *buf) 158 { 159 unsigned long i, pfn; 160 int ret = 1; 161 struct memory_block *mem = 162 container_of(dev, struct memory_block, dev); 163 164 for (i = 0; i < sections_per_block; i++) { 165 pfn = section_nr_to_pfn(mem->start_section_nr + i); 166 ret &= is_mem_section_removable(pfn, PAGES_PER_SECTION); 167 } 168 169 return sprintf(buf, "%d\n", ret); 170 } 171 172 /* 173 * online, offline, going offline, etc. 174 */ 175 static ssize_t show_mem_state(struct device *dev, 176 struct device_attribute *attr, char *buf) 177 { 178 struct memory_block *mem = 179 container_of(dev, struct memory_block, dev); 180 ssize_t len = 0; 181 182 /* 183 * We can probably put these states in a nice little array 184 * so that they're not open-coded 185 */ 186 switch (mem->state) { 187 case MEM_ONLINE: 188 len = sprintf(buf, "online\n"); 189 break; 190 case MEM_OFFLINE: 191 len = sprintf(buf, "offline\n"); 192 break; 193 case MEM_GOING_OFFLINE: 194 len = sprintf(buf, "going-offline\n"); 195 break; 196 default: 197 len = sprintf(buf, "ERROR-UNKNOWN-%ld\n", 198 mem->state); 199 WARN_ON(1); 200 break; 201 } 202 203 return len; 204 } 205 206 int memory_notify(unsigned long val, void *v) 207 { 208 return blocking_notifier_call_chain(&memory_chain, val, v); 209 } 210 211 int memory_isolate_notify(unsigned long val, void *v) 212 { 213 return atomic_notifier_call_chain(&memory_isolate_chain, val, v); 214 } 215 216 /* 217 * The probe routines leave the pages reserved, just as the bootmem code does. 218 * Make sure they're still that way. 219 */ 220 static bool pages_correctly_reserved(unsigned long start_pfn, 221 unsigned long nr_pages) 222 { 223 int i, j; 224 struct page *page; 225 unsigned long pfn = start_pfn; 226 227 /* 228 * memmap between sections is not contiguous except with 229 * SPARSEMEM_VMEMMAP. We lookup the page once per section 230 * and assume memmap is contiguous within each section 231 */ 232 for (i = 0; i < sections_per_block; i++, pfn += PAGES_PER_SECTION) { 233 if (WARN_ON_ONCE(!pfn_valid(pfn))) 234 return false; 235 page = pfn_to_page(pfn); 236 237 for (j = 0; j < PAGES_PER_SECTION; j++) { 238 if (PageReserved(page + j)) 239 continue; 240 241 printk(KERN_WARNING "section number %ld page number %d " 242 "not reserved, was it already online?\n", 243 pfn_to_section_nr(pfn), j); 244 245 return false; 246 } 247 } 248 249 return true; 250 } 251 252 /* 253 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is 254 * OK to have direct references to sparsemem variables in here. 255 */ 256 static int 257 memory_block_action(unsigned long phys_index, unsigned long action, int online_type) 258 { 259 unsigned long start_pfn; 260 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 261 struct page *first_page; 262 int ret; 263 264 first_page = pfn_to_page(phys_index << PFN_SECTION_SHIFT); 265 start_pfn = page_to_pfn(first_page); 266 267 switch (action) { 268 case MEM_ONLINE: 269 if (!pages_correctly_reserved(start_pfn, nr_pages)) 270 return -EBUSY; 271 272 ret = online_pages(start_pfn, nr_pages, online_type); 273 break; 274 case MEM_OFFLINE: 275 ret = offline_pages(start_pfn, nr_pages); 276 break; 277 default: 278 WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: " 279 "%ld\n", __func__, phys_index, action, action); 280 ret = -EINVAL; 281 } 282 283 return ret; 284 } 285 286 static int __memory_block_change_state(struct memory_block *mem, 287 unsigned long to_state, unsigned long from_state_req, 288 int online_type) 289 { 290 int ret = 0; 291 292 if (mem->state != from_state_req) { 293 ret = -EINVAL; 294 goto out; 295 } 296 297 if (to_state == MEM_OFFLINE) 298 mem->state = MEM_GOING_OFFLINE; 299 300 ret = memory_block_action(mem->start_section_nr, to_state, online_type); 301 302 if (ret) { 303 mem->state = from_state_req; 304 goto out; 305 } 306 307 mem->state = to_state; 308 switch (mem->state) { 309 case MEM_OFFLINE: 310 kobject_uevent(&mem->dev.kobj, KOBJ_OFFLINE); 311 break; 312 case MEM_ONLINE: 313 kobject_uevent(&mem->dev.kobj, KOBJ_ONLINE); 314 break; 315 default: 316 break; 317 } 318 out: 319 return ret; 320 } 321 322 static int memory_block_change_state(struct memory_block *mem, 323 unsigned long to_state, unsigned long from_state_req, 324 int online_type) 325 { 326 int ret; 327 328 mutex_lock(&mem->state_mutex); 329 ret = __memory_block_change_state(mem, to_state, from_state_req, 330 online_type); 331 mutex_unlock(&mem->state_mutex); 332 333 return ret; 334 } 335 static ssize_t 336 store_mem_state(struct device *dev, 337 struct device_attribute *attr, const char *buf, size_t count) 338 { 339 struct memory_block *mem; 340 int ret = -EINVAL; 341 342 mem = container_of(dev, struct memory_block, dev); 343 344 if (!strncmp(buf, "online_kernel", min_t(int, count, 13))) 345 ret = memory_block_change_state(mem, MEM_ONLINE, 346 MEM_OFFLINE, ONLINE_KERNEL); 347 else if (!strncmp(buf, "online_movable", min_t(int, count, 14))) 348 ret = memory_block_change_state(mem, MEM_ONLINE, 349 MEM_OFFLINE, ONLINE_MOVABLE); 350 else if (!strncmp(buf, "online", min_t(int, count, 6))) 351 ret = memory_block_change_state(mem, MEM_ONLINE, 352 MEM_OFFLINE, ONLINE_KEEP); 353 else if(!strncmp(buf, "offline", min_t(int, count, 7))) 354 ret = memory_block_change_state(mem, MEM_OFFLINE, 355 MEM_ONLINE, -1); 356 357 if (ret) 358 return ret; 359 return count; 360 } 361 362 /* 363 * phys_device is a bad name for this. What I really want 364 * is a way to differentiate between memory ranges that 365 * are part of physical devices that constitute 366 * a complete removable unit or fru. 367 * i.e. do these ranges belong to the same physical device, 368 * s.t. if I offline all of these sections I can then 369 * remove the physical device? 370 */ 371 static ssize_t show_phys_device(struct device *dev, 372 struct device_attribute *attr, char *buf) 373 { 374 struct memory_block *mem = 375 container_of(dev, struct memory_block, dev); 376 return sprintf(buf, "%d\n", mem->phys_device); 377 } 378 379 static DEVICE_ATTR(phys_index, 0444, show_mem_start_phys_index, NULL); 380 static DEVICE_ATTR(end_phys_index, 0444, show_mem_end_phys_index, NULL); 381 static DEVICE_ATTR(state, 0644, show_mem_state, store_mem_state); 382 static DEVICE_ATTR(phys_device, 0444, show_phys_device, NULL); 383 static DEVICE_ATTR(removable, 0444, show_mem_removable, NULL); 384 385 #define mem_create_simple_file(mem, attr_name) \ 386 device_create_file(&mem->dev, &dev_attr_##attr_name) 387 #define mem_remove_simple_file(mem, attr_name) \ 388 device_remove_file(&mem->dev, &dev_attr_##attr_name) 389 390 /* 391 * Block size attribute stuff 392 */ 393 static ssize_t 394 print_block_size(struct device *dev, struct device_attribute *attr, 395 char *buf) 396 { 397 return sprintf(buf, "%lx\n", get_memory_block_size()); 398 } 399 400 static DEVICE_ATTR(block_size_bytes, 0444, print_block_size, NULL); 401 402 static int block_size_init(void) 403 { 404 return device_create_file(memory_subsys.dev_root, 405 &dev_attr_block_size_bytes); 406 } 407 408 /* 409 * Some architectures will have custom drivers to do this, and 410 * will not need to do it from userspace. The fake hot-add code 411 * as well as ppc64 will do all of their discovery in userspace 412 * and will require this interface. 413 */ 414 #ifdef CONFIG_ARCH_MEMORY_PROBE 415 static ssize_t 416 memory_probe_store(struct device *dev, struct device_attribute *attr, 417 const char *buf, size_t count) 418 { 419 u64 phys_addr; 420 int nid; 421 int i, ret; 422 unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block; 423 424 phys_addr = simple_strtoull(buf, NULL, 0); 425 426 if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1)) 427 return -EINVAL; 428 429 for (i = 0; i < sections_per_block; i++) { 430 nid = memory_add_physaddr_to_nid(phys_addr); 431 ret = add_memory(nid, phys_addr, 432 PAGES_PER_SECTION << PAGE_SHIFT); 433 if (ret) 434 goto out; 435 436 phys_addr += MIN_MEMORY_BLOCK_SIZE; 437 } 438 439 ret = count; 440 out: 441 return ret; 442 } 443 static DEVICE_ATTR(probe, S_IWUSR, NULL, memory_probe_store); 444 445 static int memory_probe_init(void) 446 { 447 return device_create_file(memory_subsys.dev_root, &dev_attr_probe); 448 } 449 #else 450 static inline int memory_probe_init(void) 451 { 452 return 0; 453 } 454 #endif 455 456 #ifdef CONFIG_MEMORY_FAILURE 457 /* 458 * Support for offlining pages of memory 459 */ 460 461 /* Soft offline a page */ 462 static ssize_t 463 store_soft_offline_page(struct device *dev, 464 struct device_attribute *attr, 465 const char *buf, size_t count) 466 { 467 int ret; 468 u64 pfn; 469 if (!capable(CAP_SYS_ADMIN)) 470 return -EPERM; 471 if (strict_strtoull(buf, 0, &pfn) < 0) 472 return -EINVAL; 473 pfn >>= PAGE_SHIFT; 474 if (!pfn_valid(pfn)) 475 return -ENXIO; 476 ret = soft_offline_page(pfn_to_page(pfn), 0); 477 return ret == 0 ? count : ret; 478 } 479 480 /* Forcibly offline a page, including killing processes. */ 481 static ssize_t 482 store_hard_offline_page(struct device *dev, 483 struct device_attribute *attr, 484 const char *buf, size_t count) 485 { 486 int ret; 487 u64 pfn; 488 if (!capable(CAP_SYS_ADMIN)) 489 return -EPERM; 490 if (strict_strtoull(buf, 0, &pfn) < 0) 491 return -EINVAL; 492 pfn >>= PAGE_SHIFT; 493 ret = memory_failure(pfn, 0, 0); 494 return ret ? ret : count; 495 } 496 497 static DEVICE_ATTR(soft_offline_page, 0644, NULL, store_soft_offline_page); 498 static DEVICE_ATTR(hard_offline_page, 0644, NULL, store_hard_offline_page); 499 500 static __init int memory_fail_init(void) 501 { 502 int err; 503 504 err = device_create_file(memory_subsys.dev_root, 505 &dev_attr_soft_offline_page); 506 if (!err) 507 err = device_create_file(memory_subsys.dev_root, 508 &dev_attr_hard_offline_page); 509 return err; 510 } 511 #else 512 static inline int memory_fail_init(void) 513 { 514 return 0; 515 } 516 #endif 517 518 /* 519 * Note that phys_device is optional. It is here to allow for 520 * differentiation between which *physical* devices each 521 * section belongs to... 522 */ 523 int __weak arch_get_memory_phys_device(unsigned long start_pfn) 524 { 525 return 0; 526 } 527 528 /* 529 * A reference for the returned object is held and the reference for the 530 * hinted object is released. 531 */ 532 struct memory_block *find_memory_block_hinted(struct mem_section *section, 533 struct memory_block *hint) 534 { 535 int block_id = base_memory_block_id(__section_nr(section)); 536 struct device *hintdev = hint ? &hint->dev : NULL; 537 struct device *dev; 538 539 dev = subsys_find_device_by_id(&memory_subsys, block_id, hintdev); 540 if (hint) 541 put_device(&hint->dev); 542 if (!dev) 543 return NULL; 544 return container_of(dev, struct memory_block, dev); 545 } 546 547 /* 548 * For now, we have a linear search to go find the appropriate 549 * memory_block corresponding to a particular phys_index. If 550 * this gets to be a real problem, we can always use a radix 551 * tree or something here. 552 * 553 * This could be made generic for all device subsystems. 554 */ 555 struct memory_block *find_memory_block(struct mem_section *section) 556 { 557 return find_memory_block_hinted(section, NULL); 558 } 559 560 static int init_memory_block(struct memory_block **memory, 561 struct mem_section *section, unsigned long state) 562 { 563 struct memory_block *mem; 564 unsigned long start_pfn; 565 int scn_nr; 566 int ret = 0; 567 568 mem = kzalloc(sizeof(*mem), GFP_KERNEL); 569 if (!mem) 570 return -ENOMEM; 571 572 scn_nr = __section_nr(section); 573 mem->start_section_nr = 574 base_memory_block_id(scn_nr) * sections_per_block; 575 mem->end_section_nr = mem->start_section_nr + sections_per_block - 1; 576 mem->state = state; 577 mem->section_count++; 578 mutex_init(&mem->state_mutex); 579 start_pfn = section_nr_to_pfn(mem->start_section_nr); 580 mem->phys_device = arch_get_memory_phys_device(start_pfn); 581 582 ret = register_memory(mem); 583 if (!ret) 584 ret = mem_create_simple_file(mem, phys_index); 585 if (!ret) 586 ret = mem_create_simple_file(mem, end_phys_index); 587 if (!ret) 588 ret = mem_create_simple_file(mem, state); 589 if (!ret) 590 ret = mem_create_simple_file(mem, phys_device); 591 if (!ret) 592 ret = mem_create_simple_file(mem, removable); 593 594 *memory = mem; 595 return ret; 596 } 597 598 static int add_memory_section(int nid, struct mem_section *section, 599 struct memory_block **mem_p, 600 unsigned long state, enum mem_add_context context) 601 { 602 struct memory_block *mem = NULL; 603 int scn_nr = __section_nr(section); 604 int ret = 0; 605 606 mutex_lock(&mem_sysfs_mutex); 607 608 if (context == BOOT) { 609 /* same memory block ? */ 610 if (mem_p && *mem_p) 611 if (scn_nr >= (*mem_p)->start_section_nr && 612 scn_nr <= (*mem_p)->end_section_nr) { 613 mem = *mem_p; 614 kobject_get(&mem->dev.kobj); 615 } 616 } else 617 mem = find_memory_block(section); 618 619 if (mem) { 620 mem->section_count++; 621 kobject_put(&mem->dev.kobj); 622 } else { 623 ret = init_memory_block(&mem, section, state); 624 /* store memory_block pointer for next loop */ 625 if (!ret && context == BOOT) 626 if (mem_p) 627 *mem_p = mem; 628 } 629 630 if (!ret) { 631 if (context == HOTPLUG && 632 mem->section_count == sections_per_block) 633 ret = register_mem_sect_under_node(mem, nid); 634 } 635 636 mutex_unlock(&mem_sysfs_mutex); 637 return ret; 638 } 639 640 int remove_memory_block(unsigned long node_id, struct mem_section *section, 641 int phys_device) 642 { 643 struct memory_block *mem; 644 645 mutex_lock(&mem_sysfs_mutex); 646 mem = find_memory_block(section); 647 unregister_mem_sect_under_nodes(mem, __section_nr(section)); 648 649 mem->section_count--; 650 if (mem->section_count == 0) { 651 mem_remove_simple_file(mem, phys_index); 652 mem_remove_simple_file(mem, end_phys_index); 653 mem_remove_simple_file(mem, state); 654 mem_remove_simple_file(mem, phys_device); 655 mem_remove_simple_file(mem, removable); 656 unregister_memory(mem); 657 } else 658 kobject_put(&mem->dev.kobj); 659 660 mutex_unlock(&mem_sysfs_mutex); 661 return 0; 662 } 663 664 /* 665 * need an interface for the VM to add new memory regions, 666 * but without onlining it. 667 */ 668 int register_new_memory(int nid, struct mem_section *section) 669 { 670 return add_memory_section(nid, section, NULL, MEM_OFFLINE, HOTPLUG); 671 } 672 673 int unregister_memory_section(struct mem_section *section) 674 { 675 if (!present_section(section)) 676 return -EINVAL; 677 678 return remove_memory_block(0, section, 0); 679 } 680 681 /* 682 * offline one memory block. If the memory block has been offlined, do nothing. 683 */ 684 int offline_memory_block(struct memory_block *mem) 685 { 686 int ret = 0; 687 688 mutex_lock(&mem->state_mutex); 689 if (mem->state != MEM_OFFLINE) 690 ret = __memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE, -1); 691 mutex_unlock(&mem->state_mutex); 692 693 return ret; 694 } 695 696 /* 697 * Initialize the sysfs support for memory devices... 698 */ 699 int __init memory_dev_init(void) 700 { 701 unsigned int i; 702 int ret; 703 int err; 704 unsigned long block_sz; 705 struct memory_block *mem = NULL; 706 707 ret = subsys_system_register(&memory_subsys, NULL); 708 if (ret) 709 goto out; 710 711 block_sz = get_memory_block_size(); 712 sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; 713 714 /* 715 * Create entries for memory sections that were found 716 * during boot and have been initialized 717 */ 718 for (i = 0; i < NR_MEM_SECTIONS; i++) { 719 if (!present_section_nr(i)) 720 continue; 721 /* don't need to reuse memory_block if only one per block */ 722 err = add_memory_section(0, __nr_to_section(i), 723 (sections_per_block == 1) ? NULL : &mem, 724 MEM_ONLINE, 725 BOOT); 726 if (!ret) 727 ret = err; 728 } 729 730 err = memory_probe_init(); 731 if (!ret) 732 ret = err; 733 err = memory_fail_init(); 734 if (!ret) 735 ret = err; 736 err = block_size_init(); 737 if (!ret) 738 ret = err; 739 out: 740 if (ret) 741 printk(KERN_ERR "%s() failed: %d\n", __func__, ret); 742 return ret; 743 } 744