1 /* 2 * drivers/base/memory.c - basic Memory class support 3 * 4 * Written by Matt Tolentino <matthew.e.tolentino@intel.com> 5 * Dave Hansen <haveblue@us.ibm.com> 6 * 7 * This file provides the necessary infrastructure to represent 8 * a SPARSEMEM-memory-model system's physical memory in /sysfs. 9 * All arch-independent code that assumes MEMORY_HOTPLUG requires 10 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c. 11 */ 12 13 #include <linux/sysdev.h> 14 #include <linux/module.h> 15 #include <linux/init.h> 16 #include <linux/topology.h> 17 #include <linux/capability.h> 18 #include <linux/device.h> 19 #include <linux/memory.h> 20 #include <linux/kobject.h> 21 #include <linux/memory_hotplug.h> 22 #include <linux/mm.h> 23 #include <linux/mutex.h> 24 #include <linux/stat.h> 25 #include <linux/slab.h> 26 27 #include <linux/atomic.h> 28 #include <asm/uaccess.h> 29 30 static DEFINE_MUTEX(mem_sysfs_mutex); 31 32 #define MEMORY_CLASS_NAME "memory" 33 34 static int sections_per_block; 35 36 static inline int base_memory_block_id(int section_nr) 37 { 38 return section_nr / sections_per_block; 39 } 40 41 static struct sysdev_class memory_sysdev_class = { 42 .name = MEMORY_CLASS_NAME, 43 }; 44 45 static const char *memory_uevent_name(struct kset *kset, struct kobject *kobj) 46 { 47 return MEMORY_CLASS_NAME; 48 } 49 50 static int memory_uevent(struct kset *kset, struct kobject *obj, 51 struct kobj_uevent_env *env) 52 { 53 int retval = 0; 54 55 return retval; 56 } 57 58 static const struct kset_uevent_ops memory_uevent_ops = { 59 .name = memory_uevent_name, 60 .uevent = memory_uevent, 61 }; 62 63 static BLOCKING_NOTIFIER_HEAD(memory_chain); 64 65 int register_memory_notifier(struct notifier_block *nb) 66 { 67 return blocking_notifier_chain_register(&memory_chain, nb); 68 } 69 EXPORT_SYMBOL(register_memory_notifier); 70 71 void unregister_memory_notifier(struct notifier_block *nb) 72 { 73 blocking_notifier_chain_unregister(&memory_chain, nb); 74 } 75 EXPORT_SYMBOL(unregister_memory_notifier); 76 77 static ATOMIC_NOTIFIER_HEAD(memory_isolate_chain); 78 79 int register_memory_isolate_notifier(struct notifier_block *nb) 80 { 81 return atomic_notifier_chain_register(&memory_isolate_chain, nb); 82 } 83 EXPORT_SYMBOL(register_memory_isolate_notifier); 84 85 void unregister_memory_isolate_notifier(struct notifier_block *nb) 86 { 87 atomic_notifier_chain_unregister(&memory_isolate_chain, nb); 88 } 89 EXPORT_SYMBOL(unregister_memory_isolate_notifier); 90 91 /* 92 * register_memory - Setup a sysfs device for a memory block 93 */ 94 static 95 int register_memory(struct memory_block *memory) 96 { 97 int error; 98 99 memory->sysdev.cls = &memory_sysdev_class; 100 memory->sysdev.id = memory->start_section_nr / sections_per_block; 101 102 error = sysdev_register(&memory->sysdev); 103 return error; 104 } 105 106 static void 107 unregister_memory(struct memory_block *memory) 108 { 109 BUG_ON(memory->sysdev.cls != &memory_sysdev_class); 110 111 /* drop the ref. we got in remove_memory_block() */ 112 kobject_put(&memory->sysdev.kobj); 113 sysdev_unregister(&memory->sysdev); 114 } 115 116 unsigned long __weak memory_block_size_bytes(void) 117 { 118 return MIN_MEMORY_BLOCK_SIZE; 119 } 120 121 static unsigned long get_memory_block_size(void) 122 { 123 unsigned long block_sz; 124 125 block_sz = memory_block_size_bytes(); 126 127 /* Validate blk_sz is a power of 2 and not less than section size */ 128 if ((block_sz & (block_sz - 1)) || (block_sz < MIN_MEMORY_BLOCK_SIZE)) { 129 WARN_ON(1); 130 block_sz = MIN_MEMORY_BLOCK_SIZE; 131 } 132 133 return block_sz; 134 } 135 136 /* 137 * use this as the physical section index that this memsection 138 * uses. 139 */ 140 141 static ssize_t show_mem_start_phys_index(struct sys_device *dev, 142 struct sysdev_attribute *attr, char *buf) 143 { 144 struct memory_block *mem = 145 container_of(dev, struct memory_block, sysdev); 146 unsigned long phys_index; 147 148 phys_index = mem->start_section_nr / sections_per_block; 149 return sprintf(buf, "%08lx\n", phys_index); 150 } 151 152 static ssize_t show_mem_end_phys_index(struct sys_device *dev, 153 struct sysdev_attribute *attr, char *buf) 154 { 155 struct memory_block *mem = 156 container_of(dev, struct memory_block, sysdev); 157 unsigned long phys_index; 158 159 phys_index = mem->end_section_nr / sections_per_block; 160 return sprintf(buf, "%08lx\n", phys_index); 161 } 162 163 /* 164 * Show whether the section of memory is likely to be hot-removable 165 */ 166 static ssize_t show_mem_removable(struct sys_device *dev, 167 struct sysdev_attribute *attr, char *buf) 168 { 169 unsigned long i, pfn; 170 int ret = 1; 171 struct memory_block *mem = 172 container_of(dev, struct memory_block, sysdev); 173 174 for (i = 0; i < sections_per_block; i++) { 175 pfn = section_nr_to_pfn(mem->start_section_nr + i); 176 ret &= is_mem_section_removable(pfn, PAGES_PER_SECTION); 177 } 178 179 return sprintf(buf, "%d\n", ret); 180 } 181 182 /* 183 * online, offline, going offline, etc. 184 */ 185 static ssize_t show_mem_state(struct sys_device *dev, 186 struct sysdev_attribute *attr, char *buf) 187 { 188 struct memory_block *mem = 189 container_of(dev, struct memory_block, sysdev); 190 ssize_t len = 0; 191 192 /* 193 * We can probably put these states in a nice little array 194 * so that they're not open-coded 195 */ 196 switch (mem->state) { 197 case MEM_ONLINE: 198 len = sprintf(buf, "online\n"); 199 break; 200 case MEM_OFFLINE: 201 len = sprintf(buf, "offline\n"); 202 break; 203 case MEM_GOING_OFFLINE: 204 len = sprintf(buf, "going-offline\n"); 205 break; 206 default: 207 len = sprintf(buf, "ERROR-UNKNOWN-%ld\n", 208 mem->state); 209 WARN_ON(1); 210 break; 211 } 212 213 return len; 214 } 215 216 int memory_notify(unsigned long val, void *v) 217 { 218 return blocking_notifier_call_chain(&memory_chain, val, v); 219 } 220 221 int memory_isolate_notify(unsigned long val, void *v) 222 { 223 return atomic_notifier_call_chain(&memory_isolate_chain, val, v); 224 } 225 226 /* 227 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is 228 * OK to have direct references to sparsemem variables in here. 229 */ 230 static int 231 memory_block_action(unsigned long phys_index, unsigned long action) 232 { 233 int i; 234 unsigned long start_pfn, start_paddr; 235 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 236 struct page *first_page; 237 int ret; 238 239 first_page = pfn_to_page(phys_index << PFN_SECTION_SHIFT); 240 241 /* 242 * The probe routines leave the pages reserved, just 243 * as the bootmem code does. Make sure they're still 244 * that way. 245 */ 246 if (action == MEM_ONLINE) { 247 for (i = 0; i < nr_pages; i++) { 248 if (PageReserved(first_page+i)) 249 continue; 250 251 printk(KERN_WARNING "section number %ld page number %d " 252 "not reserved, was it already online?\n", 253 phys_index, i); 254 return -EBUSY; 255 } 256 } 257 258 switch (action) { 259 case MEM_ONLINE: 260 start_pfn = page_to_pfn(first_page); 261 ret = online_pages(start_pfn, nr_pages); 262 break; 263 case MEM_OFFLINE: 264 start_paddr = page_to_pfn(first_page) << PAGE_SHIFT; 265 ret = remove_memory(start_paddr, 266 nr_pages << PAGE_SHIFT); 267 break; 268 default: 269 WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: " 270 "%ld\n", __func__, phys_index, action, action); 271 ret = -EINVAL; 272 } 273 274 return ret; 275 } 276 277 static int memory_block_change_state(struct memory_block *mem, 278 unsigned long to_state, unsigned long from_state_req) 279 { 280 int ret = 0; 281 282 mutex_lock(&mem->state_mutex); 283 284 if (mem->state != from_state_req) { 285 ret = -EINVAL; 286 goto out; 287 } 288 289 if (to_state == MEM_OFFLINE) 290 mem->state = MEM_GOING_OFFLINE; 291 292 ret = memory_block_action(mem->start_section_nr, to_state); 293 294 if (ret) 295 mem->state = from_state_req; 296 else 297 mem->state = to_state; 298 299 out: 300 mutex_unlock(&mem->state_mutex); 301 return ret; 302 } 303 304 static ssize_t 305 store_mem_state(struct sys_device *dev, 306 struct sysdev_attribute *attr, const char *buf, size_t count) 307 { 308 struct memory_block *mem; 309 int ret = -EINVAL; 310 311 mem = container_of(dev, struct memory_block, sysdev); 312 313 if (!strncmp(buf, "online", min((int)count, 6))) 314 ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); 315 else if(!strncmp(buf, "offline", min((int)count, 7))) 316 ret = memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); 317 318 if (ret) 319 return ret; 320 return count; 321 } 322 323 /* 324 * phys_device is a bad name for this. What I really want 325 * is a way to differentiate between memory ranges that 326 * are part of physical devices that constitute 327 * a complete removable unit or fru. 328 * i.e. do these ranges belong to the same physical device, 329 * s.t. if I offline all of these sections I can then 330 * remove the physical device? 331 */ 332 static ssize_t show_phys_device(struct sys_device *dev, 333 struct sysdev_attribute *attr, char *buf) 334 { 335 struct memory_block *mem = 336 container_of(dev, struct memory_block, sysdev); 337 return sprintf(buf, "%d\n", mem->phys_device); 338 } 339 340 static SYSDEV_ATTR(phys_index, 0444, show_mem_start_phys_index, NULL); 341 static SYSDEV_ATTR(end_phys_index, 0444, show_mem_end_phys_index, NULL); 342 static SYSDEV_ATTR(state, 0644, show_mem_state, store_mem_state); 343 static SYSDEV_ATTR(phys_device, 0444, show_phys_device, NULL); 344 static SYSDEV_ATTR(removable, 0444, show_mem_removable, NULL); 345 346 #define mem_create_simple_file(mem, attr_name) \ 347 sysdev_create_file(&mem->sysdev, &attr_##attr_name) 348 #define mem_remove_simple_file(mem, attr_name) \ 349 sysdev_remove_file(&mem->sysdev, &attr_##attr_name) 350 351 /* 352 * Block size attribute stuff 353 */ 354 static ssize_t 355 print_block_size(struct sysdev_class *class, struct sysdev_class_attribute *attr, 356 char *buf) 357 { 358 return sprintf(buf, "%lx\n", get_memory_block_size()); 359 } 360 361 static SYSDEV_CLASS_ATTR(block_size_bytes, 0444, print_block_size, NULL); 362 363 static int block_size_init(void) 364 { 365 return sysfs_create_file(&memory_sysdev_class.kset.kobj, 366 &attr_block_size_bytes.attr); 367 } 368 369 /* 370 * Some architectures will have custom drivers to do this, and 371 * will not need to do it from userspace. The fake hot-add code 372 * as well as ppc64 will do all of their discovery in userspace 373 * and will require this interface. 374 */ 375 #ifdef CONFIG_ARCH_MEMORY_PROBE 376 static ssize_t 377 memory_probe_store(struct class *class, struct class_attribute *attr, 378 const char *buf, size_t count) 379 { 380 u64 phys_addr; 381 int nid; 382 int i, ret; 383 384 phys_addr = simple_strtoull(buf, NULL, 0); 385 386 for (i = 0; i < sections_per_block; i++) { 387 nid = memory_add_physaddr_to_nid(phys_addr); 388 ret = add_memory(nid, phys_addr, 389 PAGES_PER_SECTION << PAGE_SHIFT); 390 if (ret) 391 goto out; 392 393 phys_addr += MIN_MEMORY_BLOCK_SIZE; 394 } 395 396 ret = count; 397 out: 398 return ret; 399 } 400 static CLASS_ATTR(probe, S_IWUSR, NULL, memory_probe_store); 401 402 static int memory_probe_init(void) 403 { 404 return sysfs_create_file(&memory_sysdev_class.kset.kobj, 405 &class_attr_probe.attr); 406 } 407 #else 408 static inline int memory_probe_init(void) 409 { 410 return 0; 411 } 412 #endif 413 414 #ifdef CONFIG_MEMORY_FAILURE 415 /* 416 * Support for offlining pages of memory 417 */ 418 419 /* Soft offline a page */ 420 static ssize_t 421 store_soft_offline_page(struct class *class, 422 struct class_attribute *attr, 423 const char *buf, size_t count) 424 { 425 int ret; 426 u64 pfn; 427 if (!capable(CAP_SYS_ADMIN)) 428 return -EPERM; 429 if (strict_strtoull(buf, 0, &pfn) < 0) 430 return -EINVAL; 431 pfn >>= PAGE_SHIFT; 432 if (!pfn_valid(pfn)) 433 return -ENXIO; 434 ret = soft_offline_page(pfn_to_page(pfn), 0); 435 return ret == 0 ? count : ret; 436 } 437 438 /* Forcibly offline a page, including killing processes. */ 439 static ssize_t 440 store_hard_offline_page(struct class *class, 441 struct class_attribute *attr, 442 const char *buf, size_t count) 443 { 444 int ret; 445 u64 pfn; 446 if (!capable(CAP_SYS_ADMIN)) 447 return -EPERM; 448 if (strict_strtoull(buf, 0, &pfn) < 0) 449 return -EINVAL; 450 pfn >>= PAGE_SHIFT; 451 ret = __memory_failure(pfn, 0, 0); 452 return ret ? ret : count; 453 } 454 455 static CLASS_ATTR(soft_offline_page, 0644, NULL, store_soft_offline_page); 456 static CLASS_ATTR(hard_offline_page, 0644, NULL, store_hard_offline_page); 457 458 static __init int memory_fail_init(void) 459 { 460 int err; 461 462 err = sysfs_create_file(&memory_sysdev_class.kset.kobj, 463 &class_attr_soft_offline_page.attr); 464 if (!err) 465 err = sysfs_create_file(&memory_sysdev_class.kset.kobj, 466 &class_attr_hard_offline_page.attr); 467 return err; 468 } 469 #else 470 static inline int memory_fail_init(void) 471 { 472 return 0; 473 } 474 #endif 475 476 /* 477 * Note that phys_device is optional. It is here to allow for 478 * differentiation between which *physical* devices each 479 * section belongs to... 480 */ 481 int __weak arch_get_memory_phys_device(unsigned long start_pfn) 482 { 483 return 0; 484 } 485 486 struct memory_block *find_memory_block_hinted(struct mem_section *section, 487 struct memory_block *hint) 488 { 489 struct kobject *kobj; 490 struct sys_device *sysdev; 491 struct memory_block *mem; 492 char name[sizeof(MEMORY_CLASS_NAME) + 9 + 1]; 493 int block_id = base_memory_block_id(__section_nr(section)); 494 495 kobj = hint ? &hint->sysdev.kobj : NULL; 496 497 /* 498 * This only works because we know that section == sysdev->id 499 * slightly redundant with sysdev_register() 500 */ 501 sprintf(&name[0], "%s%d", MEMORY_CLASS_NAME, block_id); 502 503 kobj = kset_find_obj_hinted(&memory_sysdev_class.kset, name, kobj); 504 if (!kobj) 505 return NULL; 506 507 sysdev = container_of(kobj, struct sys_device, kobj); 508 mem = container_of(sysdev, struct memory_block, sysdev); 509 510 return mem; 511 } 512 513 /* 514 * For now, we have a linear search to go find the appropriate 515 * memory_block corresponding to a particular phys_index. If 516 * this gets to be a real problem, we can always use a radix 517 * tree or something here. 518 * 519 * This could be made generic for all sysdev classes. 520 */ 521 struct memory_block *find_memory_block(struct mem_section *section) 522 { 523 return find_memory_block_hinted(section, NULL); 524 } 525 526 static int init_memory_block(struct memory_block **memory, 527 struct mem_section *section, unsigned long state) 528 { 529 struct memory_block *mem; 530 unsigned long start_pfn; 531 int scn_nr; 532 int ret = 0; 533 534 mem = kzalloc(sizeof(*mem), GFP_KERNEL); 535 if (!mem) 536 return -ENOMEM; 537 538 scn_nr = __section_nr(section); 539 mem->start_section_nr = 540 base_memory_block_id(scn_nr) * sections_per_block; 541 mem->end_section_nr = mem->start_section_nr + sections_per_block - 1; 542 mem->state = state; 543 mem->section_count++; 544 mutex_init(&mem->state_mutex); 545 start_pfn = section_nr_to_pfn(mem->start_section_nr); 546 mem->phys_device = arch_get_memory_phys_device(start_pfn); 547 548 ret = register_memory(mem); 549 if (!ret) 550 ret = mem_create_simple_file(mem, phys_index); 551 if (!ret) 552 ret = mem_create_simple_file(mem, end_phys_index); 553 if (!ret) 554 ret = mem_create_simple_file(mem, state); 555 if (!ret) 556 ret = mem_create_simple_file(mem, phys_device); 557 if (!ret) 558 ret = mem_create_simple_file(mem, removable); 559 560 *memory = mem; 561 return ret; 562 } 563 564 static int add_memory_section(int nid, struct mem_section *section, 565 unsigned long state, enum mem_add_context context) 566 { 567 struct memory_block *mem; 568 int ret = 0; 569 570 mutex_lock(&mem_sysfs_mutex); 571 572 mem = find_memory_block(section); 573 if (mem) { 574 mem->section_count++; 575 kobject_put(&mem->sysdev.kobj); 576 } else 577 ret = init_memory_block(&mem, section, state); 578 579 if (!ret) { 580 if (context == HOTPLUG && 581 mem->section_count == sections_per_block) 582 ret = register_mem_sect_under_node(mem, nid); 583 } 584 585 mutex_unlock(&mem_sysfs_mutex); 586 return ret; 587 } 588 589 int remove_memory_block(unsigned long node_id, struct mem_section *section, 590 int phys_device) 591 { 592 struct memory_block *mem; 593 594 mutex_lock(&mem_sysfs_mutex); 595 mem = find_memory_block(section); 596 unregister_mem_sect_under_nodes(mem, __section_nr(section)); 597 598 mem->section_count--; 599 if (mem->section_count == 0) { 600 mem_remove_simple_file(mem, phys_index); 601 mem_remove_simple_file(mem, end_phys_index); 602 mem_remove_simple_file(mem, state); 603 mem_remove_simple_file(mem, phys_device); 604 mem_remove_simple_file(mem, removable); 605 unregister_memory(mem); 606 kfree(mem); 607 } else 608 kobject_put(&mem->sysdev.kobj); 609 610 mutex_unlock(&mem_sysfs_mutex); 611 return 0; 612 } 613 614 /* 615 * need an interface for the VM to add new memory regions, 616 * but without onlining it. 617 */ 618 int register_new_memory(int nid, struct mem_section *section) 619 { 620 return add_memory_section(nid, section, MEM_OFFLINE, HOTPLUG); 621 } 622 623 int unregister_memory_section(struct mem_section *section) 624 { 625 if (!present_section(section)) 626 return -EINVAL; 627 628 return remove_memory_block(0, section, 0); 629 } 630 631 /* 632 * Initialize the sysfs support for memory devices... 633 */ 634 int __init memory_dev_init(void) 635 { 636 unsigned int i; 637 int ret; 638 int err; 639 unsigned long block_sz; 640 641 memory_sysdev_class.kset.uevent_ops = &memory_uevent_ops; 642 ret = sysdev_class_register(&memory_sysdev_class); 643 if (ret) 644 goto out; 645 646 block_sz = get_memory_block_size(); 647 sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; 648 649 /* 650 * Create entries for memory sections that were found 651 * during boot and have been initialized 652 */ 653 for (i = 0; i < NR_MEM_SECTIONS; i++) { 654 if (!present_section_nr(i)) 655 continue; 656 err = add_memory_section(0, __nr_to_section(i), MEM_ONLINE, 657 BOOT); 658 if (!ret) 659 ret = err; 660 } 661 662 err = memory_probe_init(); 663 if (!ret) 664 ret = err; 665 err = memory_fail_init(); 666 if (!ret) 667 ret = err; 668 err = block_size_init(); 669 if (!ret) 670 ret = err; 671 out: 672 if (ret) 673 printk(KERN_ERR "%s() failed: %d\n", __func__, ret); 674 return ret; 675 } 676