1 /* 2 * drivers/base/memory.c - basic Memory class support 3 * 4 * Written by Matt Tolentino <matthew.e.tolentino@intel.com> 5 * Dave Hansen <haveblue@us.ibm.com> 6 * 7 * This file provides the necessary infrastructure to represent 8 * a SPARSEMEM-memory-model system's physical memory in /sysfs. 9 * All arch-independent code that assumes MEMORY_HOTPLUG requires 10 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c. 11 */ 12 13 #include <linux/sysdev.h> 14 #include <linux/module.h> 15 #include <linux/init.h> 16 #include <linux/topology.h> 17 #include <linux/capability.h> 18 #include <linux/device.h> 19 #include <linux/memory.h> 20 #include <linux/kobject.h> 21 #include <linux/memory_hotplug.h> 22 #include <linux/mm.h> 23 #include <linux/mutex.h> 24 #include <linux/stat.h> 25 #include <linux/slab.h> 26 27 #include <asm/atomic.h> 28 #include <asm/uaccess.h> 29 30 static DEFINE_MUTEX(mem_sysfs_mutex); 31 32 #define MEMORY_CLASS_NAME "memory" 33 #define MIN_MEMORY_BLOCK_SIZE (1 << SECTION_SIZE_BITS) 34 35 static int sections_per_block; 36 37 static inline int base_memory_block_id(int section_nr) 38 { 39 return section_nr / sections_per_block; 40 } 41 42 static struct sysdev_class memory_sysdev_class = { 43 .name = MEMORY_CLASS_NAME, 44 }; 45 46 static const char *memory_uevent_name(struct kset *kset, struct kobject *kobj) 47 { 48 return MEMORY_CLASS_NAME; 49 } 50 51 static int memory_uevent(struct kset *kset, struct kobject *obj, struct kobj_uevent_env *env) 52 { 53 int retval = 0; 54 55 return retval; 56 } 57 58 static const struct kset_uevent_ops memory_uevent_ops = { 59 .name = memory_uevent_name, 60 .uevent = memory_uevent, 61 }; 62 63 static BLOCKING_NOTIFIER_HEAD(memory_chain); 64 65 int register_memory_notifier(struct notifier_block *nb) 66 { 67 return blocking_notifier_chain_register(&memory_chain, nb); 68 } 69 EXPORT_SYMBOL(register_memory_notifier); 70 71 void unregister_memory_notifier(struct notifier_block *nb) 72 { 73 blocking_notifier_chain_unregister(&memory_chain, nb); 74 } 75 EXPORT_SYMBOL(unregister_memory_notifier); 76 77 static ATOMIC_NOTIFIER_HEAD(memory_isolate_chain); 78 79 int register_memory_isolate_notifier(struct notifier_block *nb) 80 { 81 return atomic_notifier_chain_register(&memory_isolate_chain, nb); 82 } 83 EXPORT_SYMBOL(register_memory_isolate_notifier); 84 85 void unregister_memory_isolate_notifier(struct notifier_block *nb) 86 { 87 atomic_notifier_chain_unregister(&memory_isolate_chain, nb); 88 } 89 EXPORT_SYMBOL(unregister_memory_isolate_notifier); 90 91 /* 92 * register_memory - Setup a sysfs device for a memory block 93 */ 94 static 95 int register_memory(struct memory_block *memory) 96 { 97 int error; 98 99 memory->sysdev.cls = &memory_sysdev_class; 100 memory->sysdev.id = memory->start_section_nr / sections_per_block; 101 102 error = sysdev_register(&memory->sysdev); 103 return error; 104 } 105 106 static void 107 unregister_memory(struct memory_block *memory) 108 { 109 BUG_ON(memory->sysdev.cls != &memory_sysdev_class); 110 111 /* drop the ref. we got in remove_memory_block() */ 112 kobject_put(&memory->sysdev.kobj); 113 sysdev_unregister(&memory->sysdev); 114 } 115 116 unsigned long __weak memory_block_size_bytes(void) 117 { 118 return MIN_MEMORY_BLOCK_SIZE; 119 } 120 121 static unsigned long get_memory_block_size(void) 122 { 123 unsigned long block_sz; 124 125 block_sz = memory_block_size_bytes(); 126 127 /* Validate blk_sz is a power of 2 and not less than section size */ 128 if ((block_sz & (block_sz - 1)) || (block_sz < MIN_MEMORY_BLOCK_SIZE)) { 129 WARN_ON(1); 130 block_sz = MIN_MEMORY_BLOCK_SIZE; 131 } 132 133 return block_sz; 134 } 135 136 /* 137 * use this as the physical section index that this memsection 138 * uses. 139 */ 140 141 static ssize_t show_mem_start_phys_index(struct sys_device *dev, 142 struct sysdev_attribute *attr, char *buf) 143 { 144 struct memory_block *mem = 145 container_of(dev, struct memory_block, sysdev); 146 unsigned long phys_index; 147 148 phys_index = mem->start_section_nr / sections_per_block; 149 return sprintf(buf, "%08lx\n", phys_index); 150 } 151 152 static ssize_t show_mem_end_phys_index(struct sys_device *dev, 153 struct sysdev_attribute *attr, char *buf) 154 { 155 struct memory_block *mem = 156 container_of(dev, struct memory_block, sysdev); 157 unsigned long phys_index; 158 159 phys_index = mem->end_section_nr / sections_per_block; 160 return sprintf(buf, "%08lx\n", phys_index); 161 } 162 163 /* 164 * Show whether the section of memory is likely to be hot-removable 165 */ 166 static ssize_t show_mem_removable(struct sys_device *dev, 167 struct sysdev_attribute *attr, char *buf) 168 { 169 unsigned long i, pfn; 170 int ret = 1; 171 struct memory_block *mem = 172 container_of(dev, struct memory_block, sysdev); 173 174 for (i = 0; i < sections_per_block; i++) { 175 pfn = section_nr_to_pfn(mem->start_section_nr + i); 176 ret &= is_mem_section_removable(pfn, PAGES_PER_SECTION); 177 } 178 179 return sprintf(buf, "%d\n", ret); 180 } 181 182 /* 183 * online, offline, going offline, etc. 184 */ 185 static ssize_t show_mem_state(struct sys_device *dev, 186 struct sysdev_attribute *attr, char *buf) 187 { 188 struct memory_block *mem = 189 container_of(dev, struct memory_block, sysdev); 190 ssize_t len = 0; 191 192 /* 193 * We can probably put these states in a nice little array 194 * so that they're not open-coded 195 */ 196 switch (mem->state) { 197 case MEM_ONLINE: 198 len = sprintf(buf, "online\n"); 199 break; 200 case MEM_OFFLINE: 201 len = sprintf(buf, "offline\n"); 202 break; 203 case MEM_GOING_OFFLINE: 204 len = sprintf(buf, "going-offline\n"); 205 break; 206 default: 207 len = sprintf(buf, "ERROR-UNKNOWN-%ld\n", 208 mem->state); 209 WARN_ON(1); 210 break; 211 } 212 213 return len; 214 } 215 216 int memory_notify(unsigned long val, void *v) 217 { 218 return blocking_notifier_call_chain(&memory_chain, val, v); 219 } 220 221 int memory_isolate_notify(unsigned long val, void *v) 222 { 223 return atomic_notifier_call_chain(&memory_isolate_chain, val, v); 224 } 225 226 /* 227 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is 228 * OK to have direct references to sparsemem variables in here. 229 */ 230 static int 231 memory_section_action(unsigned long phys_index, unsigned long action) 232 { 233 int i; 234 unsigned long start_pfn, start_paddr; 235 struct page *first_page; 236 int ret; 237 238 first_page = pfn_to_page(phys_index << PFN_SECTION_SHIFT); 239 240 /* 241 * The probe routines leave the pages reserved, just 242 * as the bootmem code does. Make sure they're still 243 * that way. 244 */ 245 if (action == MEM_ONLINE) { 246 for (i = 0; i < PAGES_PER_SECTION; i++) { 247 if (PageReserved(first_page+i)) 248 continue; 249 250 printk(KERN_WARNING "section number %ld page number %d " 251 "not reserved, was it already online?\n", 252 phys_index, i); 253 return -EBUSY; 254 } 255 } 256 257 switch (action) { 258 case MEM_ONLINE: 259 start_pfn = page_to_pfn(first_page); 260 ret = online_pages(start_pfn, PAGES_PER_SECTION); 261 break; 262 case MEM_OFFLINE: 263 start_paddr = page_to_pfn(first_page) << PAGE_SHIFT; 264 ret = remove_memory(start_paddr, 265 PAGES_PER_SECTION << PAGE_SHIFT); 266 break; 267 default: 268 WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: " 269 "%ld\n", __func__, phys_index, action, action); 270 ret = -EINVAL; 271 } 272 273 return ret; 274 } 275 276 static int memory_block_change_state(struct memory_block *mem, 277 unsigned long to_state, unsigned long from_state_req) 278 { 279 int i, ret = 0; 280 281 mutex_lock(&mem->state_mutex); 282 283 if (mem->state != from_state_req) { 284 ret = -EINVAL; 285 goto out; 286 } 287 288 if (to_state == MEM_OFFLINE) 289 mem->state = MEM_GOING_OFFLINE; 290 291 for (i = 0; i < sections_per_block; i++) { 292 ret = memory_section_action(mem->start_section_nr + i, 293 to_state); 294 if (ret) 295 break; 296 } 297 298 if (ret) { 299 for (i = 0; i < sections_per_block; i++) 300 memory_section_action(mem->start_section_nr + i, 301 from_state_req); 302 303 mem->state = from_state_req; 304 } else 305 mem->state = to_state; 306 307 out: 308 mutex_unlock(&mem->state_mutex); 309 return ret; 310 } 311 312 static ssize_t 313 store_mem_state(struct sys_device *dev, 314 struct sysdev_attribute *attr, const char *buf, size_t count) 315 { 316 struct memory_block *mem; 317 int ret = -EINVAL; 318 319 mem = container_of(dev, struct memory_block, sysdev); 320 321 if (!strncmp(buf, "online", min((int)count, 6))) 322 ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); 323 else if(!strncmp(buf, "offline", min((int)count, 7))) 324 ret = memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); 325 326 if (ret) 327 return ret; 328 return count; 329 } 330 331 /* 332 * phys_device is a bad name for this. What I really want 333 * is a way to differentiate between memory ranges that 334 * are part of physical devices that constitute 335 * a complete removable unit or fru. 336 * i.e. do these ranges belong to the same physical device, 337 * s.t. if I offline all of these sections I can then 338 * remove the physical device? 339 */ 340 static ssize_t show_phys_device(struct sys_device *dev, 341 struct sysdev_attribute *attr, char *buf) 342 { 343 struct memory_block *mem = 344 container_of(dev, struct memory_block, sysdev); 345 return sprintf(buf, "%d\n", mem->phys_device); 346 } 347 348 static SYSDEV_ATTR(phys_index, 0444, show_mem_start_phys_index, NULL); 349 static SYSDEV_ATTR(end_phys_index, 0444, show_mem_end_phys_index, NULL); 350 static SYSDEV_ATTR(state, 0644, show_mem_state, store_mem_state); 351 static SYSDEV_ATTR(phys_device, 0444, show_phys_device, NULL); 352 static SYSDEV_ATTR(removable, 0444, show_mem_removable, NULL); 353 354 #define mem_create_simple_file(mem, attr_name) \ 355 sysdev_create_file(&mem->sysdev, &attr_##attr_name) 356 #define mem_remove_simple_file(mem, attr_name) \ 357 sysdev_remove_file(&mem->sysdev, &attr_##attr_name) 358 359 /* 360 * Block size attribute stuff 361 */ 362 static ssize_t 363 print_block_size(struct sysdev_class *class, struct sysdev_class_attribute *attr, 364 char *buf) 365 { 366 return sprintf(buf, "%lx\n", get_memory_block_size()); 367 } 368 369 static SYSDEV_CLASS_ATTR(block_size_bytes, 0444, print_block_size, NULL); 370 371 static int block_size_init(void) 372 { 373 return sysfs_create_file(&memory_sysdev_class.kset.kobj, 374 &attr_block_size_bytes.attr); 375 } 376 377 /* 378 * Some architectures will have custom drivers to do this, and 379 * will not need to do it from userspace. The fake hot-add code 380 * as well as ppc64 will do all of their discovery in userspace 381 * and will require this interface. 382 */ 383 #ifdef CONFIG_ARCH_MEMORY_PROBE 384 static ssize_t 385 memory_probe_store(struct class *class, struct class_attribute *attr, 386 const char *buf, size_t count) 387 { 388 u64 phys_addr; 389 int nid; 390 int i, ret; 391 392 phys_addr = simple_strtoull(buf, NULL, 0); 393 394 for (i = 0; i < sections_per_block; i++) { 395 nid = memory_add_physaddr_to_nid(phys_addr); 396 ret = add_memory(nid, phys_addr, 397 PAGES_PER_SECTION << PAGE_SHIFT); 398 if (ret) 399 break; 400 401 phys_addr += MIN_MEMORY_BLOCK_SIZE; 402 } 403 404 if (ret) 405 count = ret; 406 407 return count; 408 } 409 static CLASS_ATTR(probe, S_IWUSR, NULL, memory_probe_store); 410 411 static int memory_probe_init(void) 412 { 413 return sysfs_create_file(&memory_sysdev_class.kset.kobj, 414 &class_attr_probe.attr); 415 } 416 #else 417 static inline int memory_probe_init(void) 418 { 419 return 0; 420 } 421 #endif 422 423 #ifdef CONFIG_MEMORY_FAILURE 424 /* 425 * Support for offlining pages of memory 426 */ 427 428 /* Soft offline a page */ 429 static ssize_t 430 store_soft_offline_page(struct class *class, 431 struct class_attribute *attr, 432 const char *buf, size_t count) 433 { 434 int ret; 435 u64 pfn; 436 if (!capable(CAP_SYS_ADMIN)) 437 return -EPERM; 438 if (strict_strtoull(buf, 0, &pfn) < 0) 439 return -EINVAL; 440 pfn >>= PAGE_SHIFT; 441 if (!pfn_valid(pfn)) 442 return -ENXIO; 443 ret = soft_offline_page(pfn_to_page(pfn), 0); 444 return ret == 0 ? count : ret; 445 } 446 447 /* Forcibly offline a page, including killing processes. */ 448 static ssize_t 449 store_hard_offline_page(struct class *class, 450 struct class_attribute *attr, 451 const char *buf, size_t count) 452 { 453 int ret; 454 u64 pfn; 455 if (!capable(CAP_SYS_ADMIN)) 456 return -EPERM; 457 if (strict_strtoull(buf, 0, &pfn) < 0) 458 return -EINVAL; 459 pfn >>= PAGE_SHIFT; 460 ret = __memory_failure(pfn, 0, 0); 461 return ret ? ret : count; 462 } 463 464 static CLASS_ATTR(soft_offline_page, 0644, NULL, store_soft_offline_page); 465 static CLASS_ATTR(hard_offline_page, 0644, NULL, store_hard_offline_page); 466 467 static __init int memory_fail_init(void) 468 { 469 int err; 470 471 err = sysfs_create_file(&memory_sysdev_class.kset.kobj, 472 &class_attr_soft_offline_page.attr); 473 if (!err) 474 err = sysfs_create_file(&memory_sysdev_class.kset.kobj, 475 &class_attr_hard_offline_page.attr); 476 return err; 477 } 478 #else 479 static inline int memory_fail_init(void) 480 { 481 return 0; 482 } 483 #endif 484 485 /* 486 * Note that phys_device is optional. It is here to allow for 487 * differentiation between which *physical* devices each 488 * section belongs to... 489 */ 490 int __weak arch_get_memory_phys_device(unsigned long start_pfn) 491 { 492 return 0; 493 } 494 495 struct memory_block *find_memory_block_hinted(struct mem_section *section, 496 struct memory_block *hint) 497 { 498 struct kobject *kobj; 499 struct sys_device *sysdev; 500 struct memory_block *mem; 501 char name[sizeof(MEMORY_CLASS_NAME) + 9 + 1]; 502 int block_id = base_memory_block_id(__section_nr(section)); 503 504 kobj = hint ? &hint->sysdev.kobj : NULL; 505 506 /* 507 * This only works because we know that section == sysdev->id 508 * slightly redundant with sysdev_register() 509 */ 510 sprintf(&name[0], "%s%d", MEMORY_CLASS_NAME, block_id); 511 512 kobj = kset_find_obj_hinted(&memory_sysdev_class.kset, name, kobj); 513 if (!kobj) 514 return NULL; 515 516 sysdev = container_of(kobj, struct sys_device, kobj); 517 mem = container_of(sysdev, struct memory_block, sysdev); 518 519 return mem; 520 } 521 522 /* 523 * For now, we have a linear search to go find the appropriate 524 * memory_block corresponding to a particular phys_index. If 525 * this gets to be a real problem, we can always use a radix 526 * tree or something here. 527 * 528 * This could be made generic for all sysdev classes. 529 */ 530 struct memory_block *find_memory_block(struct mem_section *section) 531 { 532 return find_memory_block_hinted(section, NULL); 533 } 534 535 static int init_memory_block(struct memory_block **memory, 536 struct mem_section *section, unsigned long state) 537 { 538 struct memory_block *mem; 539 unsigned long start_pfn; 540 int scn_nr; 541 int ret = 0; 542 543 mem = kzalloc(sizeof(*mem), GFP_KERNEL); 544 if (!mem) 545 return -ENOMEM; 546 547 scn_nr = __section_nr(section); 548 mem->start_section_nr = 549 base_memory_block_id(scn_nr) * sections_per_block; 550 mem->end_section_nr = mem->start_section_nr + sections_per_block - 1; 551 mem->state = state; 552 mem->section_count++; 553 mutex_init(&mem->state_mutex); 554 start_pfn = section_nr_to_pfn(mem->start_section_nr); 555 mem->phys_device = arch_get_memory_phys_device(start_pfn); 556 557 ret = register_memory(mem); 558 if (!ret) 559 ret = mem_create_simple_file(mem, phys_index); 560 if (!ret) 561 ret = mem_create_simple_file(mem, end_phys_index); 562 if (!ret) 563 ret = mem_create_simple_file(mem, state); 564 if (!ret) 565 ret = mem_create_simple_file(mem, phys_device); 566 if (!ret) 567 ret = mem_create_simple_file(mem, removable); 568 569 *memory = mem; 570 return ret; 571 } 572 573 static int add_memory_section(int nid, struct mem_section *section, 574 unsigned long state, enum mem_add_context context) 575 { 576 struct memory_block *mem; 577 int ret = 0; 578 579 mutex_lock(&mem_sysfs_mutex); 580 581 mem = find_memory_block(section); 582 if (mem) { 583 mem->section_count++; 584 kobject_put(&mem->sysdev.kobj); 585 } else 586 ret = init_memory_block(&mem, section, state); 587 588 if (!ret) { 589 if (context == HOTPLUG && 590 mem->section_count == sections_per_block) 591 ret = register_mem_sect_under_node(mem, nid); 592 } 593 594 mutex_unlock(&mem_sysfs_mutex); 595 return ret; 596 } 597 598 int remove_memory_block(unsigned long node_id, struct mem_section *section, 599 int phys_device) 600 { 601 struct memory_block *mem; 602 603 mutex_lock(&mem_sysfs_mutex); 604 mem = find_memory_block(section); 605 unregister_mem_sect_under_nodes(mem, __section_nr(section)); 606 607 mem->section_count--; 608 if (mem->section_count == 0) { 609 mem_remove_simple_file(mem, phys_index); 610 mem_remove_simple_file(mem, end_phys_index); 611 mem_remove_simple_file(mem, state); 612 mem_remove_simple_file(mem, phys_device); 613 mem_remove_simple_file(mem, removable); 614 unregister_memory(mem); 615 kfree(mem); 616 } else 617 kobject_put(&mem->sysdev.kobj); 618 619 mutex_unlock(&mem_sysfs_mutex); 620 return 0; 621 } 622 623 /* 624 * need an interface for the VM to add new memory regions, 625 * but without onlining it. 626 */ 627 int register_new_memory(int nid, struct mem_section *section) 628 { 629 return add_memory_section(nid, section, MEM_OFFLINE, HOTPLUG); 630 } 631 632 int unregister_memory_section(struct mem_section *section) 633 { 634 if (!present_section(section)) 635 return -EINVAL; 636 637 return remove_memory_block(0, section, 0); 638 } 639 640 /* 641 * Initialize the sysfs support for memory devices... 642 */ 643 int __init memory_dev_init(void) 644 { 645 unsigned int i; 646 int ret; 647 int err; 648 unsigned long block_sz; 649 650 memory_sysdev_class.kset.uevent_ops = &memory_uevent_ops; 651 ret = sysdev_class_register(&memory_sysdev_class); 652 if (ret) 653 goto out; 654 655 block_sz = get_memory_block_size(); 656 sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; 657 658 /* 659 * Create entries for memory sections that were found 660 * during boot and have been initialized 661 */ 662 for (i = 0; i < NR_MEM_SECTIONS; i++) { 663 if (!present_section_nr(i)) 664 continue; 665 err = add_memory_section(0, __nr_to_section(i), MEM_ONLINE, 666 BOOT); 667 if (!ret) 668 ret = err; 669 } 670 671 err = memory_probe_init(); 672 if (!ret) 673 ret = err; 674 err = memory_fail_init(); 675 if (!ret) 676 ret = err; 677 err = block_size_init(); 678 if (!ret) 679 ret = err; 680 out: 681 if (ret) 682 printk(KERN_ERR "%s() failed: %d\n", __func__, ret); 683 return ret; 684 } 685