1 /* 2 * drivers/base/memory.c - basic Memory class support 3 * 4 * Written by Matt Tolentino <matthew.e.tolentino@intel.com> 5 * Dave Hansen <haveblue@us.ibm.com> 6 * 7 * This file provides the necessary infrastructure to represent 8 * a SPARSEMEM-memory-model system's physical memory in /sysfs. 9 * All arch-independent code that assumes MEMORY_HOTPLUG requires 10 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c. 11 */ 12 13 #include <linux/sysdev.h> 14 #include <linux/module.h> 15 #include <linux/init.h> 16 #include <linux/topology.h> 17 #include <linux/capability.h> 18 #include <linux/device.h> 19 #include <linux/memory.h> 20 #include <linux/kobject.h> 21 #include <linux/memory_hotplug.h> 22 #include <linux/mm.h> 23 #include <linux/mutex.h> 24 #include <linux/stat.h> 25 #include <linux/slab.h> 26 27 #include <asm/atomic.h> 28 #include <asm/uaccess.h> 29 30 static DEFINE_MUTEX(mem_sysfs_mutex); 31 32 #define MEMORY_CLASS_NAME "memory" 33 34 static struct sysdev_class memory_sysdev_class = { 35 .name = MEMORY_CLASS_NAME, 36 }; 37 38 static const char *memory_uevent_name(struct kset *kset, struct kobject *kobj) 39 { 40 return MEMORY_CLASS_NAME; 41 } 42 43 static int memory_uevent(struct kset *kset, struct kobject *obj, struct kobj_uevent_env *env) 44 { 45 int retval = 0; 46 47 return retval; 48 } 49 50 static const struct kset_uevent_ops memory_uevent_ops = { 51 .name = memory_uevent_name, 52 .uevent = memory_uevent, 53 }; 54 55 static BLOCKING_NOTIFIER_HEAD(memory_chain); 56 57 int register_memory_notifier(struct notifier_block *nb) 58 { 59 return blocking_notifier_chain_register(&memory_chain, nb); 60 } 61 EXPORT_SYMBOL(register_memory_notifier); 62 63 void unregister_memory_notifier(struct notifier_block *nb) 64 { 65 blocking_notifier_chain_unregister(&memory_chain, nb); 66 } 67 EXPORT_SYMBOL(unregister_memory_notifier); 68 69 static ATOMIC_NOTIFIER_HEAD(memory_isolate_chain); 70 71 int register_memory_isolate_notifier(struct notifier_block *nb) 72 { 73 return atomic_notifier_chain_register(&memory_isolate_chain, nb); 74 } 75 EXPORT_SYMBOL(register_memory_isolate_notifier); 76 77 void unregister_memory_isolate_notifier(struct notifier_block *nb) 78 { 79 atomic_notifier_chain_unregister(&memory_isolate_chain, nb); 80 } 81 EXPORT_SYMBOL(unregister_memory_isolate_notifier); 82 83 /* 84 * register_memory - Setup a sysfs device for a memory block 85 */ 86 static 87 int register_memory(struct memory_block *memory, struct mem_section *section) 88 { 89 int error; 90 91 memory->sysdev.cls = &memory_sysdev_class; 92 memory->sysdev.id = __section_nr(section); 93 94 error = sysdev_register(&memory->sysdev); 95 return error; 96 } 97 98 static void 99 unregister_memory(struct memory_block *memory, struct mem_section *section) 100 { 101 BUG_ON(memory->sysdev.cls != &memory_sysdev_class); 102 BUG_ON(memory->sysdev.id != __section_nr(section)); 103 104 /* drop the ref. we got in remove_memory_block() */ 105 kobject_put(&memory->sysdev.kobj); 106 sysdev_unregister(&memory->sysdev); 107 } 108 109 /* 110 * use this as the physical section index that this memsection 111 * uses. 112 */ 113 114 static ssize_t show_mem_phys_index(struct sys_device *dev, 115 struct sysdev_attribute *attr, char *buf) 116 { 117 struct memory_block *mem = 118 container_of(dev, struct memory_block, sysdev); 119 return sprintf(buf, "%08lx\n", mem->phys_index); 120 } 121 122 /* 123 * Show whether the section of memory is likely to be hot-removable 124 */ 125 static ssize_t show_mem_removable(struct sys_device *dev, 126 struct sysdev_attribute *attr, char *buf) 127 { 128 unsigned long start_pfn; 129 int ret; 130 struct memory_block *mem = 131 container_of(dev, struct memory_block, sysdev); 132 133 start_pfn = section_nr_to_pfn(mem->phys_index); 134 ret = is_mem_section_removable(start_pfn, PAGES_PER_SECTION); 135 return sprintf(buf, "%d\n", ret); 136 } 137 138 /* 139 * online, offline, going offline, etc. 140 */ 141 static ssize_t show_mem_state(struct sys_device *dev, 142 struct sysdev_attribute *attr, char *buf) 143 { 144 struct memory_block *mem = 145 container_of(dev, struct memory_block, sysdev); 146 ssize_t len = 0; 147 148 /* 149 * We can probably put these states in a nice little array 150 * so that they're not open-coded 151 */ 152 switch (mem->state) { 153 case MEM_ONLINE: 154 len = sprintf(buf, "online\n"); 155 break; 156 case MEM_OFFLINE: 157 len = sprintf(buf, "offline\n"); 158 break; 159 case MEM_GOING_OFFLINE: 160 len = sprintf(buf, "going-offline\n"); 161 break; 162 default: 163 len = sprintf(buf, "ERROR-UNKNOWN-%ld\n", 164 mem->state); 165 WARN_ON(1); 166 break; 167 } 168 169 return len; 170 } 171 172 int memory_notify(unsigned long val, void *v) 173 { 174 return blocking_notifier_call_chain(&memory_chain, val, v); 175 } 176 177 int memory_isolate_notify(unsigned long val, void *v) 178 { 179 return atomic_notifier_call_chain(&memory_isolate_chain, val, v); 180 } 181 182 /* 183 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is 184 * OK to have direct references to sparsemem variables in here. 185 */ 186 static int 187 memory_block_action(struct memory_block *mem, unsigned long action) 188 { 189 int i; 190 unsigned long psection; 191 unsigned long start_pfn, start_paddr; 192 struct page *first_page; 193 int ret; 194 int old_state = mem->state; 195 196 psection = mem->phys_index; 197 first_page = pfn_to_page(psection << PFN_SECTION_SHIFT); 198 199 /* 200 * The probe routines leave the pages reserved, just 201 * as the bootmem code does. Make sure they're still 202 * that way. 203 */ 204 if (action == MEM_ONLINE) { 205 for (i = 0; i < PAGES_PER_SECTION; i++) { 206 if (PageReserved(first_page+i)) 207 continue; 208 209 printk(KERN_WARNING "section number %ld page number %d " 210 "not reserved, was it already online? \n", 211 psection, i); 212 return -EBUSY; 213 } 214 } 215 216 switch (action) { 217 case MEM_ONLINE: 218 start_pfn = page_to_pfn(first_page); 219 ret = online_pages(start_pfn, PAGES_PER_SECTION); 220 break; 221 case MEM_OFFLINE: 222 mem->state = MEM_GOING_OFFLINE; 223 start_paddr = page_to_pfn(first_page) << PAGE_SHIFT; 224 ret = remove_memory(start_paddr, 225 PAGES_PER_SECTION << PAGE_SHIFT); 226 if (ret) { 227 mem->state = old_state; 228 break; 229 } 230 break; 231 default: 232 WARN(1, KERN_WARNING "%s(%p, %ld) unknown action: %ld\n", 233 __func__, mem, action, action); 234 ret = -EINVAL; 235 } 236 237 return ret; 238 } 239 240 static int memory_block_change_state(struct memory_block *mem, 241 unsigned long to_state, unsigned long from_state_req) 242 { 243 int ret = 0; 244 mutex_lock(&mem->state_mutex); 245 246 if (mem->state != from_state_req) { 247 ret = -EINVAL; 248 goto out; 249 } 250 251 ret = memory_block_action(mem, to_state); 252 if (!ret) 253 mem->state = to_state; 254 255 out: 256 mutex_unlock(&mem->state_mutex); 257 return ret; 258 } 259 260 static ssize_t 261 store_mem_state(struct sys_device *dev, 262 struct sysdev_attribute *attr, const char *buf, size_t count) 263 { 264 struct memory_block *mem; 265 unsigned int phys_section_nr; 266 int ret = -EINVAL; 267 268 mem = container_of(dev, struct memory_block, sysdev); 269 phys_section_nr = mem->phys_index; 270 271 if (!present_section_nr(phys_section_nr)) 272 goto out; 273 274 if (!strncmp(buf, "online", min((int)count, 6))) 275 ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); 276 else if(!strncmp(buf, "offline", min((int)count, 7))) 277 ret = memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); 278 out: 279 if (ret) 280 return ret; 281 return count; 282 } 283 284 /* 285 * phys_device is a bad name for this. What I really want 286 * is a way to differentiate between memory ranges that 287 * are part of physical devices that constitute 288 * a complete removable unit or fru. 289 * i.e. do these ranges belong to the same physical device, 290 * s.t. if I offline all of these sections I can then 291 * remove the physical device? 292 */ 293 static ssize_t show_phys_device(struct sys_device *dev, 294 struct sysdev_attribute *attr, char *buf) 295 { 296 struct memory_block *mem = 297 container_of(dev, struct memory_block, sysdev); 298 return sprintf(buf, "%d\n", mem->phys_device); 299 } 300 301 static SYSDEV_ATTR(phys_index, 0444, show_mem_phys_index, NULL); 302 static SYSDEV_ATTR(state, 0644, show_mem_state, store_mem_state); 303 static SYSDEV_ATTR(phys_device, 0444, show_phys_device, NULL); 304 static SYSDEV_ATTR(removable, 0444, show_mem_removable, NULL); 305 306 #define mem_create_simple_file(mem, attr_name) \ 307 sysdev_create_file(&mem->sysdev, &attr_##attr_name) 308 #define mem_remove_simple_file(mem, attr_name) \ 309 sysdev_remove_file(&mem->sysdev, &attr_##attr_name) 310 311 /* 312 * Block size attribute stuff 313 */ 314 static ssize_t 315 print_block_size(struct sysdev_class *class, struct sysdev_class_attribute *attr, 316 char *buf) 317 { 318 return sprintf(buf, "%lx\n", (unsigned long)PAGES_PER_SECTION * PAGE_SIZE); 319 } 320 321 static SYSDEV_CLASS_ATTR(block_size_bytes, 0444, print_block_size, NULL); 322 323 static int block_size_init(void) 324 { 325 return sysfs_create_file(&memory_sysdev_class.kset.kobj, 326 &attr_block_size_bytes.attr); 327 } 328 329 /* 330 * Some architectures will have custom drivers to do this, and 331 * will not need to do it from userspace. The fake hot-add code 332 * as well as ppc64 will do all of their discovery in userspace 333 * and will require this interface. 334 */ 335 #ifdef CONFIG_ARCH_MEMORY_PROBE 336 static ssize_t 337 memory_probe_store(struct class *class, struct class_attribute *attr, 338 const char *buf, size_t count) 339 { 340 u64 phys_addr; 341 int nid; 342 int ret; 343 344 phys_addr = simple_strtoull(buf, NULL, 0); 345 346 nid = memory_add_physaddr_to_nid(phys_addr); 347 ret = add_memory(nid, phys_addr, PAGES_PER_SECTION << PAGE_SHIFT); 348 349 if (ret) 350 count = ret; 351 352 return count; 353 } 354 static CLASS_ATTR(probe, S_IWUSR, NULL, memory_probe_store); 355 356 static int memory_probe_init(void) 357 { 358 return sysfs_create_file(&memory_sysdev_class.kset.kobj, 359 &class_attr_probe.attr); 360 } 361 #else 362 static inline int memory_probe_init(void) 363 { 364 return 0; 365 } 366 #endif 367 368 #ifdef CONFIG_MEMORY_FAILURE 369 /* 370 * Support for offlining pages of memory 371 */ 372 373 /* Soft offline a page */ 374 static ssize_t 375 store_soft_offline_page(struct class *class, 376 struct class_attribute *attr, 377 const char *buf, size_t count) 378 { 379 int ret; 380 u64 pfn; 381 if (!capable(CAP_SYS_ADMIN)) 382 return -EPERM; 383 if (strict_strtoull(buf, 0, &pfn) < 0) 384 return -EINVAL; 385 pfn >>= PAGE_SHIFT; 386 if (!pfn_valid(pfn)) 387 return -ENXIO; 388 ret = soft_offline_page(pfn_to_page(pfn), 0); 389 return ret == 0 ? count : ret; 390 } 391 392 /* Forcibly offline a page, including killing processes. */ 393 static ssize_t 394 store_hard_offline_page(struct class *class, 395 struct class_attribute *attr, 396 const char *buf, size_t count) 397 { 398 int ret; 399 u64 pfn; 400 if (!capable(CAP_SYS_ADMIN)) 401 return -EPERM; 402 if (strict_strtoull(buf, 0, &pfn) < 0) 403 return -EINVAL; 404 pfn >>= PAGE_SHIFT; 405 ret = __memory_failure(pfn, 0, 0); 406 return ret ? ret : count; 407 } 408 409 static CLASS_ATTR(soft_offline_page, 0644, NULL, store_soft_offline_page); 410 static CLASS_ATTR(hard_offline_page, 0644, NULL, store_hard_offline_page); 411 412 static __init int memory_fail_init(void) 413 { 414 int err; 415 416 err = sysfs_create_file(&memory_sysdev_class.kset.kobj, 417 &class_attr_soft_offline_page.attr); 418 if (!err) 419 err = sysfs_create_file(&memory_sysdev_class.kset.kobj, 420 &class_attr_hard_offline_page.attr); 421 return err; 422 } 423 #else 424 static inline int memory_fail_init(void) 425 { 426 return 0; 427 } 428 #endif 429 430 /* 431 * Note that phys_device is optional. It is here to allow for 432 * differentiation between which *physical* devices each 433 * section belongs to... 434 */ 435 int __weak arch_get_memory_phys_device(unsigned long start_pfn) 436 { 437 return 0; 438 } 439 440 struct memory_block *find_memory_block_hinted(struct mem_section *section, 441 struct memory_block *hint) 442 { 443 struct kobject *kobj; 444 struct sys_device *sysdev; 445 struct memory_block *mem; 446 char name[sizeof(MEMORY_CLASS_NAME) + 9 + 1]; 447 448 kobj = hint ? &hint->sysdev.kobj : NULL; 449 450 /* 451 * This only works because we know that section == sysdev->id 452 * slightly redundant with sysdev_register() 453 */ 454 sprintf(&name[0], "%s%d", MEMORY_CLASS_NAME, __section_nr(section)); 455 456 kobj = kset_find_obj_hinted(&memory_sysdev_class.kset, name, kobj); 457 if (!kobj) 458 return NULL; 459 460 sysdev = container_of(kobj, struct sys_device, kobj); 461 mem = container_of(sysdev, struct memory_block, sysdev); 462 463 return mem; 464 } 465 466 /* 467 * For now, we have a linear search to go find the appropriate 468 * memory_block corresponding to a particular phys_index. If 469 * this gets to be a real problem, we can always use a radix 470 * tree or something here. 471 * 472 * This could be made generic for all sysdev classes. 473 */ 474 struct memory_block *find_memory_block(struct mem_section *section) 475 { 476 return find_memory_block_hinted(section, NULL); 477 } 478 479 static int add_memory_block(int nid, struct mem_section *section, 480 unsigned long state, enum mem_add_context context) 481 { 482 struct memory_block *mem = kzalloc(sizeof(*mem), GFP_KERNEL); 483 unsigned long start_pfn; 484 int ret = 0; 485 486 if (!mem) 487 return -ENOMEM; 488 489 mutex_lock(&mem_sysfs_mutex); 490 491 mem->phys_index = __section_nr(section); 492 mem->state = state; 493 mem->section_count++; 494 mutex_init(&mem->state_mutex); 495 start_pfn = section_nr_to_pfn(mem->phys_index); 496 mem->phys_device = arch_get_memory_phys_device(start_pfn); 497 498 ret = register_memory(mem, section); 499 if (!ret) 500 ret = mem_create_simple_file(mem, phys_index); 501 if (!ret) 502 ret = mem_create_simple_file(mem, state); 503 if (!ret) 504 ret = mem_create_simple_file(mem, phys_device); 505 if (!ret) 506 ret = mem_create_simple_file(mem, removable); 507 if (!ret) { 508 if (context == HOTPLUG) 509 ret = register_mem_sect_under_node(mem, nid); 510 } 511 512 mutex_unlock(&mem_sysfs_mutex); 513 return ret; 514 } 515 516 int remove_memory_block(unsigned long node_id, struct mem_section *section, 517 int phys_device) 518 { 519 struct memory_block *mem; 520 521 mutex_lock(&mem_sysfs_mutex); 522 mem = find_memory_block(section); 523 524 mem->section_count--; 525 if (mem->section_count == 0) { 526 unregister_mem_sect_under_nodes(mem); 527 mem_remove_simple_file(mem, phys_index); 528 mem_remove_simple_file(mem, state); 529 mem_remove_simple_file(mem, phys_device); 530 mem_remove_simple_file(mem, removable); 531 unregister_memory(mem, section); 532 } 533 534 mutex_unlock(&mem_sysfs_mutex); 535 return 0; 536 } 537 538 /* 539 * need an interface for the VM to add new memory regions, 540 * but without onlining it. 541 */ 542 int register_new_memory(int nid, struct mem_section *section) 543 { 544 return add_memory_block(nid, section, MEM_OFFLINE, HOTPLUG); 545 } 546 547 int unregister_memory_section(struct mem_section *section) 548 { 549 if (!present_section(section)) 550 return -EINVAL; 551 552 return remove_memory_block(0, section, 0); 553 } 554 555 /* 556 * Initialize the sysfs support for memory devices... 557 */ 558 int __init memory_dev_init(void) 559 { 560 unsigned int i; 561 int ret; 562 int err; 563 564 memory_sysdev_class.kset.uevent_ops = &memory_uevent_ops; 565 ret = sysdev_class_register(&memory_sysdev_class); 566 if (ret) 567 goto out; 568 569 /* 570 * Create entries for memory sections that were found 571 * during boot and have been initialized 572 */ 573 for (i = 0; i < NR_MEM_SECTIONS; i++) { 574 if (!present_section_nr(i)) 575 continue; 576 err = add_memory_block(0, __nr_to_section(i), MEM_ONLINE, 577 BOOT); 578 if (!ret) 579 ret = err; 580 } 581 582 err = memory_probe_init(); 583 if (!ret) 584 ret = err; 585 err = memory_fail_init(); 586 if (!ret) 587 ret = err; 588 err = block_size_init(); 589 if (!ret) 590 ret = err; 591 out: 592 if (ret) 593 printk(KERN_ERR "%s() failed: %d\n", __func__, ret); 594 return ret; 595 } 596