1 /* 2 * linux/mm/memory_hotplug.c 3 * 4 * Copyright (C) 5 */ 6 7 #include <linux/stddef.h> 8 #include <linux/mm.h> 9 #include <linux/swap.h> 10 #include <linux/interrupt.h> 11 #include <linux/pagemap.h> 12 #include <linux/bootmem.h> 13 #include <linux/compiler.h> 14 #include <linux/export.h> 15 #include <linux/pagevec.h> 16 #include <linux/writeback.h> 17 #include <linux/slab.h> 18 #include <linux/sysctl.h> 19 #include <linux/cpu.h> 20 #include <linux/memory.h> 21 #include <linux/memory_hotplug.h> 22 #include <linux/highmem.h> 23 #include <linux/vmalloc.h> 24 #include <linux/ioport.h> 25 #include <linux/delay.h> 26 #include <linux/migrate.h> 27 #include <linux/page-isolation.h> 28 #include <linux/pfn.h> 29 #include <linux/suspend.h> 30 #include <linux/mm_inline.h> 31 #include <linux/firmware-map.h> 32 #include <linux/stop_machine.h> 33 34 #include <asm/tlbflush.h> 35 36 #include "internal.h" 37 38 /* 39 * online_page_callback contains pointer to current page onlining function. 40 * Initially it is generic_online_page(). If it is required it could be 41 * changed by calling set_online_page_callback() for callback registration 42 * and restore_online_page_callback() for generic callback restore. 43 */ 44 45 static void generic_online_page(struct page *page); 46 47 static online_page_callback_t online_page_callback = generic_online_page; 48 49 DEFINE_MUTEX(mem_hotplug_mutex); 50 51 void lock_memory_hotplug(void) 52 { 53 mutex_lock(&mem_hotplug_mutex); 54 55 /* for exclusive hibernation if CONFIG_HIBERNATION=y */ 56 lock_system_sleep(); 57 } 58 59 void unlock_memory_hotplug(void) 60 { 61 unlock_system_sleep(); 62 mutex_unlock(&mem_hotplug_mutex); 63 } 64 65 66 /* add this memory to iomem resource */ 67 static struct resource *register_memory_resource(u64 start, u64 size) 68 { 69 struct resource *res; 70 res = kzalloc(sizeof(struct resource), GFP_KERNEL); 71 BUG_ON(!res); 72 73 res->name = "System RAM"; 74 res->start = start; 75 res->end = start + size - 1; 76 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; 77 if (request_resource(&iomem_resource, res) < 0) { 78 printk("System RAM resource %pR cannot be added\n", res); 79 kfree(res); 80 res = NULL; 81 } 82 return res; 83 } 84 85 static void release_memory_resource(struct resource *res) 86 { 87 if (!res) 88 return; 89 release_resource(res); 90 kfree(res); 91 return; 92 } 93 94 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 95 void get_page_bootmem(unsigned long info, struct page *page, 96 unsigned long type) 97 { 98 page->lru.next = (struct list_head *) type; 99 SetPagePrivate(page); 100 set_page_private(page, info); 101 atomic_inc(&page->_count); 102 } 103 104 /* reference to __meminit __free_pages_bootmem is valid 105 * so use __ref to tell modpost not to generate a warning */ 106 void __ref put_page_bootmem(struct page *page) 107 { 108 unsigned long type; 109 static DEFINE_MUTEX(ppb_lock); 110 111 type = (unsigned long) page->lru.next; 112 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || 113 type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); 114 115 if (atomic_dec_return(&page->_count) == 1) { 116 ClearPagePrivate(page); 117 set_page_private(page, 0); 118 INIT_LIST_HEAD(&page->lru); 119 120 /* 121 * Please refer to comment for __free_pages_bootmem() 122 * for why we serialize here. 123 */ 124 mutex_lock(&ppb_lock); 125 __free_pages_bootmem(page, 0); 126 mutex_unlock(&ppb_lock); 127 totalram_pages++; 128 } 129 130 } 131 132 #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE 133 #ifndef CONFIG_SPARSEMEM_VMEMMAP 134 static void register_page_bootmem_info_section(unsigned long start_pfn) 135 { 136 unsigned long *usemap, mapsize, section_nr, i; 137 struct mem_section *ms; 138 struct page *page, *memmap; 139 140 section_nr = pfn_to_section_nr(start_pfn); 141 ms = __nr_to_section(section_nr); 142 143 /* Get section's memmap address */ 144 memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); 145 146 /* 147 * Get page for the memmap's phys address 148 * XXX: need more consideration for sparse_vmemmap... 149 */ 150 page = virt_to_page(memmap); 151 mapsize = sizeof(struct page) * PAGES_PER_SECTION; 152 mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT; 153 154 /* remember memmap's page */ 155 for (i = 0; i < mapsize; i++, page++) 156 get_page_bootmem(section_nr, page, SECTION_INFO); 157 158 usemap = __nr_to_section(section_nr)->pageblock_flags; 159 page = virt_to_page(usemap); 160 161 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; 162 163 for (i = 0; i < mapsize; i++, page++) 164 get_page_bootmem(section_nr, page, MIX_SECTION_INFO); 165 166 } 167 #else /* CONFIG_SPARSEMEM_VMEMMAP */ 168 static void register_page_bootmem_info_section(unsigned long start_pfn) 169 { 170 unsigned long *usemap, mapsize, section_nr, i; 171 struct mem_section *ms; 172 struct page *page, *memmap; 173 174 if (!pfn_valid(start_pfn)) 175 return; 176 177 section_nr = pfn_to_section_nr(start_pfn); 178 ms = __nr_to_section(section_nr); 179 180 memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); 181 182 register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION); 183 184 usemap = __nr_to_section(section_nr)->pageblock_flags; 185 page = virt_to_page(usemap); 186 187 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; 188 189 for (i = 0; i < mapsize; i++, page++) 190 get_page_bootmem(section_nr, page, MIX_SECTION_INFO); 191 } 192 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ 193 194 void register_page_bootmem_info_node(struct pglist_data *pgdat) 195 { 196 unsigned long i, pfn, end_pfn, nr_pages; 197 int node = pgdat->node_id; 198 struct page *page; 199 struct zone *zone; 200 201 nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT; 202 page = virt_to_page(pgdat); 203 204 for (i = 0; i < nr_pages; i++, page++) 205 get_page_bootmem(node, page, NODE_INFO); 206 207 zone = &pgdat->node_zones[0]; 208 for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) { 209 if (zone->wait_table) { 210 nr_pages = zone->wait_table_hash_nr_entries 211 * sizeof(wait_queue_head_t); 212 nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT; 213 page = virt_to_page(zone->wait_table); 214 215 for (i = 0; i < nr_pages; i++, page++) 216 get_page_bootmem(node, page, NODE_INFO); 217 } 218 } 219 220 pfn = pgdat->node_start_pfn; 221 end_pfn = pgdat_end_pfn(pgdat); 222 223 /* register_section info */ 224 for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 225 /* 226 * Some platforms can assign the same pfn to multiple nodes - on 227 * node0 as well as nodeN. To avoid registering a pfn against 228 * multiple nodes we check that this pfn does not already 229 * reside in some other node. 230 */ 231 if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node)) 232 register_page_bootmem_info_section(pfn); 233 } 234 } 235 #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ 236 237 static void grow_zone_span(struct zone *zone, unsigned long start_pfn, 238 unsigned long end_pfn) 239 { 240 unsigned long old_zone_end_pfn; 241 242 zone_span_writelock(zone); 243 244 old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; 245 if (!zone->spanned_pages || start_pfn < zone->zone_start_pfn) 246 zone->zone_start_pfn = start_pfn; 247 248 zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - 249 zone->zone_start_pfn; 250 251 zone_span_writeunlock(zone); 252 } 253 254 static void resize_zone(struct zone *zone, unsigned long start_pfn, 255 unsigned long end_pfn) 256 { 257 zone_span_writelock(zone); 258 259 if (end_pfn - start_pfn) { 260 zone->zone_start_pfn = start_pfn; 261 zone->spanned_pages = end_pfn - start_pfn; 262 } else { 263 /* 264 * make it consist as free_area_init_core(), 265 * if spanned_pages = 0, then keep start_pfn = 0 266 */ 267 zone->zone_start_pfn = 0; 268 zone->spanned_pages = 0; 269 } 270 271 zone_span_writeunlock(zone); 272 } 273 274 static void fix_zone_id(struct zone *zone, unsigned long start_pfn, 275 unsigned long end_pfn) 276 { 277 enum zone_type zid = zone_idx(zone); 278 int nid = zone->zone_pgdat->node_id; 279 unsigned long pfn; 280 281 for (pfn = start_pfn; pfn < end_pfn; pfn++) 282 set_page_links(pfn_to_page(pfn), zid, nid, pfn); 283 } 284 285 /* Can fail with -ENOMEM from allocating a wait table with vmalloc() or 286 * alloc_bootmem_node_nopanic() */ 287 static int __ref ensure_zone_is_initialized(struct zone *zone, 288 unsigned long start_pfn, unsigned long num_pages) 289 { 290 if (!zone_is_initialized(zone)) 291 return init_currently_empty_zone(zone, start_pfn, num_pages, 292 MEMMAP_HOTPLUG); 293 return 0; 294 } 295 296 static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, 297 unsigned long start_pfn, unsigned long end_pfn) 298 { 299 int ret; 300 unsigned long flags; 301 unsigned long z1_start_pfn; 302 303 ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn); 304 if (ret) 305 return ret; 306 307 pgdat_resize_lock(z1->zone_pgdat, &flags); 308 309 /* can't move pfns which are higher than @z2 */ 310 if (end_pfn > zone_end_pfn(z2)) 311 goto out_fail; 312 /* the move out part mast at the left most of @z2 */ 313 if (start_pfn > z2->zone_start_pfn) 314 goto out_fail; 315 /* must included/overlap */ 316 if (end_pfn <= z2->zone_start_pfn) 317 goto out_fail; 318 319 /* use start_pfn for z1's start_pfn if z1 is empty */ 320 if (z1->spanned_pages) 321 z1_start_pfn = z1->zone_start_pfn; 322 else 323 z1_start_pfn = start_pfn; 324 325 resize_zone(z1, z1_start_pfn, end_pfn); 326 resize_zone(z2, end_pfn, zone_end_pfn(z2)); 327 328 pgdat_resize_unlock(z1->zone_pgdat, &flags); 329 330 fix_zone_id(z1, start_pfn, end_pfn); 331 332 return 0; 333 out_fail: 334 pgdat_resize_unlock(z1->zone_pgdat, &flags); 335 return -1; 336 } 337 338 static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, 339 unsigned long start_pfn, unsigned long end_pfn) 340 { 341 int ret; 342 unsigned long flags; 343 unsigned long z2_end_pfn; 344 345 ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn); 346 if (ret) 347 return ret; 348 349 pgdat_resize_lock(z1->zone_pgdat, &flags); 350 351 /* can't move pfns which are lower than @z1 */ 352 if (z1->zone_start_pfn > start_pfn) 353 goto out_fail; 354 /* the move out part mast at the right most of @z1 */ 355 if (zone_end_pfn(z1) > end_pfn) 356 goto out_fail; 357 /* must included/overlap */ 358 if (start_pfn >= zone_end_pfn(z1)) 359 goto out_fail; 360 361 /* use end_pfn for z2's end_pfn if z2 is empty */ 362 if (z2->spanned_pages) 363 z2_end_pfn = zone_end_pfn(z2); 364 else 365 z2_end_pfn = end_pfn; 366 367 resize_zone(z1, z1->zone_start_pfn, start_pfn); 368 resize_zone(z2, start_pfn, z2_end_pfn); 369 370 pgdat_resize_unlock(z1->zone_pgdat, &flags); 371 372 fix_zone_id(z2, start_pfn, end_pfn); 373 374 return 0; 375 out_fail: 376 pgdat_resize_unlock(z1->zone_pgdat, &flags); 377 return -1; 378 } 379 380 static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, 381 unsigned long end_pfn) 382 { 383 unsigned long old_pgdat_end_pfn = 384 pgdat->node_start_pfn + pgdat->node_spanned_pages; 385 386 if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) 387 pgdat->node_start_pfn = start_pfn; 388 389 pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - 390 pgdat->node_start_pfn; 391 } 392 393 static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) 394 { 395 struct pglist_data *pgdat = zone->zone_pgdat; 396 int nr_pages = PAGES_PER_SECTION; 397 int nid = pgdat->node_id; 398 int zone_type; 399 unsigned long flags; 400 int ret; 401 402 zone_type = zone - pgdat->node_zones; 403 ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages); 404 if (ret) 405 return ret; 406 407 pgdat_resize_lock(zone->zone_pgdat, &flags); 408 grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); 409 grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, 410 phys_start_pfn + nr_pages); 411 pgdat_resize_unlock(zone->zone_pgdat, &flags); 412 memmap_init_zone(nr_pages, nid, zone_type, 413 phys_start_pfn, MEMMAP_HOTPLUG); 414 return 0; 415 } 416 417 static int __meminit __add_section(int nid, struct zone *zone, 418 unsigned long phys_start_pfn) 419 { 420 int nr_pages = PAGES_PER_SECTION; 421 int ret; 422 423 if (pfn_valid(phys_start_pfn)) 424 return -EEXIST; 425 426 ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages); 427 428 if (ret < 0) 429 return ret; 430 431 ret = __add_zone(zone, phys_start_pfn); 432 433 if (ret < 0) 434 return ret; 435 436 return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); 437 } 438 439 /* 440 * Reasonably generic function for adding memory. It is 441 * expected that archs that support memory hotplug will 442 * call this function after deciding the zone to which to 443 * add the new pages. 444 */ 445 int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, 446 unsigned long nr_pages) 447 { 448 unsigned long i; 449 int err = 0; 450 int start_sec, end_sec; 451 /* during initialize mem_map, align hot-added range to section */ 452 start_sec = pfn_to_section_nr(phys_start_pfn); 453 end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); 454 455 for (i = start_sec; i <= end_sec; i++) { 456 err = __add_section(nid, zone, i << PFN_SECTION_SHIFT); 457 458 /* 459 * EEXIST is finally dealt with by ioresource collision 460 * check. see add_memory() => register_memory_resource() 461 * Warning will be printed if there is collision. 462 */ 463 if (err && (err != -EEXIST)) 464 break; 465 err = 0; 466 } 467 468 return err; 469 } 470 EXPORT_SYMBOL_GPL(__add_pages); 471 472 #ifdef CONFIG_MEMORY_HOTREMOVE 473 /* find the smallest valid pfn in the range [start_pfn, end_pfn) */ 474 static int find_smallest_section_pfn(int nid, struct zone *zone, 475 unsigned long start_pfn, 476 unsigned long end_pfn) 477 { 478 struct mem_section *ms; 479 480 for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) { 481 ms = __pfn_to_section(start_pfn); 482 483 if (unlikely(!valid_section(ms))) 484 continue; 485 486 if (unlikely(pfn_to_nid(start_pfn) != nid)) 487 continue; 488 489 if (zone && zone != page_zone(pfn_to_page(start_pfn))) 490 continue; 491 492 return start_pfn; 493 } 494 495 return 0; 496 } 497 498 /* find the biggest valid pfn in the range [start_pfn, end_pfn). */ 499 static int find_biggest_section_pfn(int nid, struct zone *zone, 500 unsigned long start_pfn, 501 unsigned long end_pfn) 502 { 503 struct mem_section *ms; 504 unsigned long pfn; 505 506 /* pfn is the end pfn of a memory section. */ 507 pfn = end_pfn - 1; 508 for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) { 509 ms = __pfn_to_section(pfn); 510 511 if (unlikely(!valid_section(ms))) 512 continue; 513 514 if (unlikely(pfn_to_nid(pfn) != nid)) 515 continue; 516 517 if (zone && zone != page_zone(pfn_to_page(pfn))) 518 continue; 519 520 return pfn; 521 } 522 523 return 0; 524 } 525 526 static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, 527 unsigned long end_pfn) 528 { 529 unsigned long zone_start_pfn = zone->zone_start_pfn; 530 unsigned long zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; 531 unsigned long pfn; 532 struct mem_section *ms; 533 int nid = zone_to_nid(zone); 534 535 zone_span_writelock(zone); 536 if (zone_start_pfn == start_pfn) { 537 /* 538 * If the section is smallest section in the zone, it need 539 * shrink zone->zone_start_pfn and zone->zone_spanned_pages. 540 * In this case, we find second smallest valid mem_section 541 * for shrinking zone. 542 */ 543 pfn = find_smallest_section_pfn(nid, zone, end_pfn, 544 zone_end_pfn); 545 if (pfn) { 546 zone->zone_start_pfn = pfn; 547 zone->spanned_pages = zone_end_pfn - pfn; 548 } 549 } else if (zone_end_pfn == end_pfn) { 550 /* 551 * If the section is biggest section in the zone, it need 552 * shrink zone->spanned_pages. 553 * In this case, we find second biggest valid mem_section for 554 * shrinking zone. 555 */ 556 pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn, 557 start_pfn); 558 if (pfn) 559 zone->spanned_pages = pfn - zone_start_pfn + 1; 560 } 561 562 /* 563 * The section is not biggest or smallest mem_section in the zone, it 564 * only creates a hole in the zone. So in this case, we need not 565 * change the zone. But perhaps, the zone has only hole data. Thus 566 * it check the zone has only hole or not. 567 */ 568 pfn = zone_start_pfn; 569 for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) { 570 ms = __pfn_to_section(pfn); 571 572 if (unlikely(!valid_section(ms))) 573 continue; 574 575 if (page_zone(pfn_to_page(pfn)) != zone) 576 continue; 577 578 /* If the section is current section, it continues the loop */ 579 if (start_pfn == pfn) 580 continue; 581 582 /* If we find valid section, we have nothing to do */ 583 zone_span_writeunlock(zone); 584 return; 585 } 586 587 /* The zone has no valid section */ 588 zone->zone_start_pfn = 0; 589 zone->spanned_pages = 0; 590 zone_span_writeunlock(zone); 591 } 592 593 static void shrink_pgdat_span(struct pglist_data *pgdat, 594 unsigned long start_pfn, unsigned long end_pfn) 595 { 596 unsigned long pgdat_start_pfn = pgdat->node_start_pfn; 597 unsigned long pgdat_end_pfn = 598 pgdat->node_start_pfn + pgdat->node_spanned_pages; 599 unsigned long pfn; 600 struct mem_section *ms; 601 int nid = pgdat->node_id; 602 603 if (pgdat_start_pfn == start_pfn) { 604 /* 605 * If the section is smallest section in the pgdat, it need 606 * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages. 607 * In this case, we find second smallest valid mem_section 608 * for shrinking zone. 609 */ 610 pfn = find_smallest_section_pfn(nid, NULL, end_pfn, 611 pgdat_end_pfn); 612 if (pfn) { 613 pgdat->node_start_pfn = pfn; 614 pgdat->node_spanned_pages = pgdat_end_pfn - pfn; 615 } 616 } else if (pgdat_end_pfn == end_pfn) { 617 /* 618 * If the section is biggest section in the pgdat, it need 619 * shrink pgdat->node_spanned_pages. 620 * In this case, we find second biggest valid mem_section for 621 * shrinking zone. 622 */ 623 pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn, 624 start_pfn); 625 if (pfn) 626 pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1; 627 } 628 629 /* 630 * If the section is not biggest or smallest mem_section in the pgdat, 631 * it only creates a hole in the pgdat. So in this case, we need not 632 * change the pgdat. 633 * But perhaps, the pgdat has only hole data. Thus it check the pgdat 634 * has only hole or not. 635 */ 636 pfn = pgdat_start_pfn; 637 for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) { 638 ms = __pfn_to_section(pfn); 639 640 if (unlikely(!valid_section(ms))) 641 continue; 642 643 if (pfn_to_nid(pfn) != nid) 644 continue; 645 646 /* If the section is current section, it continues the loop */ 647 if (start_pfn == pfn) 648 continue; 649 650 /* If we find valid section, we have nothing to do */ 651 return; 652 } 653 654 /* The pgdat has no valid section */ 655 pgdat->node_start_pfn = 0; 656 pgdat->node_spanned_pages = 0; 657 } 658 659 static void __remove_zone(struct zone *zone, unsigned long start_pfn) 660 { 661 struct pglist_data *pgdat = zone->zone_pgdat; 662 int nr_pages = PAGES_PER_SECTION; 663 int zone_type; 664 unsigned long flags; 665 666 zone_type = zone - pgdat->node_zones; 667 668 pgdat_resize_lock(zone->zone_pgdat, &flags); 669 shrink_zone_span(zone, start_pfn, start_pfn + nr_pages); 670 shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages); 671 pgdat_resize_unlock(zone->zone_pgdat, &flags); 672 } 673 674 static int __remove_section(struct zone *zone, struct mem_section *ms) 675 { 676 unsigned long start_pfn; 677 int scn_nr; 678 int ret = -EINVAL; 679 680 if (!valid_section(ms)) 681 return ret; 682 683 ret = unregister_memory_section(ms); 684 if (ret) 685 return ret; 686 687 scn_nr = __section_nr(ms); 688 start_pfn = section_nr_to_pfn(scn_nr); 689 __remove_zone(zone, start_pfn); 690 691 sparse_remove_one_section(zone, ms); 692 return 0; 693 } 694 695 /** 696 * __remove_pages() - remove sections of pages from a zone 697 * @zone: zone from which pages need to be removed 698 * @phys_start_pfn: starting pageframe (must be aligned to start of a section) 699 * @nr_pages: number of pages to remove (must be multiple of section size) 700 * 701 * Generic helper function to remove section mappings and sysfs entries 702 * for the section of the memory we are removing. Caller needs to make 703 * sure that pages are marked reserved and zones are adjust properly by 704 * calling offline_pages(). 705 */ 706 int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, 707 unsigned long nr_pages) 708 { 709 unsigned long i; 710 int sections_to_remove; 711 resource_size_t start, size; 712 int ret = 0; 713 714 /* 715 * We can only remove entire sections 716 */ 717 BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); 718 BUG_ON(nr_pages % PAGES_PER_SECTION); 719 720 start = phys_start_pfn << PAGE_SHIFT; 721 size = nr_pages * PAGE_SIZE; 722 ret = release_mem_region_adjustable(&iomem_resource, start, size); 723 if (ret) { 724 resource_size_t endres = start + size - 1; 725 726 pr_warn("Unable to release resource <%pa-%pa> (%d)\n", 727 &start, &endres, ret); 728 } 729 730 sections_to_remove = nr_pages / PAGES_PER_SECTION; 731 for (i = 0; i < sections_to_remove; i++) { 732 unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; 733 ret = __remove_section(zone, __pfn_to_section(pfn)); 734 if (ret) 735 break; 736 } 737 return ret; 738 } 739 EXPORT_SYMBOL_GPL(__remove_pages); 740 #endif /* CONFIG_MEMORY_HOTREMOVE */ 741 742 int set_online_page_callback(online_page_callback_t callback) 743 { 744 int rc = -EINVAL; 745 746 lock_memory_hotplug(); 747 748 if (online_page_callback == generic_online_page) { 749 online_page_callback = callback; 750 rc = 0; 751 } 752 753 unlock_memory_hotplug(); 754 755 return rc; 756 } 757 EXPORT_SYMBOL_GPL(set_online_page_callback); 758 759 int restore_online_page_callback(online_page_callback_t callback) 760 { 761 int rc = -EINVAL; 762 763 lock_memory_hotplug(); 764 765 if (online_page_callback == callback) { 766 online_page_callback = generic_online_page; 767 rc = 0; 768 } 769 770 unlock_memory_hotplug(); 771 772 return rc; 773 } 774 EXPORT_SYMBOL_GPL(restore_online_page_callback); 775 776 void __online_page_set_limits(struct page *page) 777 { 778 unsigned long pfn = page_to_pfn(page); 779 780 if (pfn >= num_physpages) 781 num_physpages = pfn + 1; 782 } 783 EXPORT_SYMBOL_GPL(__online_page_set_limits); 784 785 void __online_page_increment_counters(struct page *page) 786 { 787 totalram_pages++; 788 789 #ifdef CONFIG_HIGHMEM 790 if (PageHighMem(page)) 791 totalhigh_pages++; 792 #endif 793 } 794 EXPORT_SYMBOL_GPL(__online_page_increment_counters); 795 796 void __online_page_free(struct page *page) 797 { 798 ClearPageReserved(page); 799 init_page_count(page); 800 __free_page(page); 801 } 802 EXPORT_SYMBOL_GPL(__online_page_free); 803 804 static void generic_online_page(struct page *page) 805 { 806 __online_page_set_limits(page); 807 __online_page_increment_counters(page); 808 __online_page_free(page); 809 } 810 811 static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, 812 void *arg) 813 { 814 unsigned long i; 815 unsigned long onlined_pages = *(unsigned long *)arg; 816 struct page *page; 817 if (PageReserved(pfn_to_page(start_pfn))) 818 for (i = 0; i < nr_pages; i++) { 819 page = pfn_to_page(start_pfn + i); 820 (*online_page_callback)(page); 821 onlined_pages++; 822 } 823 *(unsigned long *)arg = onlined_pages; 824 return 0; 825 } 826 827 #ifdef CONFIG_MOVABLE_NODE 828 /* 829 * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have 830 * normal memory. 831 */ 832 static bool can_online_high_movable(struct zone *zone) 833 { 834 return true; 835 } 836 #else /* CONFIG_MOVABLE_NODE */ 837 /* ensure every online node has NORMAL memory */ 838 static bool can_online_high_movable(struct zone *zone) 839 { 840 return node_state(zone_to_nid(zone), N_NORMAL_MEMORY); 841 } 842 #endif /* CONFIG_MOVABLE_NODE */ 843 844 /* check which state of node_states will be changed when online memory */ 845 static void node_states_check_changes_online(unsigned long nr_pages, 846 struct zone *zone, struct memory_notify *arg) 847 { 848 int nid = zone_to_nid(zone); 849 enum zone_type zone_last = ZONE_NORMAL; 850 851 /* 852 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] 853 * contains nodes which have zones of 0...ZONE_NORMAL, 854 * set zone_last to ZONE_NORMAL. 855 * 856 * If we don't have HIGHMEM nor movable node, 857 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of 858 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. 859 */ 860 if (N_MEMORY == N_NORMAL_MEMORY) 861 zone_last = ZONE_MOVABLE; 862 863 /* 864 * if the memory to be online is in a zone of 0...zone_last, and 865 * the zones of 0...zone_last don't have memory before online, we will 866 * need to set the node to node_states[N_NORMAL_MEMORY] after 867 * the memory is online. 868 */ 869 if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY)) 870 arg->status_change_nid_normal = nid; 871 else 872 arg->status_change_nid_normal = -1; 873 874 #ifdef CONFIG_HIGHMEM 875 /* 876 * If we have movable node, node_states[N_HIGH_MEMORY] 877 * contains nodes which have zones of 0...ZONE_HIGHMEM, 878 * set zone_last to ZONE_HIGHMEM. 879 * 880 * If we don't have movable node, node_states[N_NORMAL_MEMORY] 881 * contains nodes which have zones of 0...ZONE_MOVABLE, 882 * set zone_last to ZONE_MOVABLE. 883 */ 884 zone_last = ZONE_HIGHMEM; 885 if (N_MEMORY == N_HIGH_MEMORY) 886 zone_last = ZONE_MOVABLE; 887 888 if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY)) 889 arg->status_change_nid_high = nid; 890 else 891 arg->status_change_nid_high = -1; 892 #else 893 arg->status_change_nid_high = arg->status_change_nid_normal; 894 #endif 895 896 /* 897 * if the node don't have memory befor online, we will need to 898 * set the node to node_states[N_MEMORY] after the memory 899 * is online. 900 */ 901 if (!node_state(nid, N_MEMORY)) 902 arg->status_change_nid = nid; 903 else 904 arg->status_change_nid = -1; 905 } 906 907 static void node_states_set_node(int node, struct memory_notify *arg) 908 { 909 if (arg->status_change_nid_normal >= 0) 910 node_set_state(node, N_NORMAL_MEMORY); 911 912 if (arg->status_change_nid_high >= 0) 913 node_set_state(node, N_HIGH_MEMORY); 914 915 node_set_state(node, N_MEMORY); 916 } 917 918 919 int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) 920 { 921 unsigned long flags; 922 unsigned long onlined_pages = 0; 923 struct zone *zone; 924 int need_zonelists_rebuild = 0; 925 int nid; 926 int ret; 927 struct memory_notify arg; 928 929 lock_memory_hotplug(); 930 /* 931 * This doesn't need a lock to do pfn_to_page(). 932 * The section can't be removed here because of the 933 * memory_block->state_mutex. 934 */ 935 zone = page_zone(pfn_to_page(pfn)); 936 937 if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) && 938 !can_online_high_movable(zone)) { 939 unlock_memory_hotplug(); 940 return -1; 941 } 942 943 if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { 944 if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) { 945 unlock_memory_hotplug(); 946 return -1; 947 } 948 } 949 if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { 950 if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) { 951 unlock_memory_hotplug(); 952 return -1; 953 } 954 } 955 956 /* Previous code may changed the zone of the pfn range */ 957 zone = page_zone(pfn_to_page(pfn)); 958 959 arg.start_pfn = pfn; 960 arg.nr_pages = nr_pages; 961 node_states_check_changes_online(nr_pages, zone, &arg); 962 963 nid = page_to_nid(pfn_to_page(pfn)); 964 965 ret = memory_notify(MEM_GOING_ONLINE, &arg); 966 ret = notifier_to_errno(ret); 967 if (ret) { 968 memory_notify(MEM_CANCEL_ONLINE, &arg); 969 unlock_memory_hotplug(); 970 return ret; 971 } 972 /* 973 * If this zone is not populated, then it is not in zonelist. 974 * This means the page allocator ignores this zone. 975 * So, zonelist must be updated after online. 976 */ 977 mutex_lock(&zonelists_mutex); 978 if (!populated_zone(zone)) { 979 need_zonelists_rebuild = 1; 980 build_all_zonelists(NULL, zone); 981 } 982 983 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, 984 online_pages_range); 985 if (ret) { 986 if (need_zonelists_rebuild) 987 zone_pcp_reset(zone); 988 mutex_unlock(&zonelists_mutex); 989 printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n", 990 (unsigned long long) pfn << PAGE_SHIFT, 991 (((unsigned long long) pfn + nr_pages) 992 << PAGE_SHIFT) - 1); 993 memory_notify(MEM_CANCEL_ONLINE, &arg); 994 unlock_memory_hotplug(); 995 return ret; 996 } 997 998 zone->managed_pages += onlined_pages; 999 zone->present_pages += onlined_pages; 1000 1001 pgdat_resize_lock(zone->zone_pgdat, &flags); 1002 zone->zone_pgdat->node_present_pages += onlined_pages; 1003 pgdat_resize_unlock(zone->zone_pgdat, &flags); 1004 1005 if (onlined_pages) { 1006 node_states_set_node(zone_to_nid(zone), &arg); 1007 if (need_zonelists_rebuild) 1008 build_all_zonelists(NULL, NULL); 1009 else 1010 zone_pcp_update(zone); 1011 } 1012 1013 mutex_unlock(&zonelists_mutex); 1014 1015 init_per_zone_wmark_min(); 1016 1017 if (onlined_pages) 1018 kswapd_run(zone_to_nid(zone)); 1019 1020 vm_total_pages = nr_free_pagecache_pages(); 1021 1022 writeback_set_ratelimit(); 1023 1024 if (onlined_pages) 1025 memory_notify(MEM_ONLINE, &arg); 1026 unlock_memory_hotplug(); 1027 1028 return 0; 1029 } 1030 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ 1031 1032 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 1033 static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) 1034 { 1035 struct pglist_data *pgdat; 1036 unsigned long zones_size[MAX_NR_ZONES] = {0}; 1037 unsigned long zholes_size[MAX_NR_ZONES] = {0}; 1038 unsigned long start_pfn = start >> PAGE_SHIFT; 1039 1040 pgdat = NODE_DATA(nid); 1041 if (!pgdat) { 1042 pgdat = arch_alloc_nodedata(nid); 1043 if (!pgdat) 1044 return NULL; 1045 1046 arch_refresh_nodedata(nid, pgdat); 1047 } 1048 1049 /* we can use NODE_DATA(nid) from here */ 1050 1051 /* init node's zones as empty zones, we don't have any present pages.*/ 1052 free_area_init_node(nid, zones_size, start_pfn, zholes_size); 1053 1054 /* 1055 * The node we allocated has no zone fallback lists. For avoiding 1056 * to access not-initialized zonelist, build here. 1057 */ 1058 mutex_lock(&zonelists_mutex); 1059 build_all_zonelists(pgdat, NULL); 1060 mutex_unlock(&zonelists_mutex); 1061 1062 return pgdat; 1063 } 1064 1065 static void rollback_node_hotadd(int nid, pg_data_t *pgdat) 1066 { 1067 arch_refresh_nodedata(nid, NULL); 1068 arch_free_nodedata(pgdat); 1069 return; 1070 } 1071 1072 1073 /* 1074 * called by cpu_up() to online a node without onlined memory. 1075 */ 1076 int mem_online_node(int nid) 1077 { 1078 pg_data_t *pgdat; 1079 int ret; 1080 1081 lock_memory_hotplug(); 1082 pgdat = hotadd_new_pgdat(nid, 0); 1083 if (!pgdat) { 1084 ret = -ENOMEM; 1085 goto out; 1086 } 1087 node_set_online(nid); 1088 ret = register_one_node(nid); 1089 BUG_ON(ret); 1090 1091 out: 1092 unlock_memory_hotplug(); 1093 return ret; 1094 } 1095 1096 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 1097 int __ref add_memory(int nid, u64 start, u64 size) 1098 { 1099 pg_data_t *pgdat = NULL; 1100 bool new_pgdat; 1101 bool new_node; 1102 struct resource *res; 1103 int ret; 1104 1105 lock_memory_hotplug(); 1106 1107 res = register_memory_resource(start, size); 1108 ret = -EEXIST; 1109 if (!res) 1110 goto out; 1111 1112 { /* Stupid hack to suppress address-never-null warning */ 1113 void *p = NODE_DATA(nid); 1114 new_pgdat = !p; 1115 } 1116 new_node = !node_online(nid); 1117 if (new_node) { 1118 pgdat = hotadd_new_pgdat(nid, start); 1119 ret = -ENOMEM; 1120 if (!pgdat) 1121 goto error; 1122 } 1123 1124 /* call arch's memory hotadd */ 1125 ret = arch_add_memory(nid, start, size); 1126 1127 if (ret < 0) 1128 goto error; 1129 1130 /* we online node here. we can't roll back from here. */ 1131 node_set_online(nid); 1132 1133 if (new_node) { 1134 ret = register_one_node(nid); 1135 /* 1136 * If sysfs file of new node can't create, cpu on the node 1137 * can't be hot-added. There is no rollback way now. 1138 * So, check by BUG_ON() to catch it reluctantly.. 1139 */ 1140 BUG_ON(ret); 1141 } 1142 1143 /* create new memmap entry */ 1144 firmware_map_add_hotplug(start, start + size, "System RAM"); 1145 1146 goto out; 1147 1148 error: 1149 /* rollback pgdat allocation and others */ 1150 if (new_pgdat) 1151 rollback_node_hotadd(nid, pgdat); 1152 release_memory_resource(res); 1153 1154 out: 1155 unlock_memory_hotplug(); 1156 return ret; 1157 } 1158 EXPORT_SYMBOL_GPL(add_memory); 1159 1160 #ifdef CONFIG_MEMORY_HOTREMOVE 1161 /* 1162 * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy 1163 * set and the size of the free page is given by page_order(). Using this, 1164 * the function determines if the pageblock contains only free pages. 1165 * Due to buddy contraints, a free page at least the size of a pageblock will 1166 * be located at the start of the pageblock 1167 */ 1168 static inline int pageblock_free(struct page *page) 1169 { 1170 return PageBuddy(page) && page_order(page) >= pageblock_order; 1171 } 1172 1173 /* Return the start of the next active pageblock after a given page */ 1174 static struct page *next_active_pageblock(struct page *page) 1175 { 1176 /* Ensure the starting page is pageblock-aligned */ 1177 BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1)); 1178 1179 /* If the entire pageblock is free, move to the end of free page */ 1180 if (pageblock_free(page)) { 1181 int order; 1182 /* be careful. we don't have locks, page_order can be changed.*/ 1183 order = page_order(page); 1184 if ((order < MAX_ORDER) && (order >= pageblock_order)) 1185 return page + (1 << order); 1186 } 1187 1188 return page + pageblock_nr_pages; 1189 } 1190 1191 /* Checks if this range of memory is likely to be hot-removable. */ 1192 int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) 1193 { 1194 struct page *page = pfn_to_page(start_pfn); 1195 struct page *end_page = page + nr_pages; 1196 1197 /* Check the starting page of each pageblock within the range */ 1198 for (; page < end_page; page = next_active_pageblock(page)) { 1199 if (!is_pageblock_removable_nolock(page)) 1200 return 0; 1201 cond_resched(); 1202 } 1203 1204 /* All pageblocks in the memory block are likely to be hot-removable */ 1205 return 1; 1206 } 1207 1208 /* 1209 * Confirm all pages in a range [start, end) is belongs to the same zone. 1210 */ 1211 static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) 1212 { 1213 unsigned long pfn; 1214 struct zone *zone = NULL; 1215 struct page *page; 1216 int i; 1217 for (pfn = start_pfn; 1218 pfn < end_pfn; 1219 pfn += MAX_ORDER_NR_PAGES) { 1220 i = 0; 1221 /* This is just a CONFIG_HOLES_IN_ZONE check.*/ 1222 while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i)) 1223 i++; 1224 if (i == MAX_ORDER_NR_PAGES) 1225 continue; 1226 page = pfn_to_page(pfn + i); 1227 if (zone && page_zone(page) != zone) 1228 return 0; 1229 zone = page_zone(page); 1230 } 1231 return 1; 1232 } 1233 1234 /* 1235 * Scanning pfn is much easier than scanning lru list. 1236 * Scan pfn from start to end and Find LRU page. 1237 */ 1238 static unsigned long scan_lru_pages(unsigned long start, unsigned long end) 1239 { 1240 unsigned long pfn; 1241 struct page *page; 1242 for (pfn = start; pfn < end; pfn++) { 1243 if (pfn_valid(pfn)) { 1244 page = pfn_to_page(pfn); 1245 if (PageLRU(page)) 1246 return pfn; 1247 } 1248 } 1249 return 0; 1250 } 1251 1252 #define NR_OFFLINE_AT_ONCE_PAGES (256) 1253 static int 1254 do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) 1255 { 1256 unsigned long pfn; 1257 struct page *page; 1258 int move_pages = NR_OFFLINE_AT_ONCE_PAGES; 1259 int not_managed = 0; 1260 int ret = 0; 1261 LIST_HEAD(source); 1262 1263 for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) { 1264 if (!pfn_valid(pfn)) 1265 continue; 1266 page = pfn_to_page(pfn); 1267 if (!get_page_unless_zero(page)) 1268 continue; 1269 /* 1270 * We can skip free pages. And we can only deal with pages on 1271 * LRU. 1272 */ 1273 ret = isolate_lru_page(page); 1274 if (!ret) { /* Success */ 1275 put_page(page); 1276 list_add_tail(&page->lru, &source); 1277 move_pages--; 1278 inc_zone_page_state(page, NR_ISOLATED_ANON + 1279 page_is_file_cache(page)); 1280 1281 } else { 1282 #ifdef CONFIG_DEBUG_VM 1283 printk(KERN_ALERT "removing pfn %lx from LRU failed\n", 1284 pfn); 1285 dump_page(page); 1286 #endif 1287 put_page(page); 1288 /* Because we don't have big zone->lock. we should 1289 check this again here. */ 1290 if (page_count(page)) { 1291 not_managed++; 1292 ret = -EBUSY; 1293 break; 1294 } 1295 } 1296 } 1297 if (!list_empty(&source)) { 1298 if (not_managed) { 1299 putback_lru_pages(&source); 1300 goto out; 1301 } 1302 1303 /* 1304 * alloc_migrate_target should be improooooved!! 1305 * migrate_pages returns # of failed pages. 1306 */ 1307 ret = migrate_pages(&source, alloc_migrate_target, 0, 1308 MIGRATE_SYNC, MR_MEMORY_HOTPLUG); 1309 if (ret) 1310 putback_lru_pages(&source); 1311 } 1312 out: 1313 return ret; 1314 } 1315 1316 /* 1317 * remove from free_area[] and mark all as Reserved. 1318 */ 1319 static int 1320 offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, 1321 void *data) 1322 { 1323 __offline_isolated_pages(start, start + nr_pages); 1324 return 0; 1325 } 1326 1327 static void 1328 offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 1329 { 1330 walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL, 1331 offline_isolated_pages_cb); 1332 } 1333 1334 /* 1335 * Check all pages in range, recoreded as memory resource, are isolated. 1336 */ 1337 static int 1338 check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, 1339 void *data) 1340 { 1341 int ret; 1342 long offlined = *(long *)data; 1343 ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true); 1344 offlined = nr_pages; 1345 if (!ret) 1346 *(long *)data += offlined; 1347 return ret; 1348 } 1349 1350 static long 1351 check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) 1352 { 1353 long offlined = 0; 1354 int ret; 1355 1356 ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined, 1357 check_pages_isolated_cb); 1358 if (ret < 0) 1359 offlined = (long)ret; 1360 return offlined; 1361 } 1362 1363 #ifdef CONFIG_MOVABLE_NODE 1364 /* 1365 * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have 1366 * normal memory. 1367 */ 1368 static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) 1369 { 1370 return true; 1371 } 1372 #else /* CONFIG_MOVABLE_NODE */ 1373 /* ensure the node has NORMAL memory if it is still online */ 1374 static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) 1375 { 1376 struct pglist_data *pgdat = zone->zone_pgdat; 1377 unsigned long present_pages = 0; 1378 enum zone_type zt; 1379 1380 for (zt = 0; zt <= ZONE_NORMAL; zt++) 1381 present_pages += pgdat->node_zones[zt].present_pages; 1382 1383 if (present_pages > nr_pages) 1384 return true; 1385 1386 present_pages = 0; 1387 for (; zt <= ZONE_MOVABLE; zt++) 1388 present_pages += pgdat->node_zones[zt].present_pages; 1389 1390 /* 1391 * we can't offline the last normal memory until all 1392 * higher memory is offlined. 1393 */ 1394 return present_pages == 0; 1395 } 1396 #endif /* CONFIG_MOVABLE_NODE */ 1397 1398 /* check which state of node_states will be changed when offline memory */ 1399 static void node_states_check_changes_offline(unsigned long nr_pages, 1400 struct zone *zone, struct memory_notify *arg) 1401 { 1402 struct pglist_data *pgdat = zone->zone_pgdat; 1403 unsigned long present_pages = 0; 1404 enum zone_type zt, zone_last = ZONE_NORMAL; 1405 1406 /* 1407 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] 1408 * contains nodes which have zones of 0...ZONE_NORMAL, 1409 * set zone_last to ZONE_NORMAL. 1410 * 1411 * If we don't have HIGHMEM nor movable node, 1412 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of 1413 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. 1414 */ 1415 if (N_MEMORY == N_NORMAL_MEMORY) 1416 zone_last = ZONE_MOVABLE; 1417 1418 /* 1419 * check whether node_states[N_NORMAL_MEMORY] will be changed. 1420 * If the memory to be offline is in a zone of 0...zone_last, 1421 * and it is the last present memory, 0...zone_last will 1422 * become empty after offline , thus we can determind we will 1423 * need to clear the node from node_states[N_NORMAL_MEMORY]. 1424 */ 1425 for (zt = 0; zt <= zone_last; zt++) 1426 present_pages += pgdat->node_zones[zt].present_pages; 1427 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) 1428 arg->status_change_nid_normal = zone_to_nid(zone); 1429 else 1430 arg->status_change_nid_normal = -1; 1431 1432 #ifdef CONFIG_HIGHMEM 1433 /* 1434 * If we have movable node, node_states[N_HIGH_MEMORY] 1435 * contains nodes which have zones of 0...ZONE_HIGHMEM, 1436 * set zone_last to ZONE_HIGHMEM. 1437 * 1438 * If we don't have movable node, node_states[N_NORMAL_MEMORY] 1439 * contains nodes which have zones of 0...ZONE_MOVABLE, 1440 * set zone_last to ZONE_MOVABLE. 1441 */ 1442 zone_last = ZONE_HIGHMEM; 1443 if (N_MEMORY == N_HIGH_MEMORY) 1444 zone_last = ZONE_MOVABLE; 1445 1446 for (; zt <= zone_last; zt++) 1447 present_pages += pgdat->node_zones[zt].present_pages; 1448 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) 1449 arg->status_change_nid_high = zone_to_nid(zone); 1450 else 1451 arg->status_change_nid_high = -1; 1452 #else 1453 arg->status_change_nid_high = arg->status_change_nid_normal; 1454 #endif 1455 1456 /* 1457 * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE 1458 */ 1459 zone_last = ZONE_MOVABLE; 1460 1461 /* 1462 * check whether node_states[N_HIGH_MEMORY] will be changed 1463 * If we try to offline the last present @nr_pages from the node, 1464 * we can determind we will need to clear the node from 1465 * node_states[N_HIGH_MEMORY]. 1466 */ 1467 for (; zt <= zone_last; zt++) 1468 present_pages += pgdat->node_zones[zt].present_pages; 1469 if (nr_pages >= present_pages) 1470 arg->status_change_nid = zone_to_nid(zone); 1471 else 1472 arg->status_change_nid = -1; 1473 } 1474 1475 static void node_states_clear_node(int node, struct memory_notify *arg) 1476 { 1477 if (arg->status_change_nid_normal >= 0) 1478 node_clear_state(node, N_NORMAL_MEMORY); 1479 1480 if ((N_MEMORY != N_NORMAL_MEMORY) && 1481 (arg->status_change_nid_high >= 0)) 1482 node_clear_state(node, N_HIGH_MEMORY); 1483 1484 if ((N_MEMORY != N_HIGH_MEMORY) && 1485 (arg->status_change_nid >= 0)) 1486 node_clear_state(node, N_MEMORY); 1487 } 1488 1489 static int __ref __offline_pages(unsigned long start_pfn, 1490 unsigned long end_pfn, unsigned long timeout) 1491 { 1492 unsigned long pfn, nr_pages, expire; 1493 long offlined_pages; 1494 int ret, drain, retry_max, node; 1495 struct zone *zone; 1496 struct memory_notify arg; 1497 1498 BUG_ON(start_pfn >= end_pfn); 1499 /* at least, alignment against pageblock is necessary */ 1500 if (!IS_ALIGNED(start_pfn, pageblock_nr_pages)) 1501 return -EINVAL; 1502 if (!IS_ALIGNED(end_pfn, pageblock_nr_pages)) 1503 return -EINVAL; 1504 /* This makes hotplug much easier...and readable. 1505 we assume this for now. .*/ 1506 if (!test_pages_in_a_zone(start_pfn, end_pfn)) 1507 return -EINVAL; 1508 1509 lock_memory_hotplug(); 1510 1511 zone = page_zone(pfn_to_page(start_pfn)); 1512 node = zone_to_nid(zone); 1513 nr_pages = end_pfn - start_pfn; 1514 1515 ret = -EINVAL; 1516 if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages)) 1517 goto out; 1518 1519 /* set above range as isolated */ 1520 ret = start_isolate_page_range(start_pfn, end_pfn, 1521 MIGRATE_MOVABLE, true); 1522 if (ret) 1523 goto out; 1524 1525 arg.start_pfn = start_pfn; 1526 arg.nr_pages = nr_pages; 1527 node_states_check_changes_offline(nr_pages, zone, &arg); 1528 1529 ret = memory_notify(MEM_GOING_OFFLINE, &arg); 1530 ret = notifier_to_errno(ret); 1531 if (ret) 1532 goto failed_removal; 1533 1534 pfn = start_pfn; 1535 expire = jiffies + timeout; 1536 drain = 0; 1537 retry_max = 5; 1538 repeat: 1539 /* start memory hot removal */ 1540 ret = -EAGAIN; 1541 if (time_after(jiffies, expire)) 1542 goto failed_removal; 1543 ret = -EINTR; 1544 if (signal_pending(current)) 1545 goto failed_removal; 1546 ret = 0; 1547 if (drain) { 1548 lru_add_drain_all(); 1549 cond_resched(); 1550 drain_all_pages(); 1551 } 1552 1553 pfn = scan_lru_pages(start_pfn, end_pfn); 1554 if (pfn) { /* We have page on LRU */ 1555 ret = do_migrate_range(pfn, end_pfn); 1556 if (!ret) { 1557 drain = 1; 1558 goto repeat; 1559 } else { 1560 if (ret < 0) 1561 if (--retry_max == 0) 1562 goto failed_removal; 1563 yield(); 1564 drain = 1; 1565 goto repeat; 1566 } 1567 } 1568 /* drain all zone's lru pagevec, this is asynchronous... */ 1569 lru_add_drain_all(); 1570 yield(); 1571 /* drain pcp pages, this is synchronous. */ 1572 drain_all_pages(); 1573 /* check again */ 1574 offlined_pages = check_pages_isolated(start_pfn, end_pfn); 1575 if (offlined_pages < 0) { 1576 ret = -EBUSY; 1577 goto failed_removal; 1578 } 1579 printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages); 1580 /* Ok, all of our target is isolated. 1581 We cannot do rollback at this point. */ 1582 offline_isolated_pages(start_pfn, end_pfn); 1583 /* reset pagetype flags and makes migrate type to be MOVABLE */ 1584 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1585 /* removal success */ 1586 zone->managed_pages -= offlined_pages; 1587 zone->present_pages -= offlined_pages; 1588 zone->zone_pgdat->node_present_pages -= offlined_pages; 1589 totalram_pages -= offlined_pages; 1590 1591 init_per_zone_wmark_min(); 1592 1593 if (!populated_zone(zone)) { 1594 zone_pcp_reset(zone); 1595 mutex_lock(&zonelists_mutex); 1596 build_all_zonelists(NULL, NULL); 1597 mutex_unlock(&zonelists_mutex); 1598 } else 1599 zone_pcp_update(zone); 1600 1601 node_states_clear_node(node, &arg); 1602 if (arg.status_change_nid >= 0) 1603 kswapd_stop(node); 1604 1605 vm_total_pages = nr_free_pagecache_pages(); 1606 writeback_set_ratelimit(); 1607 1608 memory_notify(MEM_OFFLINE, &arg); 1609 unlock_memory_hotplug(); 1610 return 0; 1611 1612 failed_removal: 1613 printk(KERN_INFO "memory offlining [mem %#010llx-%#010llx] failed\n", 1614 (unsigned long long) start_pfn << PAGE_SHIFT, 1615 ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); 1616 memory_notify(MEM_CANCEL_OFFLINE, &arg); 1617 /* pushback to free area */ 1618 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1619 1620 out: 1621 unlock_memory_hotplug(); 1622 return ret; 1623 } 1624 1625 int offline_pages(unsigned long start_pfn, unsigned long nr_pages) 1626 { 1627 return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); 1628 } 1629 1630 /** 1631 * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn) 1632 * @start_pfn: start pfn of the memory range 1633 * @end_pfn: end pfn of the memory range 1634 * @arg: argument passed to func 1635 * @func: callback for each memory section walked 1636 * 1637 * This function walks through all present mem sections in range 1638 * [start_pfn, end_pfn) and call func on each mem section. 1639 * 1640 * Returns the return value of func. 1641 */ 1642 static int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, 1643 void *arg, int (*func)(struct memory_block *, void *)) 1644 { 1645 struct memory_block *mem = NULL; 1646 struct mem_section *section; 1647 unsigned long pfn, section_nr; 1648 int ret; 1649 1650 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 1651 section_nr = pfn_to_section_nr(pfn); 1652 if (!present_section_nr(section_nr)) 1653 continue; 1654 1655 section = __nr_to_section(section_nr); 1656 /* same memblock? */ 1657 if (mem) 1658 if ((section_nr >= mem->start_section_nr) && 1659 (section_nr <= mem->end_section_nr)) 1660 continue; 1661 1662 mem = find_memory_block_hinted(section, mem); 1663 if (!mem) 1664 continue; 1665 1666 ret = func(mem, arg); 1667 if (ret) { 1668 kobject_put(&mem->dev.kobj); 1669 return ret; 1670 } 1671 } 1672 1673 if (mem) 1674 kobject_put(&mem->dev.kobj); 1675 1676 return 0; 1677 } 1678 1679 /** 1680 * offline_memory_block_cb - callback function for offlining memory block 1681 * @mem: the memory block to be offlined 1682 * @arg: buffer to hold error msg 1683 * 1684 * Always return 0, and put the error msg in arg if any. 1685 */ 1686 static int offline_memory_block_cb(struct memory_block *mem, void *arg) 1687 { 1688 int *ret = arg; 1689 int error = offline_memory_block(mem); 1690 1691 if (error != 0 && *ret == 0) 1692 *ret = error; 1693 1694 return 0; 1695 } 1696 1697 static int is_memblock_offlined_cb(struct memory_block *mem, void *arg) 1698 { 1699 int ret = !is_memblock_offlined(mem); 1700 1701 if (unlikely(ret)) { 1702 phys_addr_t beginpa, endpa; 1703 1704 beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); 1705 endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1; 1706 pr_warn("removing memory fails, because memory " 1707 "[%pa-%pa] is onlined\n", 1708 &beginpa, &endpa); 1709 } 1710 1711 return ret; 1712 } 1713 1714 static int check_cpu_on_node(void *data) 1715 { 1716 struct pglist_data *pgdat = data; 1717 int cpu; 1718 1719 for_each_present_cpu(cpu) { 1720 if (cpu_to_node(cpu) == pgdat->node_id) 1721 /* 1722 * the cpu on this node isn't removed, and we can't 1723 * offline this node. 1724 */ 1725 return -EBUSY; 1726 } 1727 1728 return 0; 1729 } 1730 1731 static void unmap_cpu_on_node(void *data) 1732 { 1733 #ifdef CONFIG_ACPI_NUMA 1734 struct pglist_data *pgdat = data; 1735 int cpu; 1736 1737 for_each_possible_cpu(cpu) 1738 if (cpu_to_node(cpu) == pgdat->node_id) 1739 numa_clear_node(cpu); 1740 #endif 1741 } 1742 1743 static int check_and_unmap_cpu_on_node(void *data) 1744 { 1745 int ret = check_cpu_on_node(data); 1746 1747 if (ret) 1748 return ret; 1749 1750 /* 1751 * the node will be offlined when we come here, so we can clear 1752 * the cpu_to_node() now. 1753 */ 1754 1755 unmap_cpu_on_node(data); 1756 return 0; 1757 } 1758 1759 /* offline the node if all memory sections of this node are removed */ 1760 void try_offline_node(int nid) 1761 { 1762 pg_data_t *pgdat = NODE_DATA(nid); 1763 unsigned long start_pfn = pgdat->node_start_pfn; 1764 unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; 1765 unsigned long pfn; 1766 struct page *pgdat_page = virt_to_page(pgdat); 1767 int i; 1768 1769 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 1770 unsigned long section_nr = pfn_to_section_nr(pfn); 1771 1772 if (!present_section_nr(section_nr)) 1773 continue; 1774 1775 if (pfn_to_nid(pfn) != nid) 1776 continue; 1777 1778 /* 1779 * some memory sections of this node are not removed, and we 1780 * can't offline node now. 1781 */ 1782 return; 1783 } 1784 1785 if (stop_machine(check_and_unmap_cpu_on_node, pgdat, NULL)) 1786 return; 1787 1788 /* 1789 * all memory/cpu of this node are removed, we can offline this 1790 * node now. 1791 */ 1792 node_set_offline(nid); 1793 unregister_one_node(nid); 1794 1795 if (!PageSlab(pgdat_page) && !PageCompound(pgdat_page)) 1796 /* node data is allocated from boot memory */ 1797 return; 1798 1799 /* free waittable in each zone */ 1800 for (i = 0; i < MAX_NR_ZONES; i++) { 1801 struct zone *zone = pgdat->node_zones + i; 1802 1803 /* 1804 * wait_table may be allocated from boot memory, 1805 * here only free if it's allocated by vmalloc. 1806 */ 1807 if (is_vmalloc_addr(zone->wait_table)) 1808 vfree(zone->wait_table); 1809 } 1810 1811 /* 1812 * Since there is no way to guarentee the address of pgdat/zone is not 1813 * on stack of any kernel threads or used by other kernel objects 1814 * without reference counting or other symchronizing method, do not 1815 * reset node_data and free pgdat here. Just reset it to 0 and reuse 1816 * the memory when the node is online again. 1817 */ 1818 memset(pgdat, 0, sizeof(*pgdat)); 1819 } 1820 EXPORT_SYMBOL(try_offline_node); 1821 1822 int __ref remove_memory(int nid, u64 start, u64 size) 1823 { 1824 unsigned long start_pfn, end_pfn; 1825 int ret = 0; 1826 int retry = 1; 1827 1828 start_pfn = PFN_DOWN(start); 1829 end_pfn = PFN_UP(start + size - 1); 1830 1831 /* 1832 * When CONFIG_MEMCG is on, one memory block may be used by other 1833 * blocks to store page cgroup when onlining pages. But we don't know 1834 * in what order pages are onlined. So we iterate twice to offline 1835 * memory: 1836 * 1st iterate: offline every non primary memory block. 1837 * 2nd iterate: offline primary (i.e. first added) memory block. 1838 */ 1839 repeat: 1840 walk_memory_range(start_pfn, end_pfn, &ret, 1841 offline_memory_block_cb); 1842 if (ret) { 1843 if (!retry) 1844 return ret; 1845 1846 retry = 0; 1847 ret = 0; 1848 goto repeat; 1849 } 1850 1851 lock_memory_hotplug(); 1852 1853 /* 1854 * we have offlined all memory blocks like this: 1855 * 1. lock memory hotplug 1856 * 2. offline a memory block 1857 * 3. unlock memory hotplug 1858 * 1859 * repeat step1-3 to offline the memory block. All memory blocks 1860 * must be offlined before removing memory. But we don't hold the 1861 * lock in the whole operation. So we should check whether all 1862 * memory blocks are offlined. 1863 */ 1864 1865 ret = walk_memory_range(start_pfn, end_pfn, NULL, 1866 is_memblock_offlined_cb); 1867 if (ret) { 1868 unlock_memory_hotplug(); 1869 return ret; 1870 } 1871 1872 /* remove memmap entry */ 1873 firmware_map_remove(start, start + size, "System RAM"); 1874 1875 arch_remove_memory(start, size); 1876 1877 try_offline_node(nid); 1878 1879 unlock_memory_hotplug(); 1880 1881 return 0; 1882 } 1883 #else 1884 int offline_pages(unsigned long start_pfn, unsigned long nr_pages) 1885 { 1886 return -EINVAL; 1887 } 1888 int remove_memory(int nid, u64 start, u64 size) 1889 { 1890 return -EINVAL; 1891 } 1892 #endif /* CONFIG_MEMORY_HOTREMOVE */ 1893 EXPORT_SYMBOL_GPL(remove_memory); 1894