1 /* 2 * linux/mm/memory_hotplug.c 3 * 4 * Copyright (C) 5 */ 6 7 #include <linux/stddef.h> 8 #include <linux/mm.h> 9 #include <linux/swap.h> 10 #include <linux/interrupt.h> 11 #include <linux/pagemap.h> 12 #include <linux/bootmem.h> 13 #include <linux/compiler.h> 14 #include <linux/export.h> 15 #include <linux/pagevec.h> 16 #include <linux/writeback.h> 17 #include <linux/slab.h> 18 #include <linux/sysctl.h> 19 #include <linux/cpu.h> 20 #include <linux/memory.h> 21 #include <linux/memory_hotplug.h> 22 #include <linux/highmem.h> 23 #include <linux/vmalloc.h> 24 #include <linux/ioport.h> 25 #include <linux/delay.h> 26 #include <linux/migrate.h> 27 #include <linux/page-isolation.h> 28 #include <linux/pfn.h> 29 #include <linux/suspend.h> 30 #include <linux/mm_inline.h> 31 #include <linux/firmware-map.h> 32 #include <linux/stop_machine.h> 33 #include <linux/hugetlb.h> 34 35 #include <asm/tlbflush.h> 36 37 #include "internal.h" 38 39 /* 40 * online_page_callback contains pointer to current page onlining function. 41 * Initially it is generic_online_page(). If it is required it could be 42 * changed by calling set_online_page_callback() for callback registration 43 * and restore_online_page_callback() for generic callback restore. 44 */ 45 46 static void generic_online_page(struct page *page); 47 48 static online_page_callback_t online_page_callback = generic_online_page; 49 50 DEFINE_MUTEX(mem_hotplug_mutex); 51 52 void lock_memory_hotplug(void) 53 { 54 mutex_lock(&mem_hotplug_mutex); 55 56 /* for exclusive hibernation if CONFIG_HIBERNATION=y */ 57 lock_system_sleep(); 58 } 59 60 void unlock_memory_hotplug(void) 61 { 62 unlock_system_sleep(); 63 mutex_unlock(&mem_hotplug_mutex); 64 } 65 66 67 /* add this memory to iomem resource */ 68 static struct resource *register_memory_resource(u64 start, u64 size) 69 { 70 struct resource *res; 71 res = kzalloc(sizeof(struct resource), GFP_KERNEL); 72 BUG_ON(!res); 73 74 res->name = "System RAM"; 75 res->start = start; 76 res->end = start + size - 1; 77 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; 78 if (request_resource(&iomem_resource, res) < 0) { 79 pr_debug("System RAM resource %pR cannot be added\n", res); 80 kfree(res); 81 res = NULL; 82 } 83 return res; 84 } 85 86 static void release_memory_resource(struct resource *res) 87 { 88 if (!res) 89 return; 90 release_resource(res); 91 kfree(res); 92 return; 93 } 94 95 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 96 void get_page_bootmem(unsigned long info, struct page *page, 97 unsigned long type) 98 { 99 page->lru.next = (struct list_head *) type; 100 SetPagePrivate(page); 101 set_page_private(page, info); 102 atomic_inc(&page->_count); 103 } 104 105 void put_page_bootmem(struct page *page) 106 { 107 unsigned long type; 108 109 type = (unsigned long) page->lru.next; 110 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || 111 type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); 112 113 if (atomic_dec_return(&page->_count) == 1) { 114 ClearPagePrivate(page); 115 set_page_private(page, 0); 116 INIT_LIST_HEAD(&page->lru); 117 free_reserved_page(page); 118 } 119 } 120 121 #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE 122 #ifndef CONFIG_SPARSEMEM_VMEMMAP 123 static void register_page_bootmem_info_section(unsigned long start_pfn) 124 { 125 unsigned long *usemap, mapsize, section_nr, i; 126 struct mem_section *ms; 127 struct page *page, *memmap; 128 129 section_nr = pfn_to_section_nr(start_pfn); 130 ms = __nr_to_section(section_nr); 131 132 /* Get section's memmap address */ 133 memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); 134 135 /* 136 * Get page for the memmap's phys address 137 * XXX: need more consideration for sparse_vmemmap... 138 */ 139 page = virt_to_page(memmap); 140 mapsize = sizeof(struct page) * PAGES_PER_SECTION; 141 mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT; 142 143 /* remember memmap's page */ 144 for (i = 0; i < mapsize; i++, page++) 145 get_page_bootmem(section_nr, page, SECTION_INFO); 146 147 usemap = __nr_to_section(section_nr)->pageblock_flags; 148 page = virt_to_page(usemap); 149 150 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; 151 152 for (i = 0; i < mapsize; i++, page++) 153 get_page_bootmem(section_nr, page, MIX_SECTION_INFO); 154 155 } 156 #else /* CONFIG_SPARSEMEM_VMEMMAP */ 157 static void register_page_bootmem_info_section(unsigned long start_pfn) 158 { 159 unsigned long *usemap, mapsize, section_nr, i; 160 struct mem_section *ms; 161 struct page *page, *memmap; 162 163 if (!pfn_valid(start_pfn)) 164 return; 165 166 section_nr = pfn_to_section_nr(start_pfn); 167 ms = __nr_to_section(section_nr); 168 169 memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); 170 171 register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION); 172 173 usemap = __nr_to_section(section_nr)->pageblock_flags; 174 page = virt_to_page(usemap); 175 176 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; 177 178 for (i = 0; i < mapsize; i++, page++) 179 get_page_bootmem(section_nr, page, MIX_SECTION_INFO); 180 } 181 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ 182 183 void register_page_bootmem_info_node(struct pglist_data *pgdat) 184 { 185 unsigned long i, pfn, end_pfn, nr_pages; 186 int node = pgdat->node_id; 187 struct page *page; 188 struct zone *zone; 189 190 nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT; 191 page = virt_to_page(pgdat); 192 193 for (i = 0; i < nr_pages; i++, page++) 194 get_page_bootmem(node, page, NODE_INFO); 195 196 zone = &pgdat->node_zones[0]; 197 for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) { 198 if (zone_is_initialized(zone)) { 199 nr_pages = zone->wait_table_hash_nr_entries 200 * sizeof(wait_queue_head_t); 201 nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT; 202 page = virt_to_page(zone->wait_table); 203 204 for (i = 0; i < nr_pages; i++, page++) 205 get_page_bootmem(node, page, NODE_INFO); 206 } 207 } 208 209 pfn = pgdat->node_start_pfn; 210 end_pfn = pgdat_end_pfn(pgdat); 211 212 /* register section info */ 213 for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 214 /* 215 * Some platforms can assign the same pfn to multiple nodes - on 216 * node0 as well as nodeN. To avoid registering a pfn against 217 * multiple nodes we check that this pfn does not already 218 * reside in some other nodes. 219 */ 220 if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node)) 221 register_page_bootmem_info_section(pfn); 222 } 223 } 224 #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ 225 226 static void grow_zone_span(struct zone *zone, unsigned long start_pfn, 227 unsigned long end_pfn) 228 { 229 unsigned long old_zone_end_pfn; 230 231 zone_span_writelock(zone); 232 233 old_zone_end_pfn = zone_end_pfn(zone); 234 if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) 235 zone->zone_start_pfn = start_pfn; 236 237 zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - 238 zone->zone_start_pfn; 239 240 zone_span_writeunlock(zone); 241 } 242 243 static void resize_zone(struct zone *zone, unsigned long start_pfn, 244 unsigned long end_pfn) 245 { 246 zone_span_writelock(zone); 247 248 if (end_pfn - start_pfn) { 249 zone->zone_start_pfn = start_pfn; 250 zone->spanned_pages = end_pfn - start_pfn; 251 } else { 252 /* 253 * make it consist as free_area_init_core(), 254 * if spanned_pages = 0, then keep start_pfn = 0 255 */ 256 zone->zone_start_pfn = 0; 257 zone->spanned_pages = 0; 258 } 259 260 zone_span_writeunlock(zone); 261 } 262 263 static void fix_zone_id(struct zone *zone, unsigned long start_pfn, 264 unsigned long end_pfn) 265 { 266 enum zone_type zid = zone_idx(zone); 267 int nid = zone->zone_pgdat->node_id; 268 unsigned long pfn; 269 270 for (pfn = start_pfn; pfn < end_pfn; pfn++) 271 set_page_links(pfn_to_page(pfn), zid, nid, pfn); 272 } 273 274 /* Can fail with -ENOMEM from allocating a wait table with vmalloc() or 275 * alloc_bootmem_node_nopanic() */ 276 static int __ref ensure_zone_is_initialized(struct zone *zone, 277 unsigned long start_pfn, unsigned long num_pages) 278 { 279 if (!zone_is_initialized(zone)) 280 return init_currently_empty_zone(zone, start_pfn, num_pages, 281 MEMMAP_HOTPLUG); 282 return 0; 283 } 284 285 static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, 286 unsigned long start_pfn, unsigned long end_pfn) 287 { 288 int ret; 289 unsigned long flags; 290 unsigned long z1_start_pfn; 291 292 ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn); 293 if (ret) 294 return ret; 295 296 pgdat_resize_lock(z1->zone_pgdat, &flags); 297 298 /* can't move pfns which are higher than @z2 */ 299 if (end_pfn > zone_end_pfn(z2)) 300 goto out_fail; 301 /* the move out part must be at the left most of @z2 */ 302 if (start_pfn > z2->zone_start_pfn) 303 goto out_fail; 304 /* must included/overlap */ 305 if (end_pfn <= z2->zone_start_pfn) 306 goto out_fail; 307 308 /* use start_pfn for z1's start_pfn if z1 is empty */ 309 if (!zone_is_empty(z1)) 310 z1_start_pfn = z1->zone_start_pfn; 311 else 312 z1_start_pfn = start_pfn; 313 314 resize_zone(z1, z1_start_pfn, end_pfn); 315 resize_zone(z2, end_pfn, zone_end_pfn(z2)); 316 317 pgdat_resize_unlock(z1->zone_pgdat, &flags); 318 319 fix_zone_id(z1, start_pfn, end_pfn); 320 321 return 0; 322 out_fail: 323 pgdat_resize_unlock(z1->zone_pgdat, &flags); 324 return -1; 325 } 326 327 static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, 328 unsigned long start_pfn, unsigned long end_pfn) 329 { 330 int ret; 331 unsigned long flags; 332 unsigned long z2_end_pfn; 333 334 ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn); 335 if (ret) 336 return ret; 337 338 pgdat_resize_lock(z1->zone_pgdat, &flags); 339 340 /* can't move pfns which are lower than @z1 */ 341 if (z1->zone_start_pfn > start_pfn) 342 goto out_fail; 343 /* the move out part mast at the right most of @z1 */ 344 if (zone_end_pfn(z1) > end_pfn) 345 goto out_fail; 346 /* must included/overlap */ 347 if (start_pfn >= zone_end_pfn(z1)) 348 goto out_fail; 349 350 /* use end_pfn for z2's end_pfn if z2 is empty */ 351 if (!zone_is_empty(z2)) 352 z2_end_pfn = zone_end_pfn(z2); 353 else 354 z2_end_pfn = end_pfn; 355 356 resize_zone(z1, z1->zone_start_pfn, start_pfn); 357 resize_zone(z2, start_pfn, z2_end_pfn); 358 359 pgdat_resize_unlock(z1->zone_pgdat, &flags); 360 361 fix_zone_id(z2, start_pfn, end_pfn); 362 363 return 0; 364 out_fail: 365 pgdat_resize_unlock(z1->zone_pgdat, &flags); 366 return -1; 367 } 368 369 static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, 370 unsigned long end_pfn) 371 { 372 unsigned long old_pgdat_end_pfn = 373 pgdat->node_start_pfn + pgdat->node_spanned_pages; 374 375 if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) 376 pgdat->node_start_pfn = start_pfn; 377 378 pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - 379 pgdat->node_start_pfn; 380 } 381 382 static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) 383 { 384 struct pglist_data *pgdat = zone->zone_pgdat; 385 int nr_pages = PAGES_PER_SECTION; 386 int nid = pgdat->node_id; 387 int zone_type; 388 unsigned long flags; 389 int ret; 390 391 zone_type = zone - pgdat->node_zones; 392 ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages); 393 if (ret) 394 return ret; 395 396 pgdat_resize_lock(zone->zone_pgdat, &flags); 397 grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); 398 grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, 399 phys_start_pfn + nr_pages); 400 pgdat_resize_unlock(zone->zone_pgdat, &flags); 401 memmap_init_zone(nr_pages, nid, zone_type, 402 phys_start_pfn, MEMMAP_HOTPLUG); 403 return 0; 404 } 405 406 static int __meminit __add_section(int nid, struct zone *zone, 407 unsigned long phys_start_pfn) 408 { 409 int nr_pages = PAGES_PER_SECTION; 410 int ret; 411 412 if (pfn_valid(phys_start_pfn)) 413 return -EEXIST; 414 415 ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages); 416 417 if (ret < 0) 418 return ret; 419 420 ret = __add_zone(zone, phys_start_pfn); 421 422 if (ret < 0) 423 return ret; 424 425 return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); 426 } 427 428 /* 429 * Reasonably generic function for adding memory. It is 430 * expected that archs that support memory hotplug will 431 * call this function after deciding the zone to which to 432 * add the new pages. 433 */ 434 int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, 435 unsigned long nr_pages) 436 { 437 unsigned long i; 438 int err = 0; 439 int start_sec, end_sec; 440 /* during initialize mem_map, align hot-added range to section */ 441 start_sec = pfn_to_section_nr(phys_start_pfn); 442 end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); 443 444 for (i = start_sec; i <= end_sec; i++) { 445 err = __add_section(nid, zone, i << PFN_SECTION_SHIFT); 446 447 /* 448 * EEXIST is finally dealt with by ioresource collision 449 * check. see add_memory() => register_memory_resource() 450 * Warning will be printed if there is collision. 451 */ 452 if (err && (err != -EEXIST)) 453 break; 454 err = 0; 455 } 456 457 return err; 458 } 459 EXPORT_SYMBOL_GPL(__add_pages); 460 461 #ifdef CONFIG_MEMORY_HOTREMOVE 462 /* find the smallest valid pfn in the range [start_pfn, end_pfn) */ 463 static int find_smallest_section_pfn(int nid, struct zone *zone, 464 unsigned long start_pfn, 465 unsigned long end_pfn) 466 { 467 struct mem_section *ms; 468 469 for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) { 470 ms = __pfn_to_section(start_pfn); 471 472 if (unlikely(!valid_section(ms))) 473 continue; 474 475 if (unlikely(pfn_to_nid(start_pfn) != nid)) 476 continue; 477 478 if (zone && zone != page_zone(pfn_to_page(start_pfn))) 479 continue; 480 481 return start_pfn; 482 } 483 484 return 0; 485 } 486 487 /* find the biggest valid pfn in the range [start_pfn, end_pfn). */ 488 static int find_biggest_section_pfn(int nid, struct zone *zone, 489 unsigned long start_pfn, 490 unsigned long end_pfn) 491 { 492 struct mem_section *ms; 493 unsigned long pfn; 494 495 /* pfn is the end pfn of a memory section. */ 496 pfn = end_pfn - 1; 497 for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) { 498 ms = __pfn_to_section(pfn); 499 500 if (unlikely(!valid_section(ms))) 501 continue; 502 503 if (unlikely(pfn_to_nid(pfn) != nid)) 504 continue; 505 506 if (zone && zone != page_zone(pfn_to_page(pfn))) 507 continue; 508 509 return pfn; 510 } 511 512 return 0; 513 } 514 515 static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, 516 unsigned long end_pfn) 517 { 518 unsigned long zone_start_pfn = zone->zone_start_pfn; 519 unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */ 520 unsigned long zone_end_pfn = z; 521 unsigned long pfn; 522 struct mem_section *ms; 523 int nid = zone_to_nid(zone); 524 525 zone_span_writelock(zone); 526 if (zone_start_pfn == start_pfn) { 527 /* 528 * If the section is smallest section in the zone, it need 529 * shrink zone->zone_start_pfn and zone->zone_spanned_pages. 530 * In this case, we find second smallest valid mem_section 531 * for shrinking zone. 532 */ 533 pfn = find_smallest_section_pfn(nid, zone, end_pfn, 534 zone_end_pfn); 535 if (pfn) { 536 zone->zone_start_pfn = pfn; 537 zone->spanned_pages = zone_end_pfn - pfn; 538 } 539 } else if (zone_end_pfn == end_pfn) { 540 /* 541 * If the section is biggest section in the zone, it need 542 * shrink zone->spanned_pages. 543 * In this case, we find second biggest valid mem_section for 544 * shrinking zone. 545 */ 546 pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn, 547 start_pfn); 548 if (pfn) 549 zone->spanned_pages = pfn - zone_start_pfn + 1; 550 } 551 552 /* 553 * The section is not biggest or smallest mem_section in the zone, it 554 * only creates a hole in the zone. So in this case, we need not 555 * change the zone. But perhaps, the zone has only hole data. Thus 556 * it check the zone has only hole or not. 557 */ 558 pfn = zone_start_pfn; 559 for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) { 560 ms = __pfn_to_section(pfn); 561 562 if (unlikely(!valid_section(ms))) 563 continue; 564 565 if (page_zone(pfn_to_page(pfn)) != zone) 566 continue; 567 568 /* If the section is current section, it continues the loop */ 569 if (start_pfn == pfn) 570 continue; 571 572 /* If we find valid section, we have nothing to do */ 573 zone_span_writeunlock(zone); 574 return; 575 } 576 577 /* The zone has no valid section */ 578 zone->zone_start_pfn = 0; 579 zone->spanned_pages = 0; 580 zone_span_writeunlock(zone); 581 } 582 583 static void shrink_pgdat_span(struct pglist_data *pgdat, 584 unsigned long start_pfn, unsigned long end_pfn) 585 { 586 unsigned long pgdat_start_pfn = pgdat->node_start_pfn; 587 unsigned long pgdat_end_pfn = 588 pgdat->node_start_pfn + pgdat->node_spanned_pages; 589 unsigned long pfn; 590 struct mem_section *ms; 591 int nid = pgdat->node_id; 592 593 if (pgdat_start_pfn == start_pfn) { 594 /* 595 * If the section is smallest section in the pgdat, it need 596 * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages. 597 * In this case, we find second smallest valid mem_section 598 * for shrinking zone. 599 */ 600 pfn = find_smallest_section_pfn(nid, NULL, end_pfn, 601 pgdat_end_pfn); 602 if (pfn) { 603 pgdat->node_start_pfn = pfn; 604 pgdat->node_spanned_pages = pgdat_end_pfn - pfn; 605 } 606 } else if (pgdat_end_pfn == end_pfn) { 607 /* 608 * If the section is biggest section in the pgdat, it need 609 * shrink pgdat->node_spanned_pages. 610 * In this case, we find second biggest valid mem_section for 611 * shrinking zone. 612 */ 613 pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn, 614 start_pfn); 615 if (pfn) 616 pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1; 617 } 618 619 /* 620 * If the section is not biggest or smallest mem_section in the pgdat, 621 * it only creates a hole in the pgdat. So in this case, we need not 622 * change the pgdat. 623 * But perhaps, the pgdat has only hole data. Thus it check the pgdat 624 * has only hole or not. 625 */ 626 pfn = pgdat_start_pfn; 627 for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) { 628 ms = __pfn_to_section(pfn); 629 630 if (unlikely(!valid_section(ms))) 631 continue; 632 633 if (pfn_to_nid(pfn) != nid) 634 continue; 635 636 /* If the section is current section, it continues the loop */ 637 if (start_pfn == pfn) 638 continue; 639 640 /* If we find valid section, we have nothing to do */ 641 return; 642 } 643 644 /* The pgdat has no valid section */ 645 pgdat->node_start_pfn = 0; 646 pgdat->node_spanned_pages = 0; 647 } 648 649 static void __remove_zone(struct zone *zone, unsigned long start_pfn) 650 { 651 struct pglist_data *pgdat = zone->zone_pgdat; 652 int nr_pages = PAGES_PER_SECTION; 653 int zone_type; 654 unsigned long flags; 655 656 zone_type = zone - pgdat->node_zones; 657 658 pgdat_resize_lock(zone->zone_pgdat, &flags); 659 shrink_zone_span(zone, start_pfn, start_pfn + nr_pages); 660 shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages); 661 pgdat_resize_unlock(zone->zone_pgdat, &flags); 662 } 663 664 static int __remove_section(struct zone *zone, struct mem_section *ms) 665 { 666 unsigned long start_pfn; 667 int scn_nr; 668 int ret = -EINVAL; 669 670 if (!valid_section(ms)) 671 return ret; 672 673 ret = unregister_memory_section(ms); 674 if (ret) 675 return ret; 676 677 scn_nr = __section_nr(ms); 678 start_pfn = section_nr_to_pfn(scn_nr); 679 __remove_zone(zone, start_pfn); 680 681 sparse_remove_one_section(zone, ms); 682 return 0; 683 } 684 685 /** 686 * __remove_pages() - remove sections of pages from a zone 687 * @zone: zone from which pages need to be removed 688 * @phys_start_pfn: starting pageframe (must be aligned to start of a section) 689 * @nr_pages: number of pages to remove (must be multiple of section size) 690 * 691 * Generic helper function to remove section mappings and sysfs entries 692 * for the section of the memory we are removing. Caller needs to make 693 * sure that pages are marked reserved and zones are adjust properly by 694 * calling offline_pages(). 695 */ 696 int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, 697 unsigned long nr_pages) 698 { 699 unsigned long i; 700 int sections_to_remove; 701 resource_size_t start, size; 702 int ret = 0; 703 704 /* 705 * We can only remove entire sections 706 */ 707 BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); 708 BUG_ON(nr_pages % PAGES_PER_SECTION); 709 710 start = phys_start_pfn << PAGE_SHIFT; 711 size = nr_pages * PAGE_SIZE; 712 ret = release_mem_region_adjustable(&iomem_resource, start, size); 713 if (ret) { 714 resource_size_t endres = start + size - 1; 715 716 pr_warn("Unable to release resource <%pa-%pa> (%d)\n", 717 &start, &endres, ret); 718 } 719 720 sections_to_remove = nr_pages / PAGES_PER_SECTION; 721 for (i = 0; i < sections_to_remove; i++) { 722 unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; 723 ret = __remove_section(zone, __pfn_to_section(pfn)); 724 if (ret) 725 break; 726 } 727 return ret; 728 } 729 EXPORT_SYMBOL_GPL(__remove_pages); 730 #endif /* CONFIG_MEMORY_HOTREMOVE */ 731 732 int set_online_page_callback(online_page_callback_t callback) 733 { 734 int rc = -EINVAL; 735 736 lock_memory_hotplug(); 737 738 if (online_page_callback == generic_online_page) { 739 online_page_callback = callback; 740 rc = 0; 741 } 742 743 unlock_memory_hotplug(); 744 745 return rc; 746 } 747 EXPORT_SYMBOL_GPL(set_online_page_callback); 748 749 int restore_online_page_callback(online_page_callback_t callback) 750 { 751 int rc = -EINVAL; 752 753 lock_memory_hotplug(); 754 755 if (online_page_callback == callback) { 756 online_page_callback = generic_online_page; 757 rc = 0; 758 } 759 760 unlock_memory_hotplug(); 761 762 return rc; 763 } 764 EXPORT_SYMBOL_GPL(restore_online_page_callback); 765 766 void __online_page_set_limits(struct page *page) 767 { 768 } 769 EXPORT_SYMBOL_GPL(__online_page_set_limits); 770 771 void __online_page_increment_counters(struct page *page) 772 { 773 adjust_managed_page_count(page, 1); 774 } 775 EXPORT_SYMBOL_GPL(__online_page_increment_counters); 776 777 void __online_page_free(struct page *page) 778 { 779 __free_reserved_page(page); 780 } 781 EXPORT_SYMBOL_GPL(__online_page_free); 782 783 static void generic_online_page(struct page *page) 784 { 785 __online_page_set_limits(page); 786 __online_page_increment_counters(page); 787 __online_page_free(page); 788 } 789 790 static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, 791 void *arg) 792 { 793 unsigned long i; 794 unsigned long onlined_pages = *(unsigned long *)arg; 795 struct page *page; 796 if (PageReserved(pfn_to_page(start_pfn))) 797 for (i = 0; i < nr_pages; i++) { 798 page = pfn_to_page(start_pfn + i); 799 (*online_page_callback)(page); 800 onlined_pages++; 801 } 802 *(unsigned long *)arg = onlined_pages; 803 return 0; 804 } 805 806 #ifdef CONFIG_MOVABLE_NODE 807 /* 808 * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have 809 * normal memory. 810 */ 811 static bool can_online_high_movable(struct zone *zone) 812 { 813 return true; 814 } 815 #else /* CONFIG_MOVABLE_NODE */ 816 /* ensure every online node has NORMAL memory */ 817 static bool can_online_high_movable(struct zone *zone) 818 { 819 return node_state(zone_to_nid(zone), N_NORMAL_MEMORY); 820 } 821 #endif /* CONFIG_MOVABLE_NODE */ 822 823 /* check which state of node_states will be changed when online memory */ 824 static void node_states_check_changes_online(unsigned long nr_pages, 825 struct zone *zone, struct memory_notify *arg) 826 { 827 int nid = zone_to_nid(zone); 828 enum zone_type zone_last = ZONE_NORMAL; 829 830 /* 831 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] 832 * contains nodes which have zones of 0...ZONE_NORMAL, 833 * set zone_last to ZONE_NORMAL. 834 * 835 * If we don't have HIGHMEM nor movable node, 836 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of 837 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. 838 */ 839 if (N_MEMORY == N_NORMAL_MEMORY) 840 zone_last = ZONE_MOVABLE; 841 842 /* 843 * if the memory to be online is in a zone of 0...zone_last, and 844 * the zones of 0...zone_last don't have memory before online, we will 845 * need to set the node to node_states[N_NORMAL_MEMORY] after 846 * the memory is online. 847 */ 848 if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY)) 849 arg->status_change_nid_normal = nid; 850 else 851 arg->status_change_nid_normal = -1; 852 853 #ifdef CONFIG_HIGHMEM 854 /* 855 * If we have movable node, node_states[N_HIGH_MEMORY] 856 * contains nodes which have zones of 0...ZONE_HIGHMEM, 857 * set zone_last to ZONE_HIGHMEM. 858 * 859 * If we don't have movable node, node_states[N_NORMAL_MEMORY] 860 * contains nodes which have zones of 0...ZONE_MOVABLE, 861 * set zone_last to ZONE_MOVABLE. 862 */ 863 zone_last = ZONE_HIGHMEM; 864 if (N_MEMORY == N_HIGH_MEMORY) 865 zone_last = ZONE_MOVABLE; 866 867 if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY)) 868 arg->status_change_nid_high = nid; 869 else 870 arg->status_change_nid_high = -1; 871 #else 872 arg->status_change_nid_high = arg->status_change_nid_normal; 873 #endif 874 875 /* 876 * if the node don't have memory befor online, we will need to 877 * set the node to node_states[N_MEMORY] after the memory 878 * is online. 879 */ 880 if (!node_state(nid, N_MEMORY)) 881 arg->status_change_nid = nid; 882 else 883 arg->status_change_nid = -1; 884 } 885 886 static void node_states_set_node(int node, struct memory_notify *arg) 887 { 888 if (arg->status_change_nid_normal >= 0) 889 node_set_state(node, N_NORMAL_MEMORY); 890 891 if (arg->status_change_nid_high >= 0) 892 node_set_state(node, N_HIGH_MEMORY); 893 894 node_set_state(node, N_MEMORY); 895 } 896 897 898 int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) 899 { 900 unsigned long flags; 901 unsigned long onlined_pages = 0; 902 struct zone *zone; 903 int need_zonelists_rebuild = 0; 904 int nid; 905 int ret; 906 struct memory_notify arg; 907 908 lock_memory_hotplug(); 909 /* 910 * This doesn't need a lock to do pfn_to_page(). 911 * The section can't be removed here because of the 912 * memory_block->state_mutex. 913 */ 914 zone = page_zone(pfn_to_page(pfn)); 915 916 if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) && 917 !can_online_high_movable(zone)) { 918 unlock_memory_hotplug(); 919 return -EINVAL; 920 } 921 922 if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { 923 if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) { 924 unlock_memory_hotplug(); 925 return -EINVAL; 926 } 927 } 928 if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { 929 if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) { 930 unlock_memory_hotplug(); 931 return -EINVAL; 932 } 933 } 934 935 /* Previous code may changed the zone of the pfn range */ 936 zone = page_zone(pfn_to_page(pfn)); 937 938 arg.start_pfn = pfn; 939 arg.nr_pages = nr_pages; 940 node_states_check_changes_online(nr_pages, zone, &arg); 941 942 nid = page_to_nid(pfn_to_page(pfn)); 943 944 ret = memory_notify(MEM_GOING_ONLINE, &arg); 945 ret = notifier_to_errno(ret); 946 if (ret) { 947 memory_notify(MEM_CANCEL_ONLINE, &arg); 948 unlock_memory_hotplug(); 949 return ret; 950 } 951 /* 952 * If this zone is not populated, then it is not in zonelist. 953 * This means the page allocator ignores this zone. 954 * So, zonelist must be updated after online. 955 */ 956 mutex_lock(&zonelists_mutex); 957 if (!populated_zone(zone)) { 958 need_zonelists_rebuild = 1; 959 build_all_zonelists(NULL, zone); 960 } 961 962 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, 963 online_pages_range); 964 if (ret) { 965 if (need_zonelists_rebuild) 966 zone_pcp_reset(zone); 967 mutex_unlock(&zonelists_mutex); 968 printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n", 969 (unsigned long long) pfn << PAGE_SHIFT, 970 (((unsigned long long) pfn + nr_pages) 971 << PAGE_SHIFT) - 1); 972 memory_notify(MEM_CANCEL_ONLINE, &arg); 973 unlock_memory_hotplug(); 974 return ret; 975 } 976 977 zone->present_pages += onlined_pages; 978 979 pgdat_resize_lock(zone->zone_pgdat, &flags); 980 zone->zone_pgdat->node_present_pages += onlined_pages; 981 pgdat_resize_unlock(zone->zone_pgdat, &flags); 982 983 if (onlined_pages) { 984 node_states_set_node(zone_to_nid(zone), &arg); 985 if (need_zonelists_rebuild) 986 build_all_zonelists(NULL, NULL); 987 else 988 zone_pcp_update(zone); 989 } 990 991 mutex_unlock(&zonelists_mutex); 992 993 init_per_zone_wmark_min(); 994 995 if (onlined_pages) 996 kswapd_run(zone_to_nid(zone)); 997 998 vm_total_pages = nr_free_pagecache_pages(); 999 1000 writeback_set_ratelimit(); 1001 1002 if (onlined_pages) 1003 memory_notify(MEM_ONLINE, &arg); 1004 unlock_memory_hotplug(); 1005 1006 return 0; 1007 } 1008 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ 1009 1010 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 1011 static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) 1012 { 1013 struct pglist_data *pgdat; 1014 unsigned long zones_size[MAX_NR_ZONES] = {0}; 1015 unsigned long zholes_size[MAX_NR_ZONES] = {0}; 1016 unsigned long start_pfn = start >> PAGE_SHIFT; 1017 1018 pgdat = NODE_DATA(nid); 1019 if (!pgdat) { 1020 pgdat = arch_alloc_nodedata(nid); 1021 if (!pgdat) 1022 return NULL; 1023 1024 arch_refresh_nodedata(nid, pgdat); 1025 } 1026 1027 /* we can use NODE_DATA(nid) from here */ 1028 1029 /* init node's zones as empty zones, we don't have any present pages.*/ 1030 free_area_init_node(nid, zones_size, start_pfn, zholes_size); 1031 1032 /* 1033 * The node we allocated has no zone fallback lists. For avoiding 1034 * to access not-initialized zonelist, build here. 1035 */ 1036 mutex_lock(&zonelists_mutex); 1037 build_all_zonelists(pgdat, NULL); 1038 mutex_unlock(&zonelists_mutex); 1039 1040 return pgdat; 1041 } 1042 1043 static void rollback_node_hotadd(int nid, pg_data_t *pgdat) 1044 { 1045 arch_refresh_nodedata(nid, NULL); 1046 arch_free_nodedata(pgdat); 1047 return; 1048 } 1049 1050 1051 /* 1052 * called by cpu_up() to online a node without onlined memory. 1053 */ 1054 int mem_online_node(int nid) 1055 { 1056 pg_data_t *pgdat; 1057 int ret; 1058 1059 lock_memory_hotplug(); 1060 pgdat = hotadd_new_pgdat(nid, 0); 1061 if (!pgdat) { 1062 ret = -ENOMEM; 1063 goto out; 1064 } 1065 node_set_online(nid); 1066 ret = register_one_node(nid); 1067 BUG_ON(ret); 1068 1069 out: 1070 unlock_memory_hotplug(); 1071 return ret; 1072 } 1073 1074 static int check_hotplug_memory_range(u64 start, u64 size) 1075 { 1076 u64 start_pfn = start >> PAGE_SHIFT; 1077 u64 nr_pages = size >> PAGE_SHIFT; 1078 1079 /* Memory range must be aligned with section */ 1080 if ((start_pfn & ~PAGE_SECTION_MASK) || 1081 (nr_pages % PAGES_PER_SECTION) || (!nr_pages)) { 1082 pr_err("Section-unaligned hotplug range: start 0x%llx, size 0x%llx\n", 1083 (unsigned long long)start, 1084 (unsigned long long)size); 1085 return -EINVAL; 1086 } 1087 1088 return 0; 1089 } 1090 1091 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 1092 int __ref add_memory(int nid, u64 start, u64 size) 1093 { 1094 pg_data_t *pgdat = NULL; 1095 bool new_pgdat; 1096 bool new_node; 1097 struct resource *res; 1098 int ret; 1099 1100 ret = check_hotplug_memory_range(start, size); 1101 if (ret) 1102 return ret; 1103 1104 lock_memory_hotplug(); 1105 1106 res = register_memory_resource(start, size); 1107 ret = -EEXIST; 1108 if (!res) 1109 goto out; 1110 1111 { /* Stupid hack to suppress address-never-null warning */ 1112 void *p = NODE_DATA(nid); 1113 new_pgdat = !p; 1114 } 1115 new_node = !node_online(nid); 1116 if (new_node) { 1117 pgdat = hotadd_new_pgdat(nid, start); 1118 ret = -ENOMEM; 1119 if (!pgdat) 1120 goto error; 1121 } 1122 1123 /* call arch's memory hotadd */ 1124 ret = arch_add_memory(nid, start, size); 1125 1126 if (ret < 0) 1127 goto error; 1128 1129 /* we online node here. we can't roll back from here. */ 1130 node_set_online(nid); 1131 1132 if (new_node) { 1133 ret = register_one_node(nid); 1134 /* 1135 * If sysfs file of new node can't create, cpu on the node 1136 * can't be hot-added. There is no rollback way now. 1137 * So, check by BUG_ON() to catch it reluctantly.. 1138 */ 1139 BUG_ON(ret); 1140 } 1141 1142 /* create new memmap entry */ 1143 firmware_map_add_hotplug(start, start + size, "System RAM"); 1144 1145 goto out; 1146 1147 error: 1148 /* rollback pgdat allocation and others */ 1149 if (new_pgdat) 1150 rollback_node_hotadd(nid, pgdat); 1151 release_memory_resource(res); 1152 1153 out: 1154 unlock_memory_hotplug(); 1155 return ret; 1156 } 1157 EXPORT_SYMBOL_GPL(add_memory); 1158 1159 #ifdef CONFIG_MEMORY_HOTREMOVE 1160 /* 1161 * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy 1162 * set and the size of the free page is given by page_order(). Using this, 1163 * the function determines if the pageblock contains only free pages. 1164 * Due to buddy contraints, a free page at least the size of a pageblock will 1165 * be located at the start of the pageblock 1166 */ 1167 static inline int pageblock_free(struct page *page) 1168 { 1169 return PageBuddy(page) && page_order(page) >= pageblock_order; 1170 } 1171 1172 /* Return the start of the next active pageblock after a given page */ 1173 static struct page *next_active_pageblock(struct page *page) 1174 { 1175 /* Ensure the starting page is pageblock-aligned */ 1176 BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1)); 1177 1178 /* If the entire pageblock is free, move to the end of free page */ 1179 if (pageblock_free(page)) { 1180 int order; 1181 /* be careful. we don't have locks, page_order can be changed.*/ 1182 order = page_order(page); 1183 if ((order < MAX_ORDER) && (order >= pageblock_order)) 1184 return page + (1 << order); 1185 } 1186 1187 return page + pageblock_nr_pages; 1188 } 1189 1190 /* Checks if this range of memory is likely to be hot-removable. */ 1191 int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) 1192 { 1193 struct page *page = pfn_to_page(start_pfn); 1194 struct page *end_page = page + nr_pages; 1195 1196 /* Check the starting page of each pageblock within the range */ 1197 for (; page < end_page; page = next_active_pageblock(page)) { 1198 if (!is_pageblock_removable_nolock(page)) 1199 return 0; 1200 cond_resched(); 1201 } 1202 1203 /* All pageblocks in the memory block are likely to be hot-removable */ 1204 return 1; 1205 } 1206 1207 /* 1208 * Confirm all pages in a range [start, end) is belongs to the same zone. 1209 */ 1210 static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) 1211 { 1212 unsigned long pfn; 1213 struct zone *zone = NULL; 1214 struct page *page; 1215 int i; 1216 for (pfn = start_pfn; 1217 pfn < end_pfn; 1218 pfn += MAX_ORDER_NR_PAGES) { 1219 i = 0; 1220 /* This is just a CONFIG_HOLES_IN_ZONE check.*/ 1221 while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i)) 1222 i++; 1223 if (i == MAX_ORDER_NR_PAGES) 1224 continue; 1225 page = pfn_to_page(pfn + i); 1226 if (zone && page_zone(page) != zone) 1227 return 0; 1228 zone = page_zone(page); 1229 } 1230 return 1; 1231 } 1232 1233 /* 1234 * Scan pfn range [start,end) to find movable/migratable pages (LRU pages 1235 * and hugepages). We scan pfn because it's much easier than scanning over 1236 * linked list. This function returns the pfn of the first found movable 1237 * page if it's found, otherwise 0. 1238 */ 1239 static unsigned long scan_movable_pages(unsigned long start, unsigned long end) 1240 { 1241 unsigned long pfn; 1242 struct page *page; 1243 for (pfn = start; pfn < end; pfn++) { 1244 if (pfn_valid(pfn)) { 1245 page = pfn_to_page(pfn); 1246 if (PageLRU(page)) 1247 return pfn; 1248 if (PageHuge(page)) { 1249 if (is_hugepage_active(page)) 1250 return pfn; 1251 else 1252 pfn = round_up(pfn + 1, 1253 1 << compound_order(page)) - 1; 1254 } 1255 } 1256 } 1257 return 0; 1258 } 1259 1260 #define NR_OFFLINE_AT_ONCE_PAGES (256) 1261 static int 1262 do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) 1263 { 1264 unsigned long pfn; 1265 struct page *page; 1266 int move_pages = NR_OFFLINE_AT_ONCE_PAGES; 1267 int not_managed = 0; 1268 int ret = 0; 1269 LIST_HEAD(source); 1270 1271 for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) { 1272 if (!pfn_valid(pfn)) 1273 continue; 1274 page = pfn_to_page(pfn); 1275 1276 if (PageHuge(page)) { 1277 struct page *head = compound_head(page); 1278 pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1; 1279 if (compound_order(head) > PFN_SECTION_SHIFT) { 1280 ret = -EBUSY; 1281 break; 1282 } 1283 if (isolate_huge_page(page, &source)) 1284 move_pages -= 1 << compound_order(head); 1285 continue; 1286 } 1287 1288 if (!get_page_unless_zero(page)) 1289 continue; 1290 /* 1291 * We can skip free pages. And we can only deal with pages on 1292 * LRU. 1293 */ 1294 ret = isolate_lru_page(page); 1295 if (!ret) { /* Success */ 1296 put_page(page); 1297 list_add_tail(&page->lru, &source); 1298 move_pages--; 1299 inc_zone_page_state(page, NR_ISOLATED_ANON + 1300 page_is_file_cache(page)); 1301 1302 } else { 1303 #ifdef CONFIG_DEBUG_VM 1304 printk(KERN_ALERT "removing pfn %lx from LRU failed\n", 1305 pfn); 1306 dump_page(page); 1307 #endif 1308 put_page(page); 1309 /* Because we don't have big zone->lock. we should 1310 check this again here. */ 1311 if (page_count(page)) { 1312 not_managed++; 1313 ret = -EBUSY; 1314 break; 1315 } 1316 } 1317 } 1318 if (!list_empty(&source)) { 1319 if (not_managed) { 1320 putback_movable_pages(&source); 1321 goto out; 1322 } 1323 1324 /* 1325 * alloc_migrate_target should be improooooved!! 1326 * migrate_pages returns # of failed pages. 1327 */ 1328 ret = migrate_pages(&source, alloc_migrate_target, 0, 1329 MIGRATE_SYNC, MR_MEMORY_HOTPLUG); 1330 if (ret) 1331 putback_movable_pages(&source); 1332 } 1333 out: 1334 return ret; 1335 } 1336 1337 /* 1338 * remove from free_area[] and mark all as Reserved. 1339 */ 1340 static int 1341 offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, 1342 void *data) 1343 { 1344 __offline_isolated_pages(start, start + nr_pages); 1345 return 0; 1346 } 1347 1348 static void 1349 offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 1350 { 1351 walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL, 1352 offline_isolated_pages_cb); 1353 } 1354 1355 /* 1356 * Check all pages in range, recoreded as memory resource, are isolated. 1357 */ 1358 static int 1359 check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, 1360 void *data) 1361 { 1362 int ret; 1363 long offlined = *(long *)data; 1364 ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true); 1365 offlined = nr_pages; 1366 if (!ret) 1367 *(long *)data += offlined; 1368 return ret; 1369 } 1370 1371 static long 1372 check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) 1373 { 1374 long offlined = 0; 1375 int ret; 1376 1377 ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined, 1378 check_pages_isolated_cb); 1379 if (ret < 0) 1380 offlined = (long)ret; 1381 return offlined; 1382 } 1383 1384 #ifdef CONFIG_MOVABLE_NODE 1385 /* 1386 * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have 1387 * normal memory. 1388 */ 1389 static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) 1390 { 1391 return true; 1392 } 1393 #else /* CONFIG_MOVABLE_NODE */ 1394 /* ensure the node has NORMAL memory if it is still online */ 1395 static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) 1396 { 1397 struct pglist_data *pgdat = zone->zone_pgdat; 1398 unsigned long present_pages = 0; 1399 enum zone_type zt; 1400 1401 for (zt = 0; zt <= ZONE_NORMAL; zt++) 1402 present_pages += pgdat->node_zones[zt].present_pages; 1403 1404 if (present_pages > nr_pages) 1405 return true; 1406 1407 present_pages = 0; 1408 for (; zt <= ZONE_MOVABLE; zt++) 1409 present_pages += pgdat->node_zones[zt].present_pages; 1410 1411 /* 1412 * we can't offline the last normal memory until all 1413 * higher memory is offlined. 1414 */ 1415 return present_pages == 0; 1416 } 1417 #endif /* CONFIG_MOVABLE_NODE */ 1418 1419 /* check which state of node_states will be changed when offline memory */ 1420 static void node_states_check_changes_offline(unsigned long nr_pages, 1421 struct zone *zone, struct memory_notify *arg) 1422 { 1423 struct pglist_data *pgdat = zone->zone_pgdat; 1424 unsigned long present_pages = 0; 1425 enum zone_type zt, zone_last = ZONE_NORMAL; 1426 1427 /* 1428 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] 1429 * contains nodes which have zones of 0...ZONE_NORMAL, 1430 * set zone_last to ZONE_NORMAL. 1431 * 1432 * If we don't have HIGHMEM nor movable node, 1433 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of 1434 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. 1435 */ 1436 if (N_MEMORY == N_NORMAL_MEMORY) 1437 zone_last = ZONE_MOVABLE; 1438 1439 /* 1440 * check whether node_states[N_NORMAL_MEMORY] will be changed. 1441 * If the memory to be offline is in a zone of 0...zone_last, 1442 * and it is the last present memory, 0...zone_last will 1443 * become empty after offline , thus we can determind we will 1444 * need to clear the node from node_states[N_NORMAL_MEMORY]. 1445 */ 1446 for (zt = 0; zt <= zone_last; zt++) 1447 present_pages += pgdat->node_zones[zt].present_pages; 1448 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) 1449 arg->status_change_nid_normal = zone_to_nid(zone); 1450 else 1451 arg->status_change_nid_normal = -1; 1452 1453 #ifdef CONFIG_HIGHMEM 1454 /* 1455 * If we have movable node, node_states[N_HIGH_MEMORY] 1456 * contains nodes which have zones of 0...ZONE_HIGHMEM, 1457 * set zone_last to ZONE_HIGHMEM. 1458 * 1459 * If we don't have movable node, node_states[N_NORMAL_MEMORY] 1460 * contains nodes which have zones of 0...ZONE_MOVABLE, 1461 * set zone_last to ZONE_MOVABLE. 1462 */ 1463 zone_last = ZONE_HIGHMEM; 1464 if (N_MEMORY == N_HIGH_MEMORY) 1465 zone_last = ZONE_MOVABLE; 1466 1467 for (; zt <= zone_last; zt++) 1468 present_pages += pgdat->node_zones[zt].present_pages; 1469 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) 1470 arg->status_change_nid_high = zone_to_nid(zone); 1471 else 1472 arg->status_change_nid_high = -1; 1473 #else 1474 arg->status_change_nid_high = arg->status_change_nid_normal; 1475 #endif 1476 1477 /* 1478 * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE 1479 */ 1480 zone_last = ZONE_MOVABLE; 1481 1482 /* 1483 * check whether node_states[N_HIGH_MEMORY] will be changed 1484 * If we try to offline the last present @nr_pages from the node, 1485 * we can determind we will need to clear the node from 1486 * node_states[N_HIGH_MEMORY]. 1487 */ 1488 for (; zt <= zone_last; zt++) 1489 present_pages += pgdat->node_zones[zt].present_pages; 1490 if (nr_pages >= present_pages) 1491 arg->status_change_nid = zone_to_nid(zone); 1492 else 1493 arg->status_change_nid = -1; 1494 } 1495 1496 static void node_states_clear_node(int node, struct memory_notify *arg) 1497 { 1498 if (arg->status_change_nid_normal >= 0) 1499 node_clear_state(node, N_NORMAL_MEMORY); 1500 1501 if ((N_MEMORY != N_NORMAL_MEMORY) && 1502 (arg->status_change_nid_high >= 0)) 1503 node_clear_state(node, N_HIGH_MEMORY); 1504 1505 if ((N_MEMORY != N_HIGH_MEMORY) && 1506 (arg->status_change_nid >= 0)) 1507 node_clear_state(node, N_MEMORY); 1508 } 1509 1510 static int __ref __offline_pages(unsigned long start_pfn, 1511 unsigned long end_pfn, unsigned long timeout) 1512 { 1513 unsigned long pfn, nr_pages, expire; 1514 long offlined_pages; 1515 int ret, drain, retry_max, node; 1516 unsigned long flags; 1517 struct zone *zone; 1518 struct memory_notify arg; 1519 1520 /* at least, alignment against pageblock is necessary */ 1521 if (!IS_ALIGNED(start_pfn, pageblock_nr_pages)) 1522 return -EINVAL; 1523 if (!IS_ALIGNED(end_pfn, pageblock_nr_pages)) 1524 return -EINVAL; 1525 /* This makes hotplug much easier...and readable. 1526 we assume this for now. .*/ 1527 if (!test_pages_in_a_zone(start_pfn, end_pfn)) 1528 return -EINVAL; 1529 1530 lock_memory_hotplug(); 1531 1532 zone = page_zone(pfn_to_page(start_pfn)); 1533 node = zone_to_nid(zone); 1534 nr_pages = end_pfn - start_pfn; 1535 1536 ret = -EINVAL; 1537 if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages)) 1538 goto out; 1539 1540 /* set above range as isolated */ 1541 ret = start_isolate_page_range(start_pfn, end_pfn, 1542 MIGRATE_MOVABLE, true); 1543 if (ret) 1544 goto out; 1545 1546 arg.start_pfn = start_pfn; 1547 arg.nr_pages = nr_pages; 1548 node_states_check_changes_offline(nr_pages, zone, &arg); 1549 1550 ret = memory_notify(MEM_GOING_OFFLINE, &arg); 1551 ret = notifier_to_errno(ret); 1552 if (ret) 1553 goto failed_removal; 1554 1555 pfn = start_pfn; 1556 expire = jiffies + timeout; 1557 drain = 0; 1558 retry_max = 5; 1559 repeat: 1560 /* start memory hot removal */ 1561 ret = -EAGAIN; 1562 if (time_after(jiffies, expire)) 1563 goto failed_removal; 1564 ret = -EINTR; 1565 if (signal_pending(current)) 1566 goto failed_removal; 1567 ret = 0; 1568 if (drain) { 1569 lru_add_drain_all(); 1570 cond_resched(); 1571 drain_all_pages(); 1572 } 1573 1574 pfn = scan_movable_pages(start_pfn, end_pfn); 1575 if (pfn) { /* We have movable pages */ 1576 ret = do_migrate_range(pfn, end_pfn); 1577 if (!ret) { 1578 drain = 1; 1579 goto repeat; 1580 } else { 1581 if (ret < 0) 1582 if (--retry_max == 0) 1583 goto failed_removal; 1584 yield(); 1585 drain = 1; 1586 goto repeat; 1587 } 1588 } 1589 /* drain all zone's lru pagevec, this is asynchronous... */ 1590 lru_add_drain_all(); 1591 yield(); 1592 /* drain pcp pages, this is synchronous. */ 1593 drain_all_pages(); 1594 /* 1595 * dissolve free hugepages in the memory block before doing offlining 1596 * actually in order to make hugetlbfs's object counting consistent. 1597 */ 1598 dissolve_free_huge_pages(start_pfn, end_pfn); 1599 /* check again */ 1600 offlined_pages = check_pages_isolated(start_pfn, end_pfn); 1601 if (offlined_pages < 0) { 1602 ret = -EBUSY; 1603 goto failed_removal; 1604 } 1605 printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages); 1606 /* Ok, all of our target is isolated. 1607 We cannot do rollback at this point. */ 1608 offline_isolated_pages(start_pfn, end_pfn); 1609 /* reset pagetype flags and makes migrate type to be MOVABLE */ 1610 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1611 /* removal success */ 1612 adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages); 1613 zone->present_pages -= offlined_pages; 1614 1615 pgdat_resize_lock(zone->zone_pgdat, &flags); 1616 zone->zone_pgdat->node_present_pages -= offlined_pages; 1617 pgdat_resize_unlock(zone->zone_pgdat, &flags); 1618 1619 init_per_zone_wmark_min(); 1620 1621 if (!populated_zone(zone)) { 1622 zone_pcp_reset(zone); 1623 mutex_lock(&zonelists_mutex); 1624 build_all_zonelists(NULL, NULL); 1625 mutex_unlock(&zonelists_mutex); 1626 } else 1627 zone_pcp_update(zone); 1628 1629 node_states_clear_node(node, &arg); 1630 if (arg.status_change_nid >= 0) 1631 kswapd_stop(node); 1632 1633 vm_total_pages = nr_free_pagecache_pages(); 1634 writeback_set_ratelimit(); 1635 1636 memory_notify(MEM_OFFLINE, &arg); 1637 unlock_memory_hotplug(); 1638 return 0; 1639 1640 failed_removal: 1641 printk(KERN_INFO "memory offlining [mem %#010llx-%#010llx] failed\n", 1642 (unsigned long long) start_pfn << PAGE_SHIFT, 1643 ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); 1644 memory_notify(MEM_CANCEL_OFFLINE, &arg); 1645 /* pushback to free area */ 1646 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1647 1648 out: 1649 unlock_memory_hotplug(); 1650 return ret; 1651 } 1652 1653 int offline_pages(unsigned long start_pfn, unsigned long nr_pages) 1654 { 1655 return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); 1656 } 1657 #endif /* CONFIG_MEMORY_HOTREMOVE */ 1658 1659 /** 1660 * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn) 1661 * @start_pfn: start pfn of the memory range 1662 * @end_pfn: end pfn of the memory range 1663 * @arg: argument passed to func 1664 * @func: callback for each memory section walked 1665 * 1666 * This function walks through all present mem sections in range 1667 * [start_pfn, end_pfn) and call func on each mem section. 1668 * 1669 * Returns the return value of func. 1670 */ 1671 int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, 1672 void *arg, int (*func)(struct memory_block *, void *)) 1673 { 1674 struct memory_block *mem = NULL; 1675 struct mem_section *section; 1676 unsigned long pfn, section_nr; 1677 int ret; 1678 1679 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 1680 section_nr = pfn_to_section_nr(pfn); 1681 if (!present_section_nr(section_nr)) 1682 continue; 1683 1684 section = __nr_to_section(section_nr); 1685 /* same memblock? */ 1686 if (mem) 1687 if ((section_nr >= mem->start_section_nr) && 1688 (section_nr <= mem->end_section_nr)) 1689 continue; 1690 1691 mem = find_memory_block_hinted(section, mem); 1692 if (!mem) 1693 continue; 1694 1695 ret = func(mem, arg); 1696 if (ret) { 1697 kobject_put(&mem->dev.kobj); 1698 return ret; 1699 } 1700 } 1701 1702 if (mem) 1703 kobject_put(&mem->dev.kobj); 1704 1705 return 0; 1706 } 1707 1708 #ifdef CONFIG_MEMORY_HOTREMOVE 1709 static int is_memblock_offlined_cb(struct memory_block *mem, void *arg) 1710 { 1711 int ret = !is_memblock_offlined(mem); 1712 1713 if (unlikely(ret)) { 1714 phys_addr_t beginpa, endpa; 1715 1716 beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); 1717 endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1; 1718 pr_warn("removing memory fails, because memory " 1719 "[%pa-%pa] is onlined\n", 1720 &beginpa, &endpa); 1721 } 1722 1723 return ret; 1724 } 1725 1726 static int check_cpu_on_node(pg_data_t *pgdat) 1727 { 1728 int cpu; 1729 1730 for_each_present_cpu(cpu) { 1731 if (cpu_to_node(cpu) == pgdat->node_id) 1732 /* 1733 * the cpu on this node isn't removed, and we can't 1734 * offline this node. 1735 */ 1736 return -EBUSY; 1737 } 1738 1739 return 0; 1740 } 1741 1742 static void unmap_cpu_on_node(pg_data_t *pgdat) 1743 { 1744 #ifdef CONFIG_ACPI_NUMA 1745 int cpu; 1746 1747 for_each_possible_cpu(cpu) 1748 if (cpu_to_node(cpu) == pgdat->node_id) 1749 numa_clear_node(cpu); 1750 #endif 1751 } 1752 1753 static int check_and_unmap_cpu_on_node(pg_data_t *pgdat) 1754 { 1755 int ret; 1756 1757 ret = check_cpu_on_node(pgdat); 1758 if (ret) 1759 return ret; 1760 1761 /* 1762 * the node will be offlined when we come here, so we can clear 1763 * the cpu_to_node() now. 1764 */ 1765 1766 unmap_cpu_on_node(pgdat); 1767 return 0; 1768 } 1769 1770 /** 1771 * try_offline_node 1772 * 1773 * Offline a node if all memory sections and cpus of the node are removed. 1774 * 1775 * NOTE: The caller must call lock_device_hotplug() to serialize hotplug 1776 * and online/offline operations before this call. 1777 */ 1778 void try_offline_node(int nid) 1779 { 1780 pg_data_t *pgdat = NODE_DATA(nid); 1781 unsigned long start_pfn = pgdat->node_start_pfn; 1782 unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; 1783 unsigned long pfn; 1784 struct page *pgdat_page = virt_to_page(pgdat); 1785 int i; 1786 1787 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 1788 unsigned long section_nr = pfn_to_section_nr(pfn); 1789 1790 if (!present_section_nr(section_nr)) 1791 continue; 1792 1793 if (pfn_to_nid(pfn) != nid) 1794 continue; 1795 1796 /* 1797 * some memory sections of this node are not removed, and we 1798 * can't offline node now. 1799 */ 1800 return; 1801 } 1802 1803 if (check_and_unmap_cpu_on_node(pgdat)) 1804 return; 1805 1806 /* 1807 * all memory/cpu of this node are removed, we can offline this 1808 * node now. 1809 */ 1810 node_set_offline(nid); 1811 unregister_one_node(nid); 1812 1813 if (!PageSlab(pgdat_page) && !PageCompound(pgdat_page)) 1814 /* node data is allocated from boot memory */ 1815 return; 1816 1817 /* free waittable in each zone */ 1818 for (i = 0; i < MAX_NR_ZONES; i++) { 1819 struct zone *zone = pgdat->node_zones + i; 1820 1821 /* 1822 * wait_table may be allocated from boot memory, 1823 * here only free if it's allocated by vmalloc. 1824 */ 1825 if (is_vmalloc_addr(zone->wait_table)) 1826 vfree(zone->wait_table); 1827 } 1828 1829 /* 1830 * Since there is no way to guarentee the address of pgdat/zone is not 1831 * on stack of any kernel threads or used by other kernel objects 1832 * without reference counting or other symchronizing method, do not 1833 * reset node_data and free pgdat here. Just reset it to 0 and reuse 1834 * the memory when the node is online again. 1835 */ 1836 memset(pgdat, 0, sizeof(*pgdat)); 1837 } 1838 EXPORT_SYMBOL(try_offline_node); 1839 1840 /** 1841 * remove_memory 1842 * 1843 * NOTE: The caller must call lock_device_hotplug() to serialize hotplug 1844 * and online/offline operations before this call, as required by 1845 * try_offline_node(). 1846 */ 1847 void __ref remove_memory(int nid, u64 start, u64 size) 1848 { 1849 int ret; 1850 1851 BUG_ON(check_hotplug_memory_range(start, size)); 1852 1853 lock_memory_hotplug(); 1854 1855 /* 1856 * All memory blocks must be offlined before removing memory. Check 1857 * whether all memory blocks in question are offline and trigger a BUG() 1858 * if this is not the case. 1859 */ 1860 ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, 1861 is_memblock_offlined_cb); 1862 if (ret) { 1863 unlock_memory_hotplug(); 1864 BUG(); 1865 } 1866 1867 /* remove memmap entry */ 1868 firmware_map_remove(start, start + size, "System RAM"); 1869 1870 arch_remove_memory(start, size); 1871 1872 try_offline_node(nid); 1873 1874 unlock_memory_hotplug(); 1875 } 1876 EXPORT_SYMBOL_GPL(remove_memory); 1877 #endif /* CONFIG_MEMORY_HOTREMOVE */ 1878