1 /* 2 * linux/mm/memory_hotplug.c 3 * 4 * Copyright (C) 5 */ 6 7 #include <linux/stddef.h> 8 #include <linux/mm.h> 9 #include <linux/swap.h> 10 #include <linux/interrupt.h> 11 #include <linux/pagemap.h> 12 #include <linux/bootmem.h> 13 #include <linux/compiler.h> 14 #include <linux/export.h> 15 #include <linux/pagevec.h> 16 #include <linux/writeback.h> 17 #include <linux/slab.h> 18 #include <linux/sysctl.h> 19 #include <linux/cpu.h> 20 #include <linux/memory.h> 21 #include <linux/memory_hotplug.h> 22 #include <linux/highmem.h> 23 #include <linux/vmalloc.h> 24 #include <linux/ioport.h> 25 #include <linux/delay.h> 26 #include <linux/migrate.h> 27 #include <linux/page-isolation.h> 28 #include <linux/pfn.h> 29 #include <linux/suspend.h> 30 #include <linux/mm_inline.h> 31 #include <linux/firmware-map.h> 32 #include <linux/stop_machine.h> 33 #include <linux/hugetlb.h> 34 #include <linux/memblock.h> 35 36 #include <asm/tlbflush.h> 37 38 #include "internal.h" 39 40 /* 41 * online_page_callback contains pointer to current page onlining function. 42 * Initially it is generic_online_page(). If it is required it could be 43 * changed by calling set_online_page_callback() for callback registration 44 * and restore_online_page_callback() for generic callback restore. 45 */ 46 47 static void generic_online_page(struct page *page); 48 49 static online_page_callback_t online_page_callback = generic_online_page; 50 51 DEFINE_MUTEX(mem_hotplug_mutex); 52 53 void lock_memory_hotplug(void) 54 { 55 mutex_lock(&mem_hotplug_mutex); 56 } 57 58 void unlock_memory_hotplug(void) 59 { 60 mutex_unlock(&mem_hotplug_mutex); 61 } 62 63 64 /* add this memory to iomem resource */ 65 static struct resource *register_memory_resource(u64 start, u64 size) 66 { 67 struct resource *res; 68 res = kzalloc(sizeof(struct resource), GFP_KERNEL); 69 BUG_ON(!res); 70 71 res->name = "System RAM"; 72 res->start = start; 73 res->end = start + size - 1; 74 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; 75 if (request_resource(&iomem_resource, res) < 0) { 76 pr_debug("System RAM resource %pR cannot be added\n", res); 77 kfree(res); 78 res = NULL; 79 } 80 return res; 81 } 82 83 static void release_memory_resource(struct resource *res) 84 { 85 if (!res) 86 return; 87 release_resource(res); 88 kfree(res); 89 return; 90 } 91 92 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 93 void get_page_bootmem(unsigned long info, struct page *page, 94 unsigned long type) 95 { 96 page->lru.next = (struct list_head *) type; 97 SetPagePrivate(page); 98 set_page_private(page, info); 99 atomic_inc(&page->_count); 100 } 101 102 void put_page_bootmem(struct page *page) 103 { 104 unsigned long type; 105 106 type = (unsigned long) page->lru.next; 107 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || 108 type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); 109 110 if (atomic_dec_return(&page->_count) == 1) { 111 ClearPagePrivate(page); 112 set_page_private(page, 0); 113 INIT_LIST_HEAD(&page->lru); 114 free_reserved_page(page); 115 } 116 } 117 118 #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE 119 #ifndef CONFIG_SPARSEMEM_VMEMMAP 120 static void register_page_bootmem_info_section(unsigned long start_pfn) 121 { 122 unsigned long *usemap, mapsize, section_nr, i; 123 struct mem_section *ms; 124 struct page *page, *memmap; 125 126 section_nr = pfn_to_section_nr(start_pfn); 127 ms = __nr_to_section(section_nr); 128 129 /* Get section's memmap address */ 130 memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); 131 132 /* 133 * Get page for the memmap's phys address 134 * XXX: need more consideration for sparse_vmemmap... 135 */ 136 page = virt_to_page(memmap); 137 mapsize = sizeof(struct page) * PAGES_PER_SECTION; 138 mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT; 139 140 /* remember memmap's page */ 141 for (i = 0; i < mapsize; i++, page++) 142 get_page_bootmem(section_nr, page, SECTION_INFO); 143 144 usemap = __nr_to_section(section_nr)->pageblock_flags; 145 page = virt_to_page(usemap); 146 147 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; 148 149 for (i = 0; i < mapsize; i++, page++) 150 get_page_bootmem(section_nr, page, MIX_SECTION_INFO); 151 152 } 153 #else /* CONFIG_SPARSEMEM_VMEMMAP */ 154 static void register_page_bootmem_info_section(unsigned long start_pfn) 155 { 156 unsigned long *usemap, mapsize, section_nr, i; 157 struct mem_section *ms; 158 struct page *page, *memmap; 159 160 if (!pfn_valid(start_pfn)) 161 return; 162 163 section_nr = pfn_to_section_nr(start_pfn); 164 ms = __nr_to_section(section_nr); 165 166 memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); 167 168 register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION); 169 170 usemap = __nr_to_section(section_nr)->pageblock_flags; 171 page = virt_to_page(usemap); 172 173 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; 174 175 for (i = 0; i < mapsize; i++, page++) 176 get_page_bootmem(section_nr, page, MIX_SECTION_INFO); 177 } 178 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ 179 180 void register_page_bootmem_info_node(struct pglist_data *pgdat) 181 { 182 unsigned long i, pfn, end_pfn, nr_pages; 183 int node = pgdat->node_id; 184 struct page *page; 185 struct zone *zone; 186 187 nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT; 188 page = virt_to_page(pgdat); 189 190 for (i = 0; i < nr_pages; i++, page++) 191 get_page_bootmem(node, page, NODE_INFO); 192 193 zone = &pgdat->node_zones[0]; 194 for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) { 195 if (zone_is_initialized(zone)) { 196 nr_pages = zone->wait_table_hash_nr_entries 197 * sizeof(wait_queue_head_t); 198 nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT; 199 page = virt_to_page(zone->wait_table); 200 201 for (i = 0; i < nr_pages; i++, page++) 202 get_page_bootmem(node, page, NODE_INFO); 203 } 204 } 205 206 pfn = pgdat->node_start_pfn; 207 end_pfn = pgdat_end_pfn(pgdat); 208 209 /* register section info */ 210 for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 211 /* 212 * Some platforms can assign the same pfn to multiple nodes - on 213 * node0 as well as nodeN. To avoid registering a pfn against 214 * multiple nodes we check that this pfn does not already 215 * reside in some other nodes. 216 */ 217 if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node)) 218 register_page_bootmem_info_section(pfn); 219 } 220 } 221 #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ 222 223 static void grow_zone_span(struct zone *zone, unsigned long start_pfn, 224 unsigned long end_pfn) 225 { 226 unsigned long old_zone_end_pfn; 227 228 zone_span_writelock(zone); 229 230 old_zone_end_pfn = zone_end_pfn(zone); 231 if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) 232 zone->zone_start_pfn = start_pfn; 233 234 zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - 235 zone->zone_start_pfn; 236 237 zone_span_writeunlock(zone); 238 } 239 240 static void resize_zone(struct zone *zone, unsigned long start_pfn, 241 unsigned long end_pfn) 242 { 243 zone_span_writelock(zone); 244 245 if (end_pfn - start_pfn) { 246 zone->zone_start_pfn = start_pfn; 247 zone->spanned_pages = end_pfn - start_pfn; 248 } else { 249 /* 250 * make it consist as free_area_init_core(), 251 * if spanned_pages = 0, then keep start_pfn = 0 252 */ 253 zone->zone_start_pfn = 0; 254 zone->spanned_pages = 0; 255 } 256 257 zone_span_writeunlock(zone); 258 } 259 260 static void fix_zone_id(struct zone *zone, unsigned long start_pfn, 261 unsigned long end_pfn) 262 { 263 enum zone_type zid = zone_idx(zone); 264 int nid = zone->zone_pgdat->node_id; 265 unsigned long pfn; 266 267 for (pfn = start_pfn; pfn < end_pfn; pfn++) 268 set_page_links(pfn_to_page(pfn), zid, nid, pfn); 269 } 270 271 /* Can fail with -ENOMEM from allocating a wait table with vmalloc() or 272 * alloc_bootmem_node_nopanic() */ 273 static int __ref ensure_zone_is_initialized(struct zone *zone, 274 unsigned long start_pfn, unsigned long num_pages) 275 { 276 if (!zone_is_initialized(zone)) 277 return init_currently_empty_zone(zone, start_pfn, num_pages, 278 MEMMAP_HOTPLUG); 279 return 0; 280 } 281 282 static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, 283 unsigned long start_pfn, unsigned long end_pfn) 284 { 285 int ret; 286 unsigned long flags; 287 unsigned long z1_start_pfn; 288 289 ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn); 290 if (ret) 291 return ret; 292 293 pgdat_resize_lock(z1->zone_pgdat, &flags); 294 295 /* can't move pfns which are higher than @z2 */ 296 if (end_pfn > zone_end_pfn(z2)) 297 goto out_fail; 298 /* the move out part must be at the left most of @z2 */ 299 if (start_pfn > z2->zone_start_pfn) 300 goto out_fail; 301 /* must included/overlap */ 302 if (end_pfn <= z2->zone_start_pfn) 303 goto out_fail; 304 305 /* use start_pfn for z1's start_pfn if z1 is empty */ 306 if (!zone_is_empty(z1)) 307 z1_start_pfn = z1->zone_start_pfn; 308 else 309 z1_start_pfn = start_pfn; 310 311 resize_zone(z1, z1_start_pfn, end_pfn); 312 resize_zone(z2, end_pfn, zone_end_pfn(z2)); 313 314 pgdat_resize_unlock(z1->zone_pgdat, &flags); 315 316 fix_zone_id(z1, start_pfn, end_pfn); 317 318 return 0; 319 out_fail: 320 pgdat_resize_unlock(z1->zone_pgdat, &flags); 321 return -1; 322 } 323 324 static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, 325 unsigned long start_pfn, unsigned long end_pfn) 326 { 327 int ret; 328 unsigned long flags; 329 unsigned long z2_end_pfn; 330 331 ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn); 332 if (ret) 333 return ret; 334 335 pgdat_resize_lock(z1->zone_pgdat, &flags); 336 337 /* can't move pfns which are lower than @z1 */ 338 if (z1->zone_start_pfn > start_pfn) 339 goto out_fail; 340 /* the move out part mast at the right most of @z1 */ 341 if (zone_end_pfn(z1) > end_pfn) 342 goto out_fail; 343 /* must included/overlap */ 344 if (start_pfn >= zone_end_pfn(z1)) 345 goto out_fail; 346 347 /* use end_pfn for z2's end_pfn if z2 is empty */ 348 if (!zone_is_empty(z2)) 349 z2_end_pfn = zone_end_pfn(z2); 350 else 351 z2_end_pfn = end_pfn; 352 353 resize_zone(z1, z1->zone_start_pfn, start_pfn); 354 resize_zone(z2, start_pfn, z2_end_pfn); 355 356 pgdat_resize_unlock(z1->zone_pgdat, &flags); 357 358 fix_zone_id(z2, start_pfn, end_pfn); 359 360 return 0; 361 out_fail: 362 pgdat_resize_unlock(z1->zone_pgdat, &flags); 363 return -1; 364 } 365 366 static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, 367 unsigned long end_pfn) 368 { 369 unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat); 370 371 if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) 372 pgdat->node_start_pfn = start_pfn; 373 374 pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - 375 pgdat->node_start_pfn; 376 } 377 378 static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) 379 { 380 struct pglist_data *pgdat = zone->zone_pgdat; 381 int nr_pages = PAGES_PER_SECTION; 382 int nid = pgdat->node_id; 383 int zone_type; 384 unsigned long flags; 385 int ret; 386 387 zone_type = zone - pgdat->node_zones; 388 ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages); 389 if (ret) 390 return ret; 391 392 pgdat_resize_lock(zone->zone_pgdat, &flags); 393 grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); 394 grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, 395 phys_start_pfn + nr_pages); 396 pgdat_resize_unlock(zone->zone_pgdat, &flags); 397 memmap_init_zone(nr_pages, nid, zone_type, 398 phys_start_pfn, MEMMAP_HOTPLUG); 399 return 0; 400 } 401 402 static int __meminit __add_section(int nid, struct zone *zone, 403 unsigned long phys_start_pfn) 404 { 405 int ret; 406 407 if (pfn_valid(phys_start_pfn)) 408 return -EEXIST; 409 410 ret = sparse_add_one_section(zone, phys_start_pfn); 411 412 if (ret < 0) 413 return ret; 414 415 ret = __add_zone(zone, phys_start_pfn); 416 417 if (ret < 0) 418 return ret; 419 420 return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); 421 } 422 423 /* 424 * Reasonably generic function for adding memory. It is 425 * expected that archs that support memory hotplug will 426 * call this function after deciding the zone to which to 427 * add the new pages. 428 */ 429 int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, 430 unsigned long nr_pages) 431 { 432 unsigned long i; 433 int err = 0; 434 int start_sec, end_sec; 435 /* during initialize mem_map, align hot-added range to section */ 436 start_sec = pfn_to_section_nr(phys_start_pfn); 437 end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); 438 439 for (i = start_sec; i <= end_sec; i++) { 440 err = __add_section(nid, zone, i << PFN_SECTION_SHIFT); 441 442 /* 443 * EEXIST is finally dealt with by ioresource collision 444 * check. see add_memory() => register_memory_resource() 445 * Warning will be printed if there is collision. 446 */ 447 if (err && (err != -EEXIST)) 448 break; 449 err = 0; 450 } 451 452 return err; 453 } 454 EXPORT_SYMBOL_GPL(__add_pages); 455 456 #ifdef CONFIG_MEMORY_HOTREMOVE 457 /* find the smallest valid pfn in the range [start_pfn, end_pfn) */ 458 static int find_smallest_section_pfn(int nid, struct zone *zone, 459 unsigned long start_pfn, 460 unsigned long end_pfn) 461 { 462 struct mem_section *ms; 463 464 for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) { 465 ms = __pfn_to_section(start_pfn); 466 467 if (unlikely(!valid_section(ms))) 468 continue; 469 470 if (unlikely(pfn_to_nid(start_pfn) != nid)) 471 continue; 472 473 if (zone && zone != page_zone(pfn_to_page(start_pfn))) 474 continue; 475 476 return start_pfn; 477 } 478 479 return 0; 480 } 481 482 /* find the biggest valid pfn in the range [start_pfn, end_pfn). */ 483 static int find_biggest_section_pfn(int nid, struct zone *zone, 484 unsigned long start_pfn, 485 unsigned long end_pfn) 486 { 487 struct mem_section *ms; 488 unsigned long pfn; 489 490 /* pfn is the end pfn of a memory section. */ 491 pfn = end_pfn - 1; 492 for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) { 493 ms = __pfn_to_section(pfn); 494 495 if (unlikely(!valid_section(ms))) 496 continue; 497 498 if (unlikely(pfn_to_nid(pfn) != nid)) 499 continue; 500 501 if (zone && zone != page_zone(pfn_to_page(pfn))) 502 continue; 503 504 return pfn; 505 } 506 507 return 0; 508 } 509 510 static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, 511 unsigned long end_pfn) 512 { 513 unsigned long zone_start_pfn = zone->zone_start_pfn; 514 unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */ 515 unsigned long zone_end_pfn = z; 516 unsigned long pfn; 517 struct mem_section *ms; 518 int nid = zone_to_nid(zone); 519 520 zone_span_writelock(zone); 521 if (zone_start_pfn == start_pfn) { 522 /* 523 * If the section is smallest section in the zone, it need 524 * shrink zone->zone_start_pfn and zone->zone_spanned_pages. 525 * In this case, we find second smallest valid mem_section 526 * for shrinking zone. 527 */ 528 pfn = find_smallest_section_pfn(nid, zone, end_pfn, 529 zone_end_pfn); 530 if (pfn) { 531 zone->zone_start_pfn = pfn; 532 zone->spanned_pages = zone_end_pfn - pfn; 533 } 534 } else if (zone_end_pfn == end_pfn) { 535 /* 536 * If the section is biggest section in the zone, it need 537 * shrink zone->spanned_pages. 538 * In this case, we find second biggest valid mem_section for 539 * shrinking zone. 540 */ 541 pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn, 542 start_pfn); 543 if (pfn) 544 zone->spanned_pages = pfn - zone_start_pfn + 1; 545 } 546 547 /* 548 * The section is not biggest or smallest mem_section in the zone, it 549 * only creates a hole in the zone. So in this case, we need not 550 * change the zone. But perhaps, the zone has only hole data. Thus 551 * it check the zone has only hole or not. 552 */ 553 pfn = zone_start_pfn; 554 for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) { 555 ms = __pfn_to_section(pfn); 556 557 if (unlikely(!valid_section(ms))) 558 continue; 559 560 if (page_zone(pfn_to_page(pfn)) != zone) 561 continue; 562 563 /* If the section is current section, it continues the loop */ 564 if (start_pfn == pfn) 565 continue; 566 567 /* If we find valid section, we have nothing to do */ 568 zone_span_writeunlock(zone); 569 return; 570 } 571 572 /* The zone has no valid section */ 573 zone->zone_start_pfn = 0; 574 zone->spanned_pages = 0; 575 zone_span_writeunlock(zone); 576 } 577 578 static void shrink_pgdat_span(struct pglist_data *pgdat, 579 unsigned long start_pfn, unsigned long end_pfn) 580 { 581 unsigned long pgdat_start_pfn = pgdat->node_start_pfn; 582 unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */ 583 unsigned long pgdat_end_pfn = p; 584 unsigned long pfn; 585 struct mem_section *ms; 586 int nid = pgdat->node_id; 587 588 if (pgdat_start_pfn == start_pfn) { 589 /* 590 * If the section is smallest section in the pgdat, it need 591 * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages. 592 * In this case, we find second smallest valid mem_section 593 * for shrinking zone. 594 */ 595 pfn = find_smallest_section_pfn(nid, NULL, end_pfn, 596 pgdat_end_pfn); 597 if (pfn) { 598 pgdat->node_start_pfn = pfn; 599 pgdat->node_spanned_pages = pgdat_end_pfn - pfn; 600 } 601 } else if (pgdat_end_pfn == end_pfn) { 602 /* 603 * If the section is biggest section in the pgdat, it need 604 * shrink pgdat->node_spanned_pages. 605 * In this case, we find second biggest valid mem_section for 606 * shrinking zone. 607 */ 608 pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn, 609 start_pfn); 610 if (pfn) 611 pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1; 612 } 613 614 /* 615 * If the section is not biggest or smallest mem_section in the pgdat, 616 * it only creates a hole in the pgdat. So in this case, we need not 617 * change the pgdat. 618 * But perhaps, the pgdat has only hole data. Thus it check the pgdat 619 * has only hole or not. 620 */ 621 pfn = pgdat_start_pfn; 622 for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) { 623 ms = __pfn_to_section(pfn); 624 625 if (unlikely(!valid_section(ms))) 626 continue; 627 628 if (pfn_to_nid(pfn) != nid) 629 continue; 630 631 /* If the section is current section, it continues the loop */ 632 if (start_pfn == pfn) 633 continue; 634 635 /* If we find valid section, we have nothing to do */ 636 return; 637 } 638 639 /* The pgdat has no valid section */ 640 pgdat->node_start_pfn = 0; 641 pgdat->node_spanned_pages = 0; 642 } 643 644 static void __remove_zone(struct zone *zone, unsigned long start_pfn) 645 { 646 struct pglist_data *pgdat = zone->zone_pgdat; 647 int nr_pages = PAGES_PER_SECTION; 648 int zone_type; 649 unsigned long flags; 650 651 zone_type = zone - pgdat->node_zones; 652 653 pgdat_resize_lock(zone->zone_pgdat, &flags); 654 shrink_zone_span(zone, start_pfn, start_pfn + nr_pages); 655 shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages); 656 pgdat_resize_unlock(zone->zone_pgdat, &flags); 657 } 658 659 static int __remove_section(struct zone *zone, struct mem_section *ms) 660 { 661 unsigned long start_pfn; 662 int scn_nr; 663 int ret = -EINVAL; 664 665 if (!valid_section(ms)) 666 return ret; 667 668 ret = unregister_memory_section(ms); 669 if (ret) 670 return ret; 671 672 scn_nr = __section_nr(ms); 673 start_pfn = section_nr_to_pfn(scn_nr); 674 __remove_zone(zone, start_pfn); 675 676 sparse_remove_one_section(zone, ms); 677 return 0; 678 } 679 680 /** 681 * __remove_pages() - remove sections of pages from a zone 682 * @zone: zone from which pages need to be removed 683 * @phys_start_pfn: starting pageframe (must be aligned to start of a section) 684 * @nr_pages: number of pages to remove (must be multiple of section size) 685 * 686 * Generic helper function to remove section mappings and sysfs entries 687 * for the section of the memory we are removing. Caller needs to make 688 * sure that pages are marked reserved and zones are adjust properly by 689 * calling offline_pages(). 690 */ 691 int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, 692 unsigned long nr_pages) 693 { 694 unsigned long i; 695 int sections_to_remove; 696 resource_size_t start, size; 697 int ret = 0; 698 699 /* 700 * We can only remove entire sections 701 */ 702 BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); 703 BUG_ON(nr_pages % PAGES_PER_SECTION); 704 705 start = phys_start_pfn << PAGE_SHIFT; 706 size = nr_pages * PAGE_SIZE; 707 ret = release_mem_region_adjustable(&iomem_resource, start, size); 708 if (ret) { 709 resource_size_t endres = start + size - 1; 710 711 pr_warn("Unable to release resource <%pa-%pa> (%d)\n", 712 &start, &endres, ret); 713 } 714 715 sections_to_remove = nr_pages / PAGES_PER_SECTION; 716 for (i = 0; i < sections_to_remove; i++) { 717 unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; 718 ret = __remove_section(zone, __pfn_to_section(pfn)); 719 if (ret) 720 break; 721 } 722 return ret; 723 } 724 EXPORT_SYMBOL_GPL(__remove_pages); 725 #endif /* CONFIG_MEMORY_HOTREMOVE */ 726 727 int set_online_page_callback(online_page_callback_t callback) 728 { 729 int rc = -EINVAL; 730 731 lock_memory_hotplug(); 732 733 if (online_page_callback == generic_online_page) { 734 online_page_callback = callback; 735 rc = 0; 736 } 737 738 unlock_memory_hotplug(); 739 740 return rc; 741 } 742 EXPORT_SYMBOL_GPL(set_online_page_callback); 743 744 int restore_online_page_callback(online_page_callback_t callback) 745 { 746 int rc = -EINVAL; 747 748 lock_memory_hotplug(); 749 750 if (online_page_callback == callback) { 751 online_page_callback = generic_online_page; 752 rc = 0; 753 } 754 755 unlock_memory_hotplug(); 756 757 return rc; 758 } 759 EXPORT_SYMBOL_GPL(restore_online_page_callback); 760 761 void __online_page_set_limits(struct page *page) 762 { 763 } 764 EXPORT_SYMBOL_GPL(__online_page_set_limits); 765 766 void __online_page_increment_counters(struct page *page) 767 { 768 adjust_managed_page_count(page, 1); 769 } 770 EXPORT_SYMBOL_GPL(__online_page_increment_counters); 771 772 void __online_page_free(struct page *page) 773 { 774 __free_reserved_page(page); 775 } 776 EXPORT_SYMBOL_GPL(__online_page_free); 777 778 static void generic_online_page(struct page *page) 779 { 780 __online_page_set_limits(page); 781 __online_page_increment_counters(page); 782 __online_page_free(page); 783 } 784 785 static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, 786 void *arg) 787 { 788 unsigned long i; 789 unsigned long onlined_pages = *(unsigned long *)arg; 790 struct page *page; 791 if (PageReserved(pfn_to_page(start_pfn))) 792 for (i = 0; i < nr_pages; i++) { 793 page = pfn_to_page(start_pfn + i); 794 (*online_page_callback)(page); 795 onlined_pages++; 796 } 797 *(unsigned long *)arg = onlined_pages; 798 return 0; 799 } 800 801 #ifdef CONFIG_MOVABLE_NODE 802 /* 803 * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have 804 * normal memory. 805 */ 806 static bool can_online_high_movable(struct zone *zone) 807 { 808 return true; 809 } 810 #else /* CONFIG_MOVABLE_NODE */ 811 /* ensure every online node has NORMAL memory */ 812 static bool can_online_high_movable(struct zone *zone) 813 { 814 return node_state(zone_to_nid(zone), N_NORMAL_MEMORY); 815 } 816 #endif /* CONFIG_MOVABLE_NODE */ 817 818 /* check which state of node_states will be changed when online memory */ 819 static void node_states_check_changes_online(unsigned long nr_pages, 820 struct zone *zone, struct memory_notify *arg) 821 { 822 int nid = zone_to_nid(zone); 823 enum zone_type zone_last = ZONE_NORMAL; 824 825 /* 826 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] 827 * contains nodes which have zones of 0...ZONE_NORMAL, 828 * set zone_last to ZONE_NORMAL. 829 * 830 * If we don't have HIGHMEM nor movable node, 831 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of 832 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. 833 */ 834 if (N_MEMORY == N_NORMAL_MEMORY) 835 zone_last = ZONE_MOVABLE; 836 837 /* 838 * if the memory to be online is in a zone of 0...zone_last, and 839 * the zones of 0...zone_last don't have memory before online, we will 840 * need to set the node to node_states[N_NORMAL_MEMORY] after 841 * the memory is online. 842 */ 843 if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY)) 844 arg->status_change_nid_normal = nid; 845 else 846 arg->status_change_nid_normal = -1; 847 848 #ifdef CONFIG_HIGHMEM 849 /* 850 * If we have movable node, node_states[N_HIGH_MEMORY] 851 * contains nodes which have zones of 0...ZONE_HIGHMEM, 852 * set zone_last to ZONE_HIGHMEM. 853 * 854 * If we don't have movable node, node_states[N_NORMAL_MEMORY] 855 * contains nodes which have zones of 0...ZONE_MOVABLE, 856 * set zone_last to ZONE_MOVABLE. 857 */ 858 zone_last = ZONE_HIGHMEM; 859 if (N_MEMORY == N_HIGH_MEMORY) 860 zone_last = ZONE_MOVABLE; 861 862 if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY)) 863 arg->status_change_nid_high = nid; 864 else 865 arg->status_change_nid_high = -1; 866 #else 867 arg->status_change_nid_high = arg->status_change_nid_normal; 868 #endif 869 870 /* 871 * if the node don't have memory befor online, we will need to 872 * set the node to node_states[N_MEMORY] after the memory 873 * is online. 874 */ 875 if (!node_state(nid, N_MEMORY)) 876 arg->status_change_nid = nid; 877 else 878 arg->status_change_nid = -1; 879 } 880 881 static void node_states_set_node(int node, struct memory_notify *arg) 882 { 883 if (arg->status_change_nid_normal >= 0) 884 node_set_state(node, N_NORMAL_MEMORY); 885 886 if (arg->status_change_nid_high >= 0) 887 node_set_state(node, N_HIGH_MEMORY); 888 889 node_set_state(node, N_MEMORY); 890 } 891 892 893 int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) 894 { 895 unsigned long flags; 896 unsigned long onlined_pages = 0; 897 struct zone *zone; 898 int need_zonelists_rebuild = 0; 899 int nid; 900 int ret; 901 struct memory_notify arg; 902 903 lock_memory_hotplug(); 904 /* 905 * This doesn't need a lock to do pfn_to_page(). 906 * The section can't be removed here because of the 907 * memory_block->state_mutex. 908 */ 909 zone = page_zone(pfn_to_page(pfn)); 910 911 if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) && 912 !can_online_high_movable(zone)) { 913 unlock_memory_hotplug(); 914 return -EINVAL; 915 } 916 917 if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { 918 if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) { 919 unlock_memory_hotplug(); 920 return -EINVAL; 921 } 922 } 923 if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { 924 if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) { 925 unlock_memory_hotplug(); 926 return -EINVAL; 927 } 928 } 929 930 /* Previous code may changed the zone of the pfn range */ 931 zone = page_zone(pfn_to_page(pfn)); 932 933 arg.start_pfn = pfn; 934 arg.nr_pages = nr_pages; 935 node_states_check_changes_online(nr_pages, zone, &arg); 936 937 nid = pfn_to_nid(pfn); 938 939 ret = memory_notify(MEM_GOING_ONLINE, &arg); 940 ret = notifier_to_errno(ret); 941 if (ret) { 942 memory_notify(MEM_CANCEL_ONLINE, &arg); 943 unlock_memory_hotplug(); 944 return ret; 945 } 946 /* 947 * If this zone is not populated, then it is not in zonelist. 948 * This means the page allocator ignores this zone. 949 * So, zonelist must be updated after online. 950 */ 951 mutex_lock(&zonelists_mutex); 952 if (!populated_zone(zone)) { 953 need_zonelists_rebuild = 1; 954 build_all_zonelists(NULL, zone); 955 } 956 957 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, 958 online_pages_range); 959 if (ret) { 960 if (need_zonelists_rebuild) 961 zone_pcp_reset(zone); 962 mutex_unlock(&zonelists_mutex); 963 printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n", 964 (unsigned long long) pfn << PAGE_SHIFT, 965 (((unsigned long long) pfn + nr_pages) 966 << PAGE_SHIFT) - 1); 967 memory_notify(MEM_CANCEL_ONLINE, &arg); 968 unlock_memory_hotplug(); 969 return ret; 970 } 971 972 zone->present_pages += onlined_pages; 973 974 pgdat_resize_lock(zone->zone_pgdat, &flags); 975 zone->zone_pgdat->node_present_pages += onlined_pages; 976 pgdat_resize_unlock(zone->zone_pgdat, &flags); 977 978 if (onlined_pages) { 979 node_states_set_node(zone_to_nid(zone), &arg); 980 if (need_zonelists_rebuild) 981 build_all_zonelists(NULL, NULL); 982 else 983 zone_pcp_update(zone); 984 } 985 986 mutex_unlock(&zonelists_mutex); 987 988 init_per_zone_wmark_min(); 989 990 if (onlined_pages) 991 kswapd_run(zone_to_nid(zone)); 992 993 vm_total_pages = nr_free_pagecache_pages(); 994 995 writeback_set_ratelimit(); 996 997 if (onlined_pages) 998 memory_notify(MEM_ONLINE, &arg); 999 unlock_memory_hotplug(); 1000 1001 return 0; 1002 } 1003 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ 1004 1005 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 1006 static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) 1007 { 1008 struct pglist_data *pgdat; 1009 unsigned long zones_size[MAX_NR_ZONES] = {0}; 1010 unsigned long zholes_size[MAX_NR_ZONES] = {0}; 1011 unsigned long start_pfn = start >> PAGE_SHIFT; 1012 1013 pgdat = NODE_DATA(nid); 1014 if (!pgdat) { 1015 pgdat = arch_alloc_nodedata(nid); 1016 if (!pgdat) 1017 return NULL; 1018 1019 arch_refresh_nodedata(nid, pgdat); 1020 } 1021 1022 /* we can use NODE_DATA(nid) from here */ 1023 1024 /* init node's zones as empty zones, we don't have any present pages.*/ 1025 free_area_init_node(nid, zones_size, start_pfn, zholes_size); 1026 1027 /* 1028 * The node we allocated has no zone fallback lists. For avoiding 1029 * to access not-initialized zonelist, build here. 1030 */ 1031 mutex_lock(&zonelists_mutex); 1032 build_all_zonelists(pgdat, NULL); 1033 mutex_unlock(&zonelists_mutex); 1034 1035 return pgdat; 1036 } 1037 1038 static void rollback_node_hotadd(int nid, pg_data_t *pgdat) 1039 { 1040 arch_refresh_nodedata(nid, NULL); 1041 arch_free_nodedata(pgdat); 1042 return; 1043 } 1044 1045 1046 /** 1047 * try_online_node - online a node if offlined 1048 * 1049 * called by cpu_up() to online a node without onlined memory. 1050 */ 1051 int try_online_node(int nid) 1052 { 1053 pg_data_t *pgdat; 1054 int ret; 1055 1056 if (node_online(nid)) 1057 return 0; 1058 1059 lock_memory_hotplug(); 1060 pgdat = hotadd_new_pgdat(nid, 0); 1061 if (!pgdat) { 1062 pr_err("Cannot online node %d due to NULL pgdat\n", nid); 1063 ret = -ENOMEM; 1064 goto out; 1065 } 1066 node_set_online(nid); 1067 ret = register_one_node(nid); 1068 BUG_ON(ret); 1069 1070 if (pgdat->node_zonelists->_zonerefs->zone == NULL) { 1071 mutex_lock(&zonelists_mutex); 1072 build_all_zonelists(NULL, NULL); 1073 mutex_unlock(&zonelists_mutex); 1074 } 1075 1076 out: 1077 unlock_memory_hotplug(); 1078 return ret; 1079 } 1080 1081 static int check_hotplug_memory_range(u64 start, u64 size) 1082 { 1083 u64 start_pfn = start >> PAGE_SHIFT; 1084 u64 nr_pages = size >> PAGE_SHIFT; 1085 1086 /* Memory range must be aligned with section */ 1087 if ((start_pfn & ~PAGE_SECTION_MASK) || 1088 (nr_pages % PAGES_PER_SECTION) || (!nr_pages)) { 1089 pr_err("Section-unaligned hotplug range: start 0x%llx, size 0x%llx\n", 1090 (unsigned long long)start, 1091 (unsigned long long)size); 1092 return -EINVAL; 1093 } 1094 1095 return 0; 1096 } 1097 1098 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 1099 int __ref add_memory(int nid, u64 start, u64 size) 1100 { 1101 pg_data_t *pgdat = NULL; 1102 bool new_pgdat; 1103 bool new_node; 1104 struct resource *res; 1105 int ret; 1106 1107 ret = check_hotplug_memory_range(start, size); 1108 if (ret) 1109 return ret; 1110 1111 lock_memory_hotplug(); 1112 1113 res = register_memory_resource(start, size); 1114 ret = -EEXIST; 1115 if (!res) 1116 goto out; 1117 1118 { /* Stupid hack to suppress address-never-null warning */ 1119 void *p = NODE_DATA(nid); 1120 new_pgdat = !p; 1121 } 1122 new_node = !node_online(nid); 1123 if (new_node) { 1124 pgdat = hotadd_new_pgdat(nid, start); 1125 ret = -ENOMEM; 1126 if (!pgdat) 1127 goto error; 1128 } 1129 1130 /* call arch's memory hotadd */ 1131 ret = arch_add_memory(nid, start, size); 1132 1133 if (ret < 0) 1134 goto error; 1135 1136 /* we online node here. we can't roll back from here. */ 1137 node_set_online(nid); 1138 1139 if (new_node) { 1140 ret = register_one_node(nid); 1141 /* 1142 * If sysfs file of new node can't create, cpu on the node 1143 * can't be hot-added. There is no rollback way now. 1144 * So, check by BUG_ON() to catch it reluctantly.. 1145 */ 1146 BUG_ON(ret); 1147 } 1148 1149 /* create new memmap entry */ 1150 firmware_map_add_hotplug(start, start + size, "System RAM"); 1151 1152 goto out; 1153 1154 error: 1155 /* rollback pgdat allocation and others */ 1156 if (new_pgdat) 1157 rollback_node_hotadd(nid, pgdat); 1158 release_memory_resource(res); 1159 1160 out: 1161 unlock_memory_hotplug(); 1162 return ret; 1163 } 1164 EXPORT_SYMBOL_GPL(add_memory); 1165 1166 #ifdef CONFIG_MEMORY_HOTREMOVE 1167 /* 1168 * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy 1169 * set and the size of the free page is given by page_order(). Using this, 1170 * the function determines if the pageblock contains only free pages. 1171 * Due to buddy contraints, a free page at least the size of a pageblock will 1172 * be located at the start of the pageblock 1173 */ 1174 static inline int pageblock_free(struct page *page) 1175 { 1176 return PageBuddy(page) && page_order(page) >= pageblock_order; 1177 } 1178 1179 /* Return the start of the next active pageblock after a given page */ 1180 static struct page *next_active_pageblock(struct page *page) 1181 { 1182 /* Ensure the starting page is pageblock-aligned */ 1183 BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1)); 1184 1185 /* If the entire pageblock is free, move to the end of free page */ 1186 if (pageblock_free(page)) { 1187 int order; 1188 /* be careful. we don't have locks, page_order can be changed.*/ 1189 order = page_order(page); 1190 if ((order < MAX_ORDER) && (order >= pageblock_order)) 1191 return page + (1 << order); 1192 } 1193 1194 return page + pageblock_nr_pages; 1195 } 1196 1197 /* Checks if this range of memory is likely to be hot-removable. */ 1198 int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) 1199 { 1200 struct page *page = pfn_to_page(start_pfn); 1201 struct page *end_page = page + nr_pages; 1202 1203 /* Check the starting page of each pageblock within the range */ 1204 for (; page < end_page; page = next_active_pageblock(page)) { 1205 if (!is_pageblock_removable_nolock(page)) 1206 return 0; 1207 cond_resched(); 1208 } 1209 1210 /* All pageblocks in the memory block are likely to be hot-removable */ 1211 return 1; 1212 } 1213 1214 /* 1215 * Confirm all pages in a range [start, end) is belongs to the same zone. 1216 */ 1217 static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) 1218 { 1219 unsigned long pfn; 1220 struct zone *zone = NULL; 1221 struct page *page; 1222 int i; 1223 for (pfn = start_pfn; 1224 pfn < end_pfn; 1225 pfn += MAX_ORDER_NR_PAGES) { 1226 i = 0; 1227 /* This is just a CONFIG_HOLES_IN_ZONE check.*/ 1228 while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i)) 1229 i++; 1230 if (i == MAX_ORDER_NR_PAGES) 1231 continue; 1232 page = pfn_to_page(pfn + i); 1233 if (zone && page_zone(page) != zone) 1234 return 0; 1235 zone = page_zone(page); 1236 } 1237 return 1; 1238 } 1239 1240 /* 1241 * Scan pfn range [start,end) to find movable/migratable pages (LRU pages 1242 * and hugepages). We scan pfn because it's much easier than scanning over 1243 * linked list. This function returns the pfn of the first found movable 1244 * page if it's found, otherwise 0. 1245 */ 1246 static unsigned long scan_movable_pages(unsigned long start, unsigned long end) 1247 { 1248 unsigned long pfn; 1249 struct page *page; 1250 for (pfn = start; pfn < end; pfn++) { 1251 if (pfn_valid(pfn)) { 1252 page = pfn_to_page(pfn); 1253 if (PageLRU(page)) 1254 return pfn; 1255 if (PageHuge(page)) { 1256 if (is_hugepage_active(page)) 1257 return pfn; 1258 else 1259 pfn = round_up(pfn + 1, 1260 1 << compound_order(page)) - 1; 1261 } 1262 } 1263 } 1264 return 0; 1265 } 1266 1267 #define NR_OFFLINE_AT_ONCE_PAGES (256) 1268 static int 1269 do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) 1270 { 1271 unsigned long pfn; 1272 struct page *page; 1273 int move_pages = NR_OFFLINE_AT_ONCE_PAGES; 1274 int not_managed = 0; 1275 int ret = 0; 1276 LIST_HEAD(source); 1277 1278 for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) { 1279 if (!pfn_valid(pfn)) 1280 continue; 1281 page = pfn_to_page(pfn); 1282 1283 if (PageHuge(page)) { 1284 struct page *head = compound_head(page); 1285 pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1; 1286 if (compound_order(head) > PFN_SECTION_SHIFT) { 1287 ret = -EBUSY; 1288 break; 1289 } 1290 if (isolate_huge_page(page, &source)) 1291 move_pages -= 1 << compound_order(head); 1292 continue; 1293 } 1294 1295 if (!get_page_unless_zero(page)) 1296 continue; 1297 /* 1298 * We can skip free pages. And we can only deal with pages on 1299 * LRU. 1300 */ 1301 ret = isolate_lru_page(page); 1302 if (!ret) { /* Success */ 1303 put_page(page); 1304 list_add_tail(&page->lru, &source); 1305 move_pages--; 1306 inc_zone_page_state(page, NR_ISOLATED_ANON + 1307 page_is_file_cache(page)); 1308 1309 } else { 1310 #ifdef CONFIG_DEBUG_VM 1311 printk(KERN_ALERT "removing pfn %lx from LRU failed\n", 1312 pfn); 1313 dump_page(page); 1314 #endif 1315 put_page(page); 1316 /* Because we don't have big zone->lock. we should 1317 check this again here. */ 1318 if (page_count(page)) { 1319 not_managed++; 1320 ret = -EBUSY; 1321 break; 1322 } 1323 } 1324 } 1325 if (!list_empty(&source)) { 1326 if (not_managed) { 1327 putback_movable_pages(&source); 1328 goto out; 1329 } 1330 1331 /* 1332 * alloc_migrate_target should be improooooved!! 1333 * migrate_pages returns # of failed pages. 1334 */ 1335 ret = migrate_pages(&source, alloc_migrate_target, 0, 1336 MIGRATE_SYNC, MR_MEMORY_HOTPLUG); 1337 if (ret) 1338 putback_movable_pages(&source); 1339 } 1340 out: 1341 return ret; 1342 } 1343 1344 /* 1345 * remove from free_area[] and mark all as Reserved. 1346 */ 1347 static int 1348 offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, 1349 void *data) 1350 { 1351 __offline_isolated_pages(start, start + nr_pages); 1352 return 0; 1353 } 1354 1355 static void 1356 offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 1357 { 1358 walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL, 1359 offline_isolated_pages_cb); 1360 } 1361 1362 /* 1363 * Check all pages in range, recoreded as memory resource, are isolated. 1364 */ 1365 static int 1366 check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, 1367 void *data) 1368 { 1369 int ret; 1370 long offlined = *(long *)data; 1371 ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true); 1372 offlined = nr_pages; 1373 if (!ret) 1374 *(long *)data += offlined; 1375 return ret; 1376 } 1377 1378 static long 1379 check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) 1380 { 1381 long offlined = 0; 1382 int ret; 1383 1384 ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined, 1385 check_pages_isolated_cb); 1386 if (ret < 0) 1387 offlined = (long)ret; 1388 return offlined; 1389 } 1390 1391 #ifdef CONFIG_MOVABLE_NODE 1392 /* 1393 * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have 1394 * normal memory. 1395 */ 1396 static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) 1397 { 1398 return true; 1399 } 1400 #else /* CONFIG_MOVABLE_NODE */ 1401 /* ensure the node has NORMAL memory if it is still online */ 1402 static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) 1403 { 1404 struct pglist_data *pgdat = zone->zone_pgdat; 1405 unsigned long present_pages = 0; 1406 enum zone_type zt; 1407 1408 for (zt = 0; zt <= ZONE_NORMAL; zt++) 1409 present_pages += pgdat->node_zones[zt].present_pages; 1410 1411 if (present_pages > nr_pages) 1412 return true; 1413 1414 present_pages = 0; 1415 for (; zt <= ZONE_MOVABLE; zt++) 1416 present_pages += pgdat->node_zones[zt].present_pages; 1417 1418 /* 1419 * we can't offline the last normal memory until all 1420 * higher memory is offlined. 1421 */ 1422 return present_pages == 0; 1423 } 1424 #endif /* CONFIG_MOVABLE_NODE */ 1425 1426 static int __init cmdline_parse_movable_node(char *p) 1427 { 1428 #ifdef CONFIG_MOVABLE_NODE 1429 /* 1430 * Memory used by the kernel cannot be hot-removed because Linux 1431 * cannot migrate the kernel pages. When memory hotplug is 1432 * enabled, we should prevent memblock from allocating memory 1433 * for the kernel. 1434 * 1435 * ACPI SRAT records all hotpluggable memory ranges. But before 1436 * SRAT is parsed, we don't know about it. 1437 * 1438 * The kernel image is loaded into memory at very early time. We 1439 * cannot prevent this anyway. So on NUMA system, we set any 1440 * node the kernel resides in as un-hotpluggable. 1441 * 1442 * Since on modern servers, one node could have double-digit 1443 * gigabytes memory, we can assume the memory around the kernel 1444 * image is also un-hotpluggable. So before SRAT is parsed, just 1445 * allocate memory near the kernel image to try the best to keep 1446 * the kernel away from hotpluggable memory. 1447 */ 1448 memblock_set_bottom_up(true); 1449 #else 1450 pr_warn("movable_node option not supported\n"); 1451 #endif 1452 return 0; 1453 } 1454 early_param("movable_node", cmdline_parse_movable_node); 1455 1456 /* check which state of node_states will be changed when offline memory */ 1457 static void node_states_check_changes_offline(unsigned long nr_pages, 1458 struct zone *zone, struct memory_notify *arg) 1459 { 1460 struct pglist_data *pgdat = zone->zone_pgdat; 1461 unsigned long present_pages = 0; 1462 enum zone_type zt, zone_last = ZONE_NORMAL; 1463 1464 /* 1465 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] 1466 * contains nodes which have zones of 0...ZONE_NORMAL, 1467 * set zone_last to ZONE_NORMAL. 1468 * 1469 * If we don't have HIGHMEM nor movable node, 1470 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of 1471 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. 1472 */ 1473 if (N_MEMORY == N_NORMAL_MEMORY) 1474 zone_last = ZONE_MOVABLE; 1475 1476 /* 1477 * check whether node_states[N_NORMAL_MEMORY] will be changed. 1478 * If the memory to be offline is in a zone of 0...zone_last, 1479 * and it is the last present memory, 0...zone_last will 1480 * become empty after offline , thus we can determind we will 1481 * need to clear the node from node_states[N_NORMAL_MEMORY]. 1482 */ 1483 for (zt = 0; zt <= zone_last; zt++) 1484 present_pages += pgdat->node_zones[zt].present_pages; 1485 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) 1486 arg->status_change_nid_normal = zone_to_nid(zone); 1487 else 1488 arg->status_change_nid_normal = -1; 1489 1490 #ifdef CONFIG_HIGHMEM 1491 /* 1492 * If we have movable node, node_states[N_HIGH_MEMORY] 1493 * contains nodes which have zones of 0...ZONE_HIGHMEM, 1494 * set zone_last to ZONE_HIGHMEM. 1495 * 1496 * If we don't have movable node, node_states[N_NORMAL_MEMORY] 1497 * contains nodes which have zones of 0...ZONE_MOVABLE, 1498 * set zone_last to ZONE_MOVABLE. 1499 */ 1500 zone_last = ZONE_HIGHMEM; 1501 if (N_MEMORY == N_HIGH_MEMORY) 1502 zone_last = ZONE_MOVABLE; 1503 1504 for (; zt <= zone_last; zt++) 1505 present_pages += pgdat->node_zones[zt].present_pages; 1506 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) 1507 arg->status_change_nid_high = zone_to_nid(zone); 1508 else 1509 arg->status_change_nid_high = -1; 1510 #else 1511 arg->status_change_nid_high = arg->status_change_nid_normal; 1512 #endif 1513 1514 /* 1515 * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE 1516 */ 1517 zone_last = ZONE_MOVABLE; 1518 1519 /* 1520 * check whether node_states[N_HIGH_MEMORY] will be changed 1521 * If we try to offline the last present @nr_pages from the node, 1522 * we can determind we will need to clear the node from 1523 * node_states[N_HIGH_MEMORY]. 1524 */ 1525 for (; zt <= zone_last; zt++) 1526 present_pages += pgdat->node_zones[zt].present_pages; 1527 if (nr_pages >= present_pages) 1528 arg->status_change_nid = zone_to_nid(zone); 1529 else 1530 arg->status_change_nid = -1; 1531 } 1532 1533 static void node_states_clear_node(int node, struct memory_notify *arg) 1534 { 1535 if (arg->status_change_nid_normal >= 0) 1536 node_clear_state(node, N_NORMAL_MEMORY); 1537 1538 if ((N_MEMORY != N_NORMAL_MEMORY) && 1539 (arg->status_change_nid_high >= 0)) 1540 node_clear_state(node, N_HIGH_MEMORY); 1541 1542 if ((N_MEMORY != N_HIGH_MEMORY) && 1543 (arg->status_change_nid >= 0)) 1544 node_clear_state(node, N_MEMORY); 1545 } 1546 1547 static int __ref __offline_pages(unsigned long start_pfn, 1548 unsigned long end_pfn, unsigned long timeout) 1549 { 1550 unsigned long pfn, nr_pages, expire; 1551 long offlined_pages; 1552 int ret, drain, retry_max, node; 1553 unsigned long flags; 1554 struct zone *zone; 1555 struct memory_notify arg; 1556 1557 /* at least, alignment against pageblock is necessary */ 1558 if (!IS_ALIGNED(start_pfn, pageblock_nr_pages)) 1559 return -EINVAL; 1560 if (!IS_ALIGNED(end_pfn, pageblock_nr_pages)) 1561 return -EINVAL; 1562 /* This makes hotplug much easier...and readable. 1563 we assume this for now. .*/ 1564 if (!test_pages_in_a_zone(start_pfn, end_pfn)) 1565 return -EINVAL; 1566 1567 lock_memory_hotplug(); 1568 1569 zone = page_zone(pfn_to_page(start_pfn)); 1570 node = zone_to_nid(zone); 1571 nr_pages = end_pfn - start_pfn; 1572 1573 ret = -EINVAL; 1574 if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages)) 1575 goto out; 1576 1577 /* set above range as isolated */ 1578 ret = start_isolate_page_range(start_pfn, end_pfn, 1579 MIGRATE_MOVABLE, true); 1580 if (ret) 1581 goto out; 1582 1583 arg.start_pfn = start_pfn; 1584 arg.nr_pages = nr_pages; 1585 node_states_check_changes_offline(nr_pages, zone, &arg); 1586 1587 ret = memory_notify(MEM_GOING_OFFLINE, &arg); 1588 ret = notifier_to_errno(ret); 1589 if (ret) 1590 goto failed_removal; 1591 1592 pfn = start_pfn; 1593 expire = jiffies + timeout; 1594 drain = 0; 1595 retry_max = 5; 1596 repeat: 1597 /* start memory hot removal */ 1598 ret = -EAGAIN; 1599 if (time_after(jiffies, expire)) 1600 goto failed_removal; 1601 ret = -EINTR; 1602 if (signal_pending(current)) 1603 goto failed_removal; 1604 ret = 0; 1605 if (drain) { 1606 lru_add_drain_all(); 1607 cond_resched(); 1608 drain_all_pages(); 1609 } 1610 1611 pfn = scan_movable_pages(start_pfn, end_pfn); 1612 if (pfn) { /* We have movable pages */ 1613 ret = do_migrate_range(pfn, end_pfn); 1614 if (!ret) { 1615 drain = 1; 1616 goto repeat; 1617 } else { 1618 if (ret < 0) 1619 if (--retry_max == 0) 1620 goto failed_removal; 1621 yield(); 1622 drain = 1; 1623 goto repeat; 1624 } 1625 } 1626 /* drain all zone's lru pagevec, this is asynchronous... */ 1627 lru_add_drain_all(); 1628 yield(); 1629 /* drain pcp pages, this is synchronous. */ 1630 drain_all_pages(); 1631 /* 1632 * dissolve free hugepages in the memory block before doing offlining 1633 * actually in order to make hugetlbfs's object counting consistent. 1634 */ 1635 dissolve_free_huge_pages(start_pfn, end_pfn); 1636 /* check again */ 1637 offlined_pages = check_pages_isolated(start_pfn, end_pfn); 1638 if (offlined_pages < 0) { 1639 ret = -EBUSY; 1640 goto failed_removal; 1641 } 1642 printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages); 1643 /* Ok, all of our target is isolated. 1644 We cannot do rollback at this point. */ 1645 offline_isolated_pages(start_pfn, end_pfn); 1646 /* reset pagetype flags and makes migrate type to be MOVABLE */ 1647 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1648 /* removal success */ 1649 adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages); 1650 zone->present_pages -= offlined_pages; 1651 1652 pgdat_resize_lock(zone->zone_pgdat, &flags); 1653 zone->zone_pgdat->node_present_pages -= offlined_pages; 1654 pgdat_resize_unlock(zone->zone_pgdat, &flags); 1655 1656 init_per_zone_wmark_min(); 1657 1658 if (!populated_zone(zone)) { 1659 zone_pcp_reset(zone); 1660 mutex_lock(&zonelists_mutex); 1661 build_all_zonelists(NULL, NULL); 1662 mutex_unlock(&zonelists_mutex); 1663 } else 1664 zone_pcp_update(zone); 1665 1666 node_states_clear_node(node, &arg); 1667 if (arg.status_change_nid >= 0) 1668 kswapd_stop(node); 1669 1670 vm_total_pages = nr_free_pagecache_pages(); 1671 writeback_set_ratelimit(); 1672 1673 memory_notify(MEM_OFFLINE, &arg); 1674 unlock_memory_hotplug(); 1675 return 0; 1676 1677 failed_removal: 1678 printk(KERN_INFO "memory offlining [mem %#010llx-%#010llx] failed\n", 1679 (unsigned long long) start_pfn << PAGE_SHIFT, 1680 ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); 1681 memory_notify(MEM_CANCEL_OFFLINE, &arg); 1682 /* pushback to free area */ 1683 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1684 1685 out: 1686 unlock_memory_hotplug(); 1687 return ret; 1688 } 1689 1690 int offline_pages(unsigned long start_pfn, unsigned long nr_pages) 1691 { 1692 return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); 1693 } 1694 #endif /* CONFIG_MEMORY_HOTREMOVE */ 1695 1696 /** 1697 * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn) 1698 * @start_pfn: start pfn of the memory range 1699 * @end_pfn: end pfn of the memory range 1700 * @arg: argument passed to func 1701 * @func: callback for each memory section walked 1702 * 1703 * This function walks through all present mem sections in range 1704 * [start_pfn, end_pfn) and call func on each mem section. 1705 * 1706 * Returns the return value of func. 1707 */ 1708 int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, 1709 void *arg, int (*func)(struct memory_block *, void *)) 1710 { 1711 struct memory_block *mem = NULL; 1712 struct mem_section *section; 1713 unsigned long pfn, section_nr; 1714 int ret; 1715 1716 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 1717 section_nr = pfn_to_section_nr(pfn); 1718 if (!present_section_nr(section_nr)) 1719 continue; 1720 1721 section = __nr_to_section(section_nr); 1722 /* same memblock? */ 1723 if (mem) 1724 if ((section_nr >= mem->start_section_nr) && 1725 (section_nr <= mem->end_section_nr)) 1726 continue; 1727 1728 mem = find_memory_block_hinted(section, mem); 1729 if (!mem) 1730 continue; 1731 1732 ret = func(mem, arg); 1733 if (ret) { 1734 kobject_put(&mem->dev.kobj); 1735 return ret; 1736 } 1737 } 1738 1739 if (mem) 1740 kobject_put(&mem->dev.kobj); 1741 1742 return 0; 1743 } 1744 1745 #ifdef CONFIG_MEMORY_HOTREMOVE 1746 static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) 1747 { 1748 int ret = !is_memblock_offlined(mem); 1749 1750 if (unlikely(ret)) { 1751 phys_addr_t beginpa, endpa; 1752 1753 beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); 1754 endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1; 1755 pr_warn("removing memory fails, because memory " 1756 "[%pa-%pa] is onlined\n", 1757 &beginpa, &endpa); 1758 } 1759 1760 return ret; 1761 } 1762 1763 static int check_cpu_on_node(pg_data_t *pgdat) 1764 { 1765 int cpu; 1766 1767 for_each_present_cpu(cpu) { 1768 if (cpu_to_node(cpu) == pgdat->node_id) 1769 /* 1770 * the cpu on this node isn't removed, and we can't 1771 * offline this node. 1772 */ 1773 return -EBUSY; 1774 } 1775 1776 return 0; 1777 } 1778 1779 static void unmap_cpu_on_node(pg_data_t *pgdat) 1780 { 1781 #ifdef CONFIG_ACPI_NUMA 1782 int cpu; 1783 1784 for_each_possible_cpu(cpu) 1785 if (cpu_to_node(cpu) == pgdat->node_id) 1786 numa_clear_node(cpu); 1787 #endif 1788 } 1789 1790 static int check_and_unmap_cpu_on_node(pg_data_t *pgdat) 1791 { 1792 int ret; 1793 1794 ret = check_cpu_on_node(pgdat); 1795 if (ret) 1796 return ret; 1797 1798 /* 1799 * the node will be offlined when we come here, so we can clear 1800 * the cpu_to_node() now. 1801 */ 1802 1803 unmap_cpu_on_node(pgdat); 1804 return 0; 1805 } 1806 1807 /** 1808 * try_offline_node 1809 * 1810 * Offline a node if all memory sections and cpus of the node are removed. 1811 * 1812 * NOTE: The caller must call lock_device_hotplug() to serialize hotplug 1813 * and online/offline operations before this call. 1814 */ 1815 void try_offline_node(int nid) 1816 { 1817 pg_data_t *pgdat = NODE_DATA(nid); 1818 unsigned long start_pfn = pgdat->node_start_pfn; 1819 unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; 1820 unsigned long pfn; 1821 struct page *pgdat_page = virt_to_page(pgdat); 1822 int i; 1823 1824 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 1825 unsigned long section_nr = pfn_to_section_nr(pfn); 1826 1827 if (!present_section_nr(section_nr)) 1828 continue; 1829 1830 if (pfn_to_nid(pfn) != nid) 1831 continue; 1832 1833 /* 1834 * some memory sections of this node are not removed, and we 1835 * can't offline node now. 1836 */ 1837 return; 1838 } 1839 1840 if (check_and_unmap_cpu_on_node(pgdat)) 1841 return; 1842 1843 /* 1844 * all memory/cpu of this node are removed, we can offline this 1845 * node now. 1846 */ 1847 node_set_offline(nid); 1848 unregister_one_node(nid); 1849 1850 if (!PageSlab(pgdat_page) && !PageCompound(pgdat_page)) 1851 /* node data is allocated from boot memory */ 1852 return; 1853 1854 /* free waittable in each zone */ 1855 for (i = 0; i < MAX_NR_ZONES; i++) { 1856 struct zone *zone = pgdat->node_zones + i; 1857 1858 /* 1859 * wait_table may be allocated from boot memory, 1860 * here only free if it's allocated by vmalloc. 1861 */ 1862 if (is_vmalloc_addr(zone->wait_table)) 1863 vfree(zone->wait_table); 1864 } 1865 1866 /* 1867 * Since there is no way to guarentee the address of pgdat/zone is not 1868 * on stack of any kernel threads or used by other kernel objects 1869 * without reference counting or other symchronizing method, do not 1870 * reset node_data and free pgdat here. Just reset it to 0 and reuse 1871 * the memory when the node is online again. 1872 */ 1873 memset(pgdat, 0, sizeof(*pgdat)); 1874 } 1875 EXPORT_SYMBOL(try_offline_node); 1876 1877 /** 1878 * remove_memory 1879 * 1880 * NOTE: The caller must call lock_device_hotplug() to serialize hotplug 1881 * and online/offline operations before this call, as required by 1882 * try_offline_node(). 1883 */ 1884 void __ref remove_memory(int nid, u64 start, u64 size) 1885 { 1886 int ret; 1887 1888 BUG_ON(check_hotplug_memory_range(start, size)); 1889 1890 lock_memory_hotplug(); 1891 1892 /* 1893 * All memory blocks must be offlined before removing memory. Check 1894 * whether all memory blocks in question are offline and trigger a BUG() 1895 * if this is not the case. 1896 */ 1897 ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, 1898 check_memblock_offlined_cb); 1899 if (ret) { 1900 unlock_memory_hotplug(); 1901 BUG(); 1902 } 1903 1904 /* remove memmap entry */ 1905 firmware_map_remove(start, start + size, "System RAM"); 1906 1907 arch_remove_memory(start, size); 1908 1909 try_offline_node(nid); 1910 1911 unlock_memory_hotplug(); 1912 } 1913 EXPORT_SYMBOL_GPL(remove_memory); 1914 #endif /* CONFIG_MEMORY_HOTREMOVE */ 1915