1 /* 2 * linux/mm/memory_hotplug.c 3 * 4 * Copyright (C) 5 */ 6 7 #include <linux/stddef.h> 8 #include <linux/mm.h> 9 #include <linux/swap.h> 10 #include <linux/interrupt.h> 11 #include <linux/pagemap.h> 12 #include <linux/compiler.h> 13 #include <linux/export.h> 14 #include <linux/pagevec.h> 15 #include <linux/writeback.h> 16 #include <linux/slab.h> 17 #include <linux/sysctl.h> 18 #include <linux/cpu.h> 19 #include <linux/memory.h> 20 #include <linux/memory_hotplug.h> 21 #include <linux/highmem.h> 22 #include <linux/vmalloc.h> 23 #include <linux/ioport.h> 24 #include <linux/delay.h> 25 #include <linux/migrate.h> 26 #include <linux/page-isolation.h> 27 #include <linux/pfn.h> 28 #include <linux/suspend.h> 29 #include <linux/mm_inline.h> 30 #include <linux/firmware-map.h> 31 #include <linux/stop_machine.h> 32 #include <linux/hugetlb.h> 33 #include <linux/memblock.h> 34 35 #include <asm/tlbflush.h> 36 37 #include "internal.h" 38 39 /* 40 * online_page_callback contains pointer to current page onlining function. 41 * Initially it is generic_online_page(). If it is required it could be 42 * changed by calling set_online_page_callback() for callback registration 43 * and restore_online_page_callback() for generic callback restore. 44 */ 45 46 static void generic_online_page(struct page *page); 47 48 static online_page_callback_t online_page_callback = generic_online_page; 49 50 DEFINE_MUTEX(mem_hotplug_mutex); 51 52 void lock_memory_hotplug(void) 53 { 54 mutex_lock(&mem_hotplug_mutex); 55 } 56 57 void unlock_memory_hotplug(void) 58 { 59 mutex_unlock(&mem_hotplug_mutex); 60 } 61 62 63 /* add this memory to iomem resource */ 64 static struct resource *register_memory_resource(u64 start, u64 size) 65 { 66 struct resource *res; 67 res = kzalloc(sizeof(struct resource), GFP_KERNEL); 68 BUG_ON(!res); 69 70 res->name = "System RAM"; 71 res->start = start; 72 res->end = start + size - 1; 73 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; 74 if (request_resource(&iomem_resource, res) < 0) { 75 pr_debug("System RAM resource %pR cannot be added\n", res); 76 kfree(res); 77 res = NULL; 78 } 79 return res; 80 } 81 82 static void release_memory_resource(struct resource *res) 83 { 84 if (!res) 85 return; 86 release_resource(res); 87 kfree(res); 88 return; 89 } 90 91 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 92 void get_page_bootmem(unsigned long info, struct page *page, 93 unsigned long type) 94 { 95 page->lru.next = (struct list_head *) type; 96 SetPagePrivate(page); 97 set_page_private(page, info); 98 atomic_inc(&page->_count); 99 } 100 101 void put_page_bootmem(struct page *page) 102 { 103 unsigned long type; 104 105 type = (unsigned long) page->lru.next; 106 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || 107 type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); 108 109 if (atomic_dec_return(&page->_count) == 1) { 110 ClearPagePrivate(page); 111 set_page_private(page, 0); 112 INIT_LIST_HEAD(&page->lru); 113 free_reserved_page(page); 114 } 115 } 116 117 #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE 118 #ifndef CONFIG_SPARSEMEM_VMEMMAP 119 static void register_page_bootmem_info_section(unsigned long start_pfn) 120 { 121 unsigned long *usemap, mapsize, section_nr, i; 122 struct mem_section *ms; 123 struct page *page, *memmap; 124 125 section_nr = pfn_to_section_nr(start_pfn); 126 ms = __nr_to_section(section_nr); 127 128 /* Get section's memmap address */ 129 memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); 130 131 /* 132 * Get page for the memmap's phys address 133 * XXX: need more consideration for sparse_vmemmap... 134 */ 135 page = virt_to_page(memmap); 136 mapsize = sizeof(struct page) * PAGES_PER_SECTION; 137 mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT; 138 139 /* remember memmap's page */ 140 for (i = 0; i < mapsize; i++, page++) 141 get_page_bootmem(section_nr, page, SECTION_INFO); 142 143 usemap = __nr_to_section(section_nr)->pageblock_flags; 144 page = virt_to_page(usemap); 145 146 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; 147 148 for (i = 0; i < mapsize; i++, page++) 149 get_page_bootmem(section_nr, page, MIX_SECTION_INFO); 150 151 } 152 #else /* CONFIG_SPARSEMEM_VMEMMAP */ 153 static void register_page_bootmem_info_section(unsigned long start_pfn) 154 { 155 unsigned long *usemap, mapsize, section_nr, i; 156 struct mem_section *ms; 157 struct page *page, *memmap; 158 159 if (!pfn_valid(start_pfn)) 160 return; 161 162 section_nr = pfn_to_section_nr(start_pfn); 163 ms = __nr_to_section(section_nr); 164 165 memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); 166 167 register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION); 168 169 usemap = __nr_to_section(section_nr)->pageblock_flags; 170 page = virt_to_page(usemap); 171 172 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; 173 174 for (i = 0; i < mapsize; i++, page++) 175 get_page_bootmem(section_nr, page, MIX_SECTION_INFO); 176 } 177 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ 178 179 void register_page_bootmem_info_node(struct pglist_data *pgdat) 180 { 181 unsigned long i, pfn, end_pfn, nr_pages; 182 int node = pgdat->node_id; 183 struct page *page; 184 struct zone *zone; 185 186 nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT; 187 page = virt_to_page(pgdat); 188 189 for (i = 0; i < nr_pages; i++, page++) 190 get_page_bootmem(node, page, NODE_INFO); 191 192 zone = &pgdat->node_zones[0]; 193 for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) { 194 if (zone_is_initialized(zone)) { 195 nr_pages = zone->wait_table_hash_nr_entries 196 * sizeof(wait_queue_head_t); 197 nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT; 198 page = virt_to_page(zone->wait_table); 199 200 for (i = 0; i < nr_pages; i++, page++) 201 get_page_bootmem(node, page, NODE_INFO); 202 } 203 } 204 205 pfn = pgdat->node_start_pfn; 206 end_pfn = pgdat_end_pfn(pgdat); 207 208 /* register section info */ 209 for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 210 /* 211 * Some platforms can assign the same pfn to multiple nodes - on 212 * node0 as well as nodeN. To avoid registering a pfn against 213 * multiple nodes we check that this pfn does not already 214 * reside in some other nodes. 215 */ 216 if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node)) 217 register_page_bootmem_info_section(pfn); 218 } 219 } 220 #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ 221 222 static void grow_zone_span(struct zone *zone, unsigned long start_pfn, 223 unsigned long end_pfn) 224 { 225 unsigned long old_zone_end_pfn; 226 227 zone_span_writelock(zone); 228 229 old_zone_end_pfn = zone_end_pfn(zone); 230 if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) 231 zone->zone_start_pfn = start_pfn; 232 233 zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - 234 zone->zone_start_pfn; 235 236 zone_span_writeunlock(zone); 237 } 238 239 static void resize_zone(struct zone *zone, unsigned long start_pfn, 240 unsigned long end_pfn) 241 { 242 zone_span_writelock(zone); 243 244 if (end_pfn - start_pfn) { 245 zone->zone_start_pfn = start_pfn; 246 zone->spanned_pages = end_pfn - start_pfn; 247 } else { 248 /* 249 * make it consist as free_area_init_core(), 250 * if spanned_pages = 0, then keep start_pfn = 0 251 */ 252 zone->zone_start_pfn = 0; 253 zone->spanned_pages = 0; 254 } 255 256 zone_span_writeunlock(zone); 257 } 258 259 static void fix_zone_id(struct zone *zone, unsigned long start_pfn, 260 unsigned long end_pfn) 261 { 262 enum zone_type zid = zone_idx(zone); 263 int nid = zone->zone_pgdat->node_id; 264 unsigned long pfn; 265 266 for (pfn = start_pfn; pfn < end_pfn; pfn++) 267 set_page_links(pfn_to_page(pfn), zid, nid, pfn); 268 } 269 270 /* Can fail with -ENOMEM from allocating a wait table with vmalloc() or 271 * alloc_bootmem_node_nopanic()/memblock_virt_alloc_node_nopanic() */ 272 static int __ref ensure_zone_is_initialized(struct zone *zone, 273 unsigned long start_pfn, unsigned long num_pages) 274 { 275 if (!zone_is_initialized(zone)) 276 return init_currently_empty_zone(zone, start_pfn, num_pages, 277 MEMMAP_HOTPLUG); 278 return 0; 279 } 280 281 static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, 282 unsigned long start_pfn, unsigned long end_pfn) 283 { 284 int ret; 285 unsigned long flags; 286 unsigned long z1_start_pfn; 287 288 ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn); 289 if (ret) 290 return ret; 291 292 pgdat_resize_lock(z1->zone_pgdat, &flags); 293 294 /* can't move pfns which are higher than @z2 */ 295 if (end_pfn > zone_end_pfn(z2)) 296 goto out_fail; 297 /* the move out part must be at the left most of @z2 */ 298 if (start_pfn > z2->zone_start_pfn) 299 goto out_fail; 300 /* must included/overlap */ 301 if (end_pfn <= z2->zone_start_pfn) 302 goto out_fail; 303 304 /* use start_pfn for z1's start_pfn if z1 is empty */ 305 if (!zone_is_empty(z1)) 306 z1_start_pfn = z1->zone_start_pfn; 307 else 308 z1_start_pfn = start_pfn; 309 310 resize_zone(z1, z1_start_pfn, end_pfn); 311 resize_zone(z2, end_pfn, zone_end_pfn(z2)); 312 313 pgdat_resize_unlock(z1->zone_pgdat, &flags); 314 315 fix_zone_id(z1, start_pfn, end_pfn); 316 317 return 0; 318 out_fail: 319 pgdat_resize_unlock(z1->zone_pgdat, &flags); 320 return -1; 321 } 322 323 static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, 324 unsigned long start_pfn, unsigned long end_pfn) 325 { 326 int ret; 327 unsigned long flags; 328 unsigned long z2_end_pfn; 329 330 ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn); 331 if (ret) 332 return ret; 333 334 pgdat_resize_lock(z1->zone_pgdat, &flags); 335 336 /* can't move pfns which are lower than @z1 */ 337 if (z1->zone_start_pfn > start_pfn) 338 goto out_fail; 339 /* the move out part mast at the right most of @z1 */ 340 if (zone_end_pfn(z1) > end_pfn) 341 goto out_fail; 342 /* must included/overlap */ 343 if (start_pfn >= zone_end_pfn(z1)) 344 goto out_fail; 345 346 /* use end_pfn for z2's end_pfn if z2 is empty */ 347 if (!zone_is_empty(z2)) 348 z2_end_pfn = zone_end_pfn(z2); 349 else 350 z2_end_pfn = end_pfn; 351 352 resize_zone(z1, z1->zone_start_pfn, start_pfn); 353 resize_zone(z2, start_pfn, z2_end_pfn); 354 355 pgdat_resize_unlock(z1->zone_pgdat, &flags); 356 357 fix_zone_id(z2, start_pfn, end_pfn); 358 359 return 0; 360 out_fail: 361 pgdat_resize_unlock(z1->zone_pgdat, &flags); 362 return -1; 363 } 364 365 static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, 366 unsigned long end_pfn) 367 { 368 unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat); 369 370 if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) 371 pgdat->node_start_pfn = start_pfn; 372 373 pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - 374 pgdat->node_start_pfn; 375 } 376 377 static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) 378 { 379 struct pglist_data *pgdat = zone->zone_pgdat; 380 int nr_pages = PAGES_PER_SECTION; 381 int nid = pgdat->node_id; 382 int zone_type; 383 unsigned long flags; 384 int ret; 385 386 zone_type = zone - pgdat->node_zones; 387 ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages); 388 if (ret) 389 return ret; 390 391 pgdat_resize_lock(zone->zone_pgdat, &flags); 392 grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); 393 grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, 394 phys_start_pfn + nr_pages); 395 pgdat_resize_unlock(zone->zone_pgdat, &flags); 396 memmap_init_zone(nr_pages, nid, zone_type, 397 phys_start_pfn, MEMMAP_HOTPLUG); 398 return 0; 399 } 400 401 static int __meminit __add_section(int nid, struct zone *zone, 402 unsigned long phys_start_pfn) 403 { 404 int ret; 405 406 if (pfn_valid(phys_start_pfn)) 407 return -EEXIST; 408 409 ret = sparse_add_one_section(zone, phys_start_pfn); 410 411 if (ret < 0) 412 return ret; 413 414 ret = __add_zone(zone, phys_start_pfn); 415 416 if (ret < 0) 417 return ret; 418 419 return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); 420 } 421 422 /* 423 * Reasonably generic function for adding memory. It is 424 * expected that archs that support memory hotplug will 425 * call this function after deciding the zone to which to 426 * add the new pages. 427 */ 428 int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, 429 unsigned long nr_pages) 430 { 431 unsigned long i; 432 int err = 0; 433 int start_sec, end_sec; 434 /* during initialize mem_map, align hot-added range to section */ 435 start_sec = pfn_to_section_nr(phys_start_pfn); 436 end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); 437 438 for (i = start_sec; i <= end_sec; i++) { 439 err = __add_section(nid, zone, i << PFN_SECTION_SHIFT); 440 441 /* 442 * EEXIST is finally dealt with by ioresource collision 443 * check. see add_memory() => register_memory_resource() 444 * Warning will be printed if there is collision. 445 */ 446 if (err && (err != -EEXIST)) 447 break; 448 err = 0; 449 } 450 451 return err; 452 } 453 EXPORT_SYMBOL_GPL(__add_pages); 454 455 #ifdef CONFIG_MEMORY_HOTREMOVE 456 /* find the smallest valid pfn in the range [start_pfn, end_pfn) */ 457 static int find_smallest_section_pfn(int nid, struct zone *zone, 458 unsigned long start_pfn, 459 unsigned long end_pfn) 460 { 461 struct mem_section *ms; 462 463 for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) { 464 ms = __pfn_to_section(start_pfn); 465 466 if (unlikely(!valid_section(ms))) 467 continue; 468 469 if (unlikely(pfn_to_nid(start_pfn) != nid)) 470 continue; 471 472 if (zone && zone != page_zone(pfn_to_page(start_pfn))) 473 continue; 474 475 return start_pfn; 476 } 477 478 return 0; 479 } 480 481 /* find the biggest valid pfn in the range [start_pfn, end_pfn). */ 482 static int find_biggest_section_pfn(int nid, struct zone *zone, 483 unsigned long start_pfn, 484 unsigned long end_pfn) 485 { 486 struct mem_section *ms; 487 unsigned long pfn; 488 489 /* pfn is the end pfn of a memory section. */ 490 pfn = end_pfn - 1; 491 for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) { 492 ms = __pfn_to_section(pfn); 493 494 if (unlikely(!valid_section(ms))) 495 continue; 496 497 if (unlikely(pfn_to_nid(pfn) != nid)) 498 continue; 499 500 if (zone && zone != page_zone(pfn_to_page(pfn))) 501 continue; 502 503 return pfn; 504 } 505 506 return 0; 507 } 508 509 static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, 510 unsigned long end_pfn) 511 { 512 unsigned long zone_start_pfn = zone->zone_start_pfn; 513 unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */ 514 unsigned long zone_end_pfn = z; 515 unsigned long pfn; 516 struct mem_section *ms; 517 int nid = zone_to_nid(zone); 518 519 zone_span_writelock(zone); 520 if (zone_start_pfn == start_pfn) { 521 /* 522 * If the section is smallest section in the zone, it need 523 * shrink zone->zone_start_pfn and zone->zone_spanned_pages. 524 * In this case, we find second smallest valid mem_section 525 * for shrinking zone. 526 */ 527 pfn = find_smallest_section_pfn(nid, zone, end_pfn, 528 zone_end_pfn); 529 if (pfn) { 530 zone->zone_start_pfn = pfn; 531 zone->spanned_pages = zone_end_pfn - pfn; 532 } 533 } else if (zone_end_pfn == end_pfn) { 534 /* 535 * If the section is biggest section in the zone, it need 536 * shrink zone->spanned_pages. 537 * In this case, we find second biggest valid mem_section for 538 * shrinking zone. 539 */ 540 pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn, 541 start_pfn); 542 if (pfn) 543 zone->spanned_pages = pfn - zone_start_pfn + 1; 544 } 545 546 /* 547 * The section is not biggest or smallest mem_section in the zone, it 548 * only creates a hole in the zone. So in this case, we need not 549 * change the zone. But perhaps, the zone has only hole data. Thus 550 * it check the zone has only hole or not. 551 */ 552 pfn = zone_start_pfn; 553 for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) { 554 ms = __pfn_to_section(pfn); 555 556 if (unlikely(!valid_section(ms))) 557 continue; 558 559 if (page_zone(pfn_to_page(pfn)) != zone) 560 continue; 561 562 /* If the section is current section, it continues the loop */ 563 if (start_pfn == pfn) 564 continue; 565 566 /* If we find valid section, we have nothing to do */ 567 zone_span_writeunlock(zone); 568 return; 569 } 570 571 /* The zone has no valid section */ 572 zone->zone_start_pfn = 0; 573 zone->spanned_pages = 0; 574 zone_span_writeunlock(zone); 575 } 576 577 static void shrink_pgdat_span(struct pglist_data *pgdat, 578 unsigned long start_pfn, unsigned long end_pfn) 579 { 580 unsigned long pgdat_start_pfn = pgdat->node_start_pfn; 581 unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */ 582 unsigned long pgdat_end_pfn = p; 583 unsigned long pfn; 584 struct mem_section *ms; 585 int nid = pgdat->node_id; 586 587 if (pgdat_start_pfn == start_pfn) { 588 /* 589 * If the section is smallest section in the pgdat, it need 590 * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages. 591 * In this case, we find second smallest valid mem_section 592 * for shrinking zone. 593 */ 594 pfn = find_smallest_section_pfn(nid, NULL, end_pfn, 595 pgdat_end_pfn); 596 if (pfn) { 597 pgdat->node_start_pfn = pfn; 598 pgdat->node_spanned_pages = pgdat_end_pfn - pfn; 599 } 600 } else if (pgdat_end_pfn == end_pfn) { 601 /* 602 * If the section is biggest section in the pgdat, it need 603 * shrink pgdat->node_spanned_pages. 604 * In this case, we find second biggest valid mem_section for 605 * shrinking zone. 606 */ 607 pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn, 608 start_pfn); 609 if (pfn) 610 pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1; 611 } 612 613 /* 614 * If the section is not biggest or smallest mem_section in the pgdat, 615 * it only creates a hole in the pgdat. So in this case, we need not 616 * change the pgdat. 617 * But perhaps, the pgdat has only hole data. Thus it check the pgdat 618 * has only hole or not. 619 */ 620 pfn = pgdat_start_pfn; 621 for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) { 622 ms = __pfn_to_section(pfn); 623 624 if (unlikely(!valid_section(ms))) 625 continue; 626 627 if (pfn_to_nid(pfn) != nid) 628 continue; 629 630 /* If the section is current section, it continues the loop */ 631 if (start_pfn == pfn) 632 continue; 633 634 /* If we find valid section, we have nothing to do */ 635 return; 636 } 637 638 /* The pgdat has no valid section */ 639 pgdat->node_start_pfn = 0; 640 pgdat->node_spanned_pages = 0; 641 } 642 643 static void __remove_zone(struct zone *zone, unsigned long start_pfn) 644 { 645 struct pglist_data *pgdat = zone->zone_pgdat; 646 int nr_pages = PAGES_PER_SECTION; 647 int zone_type; 648 unsigned long flags; 649 650 zone_type = zone - pgdat->node_zones; 651 652 pgdat_resize_lock(zone->zone_pgdat, &flags); 653 shrink_zone_span(zone, start_pfn, start_pfn + nr_pages); 654 shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages); 655 pgdat_resize_unlock(zone->zone_pgdat, &flags); 656 } 657 658 static int __remove_section(struct zone *zone, struct mem_section *ms) 659 { 660 unsigned long start_pfn; 661 int scn_nr; 662 int ret = -EINVAL; 663 664 if (!valid_section(ms)) 665 return ret; 666 667 ret = unregister_memory_section(ms); 668 if (ret) 669 return ret; 670 671 scn_nr = __section_nr(ms); 672 start_pfn = section_nr_to_pfn(scn_nr); 673 __remove_zone(zone, start_pfn); 674 675 sparse_remove_one_section(zone, ms); 676 return 0; 677 } 678 679 /** 680 * __remove_pages() - remove sections of pages from a zone 681 * @zone: zone from which pages need to be removed 682 * @phys_start_pfn: starting pageframe (must be aligned to start of a section) 683 * @nr_pages: number of pages to remove (must be multiple of section size) 684 * 685 * Generic helper function to remove section mappings and sysfs entries 686 * for the section of the memory we are removing. Caller needs to make 687 * sure that pages are marked reserved and zones are adjust properly by 688 * calling offline_pages(). 689 */ 690 int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, 691 unsigned long nr_pages) 692 { 693 unsigned long i; 694 int sections_to_remove; 695 resource_size_t start, size; 696 int ret = 0; 697 698 /* 699 * We can only remove entire sections 700 */ 701 BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); 702 BUG_ON(nr_pages % PAGES_PER_SECTION); 703 704 start = phys_start_pfn << PAGE_SHIFT; 705 size = nr_pages * PAGE_SIZE; 706 ret = release_mem_region_adjustable(&iomem_resource, start, size); 707 if (ret) { 708 resource_size_t endres = start + size - 1; 709 710 pr_warn("Unable to release resource <%pa-%pa> (%d)\n", 711 &start, &endres, ret); 712 } 713 714 sections_to_remove = nr_pages / PAGES_PER_SECTION; 715 for (i = 0; i < sections_to_remove; i++) { 716 unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; 717 ret = __remove_section(zone, __pfn_to_section(pfn)); 718 if (ret) 719 break; 720 } 721 return ret; 722 } 723 EXPORT_SYMBOL_GPL(__remove_pages); 724 #endif /* CONFIG_MEMORY_HOTREMOVE */ 725 726 int set_online_page_callback(online_page_callback_t callback) 727 { 728 int rc = -EINVAL; 729 730 lock_memory_hotplug(); 731 732 if (online_page_callback == generic_online_page) { 733 online_page_callback = callback; 734 rc = 0; 735 } 736 737 unlock_memory_hotplug(); 738 739 return rc; 740 } 741 EXPORT_SYMBOL_GPL(set_online_page_callback); 742 743 int restore_online_page_callback(online_page_callback_t callback) 744 { 745 int rc = -EINVAL; 746 747 lock_memory_hotplug(); 748 749 if (online_page_callback == callback) { 750 online_page_callback = generic_online_page; 751 rc = 0; 752 } 753 754 unlock_memory_hotplug(); 755 756 return rc; 757 } 758 EXPORT_SYMBOL_GPL(restore_online_page_callback); 759 760 void __online_page_set_limits(struct page *page) 761 { 762 } 763 EXPORT_SYMBOL_GPL(__online_page_set_limits); 764 765 void __online_page_increment_counters(struct page *page) 766 { 767 adjust_managed_page_count(page, 1); 768 } 769 EXPORT_SYMBOL_GPL(__online_page_increment_counters); 770 771 void __online_page_free(struct page *page) 772 { 773 __free_reserved_page(page); 774 } 775 EXPORT_SYMBOL_GPL(__online_page_free); 776 777 static void generic_online_page(struct page *page) 778 { 779 __online_page_set_limits(page); 780 __online_page_increment_counters(page); 781 __online_page_free(page); 782 } 783 784 static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, 785 void *arg) 786 { 787 unsigned long i; 788 unsigned long onlined_pages = *(unsigned long *)arg; 789 struct page *page; 790 if (PageReserved(pfn_to_page(start_pfn))) 791 for (i = 0; i < nr_pages; i++) { 792 page = pfn_to_page(start_pfn + i); 793 (*online_page_callback)(page); 794 onlined_pages++; 795 } 796 *(unsigned long *)arg = onlined_pages; 797 return 0; 798 } 799 800 #ifdef CONFIG_MOVABLE_NODE 801 /* 802 * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have 803 * normal memory. 804 */ 805 static bool can_online_high_movable(struct zone *zone) 806 { 807 return true; 808 } 809 #else /* CONFIG_MOVABLE_NODE */ 810 /* ensure every online node has NORMAL memory */ 811 static bool can_online_high_movable(struct zone *zone) 812 { 813 return node_state(zone_to_nid(zone), N_NORMAL_MEMORY); 814 } 815 #endif /* CONFIG_MOVABLE_NODE */ 816 817 /* check which state of node_states will be changed when online memory */ 818 static void node_states_check_changes_online(unsigned long nr_pages, 819 struct zone *zone, struct memory_notify *arg) 820 { 821 int nid = zone_to_nid(zone); 822 enum zone_type zone_last = ZONE_NORMAL; 823 824 /* 825 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] 826 * contains nodes which have zones of 0...ZONE_NORMAL, 827 * set zone_last to ZONE_NORMAL. 828 * 829 * If we don't have HIGHMEM nor movable node, 830 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of 831 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. 832 */ 833 if (N_MEMORY == N_NORMAL_MEMORY) 834 zone_last = ZONE_MOVABLE; 835 836 /* 837 * if the memory to be online is in a zone of 0...zone_last, and 838 * the zones of 0...zone_last don't have memory before online, we will 839 * need to set the node to node_states[N_NORMAL_MEMORY] after 840 * the memory is online. 841 */ 842 if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY)) 843 arg->status_change_nid_normal = nid; 844 else 845 arg->status_change_nid_normal = -1; 846 847 #ifdef CONFIG_HIGHMEM 848 /* 849 * If we have movable node, node_states[N_HIGH_MEMORY] 850 * contains nodes which have zones of 0...ZONE_HIGHMEM, 851 * set zone_last to ZONE_HIGHMEM. 852 * 853 * If we don't have movable node, node_states[N_NORMAL_MEMORY] 854 * contains nodes which have zones of 0...ZONE_MOVABLE, 855 * set zone_last to ZONE_MOVABLE. 856 */ 857 zone_last = ZONE_HIGHMEM; 858 if (N_MEMORY == N_HIGH_MEMORY) 859 zone_last = ZONE_MOVABLE; 860 861 if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY)) 862 arg->status_change_nid_high = nid; 863 else 864 arg->status_change_nid_high = -1; 865 #else 866 arg->status_change_nid_high = arg->status_change_nid_normal; 867 #endif 868 869 /* 870 * if the node don't have memory befor online, we will need to 871 * set the node to node_states[N_MEMORY] after the memory 872 * is online. 873 */ 874 if (!node_state(nid, N_MEMORY)) 875 arg->status_change_nid = nid; 876 else 877 arg->status_change_nid = -1; 878 } 879 880 static void node_states_set_node(int node, struct memory_notify *arg) 881 { 882 if (arg->status_change_nid_normal >= 0) 883 node_set_state(node, N_NORMAL_MEMORY); 884 885 if (arg->status_change_nid_high >= 0) 886 node_set_state(node, N_HIGH_MEMORY); 887 888 node_set_state(node, N_MEMORY); 889 } 890 891 892 int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) 893 { 894 unsigned long flags; 895 unsigned long onlined_pages = 0; 896 struct zone *zone; 897 int need_zonelists_rebuild = 0; 898 int nid; 899 int ret; 900 struct memory_notify arg; 901 902 lock_memory_hotplug(); 903 /* 904 * This doesn't need a lock to do pfn_to_page(). 905 * The section can't be removed here because of the 906 * memory_block->state_mutex. 907 */ 908 zone = page_zone(pfn_to_page(pfn)); 909 910 if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) && 911 !can_online_high_movable(zone)) { 912 unlock_memory_hotplug(); 913 return -EINVAL; 914 } 915 916 if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { 917 if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) { 918 unlock_memory_hotplug(); 919 return -EINVAL; 920 } 921 } 922 if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { 923 if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) { 924 unlock_memory_hotplug(); 925 return -EINVAL; 926 } 927 } 928 929 /* Previous code may changed the zone of the pfn range */ 930 zone = page_zone(pfn_to_page(pfn)); 931 932 arg.start_pfn = pfn; 933 arg.nr_pages = nr_pages; 934 node_states_check_changes_online(nr_pages, zone, &arg); 935 936 nid = pfn_to_nid(pfn); 937 938 ret = memory_notify(MEM_GOING_ONLINE, &arg); 939 ret = notifier_to_errno(ret); 940 if (ret) { 941 memory_notify(MEM_CANCEL_ONLINE, &arg); 942 unlock_memory_hotplug(); 943 return ret; 944 } 945 /* 946 * If this zone is not populated, then it is not in zonelist. 947 * This means the page allocator ignores this zone. 948 * So, zonelist must be updated after online. 949 */ 950 mutex_lock(&zonelists_mutex); 951 if (!populated_zone(zone)) { 952 need_zonelists_rebuild = 1; 953 build_all_zonelists(NULL, zone); 954 } 955 956 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, 957 online_pages_range); 958 if (ret) { 959 if (need_zonelists_rebuild) 960 zone_pcp_reset(zone); 961 mutex_unlock(&zonelists_mutex); 962 printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n", 963 (unsigned long long) pfn << PAGE_SHIFT, 964 (((unsigned long long) pfn + nr_pages) 965 << PAGE_SHIFT) - 1); 966 memory_notify(MEM_CANCEL_ONLINE, &arg); 967 unlock_memory_hotplug(); 968 return ret; 969 } 970 971 zone->present_pages += onlined_pages; 972 973 pgdat_resize_lock(zone->zone_pgdat, &flags); 974 zone->zone_pgdat->node_present_pages += onlined_pages; 975 pgdat_resize_unlock(zone->zone_pgdat, &flags); 976 977 if (onlined_pages) { 978 node_states_set_node(zone_to_nid(zone), &arg); 979 if (need_zonelists_rebuild) 980 build_all_zonelists(NULL, NULL); 981 else 982 zone_pcp_update(zone); 983 } 984 985 mutex_unlock(&zonelists_mutex); 986 987 init_per_zone_wmark_min(); 988 989 if (onlined_pages) 990 kswapd_run(zone_to_nid(zone)); 991 992 vm_total_pages = nr_free_pagecache_pages(); 993 994 writeback_set_ratelimit(); 995 996 if (onlined_pages) 997 memory_notify(MEM_ONLINE, &arg); 998 unlock_memory_hotplug(); 999 1000 return 0; 1001 } 1002 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ 1003 1004 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 1005 static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) 1006 { 1007 struct pglist_data *pgdat; 1008 unsigned long zones_size[MAX_NR_ZONES] = {0}; 1009 unsigned long zholes_size[MAX_NR_ZONES] = {0}; 1010 unsigned long start_pfn = start >> PAGE_SHIFT; 1011 1012 pgdat = NODE_DATA(nid); 1013 if (!pgdat) { 1014 pgdat = arch_alloc_nodedata(nid); 1015 if (!pgdat) 1016 return NULL; 1017 1018 arch_refresh_nodedata(nid, pgdat); 1019 } 1020 1021 /* we can use NODE_DATA(nid) from here */ 1022 1023 /* init node's zones as empty zones, we don't have any present pages.*/ 1024 free_area_init_node(nid, zones_size, start_pfn, zholes_size); 1025 1026 /* 1027 * The node we allocated has no zone fallback lists. For avoiding 1028 * to access not-initialized zonelist, build here. 1029 */ 1030 mutex_lock(&zonelists_mutex); 1031 build_all_zonelists(pgdat, NULL); 1032 mutex_unlock(&zonelists_mutex); 1033 1034 return pgdat; 1035 } 1036 1037 static void rollback_node_hotadd(int nid, pg_data_t *pgdat) 1038 { 1039 arch_refresh_nodedata(nid, NULL); 1040 arch_free_nodedata(pgdat); 1041 return; 1042 } 1043 1044 1045 /** 1046 * try_online_node - online a node if offlined 1047 * 1048 * called by cpu_up() to online a node without onlined memory. 1049 */ 1050 int try_online_node(int nid) 1051 { 1052 pg_data_t *pgdat; 1053 int ret; 1054 1055 if (node_online(nid)) 1056 return 0; 1057 1058 lock_memory_hotplug(); 1059 pgdat = hotadd_new_pgdat(nid, 0); 1060 if (!pgdat) { 1061 pr_err("Cannot online node %d due to NULL pgdat\n", nid); 1062 ret = -ENOMEM; 1063 goto out; 1064 } 1065 node_set_online(nid); 1066 ret = register_one_node(nid); 1067 BUG_ON(ret); 1068 1069 if (pgdat->node_zonelists->_zonerefs->zone == NULL) { 1070 mutex_lock(&zonelists_mutex); 1071 build_all_zonelists(NULL, NULL); 1072 mutex_unlock(&zonelists_mutex); 1073 } 1074 1075 out: 1076 unlock_memory_hotplug(); 1077 return ret; 1078 } 1079 1080 static int check_hotplug_memory_range(u64 start, u64 size) 1081 { 1082 u64 start_pfn = start >> PAGE_SHIFT; 1083 u64 nr_pages = size >> PAGE_SHIFT; 1084 1085 /* Memory range must be aligned with section */ 1086 if ((start_pfn & ~PAGE_SECTION_MASK) || 1087 (nr_pages % PAGES_PER_SECTION) || (!nr_pages)) { 1088 pr_err("Section-unaligned hotplug range: start 0x%llx, size 0x%llx\n", 1089 (unsigned long long)start, 1090 (unsigned long long)size); 1091 return -EINVAL; 1092 } 1093 1094 return 0; 1095 } 1096 1097 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 1098 int __ref add_memory(int nid, u64 start, u64 size) 1099 { 1100 pg_data_t *pgdat = NULL; 1101 bool new_pgdat; 1102 bool new_node; 1103 struct resource *res; 1104 int ret; 1105 1106 ret = check_hotplug_memory_range(start, size); 1107 if (ret) 1108 return ret; 1109 1110 res = register_memory_resource(start, size); 1111 ret = -EEXIST; 1112 if (!res) 1113 return ret; 1114 1115 { /* Stupid hack to suppress address-never-null warning */ 1116 void *p = NODE_DATA(nid); 1117 new_pgdat = !p; 1118 } 1119 1120 lock_memory_hotplug(); 1121 1122 new_node = !node_online(nid); 1123 if (new_node) { 1124 pgdat = hotadd_new_pgdat(nid, start); 1125 ret = -ENOMEM; 1126 if (!pgdat) 1127 goto error; 1128 } 1129 1130 /* call arch's memory hotadd */ 1131 ret = arch_add_memory(nid, start, size); 1132 1133 if (ret < 0) 1134 goto error; 1135 1136 /* we online node here. we can't roll back from here. */ 1137 node_set_online(nid); 1138 1139 if (new_node) { 1140 ret = register_one_node(nid); 1141 /* 1142 * If sysfs file of new node can't create, cpu on the node 1143 * can't be hot-added. There is no rollback way now. 1144 * So, check by BUG_ON() to catch it reluctantly.. 1145 */ 1146 BUG_ON(ret); 1147 } 1148 1149 /* create new memmap entry */ 1150 firmware_map_add_hotplug(start, start + size, "System RAM"); 1151 1152 goto out; 1153 1154 error: 1155 /* rollback pgdat allocation and others */ 1156 if (new_pgdat) 1157 rollback_node_hotadd(nid, pgdat); 1158 release_memory_resource(res); 1159 1160 out: 1161 unlock_memory_hotplug(); 1162 return ret; 1163 } 1164 EXPORT_SYMBOL_GPL(add_memory); 1165 1166 #ifdef CONFIG_MEMORY_HOTREMOVE 1167 /* 1168 * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy 1169 * set and the size of the free page is given by page_order(). Using this, 1170 * the function determines if the pageblock contains only free pages. 1171 * Due to buddy contraints, a free page at least the size of a pageblock will 1172 * be located at the start of the pageblock 1173 */ 1174 static inline int pageblock_free(struct page *page) 1175 { 1176 return PageBuddy(page) && page_order(page) >= pageblock_order; 1177 } 1178 1179 /* Return the start of the next active pageblock after a given page */ 1180 static struct page *next_active_pageblock(struct page *page) 1181 { 1182 /* Ensure the starting page is pageblock-aligned */ 1183 BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1)); 1184 1185 /* If the entire pageblock is free, move to the end of free page */ 1186 if (pageblock_free(page)) { 1187 int order; 1188 /* be careful. we don't have locks, page_order can be changed.*/ 1189 order = page_order(page); 1190 if ((order < MAX_ORDER) && (order >= pageblock_order)) 1191 return page + (1 << order); 1192 } 1193 1194 return page + pageblock_nr_pages; 1195 } 1196 1197 /* Checks if this range of memory is likely to be hot-removable. */ 1198 int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) 1199 { 1200 struct page *page = pfn_to_page(start_pfn); 1201 struct page *end_page = page + nr_pages; 1202 1203 /* Check the starting page of each pageblock within the range */ 1204 for (; page < end_page; page = next_active_pageblock(page)) { 1205 if (!is_pageblock_removable_nolock(page)) 1206 return 0; 1207 cond_resched(); 1208 } 1209 1210 /* All pageblocks in the memory block are likely to be hot-removable */ 1211 return 1; 1212 } 1213 1214 /* 1215 * Confirm all pages in a range [start, end) is belongs to the same zone. 1216 */ 1217 static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) 1218 { 1219 unsigned long pfn; 1220 struct zone *zone = NULL; 1221 struct page *page; 1222 int i; 1223 for (pfn = start_pfn; 1224 pfn < end_pfn; 1225 pfn += MAX_ORDER_NR_PAGES) { 1226 i = 0; 1227 /* This is just a CONFIG_HOLES_IN_ZONE check.*/ 1228 while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i)) 1229 i++; 1230 if (i == MAX_ORDER_NR_PAGES) 1231 continue; 1232 page = pfn_to_page(pfn + i); 1233 if (zone && page_zone(page) != zone) 1234 return 0; 1235 zone = page_zone(page); 1236 } 1237 return 1; 1238 } 1239 1240 /* 1241 * Scan pfn range [start,end) to find movable/migratable pages (LRU pages 1242 * and hugepages). We scan pfn because it's much easier than scanning over 1243 * linked list. This function returns the pfn of the first found movable 1244 * page if it's found, otherwise 0. 1245 */ 1246 static unsigned long scan_movable_pages(unsigned long start, unsigned long end) 1247 { 1248 unsigned long pfn; 1249 struct page *page; 1250 for (pfn = start; pfn < end; pfn++) { 1251 if (pfn_valid(pfn)) { 1252 page = pfn_to_page(pfn); 1253 if (PageLRU(page)) 1254 return pfn; 1255 if (PageHuge(page)) { 1256 if (is_hugepage_active(page)) 1257 return pfn; 1258 else 1259 pfn = round_up(pfn + 1, 1260 1 << compound_order(page)) - 1; 1261 } 1262 } 1263 } 1264 return 0; 1265 } 1266 1267 #define NR_OFFLINE_AT_ONCE_PAGES (256) 1268 static int 1269 do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) 1270 { 1271 unsigned long pfn; 1272 struct page *page; 1273 int move_pages = NR_OFFLINE_AT_ONCE_PAGES; 1274 int not_managed = 0; 1275 int ret = 0; 1276 LIST_HEAD(source); 1277 1278 for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) { 1279 if (!pfn_valid(pfn)) 1280 continue; 1281 page = pfn_to_page(pfn); 1282 1283 if (PageHuge(page)) { 1284 struct page *head = compound_head(page); 1285 pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1; 1286 if (compound_order(head) > PFN_SECTION_SHIFT) { 1287 ret = -EBUSY; 1288 break; 1289 } 1290 if (isolate_huge_page(page, &source)) 1291 move_pages -= 1 << compound_order(head); 1292 continue; 1293 } 1294 1295 if (!get_page_unless_zero(page)) 1296 continue; 1297 /* 1298 * We can skip free pages. And we can only deal with pages on 1299 * LRU. 1300 */ 1301 ret = isolate_lru_page(page); 1302 if (!ret) { /* Success */ 1303 put_page(page); 1304 list_add_tail(&page->lru, &source); 1305 move_pages--; 1306 inc_zone_page_state(page, NR_ISOLATED_ANON + 1307 page_is_file_cache(page)); 1308 1309 } else { 1310 #ifdef CONFIG_DEBUG_VM 1311 printk(KERN_ALERT "removing pfn %lx from LRU failed\n", 1312 pfn); 1313 dump_page(page, "failed to remove from LRU"); 1314 #endif 1315 put_page(page); 1316 /* Because we don't have big zone->lock. we should 1317 check this again here. */ 1318 if (page_count(page)) { 1319 not_managed++; 1320 ret = -EBUSY; 1321 break; 1322 } 1323 } 1324 } 1325 if (!list_empty(&source)) { 1326 if (not_managed) { 1327 putback_movable_pages(&source); 1328 goto out; 1329 } 1330 1331 /* 1332 * alloc_migrate_target should be improooooved!! 1333 * migrate_pages returns # of failed pages. 1334 */ 1335 ret = migrate_pages(&source, alloc_migrate_target, 0, 1336 MIGRATE_SYNC, MR_MEMORY_HOTPLUG); 1337 if (ret) 1338 putback_movable_pages(&source); 1339 } 1340 out: 1341 return ret; 1342 } 1343 1344 /* 1345 * remove from free_area[] and mark all as Reserved. 1346 */ 1347 static int 1348 offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, 1349 void *data) 1350 { 1351 __offline_isolated_pages(start, start + nr_pages); 1352 return 0; 1353 } 1354 1355 static void 1356 offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 1357 { 1358 walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL, 1359 offline_isolated_pages_cb); 1360 } 1361 1362 /* 1363 * Check all pages in range, recoreded as memory resource, are isolated. 1364 */ 1365 static int 1366 check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, 1367 void *data) 1368 { 1369 int ret; 1370 long offlined = *(long *)data; 1371 ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true); 1372 offlined = nr_pages; 1373 if (!ret) 1374 *(long *)data += offlined; 1375 return ret; 1376 } 1377 1378 static long 1379 check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) 1380 { 1381 long offlined = 0; 1382 int ret; 1383 1384 ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined, 1385 check_pages_isolated_cb); 1386 if (ret < 0) 1387 offlined = (long)ret; 1388 return offlined; 1389 } 1390 1391 #ifdef CONFIG_MOVABLE_NODE 1392 /* 1393 * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have 1394 * normal memory. 1395 */ 1396 static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) 1397 { 1398 return true; 1399 } 1400 #else /* CONFIG_MOVABLE_NODE */ 1401 /* ensure the node has NORMAL memory if it is still online */ 1402 static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) 1403 { 1404 struct pglist_data *pgdat = zone->zone_pgdat; 1405 unsigned long present_pages = 0; 1406 enum zone_type zt; 1407 1408 for (zt = 0; zt <= ZONE_NORMAL; zt++) 1409 present_pages += pgdat->node_zones[zt].present_pages; 1410 1411 if (present_pages > nr_pages) 1412 return true; 1413 1414 present_pages = 0; 1415 for (; zt <= ZONE_MOVABLE; zt++) 1416 present_pages += pgdat->node_zones[zt].present_pages; 1417 1418 /* 1419 * we can't offline the last normal memory until all 1420 * higher memory is offlined. 1421 */ 1422 return present_pages == 0; 1423 } 1424 #endif /* CONFIG_MOVABLE_NODE */ 1425 1426 static int __init cmdline_parse_movable_node(char *p) 1427 { 1428 #ifdef CONFIG_MOVABLE_NODE 1429 /* 1430 * Memory used by the kernel cannot be hot-removed because Linux 1431 * cannot migrate the kernel pages. When memory hotplug is 1432 * enabled, we should prevent memblock from allocating memory 1433 * for the kernel. 1434 * 1435 * ACPI SRAT records all hotpluggable memory ranges. But before 1436 * SRAT is parsed, we don't know about it. 1437 * 1438 * The kernel image is loaded into memory at very early time. We 1439 * cannot prevent this anyway. So on NUMA system, we set any 1440 * node the kernel resides in as un-hotpluggable. 1441 * 1442 * Since on modern servers, one node could have double-digit 1443 * gigabytes memory, we can assume the memory around the kernel 1444 * image is also un-hotpluggable. So before SRAT is parsed, just 1445 * allocate memory near the kernel image to try the best to keep 1446 * the kernel away from hotpluggable memory. 1447 */ 1448 memblock_set_bottom_up(true); 1449 movable_node_enabled = true; 1450 #else 1451 pr_warn("movable_node option not supported\n"); 1452 #endif 1453 return 0; 1454 } 1455 early_param("movable_node", cmdline_parse_movable_node); 1456 1457 /* check which state of node_states will be changed when offline memory */ 1458 static void node_states_check_changes_offline(unsigned long nr_pages, 1459 struct zone *zone, struct memory_notify *arg) 1460 { 1461 struct pglist_data *pgdat = zone->zone_pgdat; 1462 unsigned long present_pages = 0; 1463 enum zone_type zt, zone_last = ZONE_NORMAL; 1464 1465 /* 1466 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] 1467 * contains nodes which have zones of 0...ZONE_NORMAL, 1468 * set zone_last to ZONE_NORMAL. 1469 * 1470 * If we don't have HIGHMEM nor movable node, 1471 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of 1472 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. 1473 */ 1474 if (N_MEMORY == N_NORMAL_MEMORY) 1475 zone_last = ZONE_MOVABLE; 1476 1477 /* 1478 * check whether node_states[N_NORMAL_MEMORY] will be changed. 1479 * If the memory to be offline is in a zone of 0...zone_last, 1480 * and it is the last present memory, 0...zone_last will 1481 * become empty after offline , thus we can determind we will 1482 * need to clear the node from node_states[N_NORMAL_MEMORY]. 1483 */ 1484 for (zt = 0; zt <= zone_last; zt++) 1485 present_pages += pgdat->node_zones[zt].present_pages; 1486 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) 1487 arg->status_change_nid_normal = zone_to_nid(zone); 1488 else 1489 arg->status_change_nid_normal = -1; 1490 1491 #ifdef CONFIG_HIGHMEM 1492 /* 1493 * If we have movable node, node_states[N_HIGH_MEMORY] 1494 * contains nodes which have zones of 0...ZONE_HIGHMEM, 1495 * set zone_last to ZONE_HIGHMEM. 1496 * 1497 * If we don't have movable node, node_states[N_NORMAL_MEMORY] 1498 * contains nodes which have zones of 0...ZONE_MOVABLE, 1499 * set zone_last to ZONE_MOVABLE. 1500 */ 1501 zone_last = ZONE_HIGHMEM; 1502 if (N_MEMORY == N_HIGH_MEMORY) 1503 zone_last = ZONE_MOVABLE; 1504 1505 for (; zt <= zone_last; zt++) 1506 present_pages += pgdat->node_zones[zt].present_pages; 1507 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) 1508 arg->status_change_nid_high = zone_to_nid(zone); 1509 else 1510 arg->status_change_nid_high = -1; 1511 #else 1512 arg->status_change_nid_high = arg->status_change_nid_normal; 1513 #endif 1514 1515 /* 1516 * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE 1517 */ 1518 zone_last = ZONE_MOVABLE; 1519 1520 /* 1521 * check whether node_states[N_HIGH_MEMORY] will be changed 1522 * If we try to offline the last present @nr_pages from the node, 1523 * we can determind we will need to clear the node from 1524 * node_states[N_HIGH_MEMORY]. 1525 */ 1526 for (; zt <= zone_last; zt++) 1527 present_pages += pgdat->node_zones[zt].present_pages; 1528 if (nr_pages >= present_pages) 1529 arg->status_change_nid = zone_to_nid(zone); 1530 else 1531 arg->status_change_nid = -1; 1532 } 1533 1534 static void node_states_clear_node(int node, struct memory_notify *arg) 1535 { 1536 if (arg->status_change_nid_normal >= 0) 1537 node_clear_state(node, N_NORMAL_MEMORY); 1538 1539 if ((N_MEMORY != N_NORMAL_MEMORY) && 1540 (arg->status_change_nid_high >= 0)) 1541 node_clear_state(node, N_HIGH_MEMORY); 1542 1543 if ((N_MEMORY != N_HIGH_MEMORY) && 1544 (arg->status_change_nid >= 0)) 1545 node_clear_state(node, N_MEMORY); 1546 } 1547 1548 static int __ref __offline_pages(unsigned long start_pfn, 1549 unsigned long end_pfn, unsigned long timeout) 1550 { 1551 unsigned long pfn, nr_pages, expire; 1552 long offlined_pages; 1553 int ret, drain, retry_max, node; 1554 unsigned long flags; 1555 struct zone *zone; 1556 struct memory_notify arg; 1557 1558 /* at least, alignment against pageblock is necessary */ 1559 if (!IS_ALIGNED(start_pfn, pageblock_nr_pages)) 1560 return -EINVAL; 1561 if (!IS_ALIGNED(end_pfn, pageblock_nr_pages)) 1562 return -EINVAL; 1563 /* This makes hotplug much easier...and readable. 1564 we assume this for now. .*/ 1565 if (!test_pages_in_a_zone(start_pfn, end_pfn)) 1566 return -EINVAL; 1567 1568 lock_memory_hotplug(); 1569 1570 zone = page_zone(pfn_to_page(start_pfn)); 1571 node = zone_to_nid(zone); 1572 nr_pages = end_pfn - start_pfn; 1573 1574 ret = -EINVAL; 1575 if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages)) 1576 goto out; 1577 1578 /* set above range as isolated */ 1579 ret = start_isolate_page_range(start_pfn, end_pfn, 1580 MIGRATE_MOVABLE, true); 1581 if (ret) 1582 goto out; 1583 1584 arg.start_pfn = start_pfn; 1585 arg.nr_pages = nr_pages; 1586 node_states_check_changes_offline(nr_pages, zone, &arg); 1587 1588 ret = memory_notify(MEM_GOING_OFFLINE, &arg); 1589 ret = notifier_to_errno(ret); 1590 if (ret) 1591 goto failed_removal; 1592 1593 pfn = start_pfn; 1594 expire = jiffies + timeout; 1595 drain = 0; 1596 retry_max = 5; 1597 repeat: 1598 /* start memory hot removal */ 1599 ret = -EAGAIN; 1600 if (time_after(jiffies, expire)) 1601 goto failed_removal; 1602 ret = -EINTR; 1603 if (signal_pending(current)) 1604 goto failed_removal; 1605 ret = 0; 1606 if (drain) { 1607 lru_add_drain_all(); 1608 cond_resched(); 1609 drain_all_pages(); 1610 } 1611 1612 pfn = scan_movable_pages(start_pfn, end_pfn); 1613 if (pfn) { /* We have movable pages */ 1614 ret = do_migrate_range(pfn, end_pfn); 1615 if (!ret) { 1616 drain = 1; 1617 goto repeat; 1618 } else { 1619 if (ret < 0) 1620 if (--retry_max == 0) 1621 goto failed_removal; 1622 yield(); 1623 drain = 1; 1624 goto repeat; 1625 } 1626 } 1627 /* drain all zone's lru pagevec, this is asynchronous... */ 1628 lru_add_drain_all(); 1629 yield(); 1630 /* drain pcp pages, this is synchronous. */ 1631 drain_all_pages(); 1632 /* 1633 * dissolve free hugepages in the memory block before doing offlining 1634 * actually in order to make hugetlbfs's object counting consistent. 1635 */ 1636 dissolve_free_huge_pages(start_pfn, end_pfn); 1637 /* check again */ 1638 offlined_pages = check_pages_isolated(start_pfn, end_pfn); 1639 if (offlined_pages < 0) { 1640 ret = -EBUSY; 1641 goto failed_removal; 1642 } 1643 printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages); 1644 /* Ok, all of our target is isolated. 1645 We cannot do rollback at this point. */ 1646 offline_isolated_pages(start_pfn, end_pfn); 1647 /* reset pagetype flags and makes migrate type to be MOVABLE */ 1648 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1649 /* removal success */ 1650 adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages); 1651 zone->present_pages -= offlined_pages; 1652 1653 pgdat_resize_lock(zone->zone_pgdat, &flags); 1654 zone->zone_pgdat->node_present_pages -= offlined_pages; 1655 pgdat_resize_unlock(zone->zone_pgdat, &flags); 1656 1657 init_per_zone_wmark_min(); 1658 1659 if (!populated_zone(zone)) { 1660 zone_pcp_reset(zone); 1661 mutex_lock(&zonelists_mutex); 1662 build_all_zonelists(NULL, NULL); 1663 mutex_unlock(&zonelists_mutex); 1664 } else 1665 zone_pcp_update(zone); 1666 1667 node_states_clear_node(node, &arg); 1668 if (arg.status_change_nid >= 0) 1669 kswapd_stop(node); 1670 1671 vm_total_pages = nr_free_pagecache_pages(); 1672 writeback_set_ratelimit(); 1673 1674 memory_notify(MEM_OFFLINE, &arg); 1675 unlock_memory_hotplug(); 1676 return 0; 1677 1678 failed_removal: 1679 printk(KERN_INFO "memory offlining [mem %#010llx-%#010llx] failed\n", 1680 (unsigned long long) start_pfn << PAGE_SHIFT, 1681 ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); 1682 memory_notify(MEM_CANCEL_OFFLINE, &arg); 1683 /* pushback to free area */ 1684 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1685 1686 out: 1687 unlock_memory_hotplug(); 1688 return ret; 1689 } 1690 1691 int offline_pages(unsigned long start_pfn, unsigned long nr_pages) 1692 { 1693 return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); 1694 } 1695 #endif /* CONFIG_MEMORY_HOTREMOVE */ 1696 1697 /** 1698 * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn) 1699 * @start_pfn: start pfn of the memory range 1700 * @end_pfn: end pfn of the memory range 1701 * @arg: argument passed to func 1702 * @func: callback for each memory section walked 1703 * 1704 * This function walks through all present mem sections in range 1705 * [start_pfn, end_pfn) and call func on each mem section. 1706 * 1707 * Returns the return value of func. 1708 */ 1709 int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, 1710 void *arg, int (*func)(struct memory_block *, void *)) 1711 { 1712 struct memory_block *mem = NULL; 1713 struct mem_section *section; 1714 unsigned long pfn, section_nr; 1715 int ret; 1716 1717 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 1718 section_nr = pfn_to_section_nr(pfn); 1719 if (!present_section_nr(section_nr)) 1720 continue; 1721 1722 section = __nr_to_section(section_nr); 1723 /* same memblock? */ 1724 if (mem) 1725 if ((section_nr >= mem->start_section_nr) && 1726 (section_nr <= mem->end_section_nr)) 1727 continue; 1728 1729 mem = find_memory_block_hinted(section, mem); 1730 if (!mem) 1731 continue; 1732 1733 ret = func(mem, arg); 1734 if (ret) { 1735 kobject_put(&mem->dev.kobj); 1736 return ret; 1737 } 1738 } 1739 1740 if (mem) 1741 kobject_put(&mem->dev.kobj); 1742 1743 return 0; 1744 } 1745 1746 #ifdef CONFIG_MEMORY_HOTREMOVE 1747 static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) 1748 { 1749 int ret = !is_memblock_offlined(mem); 1750 1751 if (unlikely(ret)) { 1752 phys_addr_t beginpa, endpa; 1753 1754 beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); 1755 endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1; 1756 pr_warn("removing memory fails, because memory " 1757 "[%pa-%pa] is onlined\n", 1758 &beginpa, &endpa); 1759 } 1760 1761 return ret; 1762 } 1763 1764 static int check_cpu_on_node(pg_data_t *pgdat) 1765 { 1766 int cpu; 1767 1768 for_each_present_cpu(cpu) { 1769 if (cpu_to_node(cpu) == pgdat->node_id) 1770 /* 1771 * the cpu on this node isn't removed, and we can't 1772 * offline this node. 1773 */ 1774 return -EBUSY; 1775 } 1776 1777 return 0; 1778 } 1779 1780 static void unmap_cpu_on_node(pg_data_t *pgdat) 1781 { 1782 #ifdef CONFIG_ACPI_NUMA 1783 int cpu; 1784 1785 for_each_possible_cpu(cpu) 1786 if (cpu_to_node(cpu) == pgdat->node_id) 1787 numa_clear_node(cpu); 1788 #endif 1789 } 1790 1791 static int check_and_unmap_cpu_on_node(pg_data_t *pgdat) 1792 { 1793 int ret; 1794 1795 ret = check_cpu_on_node(pgdat); 1796 if (ret) 1797 return ret; 1798 1799 /* 1800 * the node will be offlined when we come here, so we can clear 1801 * the cpu_to_node() now. 1802 */ 1803 1804 unmap_cpu_on_node(pgdat); 1805 return 0; 1806 } 1807 1808 /** 1809 * try_offline_node 1810 * 1811 * Offline a node if all memory sections and cpus of the node are removed. 1812 * 1813 * NOTE: The caller must call lock_device_hotplug() to serialize hotplug 1814 * and online/offline operations before this call. 1815 */ 1816 void try_offline_node(int nid) 1817 { 1818 pg_data_t *pgdat = NODE_DATA(nid); 1819 unsigned long start_pfn = pgdat->node_start_pfn; 1820 unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; 1821 unsigned long pfn; 1822 struct page *pgdat_page = virt_to_page(pgdat); 1823 int i; 1824 1825 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 1826 unsigned long section_nr = pfn_to_section_nr(pfn); 1827 1828 if (!present_section_nr(section_nr)) 1829 continue; 1830 1831 if (pfn_to_nid(pfn) != nid) 1832 continue; 1833 1834 /* 1835 * some memory sections of this node are not removed, and we 1836 * can't offline node now. 1837 */ 1838 return; 1839 } 1840 1841 if (check_and_unmap_cpu_on_node(pgdat)) 1842 return; 1843 1844 /* 1845 * all memory/cpu of this node are removed, we can offline this 1846 * node now. 1847 */ 1848 node_set_offline(nid); 1849 unregister_one_node(nid); 1850 1851 if (!PageSlab(pgdat_page) && !PageCompound(pgdat_page)) 1852 /* node data is allocated from boot memory */ 1853 return; 1854 1855 /* free waittable in each zone */ 1856 for (i = 0; i < MAX_NR_ZONES; i++) { 1857 struct zone *zone = pgdat->node_zones + i; 1858 1859 /* 1860 * wait_table may be allocated from boot memory, 1861 * here only free if it's allocated by vmalloc. 1862 */ 1863 if (is_vmalloc_addr(zone->wait_table)) 1864 vfree(zone->wait_table); 1865 } 1866 1867 /* 1868 * Since there is no way to guarentee the address of pgdat/zone is not 1869 * on stack of any kernel threads or used by other kernel objects 1870 * without reference counting or other symchronizing method, do not 1871 * reset node_data and free pgdat here. Just reset it to 0 and reuse 1872 * the memory when the node is online again. 1873 */ 1874 memset(pgdat, 0, sizeof(*pgdat)); 1875 } 1876 EXPORT_SYMBOL(try_offline_node); 1877 1878 /** 1879 * remove_memory 1880 * 1881 * NOTE: The caller must call lock_device_hotplug() to serialize hotplug 1882 * and online/offline operations before this call, as required by 1883 * try_offline_node(). 1884 */ 1885 void __ref remove_memory(int nid, u64 start, u64 size) 1886 { 1887 int ret; 1888 1889 BUG_ON(check_hotplug_memory_range(start, size)); 1890 1891 lock_memory_hotplug(); 1892 1893 /* 1894 * All memory blocks must be offlined before removing memory. Check 1895 * whether all memory blocks in question are offline and trigger a BUG() 1896 * if this is not the case. 1897 */ 1898 ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, 1899 check_memblock_offlined_cb); 1900 if (ret) { 1901 unlock_memory_hotplug(); 1902 BUG(); 1903 } 1904 1905 /* remove memmap entry */ 1906 firmware_map_remove(start, start + size, "System RAM"); 1907 1908 arch_remove_memory(start, size); 1909 1910 try_offline_node(nid); 1911 1912 unlock_memory_hotplug(); 1913 } 1914 EXPORT_SYMBOL_GPL(remove_memory); 1915 #endif /* CONFIG_MEMORY_HOTREMOVE */ 1916