1 /* 2 * linux/mm/memory_hotplug.c 3 * 4 * Copyright (C) 5 */ 6 7 #include <linux/stddef.h> 8 #include <linux/mm.h> 9 #include <linux/swap.h> 10 #include <linux/interrupt.h> 11 #include <linux/pagemap.h> 12 #include <linux/compiler.h> 13 #include <linux/export.h> 14 #include <linux/pagevec.h> 15 #include <linux/writeback.h> 16 #include <linux/slab.h> 17 #include <linux/sysctl.h> 18 #include <linux/cpu.h> 19 #include <linux/memory.h> 20 #include <linux/memremap.h> 21 #include <linux/memory_hotplug.h> 22 #include <linux/highmem.h> 23 #include <linux/vmalloc.h> 24 #include <linux/ioport.h> 25 #include <linux/delay.h> 26 #include <linux/migrate.h> 27 #include <linux/page-isolation.h> 28 #include <linux/pfn.h> 29 #include <linux/suspend.h> 30 #include <linux/mm_inline.h> 31 #include <linux/firmware-map.h> 32 #include <linux/stop_machine.h> 33 #include <linux/hugetlb.h> 34 #include <linux/memblock.h> 35 #include <linux/bootmem.h> 36 #include <linux/compaction.h> 37 38 #include <asm/tlbflush.h> 39 40 #include "internal.h" 41 42 /* 43 * online_page_callback contains pointer to current page onlining function. 44 * Initially it is generic_online_page(). If it is required it could be 45 * changed by calling set_online_page_callback() for callback registration 46 * and restore_online_page_callback() for generic callback restore. 47 */ 48 49 static void generic_online_page(struct page *page); 50 51 static online_page_callback_t online_page_callback = generic_online_page; 52 static DEFINE_MUTEX(online_page_callback_lock); 53 54 /* The same as the cpu_hotplug lock, but for memory hotplug. */ 55 static struct { 56 struct task_struct *active_writer; 57 struct mutex lock; /* Synchronizes accesses to refcount, */ 58 /* 59 * Also blocks the new readers during 60 * an ongoing mem hotplug operation. 61 */ 62 int refcount; 63 64 #ifdef CONFIG_DEBUG_LOCK_ALLOC 65 struct lockdep_map dep_map; 66 #endif 67 } mem_hotplug = { 68 .active_writer = NULL, 69 .lock = __MUTEX_INITIALIZER(mem_hotplug.lock), 70 .refcount = 0, 71 #ifdef CONFIG_DEBUG_LOCK_ALLOC 72 .dep_map = {.name = "mem_hotplug.lock" }, 73 #endif 74 }; 75 76 /* Lockdep annotations for get/put_online_mems() and mem_hotplug_begin/end() */ 77 #define memhp_lock_acquire_read() lock_map_acquire_read(&mem_hotplug.dep_map) 78 #define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map) 79 #define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map) 80 81 #ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE 82 bool memhp_auto_online; 83 #else 84 bool memhp_auto_online = true; 85 #endif 86 EXPORT_SYMBOL_GPL(memhp_auto_online); 87 88 static int __init setup_memhp_default_state(char *str) 89 { 90 if (!strcmp(str, "online")) 91 memhp_auto_online = true; 92 else if (!strcmp(str, "offline")) 93 memhp_auto_online = false; 94 95 return 1; 96 } 97 __setup("memhp_default_state=", setup_memhp_default_state); 98 99 void get_online_mems(void) 100 { 101 might_sleep(); 102 if (mem_hotplug.active_writer == current) 103 return; 104 memhp_lock_acquire_read(); 105 mutex_lock(&mem_hotplug.lock); 106 mem_hotplug.refcount++; 107 mutex_unlock(&mem_hotplug.lock); 108 109 } 110 111 void put_online_mems(void) 112 { 113 if (mem_hotplug.active_writer == current) 114 return; 115 mutex_lock(&mem_hotplug.lock); 116 117 if (WARN_ON(!mem_hotplug.refcount)) 118 mem_hotplug.refcount++; /* try to fix things up */ 119 120 if (!--mem_hotplug.refcount && unlikely(mem_hotplug.active_writer)) 121 wake_up_process(mem_hotplug.active_writer); 122 mutex_unlock(&mem_hotplug.lock); 123 memhp_lock_release(); 124 125 } 126 127 void mem_hotplug_begin(void) 128 { 129 mem_hotplug.active_writer = current; 130 131 memhp_lock_acquire(); 132 for (;;) { 133 mutex_lock(&mem_hotplug.lock); 134 if (likely(!mem_hotplug.refcount)) 135 break; 136 __set_current_state(TASK_UNINTERRUPTIBLE); 137 mutex_unlock(&mem_hotplug.lock); 138 schedule(); 139 } 140 } 141 142 void mem_hotplug_done(void) 143 { 144 mem_hotplug.active_writer = NULL; 145 mutex_unlock(&mem_hotplug.lock); 146 memhp_lock_release(); 147 } 148 149 /* add this memory to iomem resource */ 150 static struct resource *register_memory_resource(u64 start, u64 size) 151 { 152 struct resource *res; 153 res = kzalloc(sizeof(struct resource), GFP_KERNEL); 154 if (!res) 155 return ERR_PTR(-ENOMEM); 156 157 res->name = "System RAM"; 158 res->start = start; 159 res->end = start + size - 1; 160 res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; 161 if (request_resource(&iomem_resource, res) < 0) { 162 pr_debug("System RAM resource %pR cannot be added\n", res); 163 kfree(res); 164 return ERR_PTR(-EEXIST); 165 } 166 return res; 167 } 168 169 static void release_memory_resource(struct resource *res) 170 { 171 if (!res) 172 return; 173 release_resource(res); 174 kfree(res); 175 return; 176 } 177 178 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 179 void get_page_bootmem(unsigned long info, struct page *page, 180 unsigned long type) 181 { 182 page->lru.next = (struct list_head *) type; 183 SetPagePrivate(page); 184 set_page_private(page, info); 185 page_ref_inc(page); 186 } 187 188 void put_page_bootmem(struct page *page) 189 { 190 unsigned long type; 191 192 type = (unsigned long) page->lru.next; 193 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || 194 type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); 195 196 if (page_ref_dec_return(page) == 1) { 197 ClearPagePrivate(page); 198 set_page_private(page, 0); 199 INIT_LIST_HEAD(&page->lru); 200 free_reserved_page(page); 201 } 202 } 203 204 #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE 205 #ifndef CONFIG_SPARSEMEM_VMEMMAP 206 static void register_page_bootmem_info_section(unsigned long start_pfn) 207 { 208 unsigned long *usemap, mapsize, section_nr, i; 209 struct mem_section *ms; 210 struct page *page, *memmap; 211 212 section_nr = pfn_to_section_nr(start_pfn); 213 ms = __nr_to_section(section_nr); 214 215 /* Get section's memmap address */ 216 memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); 217 218 /* 219 * Get page for the memmap's phys address 220 * XXX: need more consideration for sparse_vmemmap... 221 */ 222 page = virt_to_page(memmap); 223 mapsize = sizeof(struct page) * PAGES_PER_SECTION; 224 mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT; 225 226 /* remember memmap's page */ 227 for (i = 0; i < mapsize; i++, page++) 228 get_page_bootmem(section_nr, page, SECTION_INFO); 229 230 usemap = __nr_to_section(section_nr)->pageblock_flags; 231 page = virt_to_page(usemap); 232 233 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; 234 235 for (i = 0; i < mapsize; i++, page++) 236 get_page_bootmem(section_nr, page, MIX_SECTION_INFO); 237 238 } 239 #else /* CONFIG_SPARSEMEM_VMEMMAP */ 240 static void register_page_bootmem_info_section(unsigned long start_pfn) 241 { 242 unsigned long *usemap, mapsize, section_nr, i; 243 struct mem_section *ms; 244 struct page *page, *memmap; 245 246 if (!pfn_valid(start_pfn)) 247 return; 248 249 section_nr = pfn_to_section_nr(start_pfn); 250 ms = __nr_to_section(section_nr); 251 252 memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); 253 254 register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION); 255 256 usemap = __nr_to_section(section_nr)->pageblock_flags; 257 page = virt_to_page(usemap); 258 259 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; 260 261 for (i = 0; i < mapsize; i++, page++) 262 get_page_bootmem(section_nr, page, MIX_SECTION_INFO); 263 } 264 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ 265 266 void __init register_page_bootmem_info_node(struct pglist_data *pgdat) 267 { 268 unsigned long i, pfn, end_pfn, nr_pages; 269 int node = pgdat->node_id; 270 struct page *page; 271 struct zone *zone; 272 273 nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT; 274 page = virt_to_page(pgdat); 275 276 for (i = 0; i < nr_pages; i++, page++) 277 get_page_bootmem(node, page, NODE_INFO); 278 279 zone = &pgdat->node_zones[0]; 280 for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) { 281 if (zone_is_initialized(zone)) { 282 nr_pages = zone->wait_table_hash_nr_entries 283 * sizeof(wait_queue_head_t); 284 nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT; 285 page = virt_to_page(zone->wait_table); 286 287 for (i = 0; i < nr_pages; i++, page++) 288 get_page_bootmem(node, page, NODE_INFO); 289 } 290 } 291 292 pfn = pgdat->node_start_pfn; 293 end_pfn = pgdat_end_pfn(pgdat); 294 295 /* register section info */ 296 for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 297 /* 298 * Some platforms can assign the same pfn to multiple nodes - on 299 * node0 as well as nodeN. To avoid registering a pfn against 300 * multiple nodes we check that this pfn does not already 301 * reside in some other nodes. 302 */ 303 if (pfn_valid(pfn) && (early_pfn_to_nid(pfn) == node)) 304 register_page_bootmem_info_section(pfn); 305 } 306 } 307 #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ 308 309 static void __meminit grow_zone_span(struct zone *zone, unsigned long start_pfn, 310 unsigned long end_pfn) 311 { 312 unsigned long old_zone_end_pfn; 313 314 zone_span_writelock(zone); 315 316 old_zone_end_pfn = zone_end_pfn(zone); 317 if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) 318 zone->zone_start_pfn = start_pfn; 319 320 zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - 321 zone->zone_start_pfn; 322 323 zone_span_writeunlock(zone); 324 } 325 326 static void resize_zone(struct zone *zone, unsigned long start_pfn, 327 unsigned long end_pfn) 328 { 329 zone_span_writelock(zone); 330 331 if (end_pfn - start_pfn) { 332 zone->zone_start_pfn = start_pfn; 333 zone->spanned_pages = end_pfn - start_pfn; 334 } else { 335 /* 336 * make it consist as free_area_init_core(), 337 * if spanned_pages = 0, then keep start_pfn = 0 338 */ 339 zone->zone_start_pfn = 0; 340 zone->spanned_pages = 0; 341 } 342 343 zone_span_writeunlock(zone); 344 } 345 346 static void fix_zone_id(struct zone *zone, unsigned long start_pfn, 347 unsigned long end_pfn) 348 { 349 enum zone_type zid = zone_idx(zone); 350 int nid = zone->zone_pgdat->node_id; 351 unsigned long pfn; 352 353 for (pfn = start_pfn; pfn < end_pfn; pfn++) 354 set_page_links(pfn_to_page(pfn), zid, nid, pfn); 355 } 356 357 /* Can fail with -ENOMEM from allocating a wait table with vmalloc() or 358 * alloc_bootmem_node_nopanic()/memblock_virt_alloc_node_nopanic() */ 359 static int __ref ensure_zone_is_initialized(struct zone *zone, 360 unsigned long start_pfn, unsigned long num_pages) 361 { 362 if (!zone_is_initialized(zone)) 363 return init_currently_empty_zone(zone, start_pfn, num_pages); 364 365 return 0; 366 } 367 368 static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, 369 unsigned long start_pfn, unsigned long end_pfn) 370 { 371 int ret; 372 unsigned long flags; 373 unsigned long z1_start_pfn; 374 375 ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn); 376 if (ret) 377 return ret; 378 379 pgdat_resize_lock(z1->zone_pgdat, &flags); 380 381 /* can't move pfns which are higher than @z2 */ 382 if (end_pfn > zone_end_pfn(z2)) 383 goto out_fail; 384 /* the move out part must be at the left most of @z2 */ 385 if (start_pfn > z2->zone_start_pfn) 386 goto out_fail; 387 /* must included/overlap */ 388 if (end_pfn <= z2->zone_start_pfn) 389 goto out_fail; 390 391 /* use start_pfn for z1's start_pfn if z1 is empty */ 392 if (!zone_is_empty(z1)) 393 z1_start_pfn = z1->zone_start_pfn; 394 else 395 z1_start_pfn = start_pfn; 396 397 resize_zone(z1, z1_start_pfn, end_pfn); 398 resize_zone(z2, end_pfn, zone_end_pfn(z2)); 399 400 pgdat_resize_unlock(z1->zone_pgdat, &flags); 401 402 fix_zone_id(z1, start_pfn, end_pfn); 403 404 return 0; 405 out_fail: 406 pgdat_resize_unlock(z1->zone_pgdat, &flags); 407 return -1; 408 } 409 410 static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, 411 unsigned long start_pfn, unsigned long end_pfn) 412 { 413 int ret; 414 unsigned long flags; 415 unsigned long z2_end_pfn; 416 417 ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn); 418 if (ret) 419 return ret; 420 421 pgdat_resize_lock(z1->zone_pgdat, &flags); 422 423 /* can't move pfns which are lower than @z1 */ 424 if (z1->zone_start_pfn > start_pfn) 425 goto out_fail; 426 /* the move out part mast at the right most of @z1 */ 427 if (zone_end_pfn(z1) > end_pfn) 428 goto out_fail; 429 /* must included/overlap */ 430 if (start_pfn >= zone_end_pfn(z1)) 431 goto out_fail; 432 433 /* use end_pfn for z2's end_pfn if z2 is empty */ 434 if (!zone_is_empty(z2)) 435 z2_end_pfn = zone_end_pfn(z2); 436 else 437 z2_end_pfn = end_pfn; 438 439 resize_zone(z1, z1->zone_start_pfn, start_pfn); 440 resize_zone(z2, start_pfn, z2_end_pfn); 441 442 pgdat_resize_unlock(z1->zone_pgdat, &flags); 443 444 fix_zone_id(z2, start_pfn, end_pfn); 445 446 return 0; 447 out_fail: 448 pgdat_resize_unlock(z1->zone_pgdat, &flags); 449 return -1; 450 } 451 452 static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, 453 unsigned long end_pfn) 454 { 455 unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat); 456 457 if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) 458 pgdat->node_start_pfn = start_pfn; 459 460 pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - 461 pgdat->node_start_pfn; 462 } 463 464 static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) 465 { 466 struct pglist_data *pgdat = zone->zone_pgdat; 467 int nr_pages = PAGES_PER_SECTION; 468 int nid = pgdat->node_id; 469 int zone_type; 470 unsigned long flags, pfn; 471 int ret; 472 473 zone_type = zone - pgdat->node_zones; 474 ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages); 475 if (ret) 476 return ret; 477 478 pgdat_resize_lock(zone->zone_pgdat, &flags); 479 grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); 480 grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, 481 phys_start_pfn + nr_pages); 482 pgdat_resize_unlock(zone->zone_pgdat, &flags); 483 memmap_init_zone(nr_pages, nid, zone_type, 484 phys_start_pfn, MEMMAP_HOTPLUG); 485 486 /* online_page_range is called later and expects pages reserved */ 487 for (pfn = phys_start_pfn; pfn < phys_start_pfn + nr_pages; pfn++) { 488 if (!pfn_valid(pfn)) 489 continue; 490 491 SetPageReserved(pfn_to_page(pfn)); 492 } 493 return 0; 494 } 495 496 static int __meminit __add_section(int nid, struct zone *zone, 497 unsigned long phys_start_pfn) 498 { 499 int ret; 500 501 if (pfn_valid(phys_start_pfn)) 502 return -EEXIST; 503 504 ret = sparse_add_one_section(zone, phys_start_pfn); 505 506 if (ret < 0) 507 return ret; 508 509 ret = __add_zone(zone, phys_start_pfn); 510 511 if (ret < 0) 512 return ret; 513 514 return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); 515 } 516 517 /* 518 * Reasonably generic function for adding memory. It is 519 * expected that archs that support memory hotplug will 520 * call this function after deciding the zone to which to 521 * add the new pages. 522 */ 523 int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, 524 unsigned long nr_pages) 525 { 526 unsigned long i; 527 int err = 0; 528 int start_sec, end_sec; 529 struct vmem_altmap *altmap; 530 531 clear_zone_contiguous(zone); 532 533 /* during initialize mem_map, align hot-added range to section */ 534 start_sec = pfn_to_section_nr(phys_start_pfn); 535 end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); 536 537 altmap = to_vmem_altmap((unsigned long) pfn_to_page(phys_start_pfn)); 538 if (altmap) { 539 /* 540 * Validate altmap is within bounds of the total request 541 */ 542 if (altmap->base_pfn != phys_start_pfn 543 || vmem_altmap_offset(altmap) > nr_pages) { 544 pr_warn_once("memory add fail, invalid altmap\n"); 545 err = -EINVAL; 546 goto out; 547 } 548 altmap->alloc = 0; 549 } 550 551 for (i = start_sec; i <= end_sec; i++) { 552 err = __add_section(nid, zone, section_nr_to_pfn(i)); 553 554 /* 555 * EEXIST is finally dealt with by ioresource collision 556 * check. see add_memory() => register_memory_resource() 557 * Warning will be printed if there is collision. 558 */ 559 if (err && (err != -EEXIST)) 560 break; 561 err = 0; 562 } 563 vmemmap_populate_print_last(); 564 out: 565 set_zone_contiguous(zone); 566 return err; 567 } 568 EXPORT_SYMBOL_GPL(__add_pages); 569 570 #ifdef CONFIG_MEMORY_HOTREMOVE 571 /* find the smallest valid pfn in the range [start_pfn, end_pfn) */ 572 static int find_smallest_section_pfn(int nid, struct zone *zone, 573 unsigned long start_pfn, 574 unsigned long end_pfn) 575 { 576 struct mem_section *ms; 577 578 for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) { 579 ms = __pfn_to_section(start_pfn); 580 581 if (unlikely(!valid_section(ms))) 582 continue; 583 584 if (unlikely(pfn_to_nid(start_pfn) != nid)) 585 continue; 586 587 if (zone && zone != page_zone(pfn_to_page(start_pfn))) 588 continue; 589 590 return start_pfn; 591 } 592 593 return 0; 594 } 595 596 /* find the biggest valid pfn in the range [start_pfn, end_pfn). */ 597 static int find_biggest_section_pfn(int nid, struct zone *zone, 598 unsigned long start_pfn, 599 unsigned long end_pfn) 600 { 601 struct mem_section *ms; 602 unsigned long pfn; 603 604 /* pfn is the end pfn of a memory section. */ 605 pfn = end_pfn - 1; 606 for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) { 607 ms = __pfn_to_section(pfn); 608 609 if (unlikely(!valid_section(ms))) 610 continue; 611 612 if (unlikely(pfn_to_nid(pfn) != nid)) 613 continue; 614 615 if (zone && zone != page_zone(pfn_to_page(pfn))) 616 continue; 617 618 return pfn; 619 } 620 621 return 0; 622 } 623 624 static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, 625 unsigned long end_pfn) 626 { 627 unsigned long zone_start_pfn = zone->zone_start_pfn; 628 unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */ 629 unsigned long zone_end_pfn = z; 630 unsigned long pfn; 631 struct mem_section *ms; 632 int nid = zone_to_nid(zone); 633 634 zone_span_writelock(zone); 635 if (zone_start_pfn == start_pfn) { 636 /* 637 * If the section is smallest section in the zone, it need 638 * shrink zone->zone_start_pfn and zone->zone_spanned_pages. 639 * In this case, we find second smallest valid mem_section 640 * for shrinking zone. 641 */ 642 pfn = find_smallest_section_pfn(nid, zone, end_pfn, 643 zone_end_pfn); 644 if (pfn) { 645 zone->zone_start_pfn = pfn; 646 zone->spanned_pages = zone_end_pfn - pfn; 647 } 648 } else if (zone_end_pfn == end_pfn) { 649 /* 650 * If the section is biggest section in the zone, it need 651 * shrink zone->spanned_pages. 652 * In this case, we find second biggest valid mem_section for 653 * shrinking zone. 654 */ 655 pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn, 656 start_pfn); 657 if (pfn) 658 zone->spanned_pages = pfn - zone_start_pfn + 1; 659 } 660 661 /* 662 * The section is not biggest or smallest mem_section in the zone, it 663 * only creates a hole in the zone. So in this case, we need not 664 * change the zone. But perhaps, the zone has only hole data. Thus 665 * it check the zone has only hole or not. 666 */ 667 pfn = zone_start_pfn; 668 for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) { 669 ms = __pfn_to_section(pfn); 670 671 if (unlikely(!valid_section(ms))) 672 continue; 673 674 if (page_zone(pfn_to_page(pfn)) != zone) 675 continue; 676 677 /* If the section is current section, it continues the loop */ 678 if (start_pfn == pfn) 679 continue; 680 681 /* If we find valid section, we have nothing to do */ 682 zone_span_writeunlock(zone); 683 return; 684 } 685 686 /* The zone has no valid section */ 687 zone->zone_start_pfn = 0; 688 zone->spanned_pages = 0; 689 zone_span_writeunlock(zone); 690 } 691 692 static void shrink_pgdat_span(struct pglist_data *pgdat, 693 unsigned long start_pfn, unsigned long end_pfn) 694 { 695 unsigned long pgdat_start_pfn = pgdat->node_start_pfn; 696 unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */ 697 unsigned long pgdat_end_pfn = p; 698 unsigned long pfn; 699 struct mem_section *ms; 700 int nid = pgdat->node_id; 701 702 if (pgdat_start_pfn == start_pfn) { 703 /* 704 * If the section is smallest section in the pgdat, it need 705 * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages. 706 * In this case, we find second smallest valid mem_section 707 * for shrinking zone. 708 */ 709 pfn = find_smallest_section_pfn(nid, NULL, end_pfn, 710 pgdat_end_pfn); 711 if (pfn) { 712 pgdat->node_start_pfn = pfn; 713 pgdat->node_spanned_pages = pgdat_end_pfn - pfn; 714 } 715 } else if (pgdat_end_pfn == end_pfn) { 716 /* 717 * If the section is biggest section in the pgdat, it need 718 * shrink pgdat->node_spanned_pages. 719 * In this case, we find second biggest valid mem_section for 720 * shrinking zone. 721 */ 722 pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn, 723 start_pfn); 724 if (pfn) 725 pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1; 726 } 727 728 /* 729 * If the section is not biggest or smallest mem_section in the pgdat, 730 * it only creates a hole in the pgdat. So in this case, we need not 731 * change the pgdat. 732 * But perhaps, the pgdat has only hole data. Thus it check the pgdat 733 * has only hole or not. 734 */ 735 pfn = pgdat_start_pfn; 736 for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) { 737 ms = __pfn_to_section(pfn); 738 739 if (unlikely(!valid_section(ms))) 740 continue; 741 742 if (pfn_to_nid(pfn) != nid) 743 continue; 744 745 /* If the section is current section, it continues the loop */ 746 if (start_pfn == pfn) 747 continue; 748 749 /* If we find valid section, we have nothing to do */ 750 return; 751 } 752 753 /* The pgdat has no valid section */ 754 pgdat->node_start_pfn = 0; 755 pgdat->node_spanned_pages = 0; 756 } 757 758 static void __remove_zone(struct zone *zone, unsigned long start_pfn) 759 { 760 struct pglist_data *pgdat = zone->zone_pgdat; 761 int nr_pages = PAGES_PER_SECTION; 762 int zone_type; 763 unsigned long flags; 764 765 zone_type = zone - pgdat->node_zones; 766 767 pgdat_resize_lock(zone->zone_pgdat, &flags); 768 shrink_zone_span(zone, start_pfn, start_pfn + nr_pages); 769 shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages); 770 pgdat_resize_unlock(zone->zone_pgdat, &flags); 771 } 772 773 static int __remove_section(struct zone *zone, struct mem_section *ms, 774 unsigned long map_offset) 775 { 776 unsigned long start_pfn; 777 int scn_nr; 778 int ret = -EINVAL; 779 780 if (!valid_section(ms)) 781 return ret; 782 783 ret = unregister_memory_section(ms); 784 if (ret) 785 return ret; 786 787 scn_nr = __section_nr(ms); 788 start_pfn = section_nr_to_pfn(scn_nr); 789 __remove_zone(zone, start_pfn); 790 791 sparse_remove_one_section(zone, ms, map_offset); 792 return 0; 793 } 794 795 /** 796 * __remove_pages() - remove sections of pages from a zone 797 * @zone: zone from which pages need to be removed 798 * @phys_start_pfn: starting pageframe (must be aligned to start of a section) 799 * @nr_pages: number of pages to remove (must be multiple of section size) 800 * 801 * Generic helper function to remove section mappings and sysfs entries 802 * for the section of the memory we are removing. Caller needs to make 803 * sure that pages are marked reserved and zones are adjust properly by 804 * calling offline_pages(). 805 */ 806 int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, 807 unsigned long nr_pages) 808 { 809 unsigned long i; 810 unsigned long map_offset = 0; 811 int sections_to_remove, ret = 0; 812 813 /* In the ZONE_DEVICE case device driver owns the memory region */ 814 if (is_dev_zone(zone)) { 815 struct page *page = pfn_to_page(phys_start_pfn); 816 struct vmem_altmap *altmap; 817 818 altmap = to_vmem_altmap((unsigned long) page); 819 if (altmap) 820 map_offset = vmem_altmap_offset(altmap); 821 } else { 822 resource_size_t start, size; 823 824 start = phys_start_pfn << PAGE_SHIFT; 825 size = nr_pages * PAGE_SIZE; 826 827 ret = release_mem_region_adjustable(&iomem_resource, start, 828 size); 829 if (ret) { 830 resource_size_t endres = start + size - 1; 831 832 pr_warn("Unable to release resource <%pa-%pa> (%d)\n", 833 &start, &endres, ret); 834 } 835 } 836 837 clear_zone_contiguous(zone); 838 839 /* 840 * We can only remove entire sections 841 */ 842 BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); 843 BUG_ON(nr_pages % PAGES_PER_SECTION); 844 845 sections_to_remove = nr_pages / PAGES_PER_SECTION; 846 for (i = 0; i < sections_to_remove; i++) { 847 unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; 848 849 ret = __remove_section(zone, __pfn_to_section(pfn), map_offset); 850 map_offset = 0; 851 if (ret) 852 break; 853 } 854 855 set_zone_contiguous(zone); 856 857 return ret; 858 } 859 EXPORT_SYMBOL_GPL(__remove_pages); 860 #endif /* CONFIG_MEMORY_HOTREMOVE */ 861 862 int set_online_page_callback(online_page_callback_t callback) 863 { 864 int rc = -EINVAL; 865 866 get_online_mems(); 867 mutex_lock(&online_page_callback_lock); 868 869 if (online_page_callback == generic_online_page) { 870 online_page_callback = callback; 871 rc = 0; 872 } 873 874 mutex_unlock(&online_page_callback_lock); 875 put_online_mems(); 876 877 return rc; 878 } 879 EXPORT_SYMBOL_GPL(set_online_page_callback); 880 881 int restore_online_page_callback(online_page_callback_t callback) 882 { 883 int rc = -EINVAL; 884 885 get_online_mems(); 886 mutex_lock(&online_page_callback_lock); 887 888 if (online_page_callback == callback) { 889 online_page_callback = generic_online_page; 890 rc = 0; 891 } 892 893 mutex_unlock(&online_page_callback_lock); 894 put_online_mems(); 895 896 return rc; 897 } 898 EXPORT_SYMBOL_GPL(restore_online_page_callback); 899 900 void __online_page_set_limits(struct page *page) 901 { 902 } 903 EXPORT_SYMBOL_GPL(__online_page_set_limits); 904 905 void __online_page_increment_counters(struct page *page) 906 { 907 adjust_managed_page_count(page, 1); 908 } 909 EXPORT_SYMBOL_GPL(__online_page_increment_counters); 910 911 void __online_page_free(struct page *page) 912 { 913 __free_reserved_page(page); 914 } 915 EXPORT_SYMBOL_GPL(__online_page_free); 916 917 static void generic_online_page(struct page *page) 918 { 919 __online_page_set_limits(page); 920 __online_page_increment_counters(page); 921 __online_page_free(page); 922 } 923 924 static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, 925 void *arg) 926 { 927 unsigned long i; 928 unsigned long onlined_pages = *(unsigned long *)arg; 929 struct page *page; 930 if (PageReserved(pfn_to_page(start_pfn))) 931 for (i = 0; i < nr_pages; i++) { 932 page = pfn_to_page(start_pfn + i); 933 (*online_page_callback)(page); 934 onlined_pages++; 935 } 936 *(unsigned long *)arg = onlined_pages; 937 return 0; 938 } 939 940 #ifdef CONFIG_MOVABLE_NODE 941 /* 942 * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have 943 * normal memory. 944 */ 945 static bool can_online_high_movable(struct zone *zone) 946 { 947 return true; 948 } 949 #else /* CONFIG_MOVABLE_NODE */ 950 /* ensure every online node has NORMAL memory */ 951 static bool can_online_high_movable(struct zone *zone) 952 { 953 return node_state(zone_to_nid(zone), N_NORMAL_MEMORY); 954 } 955 #endif /* CONFIG_MOVABLE_NODE */ 956 957 /* check which state of node_states will be changed when online memory */ 958 static void node_states_check_changes_online(unsigned long nr_pages, 959 struct zone *zone, struct memory_notify *arg) 960 { 961 int nid = zone_to_nid(zone); 962 enum zone_type zone_last = ZONE_NORMAL; 963 964 /* 965 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] 966 * contains nodes which have zones of 0...ZONE_NORMAL, 967 * set zone_last to ZONE_NORMAL. 968 * 969 * If we don't have HIGHMEM nor movable node, 970 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of 971 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. 972 */ 973 if (N_MEMORY == N_NORMAL_MEMORY) 974 zone_last = ZONE_MOVABLE; 975 976 /* 977 * if the memory to be online is in a zone of 0...zone_last, and 978 * the zones of 0...zone_last don't have memory before online, we will 979 * need to set the node to node_states[N_NORMAL_MEMORY] after 980 * the memory is online. 981 */ 982 if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY)) 983 arg->status_change_nid_normal = nid; 984 else 985 arg->status_change_nid_normal = -1; 986 987 #ifdef CONFIG_HIGHMEM 988 /* 989 * If we have movable node, node_states[N_HIGH_MEMORY] 990 * contains nodes which have zones of 0...ZONE_HIGHMEM, 991 * set zone_last to ZONE_HIGHMEM. 992 * 993 * If we don't have movable node, node_states[N_NORMAL_MEMORY] 994 * contains nodes which have zones of 0...ZONE_MOVABLE, 995 * set zone_last to ZONE_MOVABLE. 996 */ 997 zone_last = ZONE_HIGHMEM; 998 if (N_MEMORY == N_HIGH_MEMORY) 999 zone_last = ZONE_MOVABLE; 1000 1001 if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY)) 1002 arg->status_change_nid_high = nid; 1003 else 1004 arg->status_change_nid_high = -1; 1005 #else 1006 arg->status_change_nid_high = arg->status_change_nid_normal; 1007 #endif 1008 1009 /* 1010 * if the node don't have memory befor online, we will need to 1011 * set the node to node_states[N_MEMORY] after the memory 1012 * is online. 1013 */ 1014 if (!node_state(nid, N_MEMORY)) 1015 arg->status_change_nid = nid; 1016 else 1017 arg->status_change_nid = -1; 1018 } 1019 1020 static void node_states_set_node(int node, struct memory_notify *arg) 1021 { 1022 if (arg->status_change_nid_normal >= 0) 1023 node_set_state(node, N_NORMAL_MEMORY); 1024 1025 if (arg->status_change_nid_high >= 0) 1026 node_set_state(node, N_HIGH_MEMORY); 1027 1028 node_set_state(node, N_MEMORY); 1029 } 1030 1031 1032 /* Must be protected by mem_hotplug_begin() */ 1033 int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) 1034 { 1035 unsigned long flags; 1036 unsigned long onlined_pages = 0; 1037 struct zone *zone; 1038 int need_zonelists_rebuild = 0; 1039 int nid; 1040 int ret; 1041 struct memory_notify arg; 1042 1043 /* 1044 * This doesn't need a lock to do pfn_to_page(). 1045 * The section can't be removed here because of the 1046 * memory_block->state_mutex. 1047 */ 1048 zone = page_zone(pfn_to_page(pfn)); 1049 1050 if ((zone_idx(zone) > ZONE_NORMAL || 1051 online_type == MMOP_ONLINE_MOVABLE) && 1052 !can_online_high_movable(zone)) 1053 return -EINVAL; 1054 1055 if (online_type == MMOP_ONLINE_KERNEL && 1056 zone_idx(zone) == ZONE_MOVABLE) { 1057 if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) 1058 return -EINVAL; 1059 } 1060 if (online_type == MMOP_ONLINE_MOVABLE && 1061 zone_idx(zone) == ZONE_MOVABLE - 1) { 1062 if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) 1063 return -EINVAL; 1064 } 1065 1066 /* Previous code may changed the zone of the pfn range */ 1067 zone = page_zone(pfn_to_page(pfn)); 1068 1069 arg.start_pfn = pfn; 1070 arg.nr_pages = nr_pages; 1071 node_states_check_changes_online(nr_pages, zone, &arg); 1072 1073 nid = zone_to_nid(zone); 1074 1075 ret = memory_notify(MEM_GOING_ONLINE, &arg); 1076 ret = notifier_to_errno(ret); 1077 if (ret) 1078 goto failed_addition; 1079 1080 /* 1081 * If this zone is not populated, then it is not in zonelist. 1082 * This means the page allocator ignores this zone. 1083 * So, zonelist must be updated after online. 1084 */ 1085 mutex_lock(&zonelists_mutex); 1086 if (!populated_zone(zone)) { 1087 need_zonelists_rebuild = 1; 1088 build_all_zonelists(NULL, zone); 1089 } 1090 1091 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, 1092 online_pages_range); 1093 if (ret) { 1094 if (need_zonelists_rebuild) 1095 zone_pcp_reset(zone); 1096 mutex_unlock(&zonelists_mutex); 1097 goto failed_addition; 1098 } 1099 1100 zone->present_pages += onlined_pages; 1101 1102 pgdat_resize_lock(zone->zone_pgdat, &flags); 1103 zone->zone_pgdat->node_present_pages += onlined_pages; 1104 pgdat_resize_unlock(zone->zone_pgdat, &flags); 1105 1106 if (onlined_pages) { 1107 node_states_set_node(nid, &arg); 1108 if (need_zonelists_rebuild) 1109 build_all_zonelists(NULL, NULL); 1110 else 1111 zone_pcp_update(zone); 1112 } 1113 1114 mutex_unlock(&zonelists_mutex); 1115 1116 init_per_zone_wmark_min(); 1117 1118 if (onlined_pages) { 1119 kswapd_run(nid); 1120 kcompactd_run(nid); 1121 } 1122 1123 vm_total_pages = nr_free_pagecache_pages(); 1124 1125 writeback_set_ratelimit(); 1126 1127 if (onlined_pages) 1128 memory_notify(MEM_ONLINE, &arg); 1129 return 0; 1130 1131 failed_addition: 1132 pr_debug("online_pages [mem %#010llx-%#010llx] failed\n", 1133 (unsigned long long) pfn << PAGE_SHIFT, 1134 (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1); 1135 memory_notify(MEM_CANCEL_ONLINE, &arg); 1136 return ret; 1137 } 1138 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ 1139 1140 static void reset_node_present_pages(pg_data_t *pgdat) 1141 { 1142 struct zone *z; 1143 1144 for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) 1145 z->present_pages = 0; 1146 1147 pgdat->node_present_pages = 0; 1148 } 1149 1150 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 1151 static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) 1152 { 1153 struct pglist_data *pgdat; 1154 unsigned long zones_size[MAX_NR_ZONES] = {0}; 1155 unsigned long zholes_size[MAX_NR_ZONES] = {0}; 1156 unsigned long start_pfn = PFN_DOWN(start); 1157 1158 pgdat = NODE_DATA(nid); 1159 if (!pgdat) { 1160 pgdat = arch_alloc_nodedata(nid); 1161 if (!pgdat) 1162 return NULL; 1163 1164 arch_refresh_nodedata(nid, pgdat); 1165 } else { 1166 /* Reset the nr_zones and classzone_idx to 0 before reuse */ 1167 pgdat->nr_zones = 0; 1168 pgdat->classzone_idx = 0; 1169 } 1170 1171 /* we can use NODE_DATA(nid) from here */ 1172 1173 /* init node's zones as empty zones, we don't have any present pages.*/ 1174 free_area_init_node(nid, zones_size, start_pfn, zholes_size); 1175 1176 /* 1177 * The node we allocated has no zone fallback lists. For avoiding 1178 * to access not-initialized zonelist, build here. 1179 */ 1180 mutex_lock(&zonelists_mutex); 1181 build_all_zonelists(pgdat, NULL); 1182 mutex_unlock(&zonelists_mutex); 1183 1184 /* 1185 * zone->managed_pages is set to an approximate value in 1186 * free_area_init_core(), which will cause 1187 * /sys/device/system/node/nodeX/meminfo has wrong data. 1188 * So reset it to 0 before any memory is onlined. 1189 */ 1190 reset_node_managed_pages(pgdat); 1191 1192 /* 1193 * When memory is hot-added, all the memory is in offline state. So 1194 * clear all zones' present_pages because they will be updated in 1195 * online_pages() and offline_pages(). 1196 */ 1197 reset_node_present_pages(pgdat); 1198 1199 return pgdat; 1200 } 1201 1202 static void rollback_node_hotadd(int nid, pg_data_t *pgdat) 1203 { 1204 arch_refresh_nodedata(nid, NULL); 1205 arch_free_nodedata(pgdat); 1206 return; 1207 } 1208 1209 1210 /** 1211 * try_online_node - online a node if offlined 1212 * 1213 * called by cpu_up() to online a node without onlined memory. 1214 */ 1215 int try_online_node(int nid) 1216 { 1217 pg_data_t *pgdat; 1218 int ret; 1219 1220 if (node_online(nid)) 1221 return 0; 1222 1223 mem_hotplug_begin(); 1224 pgdat = hotadd_new_pgdat(nid, 0); 1225 if (!pgdat) { 1226 pr_err("Cannot online node %d due to NULL pgdat\n", nid); 1227 ret = -ENOMEM; 1228 goto out; 1229 } 1230 node_set_online(nid); 1231 ret = register_one_node(nid); 1232 BUG_ON(ret); 1233 1234 if (pgdat->node_zonelists->_zonerefs->zone == NULL) { 1235 mutex_lock(&zonelists_mutex); 1236 build_all_zonelists(NULL, NULL); 1237 mutex_unlock(&zonelists_mutex); 1238 } 1239 1240 out: 1241 mem_hotplug_done(); 1242 return ret; 1243 } 1244 1245 static int check_hotplug_memory_range(u64 start, u64 size) 1246 { 1247 u64 start_pfn = PFN_DOWN(start); 1248 u64 nr_pages = size >> PAGE_SHIFT; 1249 1250 /* Memory range must be aligned with section */ 1251 if ((start_pfn & ~PAGE_SECTION_MASK) || 1252 (nr_pages % PAGES_PER_SECTION) || (!nr_pages)) { 1253 pr_err("Section-unaligned hotplug range: start 0x%llx, size 0x%llx\n", 1254 (unsigned long long)start, 1255 (unsigned long long)size); 1256 return -EINVAL; 1257 } 1258 1259 return 0; 1260 } 1261 1262 /* 1263 * If movable zone has already been setup, newly added memory should be check. 1264 * If its address is higher than movable zone, it should be added as movable. 1265 * Without this check, movable zone may overlap with other zone. 1266 */ 1267 static int should_add_memory_movable(int nid, u64 start, u64 size) 1268 { 1269 unsigned long start_pfn = start >> PAGE_SHIFT; 1270 pg_data_t *pgdat = NODE_DATA(nid); 1271 struct zone *movable_zone = pgdat->node_zones + ZONE_MOVABLE; 1272 1273 if (zone_is_empty(movable_zone)) 1274 return 0; 1275 1276 if (movable_zone->zone_start_pfn <= start_pfn) 1277 return 1; 1278 1279 return 0; 1280 } 1281 1282 int zone_for_memory(int nid, u64 start, u64 size, int zone_default, 1283 bool for_device) 1284 { 1285 #ifdef CONFIG_ZONE_DEVICE 1286 if (for_device) 1287 return ZONE_DEVICE; 1288 #endif 1289 if (should_add_memory_movable(nid, start, size)) 1290 return ZONE_MOVABLE; 1291 1292 return zone_default; 1293 } 1294 1295 static int online_memory_block(struct memory_block *mem, void *arg) 1296 { 1297 return memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); 1298 } 1299 1300 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 1301 int __ref add_memory_resource(int nid, struct resource *res, bool online) 1302 { 1303 u64 start, size; 1304 pg_data_t *pgdat = NULL; 1305 bool new_pgdat; 1306 bool new_node; 1307 int ret; 1308 1309 start = res->start; 1310 size = resource_size(res); 1311 1312 ret = check_hotplug_memory_range(start, size); 1313 if (ret) 1314 return ret; 1315 1316 { /* Stupid hack to suppress address-never-null warning */ 1317 void *p = NODE_DATA(nid); 1318 new_pgdat = !p; 1319 } 1320 1321 mem_hotplug_begin(); 1322 1323 /* 1324 * Add new range to memblock so that when hotadd_new_pgdat() is called 1325 * to allocate new pgdat, get_pfn_range_for_nid() will be able to find 1326 * this new range and calculate total pages correctly. The range will 1327 * be removed at hot-remove time. 1328 */ 1329 memblock_add_node(start, size, nid); 1330 1331 new_node = !node_online(nid); 1332 if (new_node) { 1333 pgdat = hotadd_new_pgdat(nid, start); 1334 ret = -ENOMEM; 1335 if (!pgdat) 1336 goto error; 1337 } 1338 1339 /* call arch's memory hotadd */ 1340 ret = arch_add_memory(nid, start, size, false); 1341 1342 if (ret < 0) 1343 goto error; 1344 1345 /* we online node here. we can't roll back from here. */ 1346 node_set_online(nid); 1347 1348 if (new_node) { 1349 ret = register_one_node(nid); 1350 /* 1351 * If sysfs file of new node can't create, cpu on the node 1352 * can't be hot-added. There is no rollback way now. 1353 * So, check by BUG_ON() to catch it reluctantly.. 1354 */ 1355 BUG_ON(ret); 1356 } 1357 1358 /* create new memmap entry */ 1359 firmware_map_add_hotplug(start, start + size, "System RAM"); 1360 1361 /* online pages if requested */ 1362 if (online) 1363 walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), 1364 NULL, online_memory_block); 1365 1366 goto out; 1367 1368 error: 1369 /* rollback pgdat allocation and others */ 1370 if (new_pgdat) 1371 rollback_node_hotadd(nid, pgdat); 1372 memblock_remove(start, size); 1373 1374 out: 1375 mem_hotplug_done(); 1376 return ret; 1377 } 1378 EXPORT_SYMBOL_GPL(add_memory_resource); 1379 1380 int __ref add_memory(int nid, u64 start, u64 size) 1381 { 1382 struct resource *res; 1383 int ret; 1384 1385 res = register_memory_resource(start, size); 1386 if (IS_ERR(res)) 1387 return PTR_ERR(res); 1388 1389 ret = add_memory_resource(nid, res, memhp_auto_online); 1390 if (ret < 0) 1391 release_memory_resource(res); 1392 return ret; 1393 } 1394 EXPORT_SYMBOL_GPL(add_memory); 1395 1396 #ifdef CONFIG_MEMORY_HOTREMOVE 1397 /* 1398 * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy 1399 * set and the size of the free page is given by page_order(). Using this, 1400 * the function determines if the pageblock contains only free pages. 1401 * Due to buddy contraints, a free page at least the size of a pageblock will 1402 * be located at the start of the pageblock 1403 */ 1404 static inline int pageblock_free(struct page *page) 1405 { 1406 return PageBuddy(page) && page_order(page) >= pageblock_order; 1407 } 1408 1409 /* Return the start of the next active pageblock after a given page */ 1410 static struct page *next_active_pageblock(struct page *page) 1411 { 1412 /* Ensure the starting page is pageblock-aligned */ 1413 BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1)); 1414 1415 /* If the entire pageblock is free, move to the end of free page */ 1416 if (pageblock_free(page)) { 1417 int order; 1418 /* be careful. we don't have locks, page_order can be changed.*/ 1419 order = page_order(page); 1420 if ((order < MAX_ORDER) && (order >= pageblock_order)) 1421 return page + (1 << order); 1422 } 1423 1424 return page + pageblock_nr_pages; 1425 } 1426 1427 /* Checks if this range of memory is likely to be hot-removable. */ 1428 bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) 1429 { 1430 struct page *page = pfn_to_page(start_pfn); 1431 struct page *end_page = page + nr_pages; 1432 1433 /* Check the starting page of each pageblock within the range */ 1434 for (; page < end_page; page = next_active_pageblock(page)) { 1435 if (!is_pageblock_removable_nolock(page)) 1436 return false; 1437 cond_resched(); 1438 } 1439 1440 /* All pageblocks in the memory block are likely to be hot-removable */ 1441 return true; 1442 } 1443 1444 /* 1445 * Confirm all pages in a range [start, end) is belongs to the same zone. 1446 */ 1447 int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) 1448 { 1449 unsigned long pfn, sec_end_pfn; 1450 struct zone *zone = NULL; 1451 struct page *page; 1452 int i; 1453 for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn); 1454 pfn < end_pfn; 1455 pfn = sec_end_pfn + 1, sec_end_pfn += PAGES_PER_SECTION) { 1456 /* Make sure the memory section is present first */ 1457 if (!present_section_nr(pfn_to_section_nr(pfn))) 1458 continue; 1459 for (; pfn < sec_end_pfn && pfn < end_pfn; 1460 pfn += MAX_ORDER_NR_PAGES) { 1461 i = 0; 1462 /* This is just a CONFIG_HOLES_IN_ZONE check.*/ 1463 while ((i < MAX_ORDER_NR_PAGES) && 1464 !pfn_valid_within(pfn + i)) 1465 i++; 1466 if (i == MAX_ORDER_NR_PAGES) 1467 continue; 1468 page = pfn_to_page(pfn + i); 1469 if (zone && page_zone(page) != zone) 1470 return 0; 1471 zone = page_zone(page); 1472 } 1473 } 1474 return 1; 1475 } 1476 1477 /* 1478 * Scan pfn range [start,end) to find movable/migratable pages (LRU pages 1479 * and hugepages). We scan pfn because it's much easier than scanning over 1480 * linked list. This function returns the pfn of the first found movable 1481 * page if it's found, otherwise 0. 1482 */ 1483 static unsigned long scan_movable_pages(unsigned long start, unsigned long end) 1484 { 1485 unsigned long pfn; 1486 struct page *page; 1487 for (pfn = start; pfn < end; pfn++) { 1488 if (pfn_valid(pfn)) { 1489 page = pfn_to_page(pfn); 1490 if (PageLRU(page)) 1491 return pfn; 1492 if (PageHuge(page)) { 1493 if (page_huge_active(page)) 1494 return pfn; 1495 else 1496 pfn = round_up(pfn + 1, 1497 1 << compound_order(page)) - 1; 1498 } 1499 } 1500 } 1501 return 0; 1502 } 1503 1504 #define NR_OFFLINE_AT_ONCE_PAGES (256) 1505 static int 1506 do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) 1507 { 1508 unsigned long pfn; 1509 struct page *page; 1510 int move_pages = NR_OFFLINE_AT_ONCE_PAGES; 1511 int not_managed = 0; 1512 int ret = 0; 1513 LIST_HEAD(source); 1514 1515 for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) { 1516 if (!pfn_valid(pfn)) 1517 continue; 1518 page = pfn_to_page(pfn); 1519 1520 if (PageHuge(page)) { 1521 struct page *head = compound_head(page); 1522 pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1; 1523 if (compound_order(head) > PFN_SECTION_SHIFT) { 1524 ret = -EBUSY; 1525 break; 1526 } 1527 if (isolate_huge_page(page, &source)) 1528 move_pages -= 1 << compound_order(head); 1529 continue; 1530 } 1531 1532 if (!get_page_unless_zero(page)) 1533 continue; 1534 /* 1535 * We can skip free pages. And we can only deal with pages on 1536 * LRU. 1537 */ 1538 ret = isolate_lru_page(page); 1539 if (!ret) { /* Success */ 1540 put_page(page); 1541 list_add_tail(&page->lru, &source); 1542 move_pages--; 1543 inc_zone_page_state(page, NR_ISOLATED_ANON + 1544 page_is_file_cache(page)); 1545 1546 } else { 1547 #ifdef CONFIG_DEBUG_VM 1548 pr_alert("removing pfn %lx from LRU failed\n", pfn); 1549 dump_page(page, "failed to remove from LRU"); 1550 #endif 1551 put_page(page); 1552 /* Because we don't have big zone->lock. we should 1553 check this again here. */ 1554 if (page_count(page)) { 1555 not_managed++; 1556 ret = -EBUSY; 1557 break; 1558 } 1559 } 1560 } 1561 if (!list_empty(&source)) { 1562 if (not_managed) { 1563 putback_movable_pages(&source); 1564 goto out; 1565 } 1566 1567 /* 1568 * alloc_migrate_target should be improooooved!! 1569 * migrate_pages returns # of failed pages. 1570 */ 1571 ret = migrate_pages(&source, alloc_migrate_target, NULL, 0, 1572 MIGRATE_SYNC, MR_MEMORY_HOTPLUG); 1573 if (ret) 1574 putback_movable_pages(&source); 1575 } 1576 out: 1577 return ret; 1578 } 1579 1580 /* 1581 * remove from free_area[] and mark all as Reserved. 1582 */ 1583 static int 1584 offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, 1585 void *data) 1586 { 1587 __offline_isolated_pages(start, start + nr_pages); 1588 return 0; 1589 } 1590 1591 static void 1592 offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 1593 { 1594 walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL, 1595 offline_isolated_pages_cb); 1596 } 1597 1598 /* 1599 * Check all pages in range, recoreded as memory resource, are isolated. 1600 */ 1601 static int 1602 check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, 1603 void *data) 1604 { 1605 int ret; 1606 long offlined = *(long *)data; 1607 ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true); 1608 offlined = nr_pages; 1609 if (!ret) 1610 *(long *)data += offlined; 1611 return ret; 1612 } 1613 1614 static long 1615 check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) 1616 { 1617 long offlined = 0; 1618 int ret; 1619 1620 ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined, 1621 check_pages_isolated_cb); 1622 if (ret < 0) 1623 offlined = (long)ret; 1624 return offlined; 1625 } 1626 1627 #ifdef CONFIG_MOVABLE_NODE 1628 /* 1629 * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have 1630 * normal memory. 1631 */ 1632 static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) 1633 { 1634 return true; 1635 } 1636 #else /* CONFIG_MOVABLE_NODE */ 1637 /* ensure the node has NORMAL memory if it is still online */ 1638 static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) 1639 { 1640 struct pglist_data *pgdat = zone->zone_pgdat; 1641 unsigned long present_pages = 0; 1642 enum zone_type zt; 1643 1644 for (zt = 0; zt <= ZONE_NORMAL; zt++) 1645 present_pages += pgdat->node_zones[zt].present_pages; 1646 1647 if (present_pages > nr_pages) 1648 return true; 1649 1650 present_pages = 0; 1651 for (; zt <= ZONE_MOVABLE; zt++) 1652 present_pages += pgdat->node_zones[zt].present_pages; 1653 1654 /* 1655 * we can't offline the last normal memory until all 1656 * higher memory is offlined. 1657 */ 1658 return present_pages == 0; 1659 } 1660 #endif /* CONFIG_MOVABLE_NODE */ 1661 1662 static int __init cmdline_parse_movable_node(char *p) 1663 { 1664 #ifdef CONFIG_MOVABLE_NODE 1665 /* 1666 * Memory used by the kernel cannot be hot-removed because Linux 1667 * cannot migrate the kernel pages. When memory hotplug is 1668 * enabled, we should prevent memblock from allocating memory 1669 * for the kernel. 1670 * 1671 * ACPI SRAT records all hotpluggable memory ranges. But before 1672 * SRAT is parsed, we don't know about it. 1673 * 1674 * The kernel image is loaded into memory at very early time. We 1675 * cannot prevent this anyway. So on NUMA system, we set any 1676 * node the kernel resides in as un-hotpluggable. 1677 * 1678 * Since on modern servers, one node could have double-digit 1679 * gigabytes memory, we can assume the memory around the kernel 1680 * image is also un-hotpluggable. So before SRAT is parsed, just 1681 * allocate memory near the kernel image to try the best to keep 1682 * the kernel away from hotpluggable memory. 1683 */ 1684 memblock_set_bottom_up(true); 1685 movable_node_enabled = true; 1686 #else 1687 pr_warn("movable_node option not supported\n"); 1688 #endif 1689 return 0; 1690 } 1691 early_param("movable_node", cmdline_parse_movable_node); 1692 1693 /* check which state of node_states will be changed when offline memory */ 1694 static void node_states_check_changes_offline(unsigned long nr_pages, 1695 struct zone *zone, struct memory_notify *arg) 1696 { 1697 struct pglist_data *pgdat = zone->zone_pgdat; 1698 unsigned long present_pages = 0; 1699 enum zone_type zt, zone_last = ZONE_NORMAL; 1700 1701 /* 1702 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] 1703 * contains nodes which have zones of 0...ZONE_NORMAL, 1704 * set zone_last to ZONE_NORMAL. 1705 * 1706 * If we don't have HIGHMEM nor movable node, 1707 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of 1708 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. 1709 */ 1710 if (N_MEMORY == N_NORMAL_MEMORY) 1711 zone_last = ZONE_MOVABLE; 1712 1713 /* 1714 * check whether node_states[N_NORMAL_MEMORY] will be changed. 1715 * If the memory to be offline is in a zone of 0...zone_last, 1716 * and it is the last present memory, 0...zone_last will 1717 * become empty after offline , thus we can determind we will 1718 * need to clear the node from node_states[N_NORMAL_MEMORY]. 1719 */ 1720 for (zt = 0; zt <= zone_last; zt++) 1721 present_pages += pgdat->node_zones[zt].present_pages; 1722 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) 1723 arg->status_change_nid_normal = zone_to_nid(zone); 1724 else 1725 arg->status_change_nid_normal = -1; 1726 1727 #ifdef CONFIG_HIGHMEM 1728 /* 1729 * If we have movable node, node_states[N_HIGH_MEMORY] 1730 * contains nodes which have zones of 0...ZONE_HIGHMEM, 1731 * set zone_last to ZONE_HIGHMEM. 1732 * 1733 * If we don't have movable node, node_states[N_NORMAL_MEMORY] 1734 * contains nodes which have zones of 0...ZONE_MOVABLE, 1735 * set zone_last to ZONE_MOVABLE. 1736 */ 1737 zone_last = ZONE_HIGHMEM; 1738 if (N_MEMORY == N_HIGH_MEMORY) 1739 zone_last = ZONE_MOVABLE; 1740 1741 for (; zt <= zone_last; zt++) 1742 present_pages += pgdat->node_zones[zt].present_pages; 1743 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) 1744 arg->status_change_nid_high = zone_to_nid(zone); 1745 else 1746 arg->status_change_nid_high = -1; 1747 #else 1748 arg->status_change_nid_high = arg->status_change_nid_normal; 1749 #endif 1750 1751 /* 1752 * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE 1753 */ 1754 zone_last = ZONE_MOVABLE; 1755 1756 /* 1757 * check whether node_states[N_HIGH_MEMORY] will be changed 1758 * If we try to offline the last present @nr_pages from the node, 1759 * we can determind we will need to clear the node from 1760 * node_states[N_HIGH_MEMORY]. 1761 */ 1762 for (; zt <= zone_last; zt++) 1763 present_pages += pgdat->node_zones[zt].present_pages; 1764 if (nr_pages >= present_pages) 1765 arg->status_change_nid = zone_to_nid(zone); 1766 else 1767 arg->status_change_nid = -1; 1768 } 1769 1770 static void node_states_clear_node(int node, struct memory_notify *arg) 1771 { 1772 if (arg->status_change_nid_normal >= 0) 1773 node_clear_state(node, N_NORMAL_MEMORY); 1774 1775 if ((N_MEMORY != N_NORMAL_MEMORY) && 1776 (arg->status_change_nid_high >= 0)) 1777 node_clear_state(node, N_HIGH_MEMORY); 1778 1779 if ((N_MEMORY != N_HIGH_MEMORY) && 1780 (arg->status_change_nid >= 0)) 1781 node_clear_state(node, N_MEMORY); 1782 } 1783 1784 static int __ref __offline_pages(unsigned long start_pfn, 1785 unsigned long end_pfn, unsigned long timeout) 1786 { 1787 unsigned long pfn, nr_pages, expire; 1788 long offlined_pages; 1789 int ret, drain, retry_max, node; 1790 unsigned long flags; 1791 struct zone *zone; 1792 struct memory_notify arg; 1793 1794 /* at least, alignment against pageblock is necessary */ 1795 if (!IS_ALIGNED(start_pfn, pageblock_nr_pages)) 1796 return -EINVAL; 1797 if (!IS_ALIGNED(end_pfn, pageblock_nr_pages)) 1798 return -EINVAL; 1799 /* This makes hotplug much easier...and readable. 1800 we assume this for now. .*/ 1801 if (!test_pages_in_a_zone(start_pfn, end_pfn)) 1802 return -EINVAL; 1803 1804 zone = page_zone(pfn_to_page(start_pfn)); 1805 node = zone_to_nid(zone); 1806 nr_pages = end_pfn - start_pfn; 1807 1808 if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages)) 1809 return -EINVAL; 1810 1811 /* set above range as isolated */ 1812 ret = start_isolate_page_range(start_pfn, end_pfn, 1813 MIGRATE_MOVABLE, true); 1814 if (ret) 1815 return ret; 1816 1817 arg.start_pfn = start_pfn; 1818 arg.nr_pages = nr_pages; 1819 node_states_check_changes_offline(nr_pages, zone, &arg); 1820 1821 ret = memory_notify(MEM_GOING_OFFLINE, &arg); 1822 ret = notifier_to_errno(ret); 1823 if (ret) 1824 goto failed_removal; 1825 1826 pfn = start_pfn; 1827 expire = jiffies + timeout; 1828 drain = 0; 1829 retry_max = 5; 1830 repeat: 1831 /* start memory hot removal */ 1832 ret = -EAGAIN; 1833 if (time_after(jiffies, expire)) 1834 goto failed_removal; 1835 ret = -EINTR; 1836 if (signal_pending(current)) 1837 goto failed_removal; 1838 ret = 0; 1839 if (drain) { 1840 lru_add_drain_all(); 1841 cond_resched(); 1842 drain_all_pages(zone); 1843 } 1844 1845 pfn = scan_movable_pages(start_pfn, end_pfn); 1846 if (pfn) { /* We have movable pages */ 1847 ret = do_migrate_range(pfn, end_pfn); 1848 if (!ret) { 1849 drain = 1; 1850 goto repeat; 1851 } else { 1852 if (ret < 0) 1853 if (--retry_max == 0) 1854 goto failed_removal; 1855 yield(); 1856 drain = 1; 1857 goto repeat; 1858 } 1859 } 1860 /* drain all zone's lru pagevec, this is asynchronous... */ 1861 lru_add_drain_all(); 1862 yield(); 1863 /* drain pcp pages, this is synchronous. */ 1864 drain_all_pages(zone); 1865 /* 1866 * dissolve free hugepages in the memory block before doing offlining 1867 * actually in order to make hugetlbfs's object counting consistent. 1868 */ 1869 dissolve_free_huge_pages(start_pfn, end_pfn); 1870 /* check again */ 1871 offlined_pages = check_pages_isolated(start_pfn, end_pfn); 1872 if (offlined_pages < 0) { 1873 ret = -EBUSY; 1874 goto failed_removal; 1875 } 1876 pr_info("Offlined Pages %ld\n", offlined_pages); 1877 /* Ok, all of our target is isolated. 1878 We cannot do rollback at this point. */ 1879 offline_isolated_pages(start_pfn, end_pfn); 1880 /* reset pagetype flags and makes migrate type to be MOVABLE */ 1881 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1882 /* removal success */ 1883 adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages); 1884 zone->present_pages -= offlined_pages; 1885 1886 pgdat_resize_lock(zone->zone_pgdat, &flags); 1887 zone->zone_pgdat->node_present_pages -= offlined_pages; 1888 pgdat_resize_unlock(zone->zone_pgdat, &flags); 1889 1890 init_per_zone_wmark_min(); 1891 1892 if (!populated_zone(zone)) { 1893 zone_pcp_reset(zone); 1894 mutex_lock(&zonelists_mutex); 1895 build_all_zonelists(NULL, NULL); 1896 mutex_unlock(&zonelists_mutex); 1897 } else 1898 zone_pcp_update(zone); 1899 1900 node_states_clear_node(node, &arg); 1901 if (arg.status_change_nid >= 0) { 1902 kswapd_stop(node); 1903 kcompactd_stop(node); 1904 } 1905 1906 vm_total_pages = nr_free_pagecache_pages(); 1907 writeback_set_ratelimit(); 1908 1909 memory_notify(MEM_OFFLINE, &arg); 1910 return 0; 1911 1912 failed_removal: 1913 pr_debug("memory offlining [mem %#010llx-%#010llx] failed\n", 1914 (unsigned long long) start_pfn << PAGE_SHIFT, 1915 ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); 1916 memory_notify(MEM_CANCEL_OFFLINE, &arg); 1917 /* pushback to free area */ 1918 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1919 return ret; 1920 } 1921 1922 /* Must be protected by mem_hotplug_begin() */ 1923 int offline_pages(unsigned long start_pfn, unsigned long nr_pages) 1924 { 1925 return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); 1926 } 1927 #endif /* CONFIG_MEMORY_HOTREMOVE */ 1928 1929 /** 1930 * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn) 1931 * @start_pfn: start pfn of the memory range 1932 * @end_pfn: end pfn of the memory range 1933 * @arg: argument passed to func 1934 * @func: callback for each memory section walked 1935 * 1936 * This function walks through all present mem sections in range 1937 * [start_pfn, end_pfn) and call func on each mem section. 1938 * 1939 * Returns the return value of func. 1940 */ 1941 int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, 1942 void *arg, int (*func)(struct memory_block *, void *)) 1943 { 1944 struct memory_block *mem = NULL; 1945 struct mem_section *section; 1946 unsigned long pfn, section_nr; 1947 int ret; 1948 1949 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 1950 section_nr = pfn_to_section_nr(pfn); 1951 if (!present_section_nr(section_nr)) 1952 continue; 1953 1954 section = __nr_to_section(section_nr); 1955 /* same memblock? */ 1956 if (mem) 1957 if ((section_nr >= mem->start_section_nr) && 1958 (section_nr <= mem->end_section_nr)) 1959 continue; 1960 1961 mem = find_memory_block_hinted(section, mem); 1962 if (!mem) 1963 continue; 1964 1965 ret = func(mem, arg); 1966 if (ret) { 1967 kobject_put(&mem->dev.kobj); 1968 return ret; 1969 } 1970 } 1971 1972 if (mem) 1973 kobject_put(&mem->dev.kobj); 1974 1975 return 0; 1976 } 1977 1978 #ifdef CONFIG_MEMORY_HOTREMOVE 1979 static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) 1980 { 1981 int ret = !is_memblock_offlined(mem); 1982 1983 if (unlikely(ret)) { 1984 phys_addr_t beginpa, endpa; 1985 1986 beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); 1987 endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1; 1988 pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n", 1989 &beginpa, &endpa); 1990 } 1991 1992 return ret; 1993 } 1994 1995 static int check_cpu_on_node(pg_data_t *pgdat) 1996 { 1997 int cpu; 1998 1999 for_each_present_cpu(cpu) { 2000 if (cpu_to_node(cpu) == pgdat->node_id) 2001 /* 2002 * the cpu on this node isn't removed, and we can't 2003 * offline this node. 2004 */ 2005 return -EBUSY; 2006 } 2007 2008 return 0; 2009 } 2010 2011 static void unmap_cpu_on_node(pg_data_t *pgdat) 2012 { 2013 #ifdef CONFIG_ACPI_NUMA 2014 int cpu; 2015 2016 for_each_possible_cpu(cpu) 2017 if (cpu_to_node(cpu) == pgdat->node_id) 2018 numa_clear_node(cpu); 2019 #endif 2020 } 2021 2022 static int check_and_unmap_cpu_on_node(pg_data_t *pgdat) 2023 { 2024 int ret; 2025 2026 ret = check_cpu_on_node(pgdat); 2027 if (ret) 2028 return ret; 2029 2030 /* 2031 * the node will be offlined when we come here, so we can clear 2032 * the cpu_to_node() now. 2033 */ 2034 2035 unmap_cpu_on_node(pgdat); 2036 return 0; 2037 } 2038 2039 /** 2040 * try_offline_node 2041 * 2042 * Offline a node if all memory sections and cpus of the node are removed. 2043 * 2044 * NOTE: The caller must call lock_device_hotplug() to serialize hotplug 2045 * and online/offline operations before this call. 2046 */ 2047 void try_offline_node(int nid) 2048 { 2049 pg_data_t *pgdat = NODE_DATA(nid); 2050 unsigned long start_pfn = pgdat->node_start_pfn; 2051 unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; 2052 unsigned long pfn; 2053 int i; 2054 2055 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 2056 unsigned long section_nr = pfn_to_section_nr(pfn); 2057 2058 if (!present_section_nr(section_nr)) 2059 continue; 2060 2061 if (pfn_to_nid(pfn) != nid) 2062 continue; 2063 2064 /* 2065 * some memory sections of this node are not removed, and we 2066 * can't offline node now. 2067 */ 2068 return; 2069 } 2070 2071 if (check_and_unmap_cpu_on_node(pgdat)) 2072 return; 2073 2074 /* 2075 * all memory/cpu of this node are removed, we can offline this 2076 * node now. 2077 */ 2078 node_set_offline(nid); 2079 unregister_one_node(nid); 2080 2081 /* free waittable in each zone */ 2082 for (i = 0; i < MAX_NR_ZONES; i++) { 2083 struct zone *zone = pgdat->node_zones + i; 2084 2085 /* 2086 * wait_table may be allocated from boot memory, 2087 * here only free if it's allocated by vmalloc. 2088 */ 2089 if (is_vmalloc_addr(zone->wait_table)) { 2090 vfree(zone->wait_table); 2091 zone->wait_table = NULL; 2092 } 2093 } 2094 } 2095 EXPORT_SYMBOL(try_offline_node); 2096 2097 /** 2098 * remove_memory 2099 * 2100 * NOTE: The caller must call lock_device_hotplug() to serialize hotplug 2101 * and online/offline operations before this call, as required by 2102 * try_offline_node(). 2103 */ 2104 void __ref remove_memory(int nid, u64 start, u64 size) 2105 { 2106 int ret; 2107 2108 BUG_ON(check_hotplug_memory_range(start, size)); 2109 2110 mem_hotplug_begin(); 2111 2112 /* 2113 * All memory blocks must be offlined before removing memory. Check 2114 * whether all memory blocks in question are offline and trigger a BUG() 2115 * if this is not the case. 2116 */ 2117 ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, 2118 check_memblock_offlined_cb); 2119 if (ret) 2120 BUG(); 2121 2122 /* remove memmap entry */ 2123 firmware_map_remove(start, start + size, "System RAM"); 2124 memblock_free(start, size); 2125 memblock_remove(start, size); 2126 2127 arch_remove_memory(start, size); 2128 2129 try_offline_node(nid); 2130 2131 mem_hotplug_done(); 2132 } 2133 EXPORT_SYMBOL_GPL(remove_memory); 2134 #endif /* CONFIG_MEMORY_HOTREMOVE */ 2135