1 /* 2 * linux/mm/memory_hotplug.c 3 * 4 * Copyright (C) 5 */ 6 7 #include <linux/stddef.h> 8 #include <linux/mm.h> 9 #include <linux/sched/signal.h> 10 #include <linux/swap.h> 11 #include <linux/interrupt.h> 12 #include <linux/pagemap.h> 13 #include <linux/compiler.h> 14 #include <linux/export.h> 15 #include <linux/pagevec.h> 16 #include <linux/writeback.h> 17 #include <linux/slab.h> 18 #include <linux/sysctl.h> 19 #include <linux/cpu.h> 20 #include <linux/memory.h> 21 #include <linux/memremap.h> 22 #include <linux/memory_hotplug.h> 23 #include <linux/highmem.h> 24 #include <linux/vmalloc.h> 25 #include <linux/ioport.h> 26 #include <linux/delay.h> 27 #include <linux/migrate.h> 28 #include <linux/page-isolation.h> 29 #include <linux/pfn.h> 30 #include <linux/suspend.h> 31 #include <linux/mm_inline.h> 32 #include <linux/firmware-map.h> 33 #include <linux/stop_machine.h> 34 #include <linux/hugetlb.h> 35 #include <linux/memblock.h> 36 #include <linux/bootmem.h> 37 #include <linux/compaction.h> 38 39 #include <asm/tlbflush.h> 40 41 #include "internal.h" 42 43 /* 44 * online_page_callback contains pointer to current page onlining function. 45 * Initially it is generic_online_page(). If it is required it could be 46 * changed by calling set_online_page_callback() for callback registration 47 * and restore_online_page_callback() for generic callback restore. 48 */ 49 50 static void generic_online_page(struct page *page); 51 52 static online_page_callback_t online_page_callback = generic_online_page; 53 static DEFINE_MUTEX(online_page_callback_lock); 54 55 /* The same as the cpu_hotplug lock, but for memory hotplug. */ 56 static struct { 57 struct task_struct *active_writer; 58 struct mutex lock; /* Synchronizes accesses to refcount, */ 59 /* 60 * Also blocks the new readers during 61 * an ongoing mem hotplug operation. 62 */ 63 int refcount; 64 65 #ifdef CONFIG_DEBUG_LOCK_ALLOC 66 struct lockdep_map dep_map; 67 #endif 68 } mem_hotplug = { 69 .active_writer = NULL, 70 .lock = __MUTEX_INITIALIZER(mem_hotplug.lock), 71 .refcount = 0, 72 #ifdef CONFIG_DEBUG_LOCK_ALLOC 73 .dep_map = {.name = "mem_hotplug.lock" }, 74 #endif 75 }; 76 77 /* Lockdep annotations for get/put_online_mems() and mem_hotplug_begin/end() */ 78 #define memhp_lock_acquire_read() lock_map_acquire_read(&mem_hotplug.dep_map) 79 #define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map) 80 #define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map) 81 82 #ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE 83 bool memhp_auto_online; 84 #else 85 bool memhp_auto_online = true; 86 #endif 87 EXPORT_SYMBOL_GPL(memhp_auto_online); 88 89 static int __init setup_memhp_default_state(char *str) 90 { 91 if (!strcmp(str, "online")) 92 memhp_auto_online = true; 93 else if (!strcmp(str, "offline")) 94 memhp_auto_online = false; 95 96 return 1; 97 } 98 __setup("memhp_default_state=", setup_memhp_default_state); 99 100 void get_online_mems(void) 101 { 102 might_sleep(); 103 if (mem_hotplug.active_writer == current) 104 return; 105 memhp_lock_acquire_read(); 106 mutex_lock(&mem_hotplug.lock); 107 mem_hotplug.refcount++; 108 mutex_unlock(&mem_hotplug.lock); 109 110 } 111 112 void put_online_mems(void) 113 { 114 if (mem_hotplug.active_writer == current) 115 return; 116 mutex_lock(&mem_hotplug.lock); 117 118 if (WARN_ON(!mem_hotplug.refcount)) 119 mem_hotplug.refcount++; /* try to fix things up */ 120 121 if (!--mem_hotplug.refcount && unlikely(mem_hotplug.active_writer)) 122 wake_up_process(mem_hotplug.active_writer); 123 mutex_unlock(&mem_hotplug.lock); 124 memhp_lock_release(); 125 126 } 127 128 /* Serializes write accesses to mem_hotplug.active_writer. */ 129 static DEFINE_MUTEX(memory_add_remove_lock); 130 131 void mem_hotplug_begin(void) 132 { 133 mutex_lock(&memory_add_remove_lock); 134 135 mem_hotplug.active_writer = current; 136 137 memhp_lock_acquire(); 138 for (;;) { 139 mutex_lock(&mem_hotplug.lock); 140 if (likely(!mem_hotplug.refcount)) 141 break; 142 __set_current_state(TASK_UNINTERRUPTIBLE); 143 mutex_unlock(&mem_hotplug.lock); 144 schedule(); 145 } 146 } 147 148 void mem_hotplug_done(void) 149 { 150 mem_hotplug.active_writer = NULL; 151 mutex_unlock(&mem_hotplug.lock); 152 memhp_lock_release(); 153 mutex_unlock(&memory_add_remove_lock); 154 } 155 156 /* add this memory to iomem resource */ 157 static struct resource *register_memory_resource(u64 start, u64 size) 158 { 159 struct resource *res; 160 res = kzalloc(sizeof(struct resource), GFP_KERNEL); 161 if (!res) 162 return ERR_PTR(-ENOMEM); 163 164 res->name = "System RAM"; 165 res->start = start; 166 res->end = start + size - 1; 167 res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; 168 if (request_resource(&iomem_resource, res) < 0) { 169 pr_debug("System RAM resource %pR cannot be added\n", res); 170 kfree(res); 171 return ERR_PTR(-EEXIST); 172 } 173 return res; 174 } 175 176 static void release_memory_resource(struct resource *res) 177 { 178 if (!res) 179 return; 180 release_resource(res); 181 kfree(res); 182 return; 183 } 184 185 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 186 void get_page_bootmem(unsigned long info, struct page *page, 187 unsigned long type) 188 { 189 page->freelist = (void *)type; 190 SetPagePrivate(page); 191 set_page_private(page, info); 192 page_ref_inc(page); 193 } 194 195 void put_page_bootmem(struct page *page) 196 { 197 unsigned long type; 198 199 type = (unsigned long) page->freelist; 200 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || 201 type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); 202 203 if (page_ref_dec_return(page) == 1) { 204 page->freelist = NULL; 205 ClearPagePrivate(page); 206 set_page_private(page, 0); 207 INIT_LIST_HEAD(&page->lru); 208 free_reserved_page(page); 209 } 210 } 211 212 #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE 213 #ifndef CONFIG_SPARSEMEM_VMEMMAP 214 static void register_page_bootmem_info_section(unsigned long start_pfn) 215 { 216 unsigned long *usemap, mapsize, section_nr, i; 217 struct mem_section *ms; 218 struct page *page, *memmap; 219 220 section_nr = pfn_to_section_nr(start_pfn); 221 ms = __nr_to_section(section_nr); 222 223 /* Get section's memmap address */ 224 memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); 225 226 /* 227 * Get page for the memmap's phys address 228 * XXX: need more consideration for sparse_vmemmap... 229 */ 230 page = virt_to_page(memmap); 231 mapsize = sizeof(struct page) * PAGES_PER_SECTION; 232 mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT; 233 234 /* remember memmap's page */ 235 for (i = 0; i < mapsize; i++, page++) 236 get_page_bootmem(section_nr, page, SECTION_INFO); 237 238 usemap = __nr_to_section(section_nr)->pageblock_flags; 239 page = virt_to_page(usemap); 240 241 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; 242 243 for (i = 0; i < mapsize; i++, page++) 244 get_page_bootmem(section_nr, page, MIX_SECTION_INFO); 245 246 } 247 #else /* CONFIG_SPARSEMEM_VMEMMAP */ 248 static void register_page_bootmem_info_section(unsigned long start_pfn) 249 { 250 unsigned long *usemap, mapsize, section_nr, i; 251 struct mem_section *ms; 252 struct page *page, *memmap; 253 254 if (!pfn_valid(start_pfn)) 255 return; 256 257 section_nr = pfn_to_section_nr(start_pfn); 258 ms = __nr_to_section(section_nr); 259 260 memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); 261 262 register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION); 263 264 usemap = __nr_to_section(section_nr)->pageblock_flags; 265 page = virt_to_page(usemap); 266 267 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; 268 269 for (i = 0; i < mapsize; i++, page++) 270 get_page_bootmem(section_nr, page, MIX_SECTION_INFO); 271 } 272 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ 273 274 void __init register_page_bootmem_info_node(struct pglist_data *pgdat) 275 { 276 unsigned long i, pfn, end_pfn, nr_pages; 277 int node = pgdat->node_id; 278 struct page *page; 279 280 nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT; 281 page = virt_to_page(pgdat); 282 283 for (i = 0; i < nr_pages; i++, page++) 284 get_page_bootmem(node, page, NODE_INFO); 285 286 pfn = pgdat->node_start_pfn; 287 end_pfn = pgdat_end_pfn(pgdat); 288 289 /* register section info */ 290 for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 291 /* 292 * Some platforms can assign the same pfn to multiple nodes - on 293 * node0 as well as nodeN. To avoid registering a pfn against 294 * multiple nodes we check that this pfn does not already 295 * reside in some other nodes. 296 */ 297 if (pfn_valid(pfn) && (early_pfn_to_nid(pfn) == node)) 298 register_page_bootmem_info_section(pfn); 299 } 300 } 301 #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ 302 303 static void __meminit grow_zone_span(struct zone *zone, unsigned long start_pfn, 304 unsigned long end_pfn) 305 { 306 unsigned long old_zone_end_pfn; 307 308 zone_span_writelock(zone); 309 310 old_zone_end_pfn = zone_end_pfn(zone); 311 if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) 312 zone->zone_start_pfn = start_pfn; 313 314 zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - 315 zone->zone_start_pfn; 316 317 zone_span_writeunlock(zone); 318 } 319 320 static void resize_zone(struct zone *zone, unsigned long start_pfn, 321 unsigned long end_pfn) 322 { 323 zone_span_writelock(zone); 324 325 if (end_pfn - start_pfn) { 326 zone->zone_start_pfn = start_pfn; 327 zone->spanned_pages = end_pfn - start_pfn; 328 } else { 329 /* 330 * make it consist as free_area_init_core(), 331 * if spanned_pages = 0, then keep start_pfn = 0 332 */ 333 zone->zone_start_pfn = 0; 334 zone->spanned_pages = 0; 335 } 336 337 zone_span_writeunlock(zone); 338 } 339 340 static void fix_zone_id(struct zone *zone, unsigned long start_pfn, 341 unsigned long end_pfn) 342 { 343 enum zone_type zid = zone_idx(zone); 344 int nid = zone->zone_pgdat->node_id; 345 unsigned long pfn; 346 347 for (pfn = start_pfn; pfn < end_pfn; pfn++) 348 set_page_links(pfn_to_page(pfn), zid, nid, pfn); 349 } 350 351 /* Can fail with -ENOMEM from allocating a wait table with vmalloc() or 352 * alloc_bootmem_node_nopanic()/memblock_virt_alloc_node_nopanic() */ 353 static int __ref ensure_zone_is_initialized(struct zone *zone, 354 unsigned long start_pfn, unsigned long num_pages) 355 { 356 if (!zone_is_initialized(zone)) 357 return init_currently_empty_zone(zone, start_pfn, num_pages); 358 359 return 0; 360 } 361 362 static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, 363 unsigned long start_pfn, unsigned long end_pfn) 364 { 365 int ret; 366 unsigned long flags; 367 unsigned long z1_start_pfn; 368 369 ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn); 370 if (ret) 371 return ret; 372 373 pgdat_resize_lock(z1->zone_pgdat, &flags); 374 375 /* can't move pfns which are higher than @z2 */ 376 if (end_pfn > zone_end_pfn(z2)) 377 goto out_fail; 378 /* the move out part must be at the left most of @z2 */ 379 if (start_pfn > z2->zone_start_pfn) 380 goto out_fail; 381 /* must included/overlap */ 382 if (end_pfn <= z2->zone_start_pfn) 383 goto out_fail; 384 385 /* use start_pfn for z1's start_pfn if z1 is empty */ 386 if (!zone_is_empty(z1)) 387 z1_start_pfn = z1->zone_start_pfn; 388 else 389 z1_start_pfn = start_pfn; 390 391 resize_zone(z1, z1_start_pfn, end_pfn); 392 resize_zone(z2, end_pfn, zone_end_pfn(z2)); 393 394 pgdat_resize_unlock(z1->zone_pgdat, &flags); 395 396 fix_zone_id(z1, start_pfn, end_pfn); 397 398 return 0; 399 out_fail: 400 pgdat_resize_unlock(z1->zone_pgdat, &flags); 401 return -1; 402 } 403 404 static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, 405 unsigned long start_pfn, unsigned long end_pfn) 406 { 407 int ret; 408 unsigned long flags; 409 unsigned long z2_end_pfn; 410 411 ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn); 412 if (ret) 413 return ret; 414 415 pgdat_resize_lock(z1->zone_pgdat, &flags); 416 417 /* can't move pfns which are lower than @z1 */ 418 if (z1->zone_start_pfn > start_pfn) 419 goto out_fail; 420 /* the move out part mast at the right most of @z1 */ 421 if (zone_end_pfn(z1) > end_pfn) 422 goto out_fail; 423 /* must included/overlap */ 424 if (start_pfn >= zone_end_pfn(z1)) 425 goto out_fail; 426 427 /* use end_pfn for z2's end_pfn if z2 is empty */ 428 if (!zone_is_empty(z2)) 429 z2_end_pfn = zone_end_pfn(z2); 430 else 431 z2_end_pfn = end_pfn; 432 433 resize_zone(z1, z1->zone_start_pfn, start_pfn); 434 resize_zone(z2, start_pfn, z2_end_pfn); 435 436 pgdat_resize_unlock(z1->zone_pgdat, &flags); 437 438 fix_zone_id(z2, start_pfn, end_pfn); 439 440 return 0; 441 out_fail: 442 pgdat_resize_unlock(z1->zone_pgdat, &flags); 443 return -1; 444 } 445 446 static struct zone * __meminit move_pfn_range(int zone_shift, 447 unsigned long start_pfn, unsigned long end_pfn) 448 { 449 struct zone *zone = page_zone(pfn_to_page(start_pfn)); 450 int ret = 0; 451 452 if (zone_shift < 0) 453 ret = move_pfn_range_left(zone + zone_shift, zone, 454 start_pfn, end_pfn); 455 else if (zone_shift) 456 ret = move_pfn_range_right(zone, zone + zone_shift, 457 start_pfn, end_pfn); 458 459 if (ret) 460 return NULL; 461 462 return zone + zone_shift; 463 } 464 465 static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, 466 unsigned long end_pfn) 467 { 468 unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat); 469 470 if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) 471 pgdat->node_start_pfn = start_pfn; 472 473 pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - 474 pgdat->node_start_pfn; 475 } 476 477 static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) 478 { 479 struct pglist_data *pgdat = zone->zone_pgdat; 480 int nr_pages = PAGES_PER_SECTION; 481 int nid = pgdat->node_id; 482 int zone_type; 483 unsigned long flags, pfn; 484 int ret; 485 486 zone_type = zone - pgdat->node_zones; 487 ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages); 488 if (ret) 489 return ret; 490 491 pgdat_resize_lock(zone->zone_pgdat, &flags); 492 grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); 493 grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, 494 phys_start_pfn + nr_pages); 495 pgdat_resize_unlock(zone->zone_pgdat, &flags); 496 memmap_init_zone(nr_pages, nid, zone_type, 497 phys_start_pfn, MEMMAP_HOTPLUG); 498 499 /* online_page_range is called later and expects pages reserved */ 500 for (pfn = phys_start_pfn; pfn < phys_start_pfn + nr_pages; pfn++) { 501 if (!pfn_valid(pfn)) 502 continue; 503 504 SetPageReserved(pfn_to_page(pfn)); 505 } 506 return 0; 507 } 508 509 static int __meminit __add_section(int nid, struct zone *zone, 510 unsigned long phys_start_pfn) 511 { 512 int ret; 513 514 if (pfn_valid(phys_start_pfn)) 515 return -EEXIST; 516 517 ret = sparse_add_one_section(zone, phys_start_pfn); 518 519 if (ret < 0) 520 return ret; 521 522 ret = __add_zone(zone, phys_start_pfn); 523 524 if (ret < 0) 525 return ret; 526 527 return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); 528 } 529 530 /* 531 * Reasonably generic function for adding memory. It is 532 * expected that archs that support memory hotplug will 533 * call this function after deciding the zone to which to 534 * add the new pages. 535 */ 536 int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, 537 unsigned long nr_pages) 538 { 539 unsigned long i; 540 int err = 0; 541 int start_sec, end_sec; 542 struct vmem_altmap *altmap; 543 544 clear_zone_contiguous(zone); 545 546 /* during initialize mem_map, align hot-added range to section */ 547 start_sec = pfn_to_section_nr(phys_start_pfn); 548 end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); 549 550 altmap = to_vmem_altmap((unsigned long) pfn_to_page(phys_start_pfn)); 551 if (altmap) { 552 /* 553 * Validate altmap is within bounds of the total request 554 */ 555 if (altmap->base_pfn != phys_start_pfn 556 || vmem_altmap_offset(altmap) > nr_pages) { 557 pr_warn_once("memory add fail, invalid altmap\n"); 558 err = -EINVAL; 559 goto out; 560 } 561 altmap->alloc = 0; 562 } 563 564 for (i = start_sec; i <= end_sec; i++) { 565 err = __add_section(nid, zone, section_nr_to_pfn(i)); 566 567 /* 568 * EEXIST is finally dealt with by ioresource collision 569 * check. see add_memory() => register_memory_resource() 570 * Warning will be printed if there is collision. 571 */ 572 if (err && (err != -EEXIST)) 573 break; 574 err = 0; 575 } 576 vmemmap_populate_print_last(); 577 out: 578 set_zone_contiguous(zone); 579 return err; 580 } 581 EXPORT_SYMBOL_GPL(__add_pages); 582 583 #ifdef CONFIG_MEMORY_HOTREMOVE 584 /* find the smallest valid pfn in the range [start_pfn, end_pfn) */ 585 static int find_smallest_section_pfn(int nid, struct zone *zone, 586 unsigned long start_pfn, 587 unsigned long end_pfn) 588 { 589 struct mem_section *ms; 590 591 for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) { 592 ms = __pfn_to_section(start_pfn); 593 594 if (unlikely(!valid_section(ms))) 595 continue; 596 597 if (unlikely(pfn_to_nid(start_pfn) != nid)) 598 continue; 599 600 if (zone && zone != page_zone(pfn_to_page(start_pfn))) 601 continue; 602 603 return start_pfn; 604 } 605 606 return 0; 607 } 608 609 /* find the biggest valid pfn in the range [start_pfn, end_pfn). */ 610 static int find_biggest_section_pfn(int nid, struct zone *zone, 611 unsigned long start_pfn, 612 unsigned long end_pfn) 613 { 614 struct mem_section *ms; 615 unsigned long pfn; 616 617 /* pfn is the end pfn of a memory section. */ 618 pfn = end_pfn - 1; 619 for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) { 620 ms = __pfn_to_section(pfn); 621 622 if (unlikely(!valid_section(ms))) 623 continue; 624 625 if (unlikely(pfn_to_nid(pfn) != nid)) 626 continue; 627 628 if (zone && zone != page_zone(pfn_to_page(pfn))) 629 continue; 630 631 return pfn; 632 } 633 634 return 0; 635 } 636 637 static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, 638 unsigned long end_pfn) 639 { 640 unsigned long zone_start_pfn = zone->zone_start_pfn; 641 unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */ 642 unsigned long zone_end_pfn = z; 643 unsigned long pfn; 644 struct mem_section *ms; 645 int nid = zone_to_nid(zone); 646 647 zone_span_writelock(zone); 648 if (zone_start_pfn == start_pfn) { 649 /* 650 * If the section is smallest section in the zone, it need 651 * shrink zone->zone_start_pfn and zone->zone_spanned_pages. 652 * In this case, we find second smallest valid mem_section 653 * for shrinking zone. 654 */ 655 pfn = find_smallest_section_pfn(nid, zone, end_pfn, 656 zone_end_pfn); 657 if (pfn) { 658 zone->zone_start_pfn = pfn; 659 zone->spanned_pages = zone_end_pfn - pfn; 660 } 661 } else if (zone_end_pfn == end_pfn) { 662 /* 663 * If the section is biggest section in the zone, it need 664 * shrink zone->spanned_pages. 665 * In this case, we find second biggest valid mem_section for 666 * shrinking zone. 667 */ 668 pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn, 669 start_pfn); 670 if (pfn) 671 zone->spanned_pages = pfn - zone_start_pfn + 1; 672 } 673 674 /* 675 * The section is not biggest or smallest mem_section in the zone, it 676 * only creates a hole in the zone. So in this case, we need not 677 * change the zone. But perhaps, the zone has only hole data. Thus 678 * it check the zone has only hole or not. 679 */ 680 pfn = zone_start_pfn; 681 for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) { 682 ms = __pfn_to_section(pfn); 683 684 if (unlikely(!valid_section(ms))) 685 continue; 686 687 if (page_zone(pfn_to_page(pfn)) != zone) 688 continue; 689 690 /* If the section is current section, it continues the loop */ 691 if (start_pfn == pfn) 692 continue; 693 694 /* If we find valid section, we have nothing to do */ 695 zone_span_writeunlock(zone); 696 return; 697 } 698 699 /* The zone has no valid section */ 700 zone->zone_start_pfn = 0; 701 zone->spanned_pages = 0; 702 zone_span_writeunlock(zone); 703 } 704 705 static void shrink_pgdat_span(struct pglist_data *pgdat, 706 unsigned long start_pfn, unsigned long end_pfn) 707 { 708 unsigned long pgdat_start_pfn = pgdat->node_start_pfn; 709 unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */ 710 unsigned long pgdat_end_pfn = p; 711 unsigned long pfn; 712 struct mem_section *ms; 713 int nid = pgdat->node_id; 714 715 if (pgdat_start_pfn == start_pfn) { 716 /* 717 * If the section is smallest section in the pgdat, it need 718 * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages. 719 * In this case, we find second smallest valid mem_section 720 * for shrinking zone. 721 */ 722 pfn = find_smallest_section_pfn(nid, NULL, end_pfn, 723 pgdat_end_pfn); 724 if (pfn) { 725 pgdat->node_start_pfn = pfn; 726 pgdat->node_spanned_pages = pgdat_end_pfn - pfn; 727 } 728 } else if (pgdat_end_pfn == end_pfn) { 729 /* 730 * If the section is biggest section in the pgdat, it need 731 * shrink pgdat->node_spanned_pages. 732 * In this case, we find second biggest valid mem_section for 733 * shrinking zone. 734 */ 735 pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn, 736 start_pfn); 737 if (pfn) 738 pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1; 739 } 740 741 /* 742 * If the section is not biggest or smallest mem_section in the pgdat, 743 * it only creates a hole in the pgdat. So in this case, we need not 744 * change the pgdat. 745 * But perhaps, the pgdat has only hole data. Thus it check the pgdat 746 * has only hole or not. 747 */ 748 pfn = pgdat_start_pfn; 749 for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) { 750 ms = __pfn_to_section(pfn); 751 752 if (unlikely(!valid_section(ms))) 753 continue; 754 755 if (pfn_to_nid(pfn) != nid) 756 continue; 757 758 /* If the section is current section, it continues the loop */ 759 if (start_pfn == pfn) 760 continue; 761 762 /* If we find valid section, we have nothing to do */ 763 return; 764 } 765 766 /* The pgdat has no valid section */ 767 pgdat->node_start_pfn = 0; 768 pgdat->node_spanned_pages = 0; 769 } 770 771 static void __remove_zone(struct zone *zone, unsigned long start_pfn) 772 { 773 struct pglist_data *pgdat = zone->zone_pgdat; 774 int nr_pages = PAGES_PER_SECTION; 775 int zone_type; 776 unsigned long flags; 777 778 zone_type = zone - pgdat->node_zones; 779 780 pgdat_resize_lock(zone->zone_pgdat, &flags); 781 shrink_zone_span(zone, start_pfn, start_pfn + nr_pages); 782 shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages); 783 pgdat_resize_unlock(zone->zone_pgdat, &flags); 784 } 785 786 static int __remove_section(struct zone *zone, struct mem_section *ms, 787 unsigned long map_offset) 788 { 789 unsigned long start_pfn; 790 int scn_nr; 791 int ret = -EINVAL; 792 793 if (!valid_section(ms)) 794 return ret; 795 796 ret = unregister_memory_section(ms); 797 if (ret) 798 return ret; 799 800 scn_nr = __section_nr(ms); 801 start_pfn = section_nr_to_pfn(scn_nr); 802 __remove_zone(zone, start_pfn); 803 804 sparse_remove_one_section(zone, ms, map_offset); 805 return 0; 806 } 807 808 /** 809 * __remove_pages() - remove sections of pages from a zone 810 * @zone: zone from which pages need to be removed 811 * @phys_start_pfn: starting pageframe (must be aligned to start of a section) 812 * @nr_pages: number of pages to remove (must be multiple of section size) 813 * 814 * Generic helper function to remove section mappings and sysfs entries 815 * for the section of the memory we are removing. Caller needs to make 816 * sure that pages are marked reserved and zones are adjust properly by 817 * calling offline_pages(). 818 */ 819 int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, 820 unsigned long nr_pages) 821 { 822 unsigned long i; 823 unsigned long map_offset = 0; 824 int sections_to_remove, ret = 0; 825 826 /* In the ZONE_DEVICE case device driver owns the memory region */ 827 if (is_dev_zone(zone)) { 828 struct page *page = pfn_to_page(phys_start_pfn); 829 struct vmem_altmap *altmap; 830 831 altmap = to_vmem_altmap((unsigned long) page); 832 if (altmap) 833 map_offset = vmem_altmap_offset(altmap); 834 } else { 835 resource_size_t start, size; 836 837 start = phys_start_pfn << PAGE_SHIFT; 838 size = nr_pages * PAGE_SIZE; 839 840 ret = release_mem_region_adjustable(&iomem_resource, start, 841 size); 842 if (ret) { 843 resource_size_t endres = start + size - 1; 844 845 pr_warn("Unable to release resource <%pa-%pa> (%d)\n", 846 &start, &endres, ret); 847 } 848 } 849 850 clear_zone_contiguous(zone); 851 852 /* 853 * We can only remove entire sections 854 */ 855 BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); 856 BUG_ON(nr_pages % PAGES_PER_SECTION); 857 858 sections_to_remove = nr_pages / PAGES_PER_SECTION; 859 for (i = 0; i < sections_to_remove; i++) { 860 unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; 861 862 ret = __remove_section(zone, __pfn_to_section(pfn), map_offset); 863 map_offset = 0; 864 if (ret) 865 break; 866 } 867 868 set_zone_contiguous(zone); 869 870 return ret; 871 } 872 #endif /* CONFIG_MEMORY_HOTREMOVE */ 873 874 int set_online_page_callback(online_page_callback_t callback) 875 { 876 int rc = -EINVAL; 877 878 get_online_mems(); 879 mutex_lock(&online_page_callback_lock); 880 881 if (online_page_callback == generic_online_page) { 882 online_page_callback = callback; 883 rc = 0; 884 } 885 886 mutex_unlock(&online_page_callback_lock); 887 put_online_mems(); 888 889 return rc; 890 } 891 EXPORT_SYMBOL_GPL(set_online_page_callback); 892 893 int restore_online_page_callback(online_page_callback_t callback) 894 { 895 int rc = -EINVAL; 896 897 get_online_mems(); 898 mutex_lock(&online_page_callback_lock); 899 900 if (online_page_callback == callback) { 901 online_page_callback = generic_online_page; 902 rc = 0; 903 } 904 905 mutex_unlock(&online_page_callback_lock); 906 put_online_mems(); 907 908 return rc; 909 } 910 EXPORT_SYMBOL_GPL(restore_online_page_callback); 911 912 void __online_page_set_limits(struct page *page) 913 { 914 } 915 EXPORT_SYMBOL_GPL(__online_page_set_limits); 916 917 void __online_page_increment_counters(struct page *page) 918 { 919 adjust_managed_page_count(page, 1); 920 } 921 EXPORT_SYMBOL_GPL(__online_page_increment_counters); 922 923 void __online_page_free(struct page *page) 924 { 925 __free_reserved_page(page); 926 } 927 EXPORT_SYMBOL_GPL(__online_page_free); 928 929 static void generic_online_page(struct page *page) 930 { 931 __online_page_set_limits(page); 932 __online_page_increment_counters(page); 933 __online_page_free(page); 934 } 935 936 static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, 937 void *arg) 938 { 939 unsigned long i; 940 unsigned long onlined_pages = *(unsigned long *)arg; 941 struct page *page; 942 if (PageReserved(pfn_to_page(start_pfn))) 943 for (i = 0; i < nr_pages; i++) { 944 page = pfn_to_page(start_pfn + i); 945 (*online_page_callback)(page); 946 onlined_pages++; 947 } 948 *(unsigned long *)arg = onlined_pages; 949 return 0; 950 } 951 952 #ifdef CONFIG_MOVABLE_NODE 953 /* 954 * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have 955 * normal memory. 956 */ 957 static bool can_online_high_movable(struct zone *zone) 958 { 959 return true; 960 } 961 #else /* CONFIG_MOVABLE_NODE */ 962 /* ensure every online node has NORMAL memory */ 963 static bool can_online_high_movable(struct zone *zone) 964 { 965 return node_state(zone_to_nid(zone), N_NORMAL_MEMORY); 966 } 967 #endif /* CONFIG_MOVABLE_NODE */ 968 969 /* check which state of node_states will be changed when online memory */ 970 static void node_states_check_changes_online(unsigned long nr_pages, 971 struct zone *zone, struct memory_notify *arg) 972 { 973 int nid = zone_to_nid(zone); 974 enum zone_type zone_last = ZONE_NORMAL; 975 976 /* 977 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] 978 * contains nodes which have zones of 0...ZONE_NORMAL, 979 * set zone_last to ZONE_NORMAL. 980 * 981 * If we don't have HIGHMEM nor movable node, 982 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of 983 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. 984 */ 985 if (N_MEMORY == N_NORMAL_MEMORY) 986 zone_last = ZONE_MOVABLE; 987 988 /* 989 * if the memory to be online is in a zone of 0...zone_last, and 990 * the zones of 0...zone_last don't have memory before online, we will 991 * need to set the node to node_states[N_NORMAL_MEMORY] after 992 * the memory is online. 993 */ 994 if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY)) 995 arg->status_change_nid_normal = nid; 996 else 997 arg->status_change_nid_normal = -1; 998 999 #ifdef CONFIG_HIGHMEM 1000 /* 1001 * If we have movable node, node_states[N_HIGH_MEMORY] 1002 * contains nodes which have zones of 0...ZONE_HIGHMEM, 1003 * set zone_last to ZONE_HIGHMEM. 1004 * 1005 * If we don't have movable node, node_states[N_NORMAL_MEMORY] 1006 * contains nodes which have zones of 0...ZONE_MOVABLE, 1007 * set zone_last to ZONE_MOVABLE. 1008 */ 1009 zone_last = ZONE_HIGHMEM; 1010 if (N_MEMORY == N_HIGH_MEMORY) 1011 zone_last = ZONE_MOVABLE; 1012 1013 if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY)) 1014 arg->status_change_nid_high = nid; 1015 else 1016 arg->status_change_nid_high = -1; 1017 #else 1018 arg->status_change_nid_high = arg->status_change_nid_normal; 1019 #endif 1020 1021 /* 1022 * if the node don't have memory befor online, we will need to 1023 * set the node to node_states[N_MEMORY] after the memory 1024 * is online. 1025 */ 1026 if (!node_state(nid, N_MEMORY)) 1027 arg->status_change_nid = nid; 1028 else 1029 arg->status_change_nid = -1; 1030 } 1031 1032 static void node_states_set_node(int node, struct memory_notify *arg) 1033 { 1034 if (arg->status_change_nid_normal >= 0) 1035 node_set_state(node, N_NORMAL_MEMORY); 1036 1037 if (arg->status_change_nid_high >= 0) 1038 node_set_state(node, N_HIGH_MEMORY); 1039 1040 node_set_state(node, N_MEMORY); 1041 } 1042 1043 bool zone_can_shift(unsigned long pfn, unsigned long nr_pages, 1044 enum zone_type target, int *zone_shift) 1045 { 1046 struct zone *zone = page_zone(pfn_to_page(pfn)); 1047 enum zone_type idx = zone_idx(zone); 1048 int i; 1049 1050 *zone_shift = 0; 1051 1052 if (idx < target) { 1053 /* pages must be at end of current zone */ 1054 if (pfn + nr_pages != zone_end_pfn(zone)) 1055 return false; 1056 1057 /* no zones in use between current zone and target */ 1058 for (i = idx + 1; i < target; i++) 1059 if (zone_is_initialized(zone - idx + i)) 1060 return false; 1061 } 1062 1063 if (target < idx) { 1064 /* pages must be at beginning of current zone */ 1065 if (pfn != zone->zone_start_pfn) 1066 return false; 1067 1068 /* no zones in use between current zone and target */ 1069 for (i = target + 1; i < idx; i++) 1070 if (zone_is_initialized(zone - idx + i)) 1071 return false; 1072 } 1073 1074 *zone_shift = target - idx; 1075 return true; 1076 } 1077 1078 /* Must be protected by mem_hotplug_begin() */ 1079 int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) 1080 { 1081 unsigned long flags; 1082 unsigned long onlined_pages = 0; 1083 struct zone *zone; 1084 int need_zonelists_rebuild = 0; 1085 int nid; 1086 int ret; 1087 struct memory_notify arg; 1088 int zone_shift = 0; 1089 1090 /* 1091 * This doesn't need a lock to do pfn_to_page(). 1092 * The section can't be removed here because of the 1093 * memory_block->state_mutex. 1094 */ 1095 zone = page_zone(pfn_to_page(pfn)); 1096 1097 if ((zone_idx(zone) > ZONE_NORMAL || 1098 online_type == MMOP_ONLINE_MOVABLE) && 1099 !can_online_high_movable(zone)) 1100 return -EINVAL; 1101 1102 if (online_type == MMOP_ONLINE_KERNEL) { 1103 if (!zone_can_shift(pfn, nr_pages, ZONE_NORMAL, &zone_shift)) 1104 return -EINVAL; 1105 } else if (online_type == MMOP_ONLINE_MOVABLE) { 1106 if (!zone_can_shift(pfn, nr_pages, ZONE_MOVABLE, &zone_shift)) 1107 return -EINVAL; 1108 } 1109 1110 zone = move_pfn_range(zone_shift, pfn, pfn + nr_pages); 1111 if (!zone) 1112 return -EINVAL; 1113 1114 arg.start_pfn = pfn; 1115 arg.nr_pages = nr_pages; 1116 node_states_check_changes_online(nr_pages, zone, &arg); 1117 1118 nid = zone_to_nid(zone); 1119 1120 ret = memory_notify(MEM_GOING_ONLINE, &arg); 1121 ret = notifier_to_errno(ret); 1122 if (ret) 1123 goto failed_addition; 1124 1125 /* 1126 * If this zone is not populated, then it is not in zonelist. 1127 * This means the page allocator ignores this zone. 1128 * So, zonelist must be updated after online. 1129 */ 1130 mutex_lock(&zonelists_mutex); 1131 if (!populated_zone(zone)) { 1132 need_zonelists_rebuild = 1; 1133 build_all_zonelists(NULL, zone); 1134 } 1135 1136 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, 1137 online_pages_range); 1138 if (ret) { 1139 if (need_zonelists_rebuild) 1140 zone_pcp_reset(zone); 1141 mutex_unlock(&zonelists_mutex); 1142 goto failed_addition; 1143 } 1144 1145 zone->present_pages += onlined_pages; 1146 1147 pgdat_resize_lock(zone->zone_pgdat, &flags); 1148 zone->zone_pgdat->node_present_pages += onlined_pages; 1149 pgdat_resize_unlock(zone->zone_pgdat, &flags); 1150 1151 if (onlined_pages) { 1152 node_states_set_node(nid, &arg); 1153 if (need_zonelists_rebuild) 1154 build_all_zonelists(NULL, NULL); 1155 else 1156 zone_pcp_update(zone); 1157 } 1158 1159 mutex_unlock(&zonelists_mutex); 1160 1161 init_per_zone_wmark_min(); 1162 1163 if (onlined_pages) { 1164 kswapd_run(nid); 1165 kcompactd_run(nid); 1166 } 1167 1168 vm_total_pages = nr_free_pagecache_pages(); 1169 1170 writeback_set_ratelimit(); 1171 1172 if (onlined_pages) 1173 memory_notify(MEM_ONLINE, &arg); 1174 return 0; 1175 1176 failed_addition: 1177 pr_debug("online_pages [mem %#010llx-%#010llx] failed\n", 1178 (unsigned long long) pfn << PAGE_SHIFT, 1179 (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1); 1180 memory_notify(MEM_CANCEL_ONLINE, &arg); 1181 return ret; 1182 } 1183 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ 1184 1185 static void reset_node_present_pages(pg_data_t *pgdat) 1186 { 1187 struct zone *z; 1188 1189 for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) 1190 z->present_pages = 0; 1191 1192 pgdat->node_present_pages = 0; 1193 } 1194 1195 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 1196 static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) 1197 { 1198 struct pglist_data *pgdat; 1199 unsigned long zones_size[MAX_NR_ZONES] = {0}; 1200 unsigned long zholes_size[MAX_NR_ZONES] = {0}; 1201 unsigned long start_pfn = PFN_DOWN(start); 1202 1203 pgdat = NODE_DATA(nid); 1204 if (!pgdat) { 1205 pgdat = arch_alloc_nodedata(nid); 1206 if (!pgdat) 1207 return NULL; 1208 1209 arch_refresh_nodedata(nid, pgdat); 1210 } else { 1211 /* Reset the nr_zones, order and classzone_idx before reuse */ 1212 pgdat->nr_zones = 0; 1213 pgdat->kswapd_order = 0; 1214 pgdat->kswapd_classzone_idx = 0; 1215 } 1216 1217 /* we can use NODE_DATA(nid) from here */ 1218 1219 /* init node's zones as empty zones, we don't have any present pages.*/ 1220 free_area_init_node(nid, zones_size, start_pfn, zholes_size); 1221 pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat); 1222 1223 /* 1224 * The node we allocated has no zone fallback lists. For avoiding 1225 * to access not-initialized zonelist, build here. 1226 */ 1227 mutex_lock(&zonelists_mutex); 1228 build_all_zonelists(pgdat, NULL); 1229 mutex_unlock(&zonelists_mutex); 1230 1231 /* 1232 * zone->managed_pages is set to an approximate value in 1233 * free_area_init_core(), which will cause 1234 * /sys/device/system/node/nodeX/meminfo has wrong data. 1235 * So reset it to 0 before any memory is onlined. 1236 */ 1237 reset_node_managed_pages(pgdat); 1238 1239 /* 1240 * When memory is hot-added, all the memory is in offline state. So 1241 * clear all zones' present_pages because they will be updated in 1242 * online_pages() and offline_pages(). 1243 */ 1244 reset_node_present_pages(pgdat); 1245 1246 return pgdat; 1247 } 1248 1249 static void rollback_node_hotadd(int nid, pg_data_t *pgdat) 1250 { 1251 arch_refresh_nodedata(nid, NULL); 1252 free_percpu(pgdat->per_cpu_nodestats); 1253 arch_free_nodedata(pgdat); 1254 return; 1255 } 1256 1257 1258 /** 1259 * try_online_node - online a node if offlined 1260 * 1261 * called by cpu_up() to online a node without onlined memory. 1262 */ 1263 int try_online_node(int nid) 1264 { 1265 pg_data_t *pgdat; 1266 int ret; 1267 1268 if (node_online(nid)) 1269 return 0; 1270 1271 mem_hotplug_begin(); 1272 pgdat = hotadd_new_pgdat(nid, 0); 1273 if (!pgdat) { 1274 pr_err("Cannot online node %d due to NULL pgdat\n", nid); 1275 ret = -ENOMEM; 1276 goto out; 1277 } 1278 node_set_online(nid); 1279 ret = register_one_node(nid); 1280 BUG_ON(ret); 1281 1282 if (pgdat->node_zonelists->_zonerefs->zone == NULL) { 1283 mutex_lock(&zonelists_mutex); 1284 build_all_zonelists(NULL, NULL); 1285 mutex_unlock(&zonelists_mutex); 1286 } 1287 1288 out: 1289 mem_hotplug_done(); 1290 return ret; 1291 } 1292 1293 static int check_hotplug_memory_range(u64 start, u64 size) 1294 { 1295 u64 start_pfn = PFN_DOWN(start); 1296 u64 nr_pages = size >> PAGE_SHIFT; 1297 1298 /* Memory range must be aligned with section */ 1299 if ((start_pfn & ~PAGE_SECTION_MASK) || 1300 (nr_pages % PAGES_PER_SECTION) || (!nr_pages)) { 1301 pr_err("Section-unaligned hotplug range: start 0x%llx, size 0x%llx\n", 1302 (unsigned long long)start, 1303 (unsigned long long)size); 1304 return -EINVAL; 1305 } 1306 1307 return 0; 1308 } 1309 1310 /* 1311 * If movable zone has already been setup, newly added memory should be check. 1312 * If its address is higher than movable zone, it should be added as movable. 1313 * Without this check, movable zone may overlap with other zone. 1314 */ 1315 static int should_add_memory_movable(int nid, u64 start, u64 size) 1316 { 1317 unsigned long start_pfn = start >> PAGE_SHIFT; 1318 pg_data_t *pgdat = NODE_DATA(nid); 1319 struct zone *movable_zone = pgdat->node_zones + ZONE_MOVABLE; 1320 1321 if (zone_is_empty(movable_zone)) 1322 return 0; 1323 1324 if (movable_zone->zone_start_pfn <= start_pfn) 1325 return 1; 1326 1327 return 0; 1328 } 1329 1330 int zone_for_memory(int nid, u64 start, u64 size, int zone_default, 1331 bool for_device) 1332 { 1333 #ifdef CONFIG_ZONE_DEVICE 1334 if (for_device) 1335 return ZONE_DEVICE; 1336 #endif 1337 if (should_add_memory_movable(nid, start, size)) 1338 return ZONE_MOVABLE; 1339 1340 return zone_default; 1341 } 1342 1343 static int online_memory_block(struct memory_block *mem, void *arg) 1344 { 1345 return device_online(&mem->dev); 1346 } 1347 1348 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 1349 int __ref add_memory_resource(int nid, struct resource *res, bool online) 1350 { 1351 u64 start, size; 1352 pg_data_t *pgdat = NULL; 1353 bool new_pgdat; 1354 bool new_node; 1355 int ret; 1356 1357 start = res->start; 1358 size = resource_size(res); 1359 1360 ret = check_hotplug_memory_range(start, size); 1361 if (ret) 1362 return ret; 1363 1364 { /* Stupid hack to suppress address-never-null warning */ 1365 void *p = NODE_DATA(nid); 1366 new_pgdat = !p; 1367 } 1368 1369 mem_hotplug_begin(); 1370 1371 /* 1372 * Add new range to memblock so that when hotadd_new_pgdat() is called 1373 * to allocate new pgdat, get_pfn_range_for_nid() will be able to find 1374 * this new range and calculate total pages correctly. The range will 1375 * be removed at hot-remove time. 1376 */ 1377 memblock_add_node(start, size, nid); 1378 1379 new_node = !node_online(nid); 1380 if (new_node) { 1381 pgdat = hotadd_new_pgdat(nid, start); 1382 ret = -ENOMEM; 1383 if (!pgdat) 1384 goto error; 1385 } 1386 1387 /* call arch's memory hotadd */ 1388 ret = arch_add_memory(nid, start, size, false); 1389 1390 if (ret < 0) 1391 goto error; 1392 1393 /* we online node here. we can't roll back from here. */ 1394 node_set_online(nid); 1395 1396 if (new_node) { 1397 ret = register_one_node(nid); 1398 /* 1399 * If sysfs file of new node can't create, cpu on the node 1400 * can't be hot-added. There is no rollback way now. 1401 * So, check by BUG_ON() to catch it reluctantly.. 1402 */ 1403 BUG_ON(ret); 1404 } 1405 1406 /* create new memmap entry */ 1407 firmware_map_add_hotplug(start, start + size, "System RAM"); 1408 1409 /* online pages if requested */ 1410 if (online) 1411 walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), 1412 NULL, online_memory_block); 1413 1414 goto out; 1415 1416 error: 1417 /* rollback pgdat allocation and others */ 1418 if (new_pgdat) 1419 rollback_node_hotadd(nid, pgdat); 1420 memblock_remove(start, size); 1421 1422 out: 1423 mem_hotplug_done(); 1424 return ret; 1425 } 1426 EXPORT_SYMBOL_GPL(add_memory_resource); 1427 1428 int __ref add_memory(int nid, u64 start, u64 size) 1429 { 1430 struct resource *res; 1431 int ret; 1432 1433 res = register_memory_resource(start, size); 1434 if (IS_ERR(res)) 1435 return PTR_ERR(res); 1436 1437 ret = add_memory_resource(nid, res, memhp_auto_online); 1438 if (ret < 0) 1439 release_memory_resource(res); 1440 return ret; 1441 } 1442 EXPORT_SYMBOL_GPL(add_memory); 1443 1444 #ifdef CONFIG_MEMORY_HOTREMOVE 1445 /* 1446 * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy 1447 * set and the size of the free page is given by page_order(). Using this, 1448 * the function determines if the pageblock contains only free pages. 1449 * Due to buddy contraints, a free page at least the size of a pageblock will 1450 * be located at the start of the pageblock 1451 */ 1452 static inline int pageblock_free(struct page *page) 1453 { 1454 return PageBuddy(page) && page_order(page) >= pageblock_order; 1455 } 1456 1457 /* Return the start of the next active pageblock after a given page */ 1458 static struct page *next_active_pageblock(struct page *page) 1459 { 1460 /* Ensure the starting page is pageblock-aligned */ 1461 BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1)); 1462 1463 /* If the entire pageblock is free, move to the end of free page */ 1464 if (pageblock_free(page)) { 1465 int order; 1466 /* be careful. we don't have locks, page_order can be changed.*/ 1467 order = page_order(page); 1468 if ((order < MAX_ORDER) && (order >= pageblock_order)) 1469 return page + (1 << order); 1470 } 1471 1472 return page + pageblock_nr_pages; 1473 } 1474 1475 /* Checks if this range of memory is likely to be hot-removable. */ 1476 bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) 1477 { 1478 struct page *page = pfn_to_page(start_pfn); 1479 struct page *end_page = page + nr_pages; 1480 1481 /* Check the starting page of each pageblock within the range */ 1482 for (; page < end_page; page = next_active_pageblock(page)) { 1483 if (!is_pageblock_removable_nolock(page)) 1484 return false; 1485 cond_resched(); 1486 } 1487 1488 /* All pageblocks in the memory block are likely to be hot-removable */ 1489 return true; 1490 } 1491 1492 /* 1493 * Confirm all pages in a range [start, end) belong to the same zone. 1494 * When true, return its valid [start, end). 1495 */ 1496 int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn, 1497 unsigned long *valid_start, unsigned long *valid_end) 1498 { 1499 unsigned long pfn, sec_end_pfn; 1500 unsigned long start, end; 1501 struct zone *zone = NULL; 1502 struct page *page; 1503 int i; 1504 for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn + 1); 1505 pfn < end_pfn; 1506 pfn = sec_end_pfn, sec_end_pfn += PAGES_PER_SECTION) { 1507 /* Make sure the memory section is present first */ 1508 if (!present_section_nr(pfn_to_section_nr(pfn))) 1509 continue; 1510 for (; pfn < sec_end_pfn && pfn < end_pfn; 1511 pfn += MAX_ORDER_NR_PAGES) { 1512 i = 0; 1513 /* This is just a CONFIG_HOLES_IN_ZONE check.*/ 1514 while ((i < MAX_ORDER_NR_PAGES) && 1515 !pfn_valid_within(pfn + i)) 1516 i++; 1517 if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn) 1518 continue; 1519 page = pfn_to_page(pfn + i); 1520 if (zone && page_zone(page) != zone) 1521 return 0; 1522 if (!zone) 1523 start = pfn + i; 1524 zone = page_zone(page); 1525 end = pfn + MAX_ORDER_NR_PAGES; 1526 } 1527 } 1528 1529 if (zone) { 1530 *valid_start = start; 1531 *valid_end = min(end, end_pfn); 1532 return 1; 1533 } else { 1534 return 0; 1535 } 1536 } 1537 1538 /* 1539 * Scan pfn range [start,end) to find movable/migratable pages (LRU pages, 1540 * non-lru movable pages and hugepages). We scan pfn because it's much 1541 * easier than scanning over linked list. This function returns the pfn 1542 * of the first found movable page if it's found, otherwise 0. 1543 */ 1544 static unsigned long scan_movable_pages(unsigned long start, unsigned long end) 1545 { 1546 unsigned long pfn; 1547 struct page *page; 1548 for (pfn = start; pfn < end; pfn++) { 1549 if (pfn_valid(pfn)) { 1550 page = pfn_to_page(pfn); 1551 if (PageLRU(page)) 1552 return pfn; 1553 if (__PageMovable(page)) 1554 return pfn; 1555 if (PageHuge(page)) { 1556 if (page_huge_active(page)) 1557 return pfn; 1558 else 1559 pfn = round_up(pfn + 1, 1560 1 << compound_order(page)) - 1; 1561 } 1562 } 1563 } 1564 return 0; 1565 } 1566 1567 static struct page *new_node_page(struct page *page, unsigned long private, 1568 int **result) 1569 { 1570 gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; 1571 int nid = page_to_nid(page); 1572 nodemask_t nmask = node_states[N_MEMORY]; 1573 struct page *new_page = NULL; 1574 1575 /* 1576 * TODO: allocate a destination hugepage from a nearest neighbor node, 1577 * accordance with memory policy of the user process if possible. For 1578 * now as a simple work-around, we use the next node for destination. 1579 */ 1580 if (PageHuge(page)) 1581 return alloc_huge_page_node(page_hstate(compound_head(page)), 1582 next_node_in(nid, nmask)); 1583 1584 node_clear(nid, nmask); 1585 1586 if (PageHighMem(page) 1587 || (zone_idx(page_zone(page)) == ZONE_MOVABLE)) 1588 gfp_mask |= __GFP_HIGHMEM; 1589 1590 if (!nodes_empty(nmask)) 1591 new_page = __alloc_pages_nodemask(gfp_mask, 0, 1592 node_zonelist(nid, gfp_mask), &nmask); 1593 if (!new_page) 1594 new_page = __alloc_pages(gfp_mask, 0, 1595 node_zonelist(nid, gfp_mask)); 1596 1597 return new_page; 1598 } 1599 1600 #define NR_OFFLINE_AT_ONCE_PAGES (256) 1601 static int 1602 do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) 1603 { 1604 unsigned long pfn; 1605 struct page *page; 1606 int move_pages = NR_OFFLINE_AT_ONCE_PAGES; 1607 int not_managed = 0; 1608 int ret = 0; 1609 LIST_HEAD(source); 1610 1611 for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) { 1612 if (!pfn_valid(pfn)) 1613 continue; 1614 page = pfn_to_page(pfn); 1615 1616 if (PageHuge(page)) { 1617 struct page *head = compound_head(page); 1618 pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1; 1619 if (compound_order(head) > PFN_SECTION_SHIFT) { 1620 ret = -EBUSY; 1621 break; 1622 } 1623 if (isolate_huge_page(page, &source)) 1624 move_pages -= 1 << compound_order(head); 1625 continue; 1626 } 1627 1628 if (!get_page_unless_zero(page)) 1629 continue; 1630 /* 1631 * We can skip free pages. And we can deal with pages on 1632 * LRU and non-lru movable pages. 1633 */ 1634 if (PageLRU(page)) 1635 ret = isolate_lru_page(page); 1636 else 1637 ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE); 1638 if (!ret) { /* Success */ 1639 put_page(page); 1640 list_add_tail(&page->lru, &source); 1641 move_pages--; 1642 if (!__PageMovable(page)) 1643 inc_node_page_state(page, NR_ISOLATED_ANON + 1644 page_is_file_cache(page)); 1645 1646 } else { 1647 #ifdef CONFIG_DEBUG_VM 1648 pr_alert("failed to isolate pfn %lx\n", pfn); 1649 dump_page(page, "isolation failed"); 1650 #endif 1651 put_page(page); 1652 /* Because we don't have big zone->lock. we should 1653 check this again here. */ 1654 if (page_count(page)) { 1655 not_managed++; 1656 ret = -EBUSY; 1657 break; 1658 } 1659 } 1660 } 1661 if (!list_empty(&source)) { 1662 if (not_managed) { 1663 putback_movable_pages(&source); 1664 goto out; 1665 } 1666 1667 /* Allocate a new page from the nearest neighbor node */ 1668 ret = migrate_pages(&source, new_node_page, NULL, 0, 1669 MIGRATE_SYNC, MR_MEMORY_HOTPLUG); 1670 if (ret) 1671 putback_movable_pages(&source); 1672 } 1673 out: 1674 return ret; 1675 } 1676 1677 /* 1678 * remove from free_area[] and mark all as Reserved. 1679 */ 1680 static int 1681 offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, 1682 void *data) 1683 { 1684 __offline_isolated_pages(start, start + nr_pages); 1685 return 0; 1686 } 1687 1688 static void 1689 offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 1690 { 1691 walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL, 1692 offline_isolated_pages_cb); 1693 } 1694 1695 /* 1696 * Check all pages in range, recoreded as memory resource, are isolated. 1697 */ 1698 static int 1699 check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, 1700 void *data) 1701 { 1702 int ret; 1703 long offlined = *(long *)data; 1704 ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true); 1705 offlined = nr_pages; 1706 if (!ret) 1707 *(long *)data += offlined; 1708 return ret; 1709 } 1710 1711 static long 1712 check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) 1713 { 1714 long offlined = 0; 1715 int ret; 1716 1717 ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined, 1718 check_pages_isolated_cb); 1719 if (ret < 0) 1720 offlined = (long)ret; 1721 return offlined; 1722 } 1723 1724 #ifdef CONFIG_MOVABLE_NODE 1725 /* 1726 * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have 1727 * normal memory. 1728 */ 1729 static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) 1730 { 1731 return true; 1732 } 1733 #else /* CONFIG_MOVABLE_NODE */ 1734 /* ensure the node has NORMAL memory if it is still online */ 1735 static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) 1736 { 1737 struct pglist_data *pgdat = zone->zone_pgdat; 1738 unsigned long present_pages = 0; 1739 enum zone_type zt; 1740 1741 for (zt = 0; zt <= ZONE_NORMAL; zt++) 1742 present_pages += pgdat->node_zones[zt].present_pages; 1743 1744 if (present_pages > nr_pages) 1745 return true; 1746 1747 present_pages = 0; 1748 for (; zt <= ZONE_MOVABLE; zt++) 1749 present_pages += pgdat->node_zones[zt].present_pages; 1750 1751 /* 1752 * we can't offline the last normal memory until all 1753 * higher memory is offlined. 1754 */ 1755 return present_pages == 0; 1756 } 1757 #endif /* CONFIG_MOVABLE_NODE */ 1758 1759 static int __init cmdline_parse_movable_node(char *p) 1760 { 1761 #ifdef CONFIG_MOVABLE_NODE 1762 movable_node_enabled = true; 1763 #else 1764 pr_warn("movable_node option not supported\n"); 1765 #endif 1766 return 0; 1767 } 1768 early_param("movable_node", cmdline_parse_movable_node); 1769 1770 /* check which state of node_states will be changed when offline memory */ 1771 static void node_states_check_changes_offline(unsigned long nr_pages, 1772 struct zone *zone, struct memory_notify *arg) 1773 { 1774 struct pglist_data *pgdat = zone->zone_pgdat; 1775 unsigned long present_pages = 0; 1776 enum zone_type zt, zone_last = ZONE_NORMAL; 1777 1778 /* 1779 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] 1780 * contains nodes which have zones of 0...ZONE_NORMAL, 1781 * set zone_last to ZONE_NORMAL. 1782 * 1783 * If we don't have HIGHMEM nor movable node, 1784 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of 1785 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. 1786 */ 1787 if (N_MEMORY == N_NORMAL_MEMORY) 1788 zone_last = ZONE_MOVABLE; 1789 1790 /* 1791 * check whether node_states[N_NORMAL_MEMORY] will be changed. 1792 * If the memory to be offline is in a zone of 0...zone_last, 1793 * and it is the last present memory, 0...zone_last will 1794 * become empty after offline , thus we can determind we will 1795 * need to clear the node from node_states[N_NORMAL_MEMORY]. 1796 */ 1797 for (zt = 0; zt <= zone_last; zt++) 1798 present_pages += pgdat->node_zones[zt].present_pages; 1799 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) 1800 arg->status_change_nid_normal = zone_to_nid(zone); 1801 else 1802 arg->status_change_nid_normal = -1; 1803 1804 #ifdef CONFIG_HIGHMEM 1805 /* 1806 * If we have movable node, node_states[N_HIGH_MEMORY] 1807 * contains nodes which have zones of 0...ZONE_HIGHMEM, 1808 * set zone_last to ZONE_HIGHMEM. 1809 * 1810 * If we don't have movable node, node_states[N_NORMAL_MEMORY] 1811 * contains nodes which have zones of 0...ZONE_MOVABLE, 1812 * set zone_last to ZONE_MOVABLE. 1813 */ 1814 zone_last = ZONE_HIGHMEM; 1815 if (N_MEMORY == N_HIGH_MEMORY) 1816 zone_last = ZONE_MOVABLE; 1817 1818 for (; zt <= zone_last; zt++) 1819 present_pages += pgdat->node_zones[zt].present_pages; 1820 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) 1821 arg->status_change_nid_high = zone_to_nid(zone); 1822 else 1823 arg->status_change_nid_high = -1; 1824 #else 1825 arg->status_change_nid_high = arg->status_change_nid_normal; 1826 #endif 1827 1828 /* 1829 * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE 1830 */ 1831 zone_last = ZONE_MOVABLE; 1832 1833 /* 1834 * check whether node_states[N_HIGH_MEMORY] will be changed 1835 * If we try to offline the last present @nr_pages from the node, 1836 * we can determind we will need to clear the node from 1837 * node_states[N_HIGH_MEMORY]. 1838 */ 1839 for (; zt <= zone_last; zt++) 1840 present_pages += pgdat->node_zones[zt].present_pages; 1841 if (nr_pages >= present_pages) 1842 arg->status_change_nid = zone_to_nid(zone); 1843 else 1844 arg->status_change_nid = -1; 1845 } 1846 1847 static void node_states_clear_node(int node, struct memory_notify *arg) 1848 { 1849 if (arg->status_change_nid_normal >= 0) 1850 node_clear_state(node, N_NORMAL_MEMORY); 1851 1852 if ((N_MEMORY != N_NORMAL_MEMORY) && 1853 (arg->status_change_nid_high >= 0)) 1854 node_clear_state(node, N_HIGH_MEMORY); 1855 1856 if ((N_MEMORY != N_HIGH_MEMORY) && 1857 (arg->status_change_nid >= 0)) 1858 node_clear_state(node, N_MEMORY); 1859 } 1860 1861 static int __ref __offline_pages(unsigned long start_pfn, 1862 unsigned long end_pfn, unsigned long timeout) 1863 { 1864 unsigned long pfn, nr_pages, expire; 1865 long offlined_pages; 1866 int ret, drain, retry_max, node; 1867 unsigned long flags; 1868 unsigned long valid_start, valid_end; 1869 struct zone *zone; 1870 struct memory_notify arg; 1871 1872 /* at least, alignment against pageblock is necessary */ 1873 if (!IS_ALIGNED(start_pfn, pageblock_nr_pages)) 1874 return -EINVAL; 1875 if (!IS_ALIGNED(end_pfn, pageblock_nr_pages)) 1876 return -EINVAL; 1877 /* This makes hotplug much easier...and readable. 1878 we assume this for now. .*/ 1879 if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, &valid_end)) 1880 return -EINVAL; 1881 1882 zone = page_zone(pfn_to_page(valid_start)); 1883 node = zone_to_nid(zone); 1884 nr_pages = end_pfn - start_pfn; 1885 1886 if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages)) 1887 return -EINVAL; 1888 1889 /* set above range as isolated */ 1890 ret = start_isolate_page_range(start_pfn, end_pfn, 1891 MIGRATE_MOVABLE, true); 1892 if (ret) 1893 return ret; 1894 1895 arg.start_pfn = start_pfn; 1896 arg.nr_pages = nr_pages; 1897 node_states_check_changes_offline(nr_pages, zone, &arg); 1898 1899 ret = memory_notify(MEM_GOING_OFFLINE, &arg); 1900 ret = notifier_to_errno(ret); 1901 if (ret) 1902 goto failed_removal; 1903 1904 pfn = start_pfn; 1905 expire = jiffies + timeout; 1906 drain = 0; 1907 retry_max = 5; 1908 repeat: 1909 /* start memory hot removal */ 1910 ret = -EAGAIN; 1911 if (time_after(jiffies, expire)) 1912 goto failed_removal; 1913 ret = -EINTR; 1914 if (signal_pending(current)) 1915 goto failed_removal; 1916 ret = 0; 1917 if (drain) { 1918 lru_add_drain_all(); 1919 cond_resched(); 1920 drain_all_pages(zone); 1921 } 1922 1923 pfn = scan_movable_pages(start_pfn, end_pfn); 1924 if (pfn) { /* We have movable pages */ 1925 ret = do_migrate_range(pfn, end_pfn); 1926 if (!ret) { 1927 drain = 1; 1928 goto repeat; 1929 } else { 1930 if (ret < 0) 1931 if (--retry_max == 0) 1932 goto failed_removal; 1933 yield(); 1934 drain = 1; 1935 goto repeat; 1936 } 1937 } 1938 /* drain all zone's lru pagevec, this is asynchronous... */ 1939 lru_add_drain_all(); 1940 yield(); 1941 /* drain pcp pages, this is synchronous. */ 1942 drain_all_pages(zone); 1943 /* 1944 * dissolve free hugepages in the memory block before doing offlining 1945 * actually in order to make hugetlbfs's object counting consistent. 1946 */ 1947 ret = dissolve_free_huge_pages(start_pfn, end_pfn); 1948 if (ret) 1949 goto failed_removal; 1950 /* check again */ 1951 offlined_pages = check_pages_isolated(start_pfn, end_pfn); 1952 if (offlined_pages < 0) { 1953 ret = -EBUSY; 1954 goto failed_removal; 1955 } 1956 pr_info("Offlined Pages %ld\n", offlined_pages); 1957 /* Ok, all of our target is isolated. 1958 We cannot do rollback at this point. */ 1959 offline_isolated_pages(start_pfn, end_pfn); 1960 /* reset pagetype flags and makes migrate type to be MOVABLE */ 1961 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1962 /* removal success */ 1963 adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages); 1964 zone->present_pages -= offlined_pages; 1965 1966 pgdat_resize_lock(zone->zone_pgdat, &flags); 1967 zone->zone_pgdat->node_present_pages -= offlined_pages; 1968 pgdat_resize_unlock(zone->zone_pgdat, &flags); 1969 1970 init_per_zone_wmark_min(); 1971 1972 if (!populated_zone(zone)) { 1973 zone_pcp_reset(zone); 1974 mutex_lock(&zonelists_mutex); 1975 build_all_zonelists(NULL, NULL); 1976 mutex_unlock(&zonelists_mutex); 1977 } else 1978 zone_pcp_update(zone); 1979 1980 node_states_clear_node(node, &arg); 1981 if (arg.status_change_nid >= 0) { 1982 kswapd_stop(node); 1983 kcompactd_stop(node); 1984 } 1985 1986 vm_total_pages = nr_free_pagecache_pages(); 1987 writeback_set_ratelimit(); 1988 1989 memory_notify(MEM_OFFLINE, &arg); 1990 return 0; 1991 1992 failed_removal: 1993 pr_debug("memory offlining [mem %#010llx-%#010llx] failed\n", 1994 (unsigned long long) start_pfn << PAGE_SHIFT, 1995 ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); 1996 memory_notify(MEM_CANCEL_OFFLINE, &arg); 1997 /* pushback to free area */ 1998 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1999 return ret; 2000 } 2001 2002 /* Must be protected by mem_hotplug_begin() */ 2003 int offline_pages(unsigned long start_pfn, unsigned long nr_pages) 2004 { 2005 return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); 2006 } 2007 #endif /* CONFIG_MEMORY_HOTREMOVE */ 2008 2009 /** 2010 * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn) 2011 * @start_pfn: start pfn of the memory range 2012 * @end_pfn: end pfn of the memory range 2013 * @arg: argument passed to func 2014 * @func: callback for each memory section walked 2015 * 2016 * This function walks through all present mem sections in range 2017 * [start_pfn, end_pfn) and call func on each mem section. 2018 * 2019 * Returns the return value of func. 2020 */ 2021 int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, 2022 void *arg, int (*func)(struct memory_block *, void *)) 2023 { 2024 struct memory_block *mem = NULL; 2025 struct mem_section *section; 2026 unsigned long pfn, section_nr; 2027 int ret; 2028 2029 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 2030 section_nr = pfn_to_section_nr(pfn); 2031 if (!present_section_nr(section_nr)) 2032 continue; 2033 2034 section = __nr_to_section(section_nr); 2035 /* same memblock? */ 2036 if (mem) 2037 if ((section_nr >= mem->start_section_nr) && 2038 (section_nr <= mem->end_section_nr)) 2039 continue; 2040 2041 mem = find_memory_block_hinted(section, mem); 2042 if (!mem) 2043 continue; 2044 2045 ret = func(mem, arg); 2046 if (ret) { 2047 kobject_put(&mem->dev.kobj); 2048 return ret; 2049 } 2050 } 2051 2052 if (mem) 2053 kobject_put(&mem->dev.kobj); 2054 2055 return 0; 2056 } 2057 2058 #ifdef CONFIG_MEMORY_HOTREMOVE 2059 static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) 2060 { 2061 int ret = !is_memblock_offlined(mem); 2062 2063 if (unlikely(ret)) { 2064 phys_addr_t beginpa, endpa; 2065 2066 beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); 2067 endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1; 2068 pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n", 2069 &beginpa, &endpa); 2070 } 2071 2072 return ret; 2073 } 2074 2075 static int check_cpu_on_node(pg_data_t *pgdat) 2076 { 2077 int cpu; 2078 2079 for_each_present_cpu(cpu) { 2080 if (cpu_to_node(cpu) == pgdat->node_id) 2081 /* 2082 * the cpu on this node isn't removed, and we can't 2083 * offline this node. 2084 */ 2085 return -EBUSY; 2086 } 2087 2088 return 0; 2089 } 2090 2091 static void unmap_cpu_on_node(pg_data_t *pgdat) 2092 { 2093 #ifdef CONFIG_ACPI_NUMA 2094 int cpu; 2095 2096 for_each_possible_cpu(cpu) 2097 if (cpu_to_node(cpu) == pgdat->node_id) 2098 numa_clear_node(cpu); 2099 #endif 2100 } 2101 2102 static int check_and_unmap_cpu_on_node(pg_data_t *pgdat) 2103 { 2104 int ret; 2105 2106 ret = check_cpu_on_node(pgdat); 2107 if (ret) 2108 return ret; 2109 2110 /* 2111 * the node will be offlined when we come here, so we can clear 2112 * the cpu_to_node() now. 2113 */ 2114 2115 unmap_cpu_on_node(pgdat); 2116 return 0; 2117 } 2118 2119 /** 2120 * try_offline_node 2121 * 2122 * Offline a node if all memory sections and cpus of the node are removed. 2123 * 2124 * NOTE: The caller must call lock_device_hotplug() to serialize hotplug 2125 * and online/offline operations before this call. 2126 */ 2127 void try_offline_node(int nid) 2128 { 2129 pg_data_t *pgdat = NODE_DATA(nid); 2130 unsigned long start_pfn = pgdat->node_start_pfn; 2131 unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; 2132 unsigned long pfn; 2133 2134 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 2135 unsigned long section_nr = pfn_to_section_nr(pfn); 2136 2137 if (!present_section_nr(section_nr)) 2138 continue; 2139 2140 if (pfn_to_nid(pfn) != nid) 2141 continue; 2142 2143 /* 2144 * some memory sections of this node are not removed, and we 2145 * can't offline node now. 2146 */ 2147 return; 2148 } 2149 2150 if (check_and_unmap_cpu_on_node(pgdat)) 2151 return; 2152 2153 /* 2154 * all memory/cpu of this node are removed, we can offline this 2155 * node now. 2156 */ 2157 node_set_offline(nid); 2158 unregister_one_node(nid); 2159 } 2160 EXPORT_SYMBOL(try_offline_node); 2161 2162 /** 2163 * remove_memory 2164 * 2165 * NOTE: The caller must call lock_device_hotplug() to serialize hotplug 2166 * and online/offline operations before this call, as required by 2167 * try_offline_node(). 2168 */ 2169 void __ref remove_memory(int nid, u64 start, u64 size) 2170 { 2171 int ret; 2172 2173 BUG_ON(check_hotplug_memory_range(start, size)); 2174 2175 mem_hotplug_begin(); 2176 2177 /* 2178 * All memory blocks must be offlined before removing memory. Check 2179 * whether all memory blocks in question are offline and trigger a BUG() 2180 * if this is not the case. 2181 */ 2182 ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, 2183 check_memblock_offlined_cb); 2184 if (ret) 2185 BUG(); 2186 2187 /* remove memmap entry */ 2188 firmware_map_remove(start, start + size, "System RAM"); 2189 memblock_free(start, size); 2190 memblock_remove(start, size); 2191 2192 arch_remove_memory(start, size); 2193 2194 try_offline_node(nid); 2195 2196 mem_hotplug_done(); 2197 } 2198 EXPORT_SYMBOL_GPL(remove_memory); 2199 #endif /* CONFIG_MEMORY_HOTREMOVE */ 2200