1 /* 2 * linux/mm/memory_hotplug.c 3 * 4 * Copyright (C) 5 */ 6 7 #include <linux/stddef.h> 8 #include <linux/mm.h> 9 #include <linux/sched/signal.h> 10 #include <linux/swap.h> 11 #include <linux/interrupt.h> 12 #include <linux/pagemap.h> 13 #include <linux/compiler.h> 14 #include <linux/export.h> 15 #include <linux/pagevec.h> 16 #include <linux/writeback.h> 17 #include <linux/slab.h> 18 #include <linux/sysctl.h> 19 #include <linux/cpu.h> 20 #include <linux/memory.h> 21 #include <linux/memremap.h> 22 #include <linux/memory_hotplug.h> 23 #include <linux/highmem.h> 24 #include <linux/vmalloc.h> 25 #include <linux/ioport.h> 26 #include <linux/delay.h> 27 #include <linux/migrate.h> 28 #include <linux/page-isolation.h> 29 #include <linux/pfn.h> 30 #include <linux/suspend.h> 31 #include <linux/mm_inline.h> 32 #include <linux/firmware-map.h> 33 #include <linux/stop_machine.h> 34 #include <linux/hugetlb.h> 35 #include <linux/memblock.h> 36 #include <linux/bootmem.h> 37 #include <linux/compaction.h> 38 39 #include <asm/tlbflush.h> 40 41 #include "internal.h" 42 43 /* 44 * online_page_callback contains pointer to current page onlining function. 45 * Initially it is generic_online_page(). If it is required it could be 46 * changed by calling set_online_page_callback() for callback registration 47 * and restore_online_page_callback() for generic callback restore. 48 */ 49 50 static void generic_online_page(struct page *page); 51 52 static online_page_callback_t online_page_callback = generic_online_page; 53 static DEFINE_MUTEX(online_page_callback_lock); 54 55 /* The same as the cpu_hotplug lock, but for memory hotplug. */ 56 static struct { 57 struct task_struct *active_writer; 58 struct mutex lock; /* Synchronizes accesses to refcount, */ 59 /* 60 * Also blocks the new readers during 61 * an ongoing mem hotplug operation. 62 */ 63 int refcount; 64 65 #ifdef CONFIG_DEBUG_LOCK_ALLOC 66 struct lockdep_map dep_map; 67 #endif 68 } mem_hotplug = { 69 .active_writer = NULL, 70 .lock = __MUTEX_INITIALIZER(mem_hotplug.lock), 71 .refcount = 0, 72 #ifdef CONFIG_DEBUG_LOCK_ALLOC 73 .dep_map = {.name = "mem_hotplug.lock" }, 74 #endif 75 }; 76 77 /* Lockdep annotations for get/put_online_mems() and mem_hotplug_begin/end() */ 78 #define memhp_lock_acquire_read() lock_map_acquire_read(&mem_hotplug.dep_map) 79 #define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map) 80 #define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map) 81 82 #ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE 83 bool memhp_auto_online; 84 #else 85 bool memhp_auto_online = true; 86 #endif 87 EXPORT_SYMBOL_GPL(memhp_auto_online); 88 89 static int __init setup_memhp_default_state(char *str) 90 { 91 if (!strcmp(str, "online")) 92 memhp_auto_online = true; 93 else if (!strcmp(str, "offline")) 94 memhp_auto_online = false; 95 96 return 1; 97 } 98 __setup("memhp_default_state=", setup_memhp_default_state); 99 100 void get_online_mems(void) 101 { 102 might_sleep(); 103 if (mem_hotplug.active_writer == current) 104 return; 105 memhp_lock_acquire_read(); 106 mutex_lock(&mem_hotplug.lock); 107 mem_hotplug.refcount++; 108 mutex_unlock(&mem_hotplug.lock); 109 110 } 111 112 void put_online_mems(void) 113 { 114 if (mem_hotplug.active_writer == current) 115 return; 116 mutex_lock(&mem_hotplug.lock); 117 118 if (WARN_ON(!mem_hotplug.refcount)) 119 mem_hotplug.refcount++; /* try to fix things up */ 120 121 if (!--mem_hotplug.refcount && unlikely(mem_hotplug.active_writer)) 122 wake_up_process(mem_hotplug.active_writer); 123 mutex_unlock(&mem_hotplug.lock); 124 memhp_lock_release(); 125 126 } 127 128 void mem_hotplug_begin(void) 129 { 130 assert_held_device_hotplug(); 131 132 mem_hotplug.active_writer = current; 133 134 memhp_lock_acquire(); 135 for (;;) { 136 mutex_lock(&mem_hotplug.lock); 137 if (likely(!mem_hotplug.refcount)) 138 break; 139 __set_current_state(TASK_UNINTERRUPTIBLE); 140 mutex_unlock(&mem_hotplug.lock); 141 schedule(); 142 } 143 } 144 145 void mem_hotplug_done(void) 146 { 147 mem_hotplug.active_writer = NULL; 148 mutex_unlock(&mem_hotplug.lock); 149 memhp_lock_release(); 150 } 151 152 /* add this memory to iomem resource */ 153 static struct resource *register_memory_resource(u64 start, u64 size) 154 { 155 struct resource *res; 156 res = kzalloc(sizeof(struct resource), GFP_KERNEL); 157 if (!res) 158 return ERR_PTR(-ENOMEM); 159 160 res->name = "System RAM"; 161 res->start = start; 162 res->end = start + size - 1; 163 res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; 164 if (request_resource(&iomem_resource, res) < 0) { 165 pr_debug("System RAM resource %pR cannot be added\n", res); 166 kfree(res); 167 return ERR_PTR(-EEXIST); 168 } 169 return res; 170 } 171 172 static void release_memory_resource(struct resource *res) 173 { 174 if (!res) 175 return; 176 release_resource(res); 177 kfree(res); 178 return; 179 } 180 181 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 182 void get_page_bootmem(unsigned long info, struct page *page, 183 unsigned long type) 184 { 185 page->freelist = (void *)type; 186 SetPagePrivate(page); 187 set_page_private(page, info); 188 page_ref_inc(page); 189 } 190 191 void put_page_bootmem(struct page *page) 192 { 193 unsigned long type; 194 195 type = (unsigned long) page->freelist; 196 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || 197 type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); 198 199 if (page_ref_dec_return(page) == 1) { 200 page->freelist = NULL; 201 ClearPagePrivate(page); 202 set_page_private(page, 0); 203 INIT_LIST_HEAD(&page->lru); 204 free_reserved_page(page); 205 } 206 } 207 208 #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE 209 #ifndef CONFIG_SPARSEMEM_VMEMMAP 210 static void register_page_bootmem_info_section(unsigned long start_pfn) 211 { 212 unsigned long *usemap, mapsize, section_nr, i; 213 struct mem_section *ms; 214 struct page *page, *memmap; 215 216 section_nr = pfn_to_section_nr(start_pfn); 217 ms = __nr_to_section(section_nr); 218 219 /* Get section's memmap address */ 220 memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); 221 222 /* 223 * Get page for the memmap's phys address 224 * XXX: need more consideration for sparse_vmemmap... 225 */ 226 page = virt_to_page(memmap); 227 mapsize = sizeof(struct page) * PAGES_PER_SECTION; 228 mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT; 229 230 /* remember memmap's page */ 231 for (i = 0; i < mapsize; i++, page++) 232 get_page_bootmem(section_nr, page, SECTION_INFO); 233 234 usemap = __nr_to_section(section_nr)->pageblock_flags; 235 page = virt_to_page(usemap); 236 237 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; 238 239 for (i = 0; i < mapsize; i++, page++) 240 get_page_bootmem(section_nr, page, MIX_SECTION_INFO); 241 242 } 243 #else /* CONFIG_SPARSEMEM_VMEMMAP */ 244 static void register_page_bootmem_info_section(unsigned long start_pfn) 245 { 246 unsigned long *usemap, mapsize, section_nr, i; 247 struct mem_section *ms; 248 struct page *page, *memmap; 249 250 if (!pfn_valid(start_pfn)) 251 return; 252 253 section_nr = pfn_to_section_nr(start_pfn); 254 ms = __nr_to_section(section_nr); 255 256 memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); 257 258 register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION); 259 260 usemap = __nr_to_section(section_nr)->pageblock_flags; 261 page = virt_to_page(usemap); 262 263 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; 264 265 for (i = 0; i < mapsize; i++, page++) 266 get_page_bootmem(section_nr, page, MIX_SECTION_INFO); 267 } 268 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ 269 270 void __init register_page_bootmem_info_node(struct pglist_data *pgdat) 271 { 272 unsigned long i, pfn, end_pfn, nr_pages; 273 int node = pgdat->node_id; 274 struct page *page; 275 276 nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT; 277 page = virt_to_page(pgdat); 278 279 for (i = 0; i < nr_pages; i++, page++) 280 get_page_bootmem(node, page, NODE_INFO); 281 282 pfn = pgdat->node_start_pfn; 283 end_pfn = pgdat_end_pfn(pgdat); 284 285 /* register section info */ 286 for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 287 /* 288 * Some platforms can assign the same pfn to multiple nodes - on 289 * node0 as well as nodeN. To avoid registering a pfn against 290 * multiple nodes we check that this pfn does not already 291 * reside in some other nodes. 292 */ 293 if (pfn_valid(pfn) && (early_pfn_to_nid(pfn) == node)) 294 register_page_bootmem_info_section(pfn); 295 } 296 } 297 #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ 298 299 static void __meminit grow_zone_span(struct zone *zone, unsigned long start_pfn, 300 unsigned long end_pfn) 301 { 302 unsigned long old_zone_end_pfn; 303 304 zone_span_writelock(zone); 305 306 old_zone_end_pfn = zone_end_pfn(zone); 307 if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) 308 zone->zone_start_pfn = start_pfn; 309 310 zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - 311 zone->zone_start_pfn; 312 313 zone_span_writeunlock(zone); 314 } 315 316 static void resize_zone(struct zone *zone, unsigned long start_pfn, 317 unsigned long end_pfn) 318 { 319 zone_span_writelock(zone); 320 321 if (end_pfn - start_pfn) { 322 zone->zone_start_pfn = start_pfn; 323 zone->spanned_pages = end_pfn - start_pfn; 324 } else { 325 /* 326 * make it consist as free_area_init_core(), 327 * if spanned_pages = 0, then keep start_pfn = 0 328 */ 329 zone->zone_start_pfn = 0; 330 zone->spanned_pages = 0; 331 } 332 333 zone_span_writeunlock(zone); 334 } 335 336 static void fix_zone_id(struct zone *zone, unsigned long start_pfn, 337 unsigned long end_pfn) 338 { 339 enum zone_type zid = zone_idx(zone); 340 int nid = zone->zone_pgdat->node_id; 341 unsigned long pfn; 342 343 for (pfn = start_pfn; pfn < end_pfn; pfn++) 344 set_page_links(pfn_to_page(pfn), zid, nid, pfn); 345 } 346 347 /* Can fail with -ENOMEM from allocating a wait table with vmalloc() or 348 * alloc_bootmem_node_nopanic()/memblock_virt_alloc_node_nopanic() */ 349 static int __ref ensure_zone_is_initialized(struct zone *zone, 350 unsigned long start_pfn, unsigned long num_pages) 351 { 352 if (!zone_is_initialized(zone)) 353 return init_currently_empty_zone(zone, start_pfn, num_pages); 354 355 return 0; 356 } 357 358 static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, 359 unsigned long start_pfn, unsigned long end_pfn) 360 { 361 int ret; 362 unsigned long flags; 363 unsigned long z1_start_pfn; 364 365 ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn); 366 if (ret) 367 return ret; 368 369 pgdat_resize_lock(z1->zone_pgdat, &flags); 370 371 /* can't move pfns which are higher than @z2 */ 372 if (end_pfn > zone_end_pfn(z2)) 373 goto out_fail; 374 /* the move out part must be at the left most of @z2 */ 375 if (start_pfn > z2->zone_start_pfn) 376 goto out_fail; 377 /* must included/overlap */ 378 if (end_pfn <= z2->zone_start_pfn) 379 goto out_fail; 380 381 /* use start_pfn for z1's start_pfn if z1 is empty */ 382 if (!zone_is_empty(z1)) 383 z1_start_pfn = z1->zone_start_pfn; 384 else 385 z1_start_pfn = start_pfn; 386 387 resize_zone(z1, z1_start_pfn, end_pfn); 388 resize_zone(z2, end_pfn, zone_end_pfn(z2)); 389 390 pgdat_resize_unlock(z1->zone_pgdat, &flags); 391 392 fix_zone_id(z1, start_pfn, end_pfn); 393 394 return 0; 395 out_fail: 396 pgdat_resize_unlock(z1->zone_pgdat, &flags); 397 return -1; 398 } 399 400 static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, 401 unsigned long start_pfn, unsigned long end_pfn) 402 { 403 int ret; 404 unsigned long flags; 405 unsigned long z2_end_pfn; 406 407 ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn); 408 if (ret) 409 return ret; 410 411 pgdat_resize_lock(z1->zone_pgdat, &flags); 412 413 /* can't move pfns which are lower than @z1 */ 414 if (z1->zone_start_pfn > start_pfn) 415 goto out_fail; 416 /* the move out part mast at the right most of @z1 */ 417 if (zone_end_pfn(z1) > end_pfn) 418 goto out_fail; 419 /* must included/overlap */ 420 if (start_pfn >= zone_end_pfn(z1)) 421 goto out_fail; 422 423 /* use end_pfn for z2's end_pfn if z2 is empty */ 424 if (!zone_is_empty(z2)) 425 z2_end_pfn = zone_end_pfn(z2); 426 else 427 z2_end_pfn = end_pfn; 428 429 resize_zone(z1, z1->zone_start_pfn, start_pfn); 430 resize_zone(z2, start_pfn, z2_end_pfn); 431 432 pgdat_resize_unlock(z1->zone_pgdat, &flags); 433 434 fix_zone_id(z2, start_pfn, end_pfn); 435 436 return 0; 437 out_fail: 438 pgdat_resize_unlock(z1->zone_pgdat, &flags); 439 return -1; 440 } 441 442 static struct zone * __meminit move_pfn_range(int zone_shift, 443 unsigned long start_pfn, unsigned long end_pfn) 444 { 445 struct zone *zone = page_zone(pfn_to_page(start_pfn)); 446 int ret = 0; 447 448 if (zone_shift < 0) 449 ret = move_pfn_range_left(zone + zone_shift, zone, 450 start_pfn, end_pfn); 451 else if (zone_shift) 452 ret = move_pfn_range_right(zone, zone + zone_shift, 453 start_pfn, end_pfn); 454 455 if (ret) 456 return NULL; 457 458 return zone + zone_shift; 459 } 460 461 static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, 462 unsigned long end_pfn) 463 { 464 unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat); 465 466 if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) 467 pgdat->node_start_pfn = start_pfn; 468 469 pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - 470 pgdat->node_start_pfn; 471 } 472 473 static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) 474 { 475 struct pglist_data *pgdat = zone->zone_pgdat; 476 int nr_pages = PAGES_PER_SECTION; 477 int nid = pgdat->node_id; 478 int zone_type; 479 unsigned long flags, pfn; 480 int ret; 481 482 zone_type = zone - pgdat->node_zones; 483 ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages); 484 if (ret) 485 return ret; 486 487 pgdat_resize_lock(zone->zone_pgdat, &flags); 488 grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); 489 grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, 490 phys_start_pfn + nr_pages); 491 pgdat_resize_unlock(zone->zone_pgdat, &flags); 492 memmap_init_zone(nr_pages, nid, zone_type, 493 phys_start_pfn, MEMMAP_HOTPLUG); 494 495 /* online_page_range is called later and expects pages reserved */ 496 for (pfn = phys_start_pfn; pfn < phys_start_pfn + nr_pages; pfn++) { 497 if (!pfn_valid(pfn)) 498 continue; 499 500 SetPageReserved(pfn_to_page(pfn)); 501 } 502 return 0; 503 } 504 505 static int __meminit __add_section(int nid, struct zone *zone, 506 unsigned long phys_start_pfn) 507 { 508 int ret; 509 510 if (pfn_valid(phys_start_pfn)) 511 return -EEXIST; 512 513 ret = sparse_add_one_section(zone, phys_start_pfn); 514 515 if (ret < 0) 516 return ret; 517 518 ret = __add_zone(zone, phys_start_pfn); 519 520 if (ret < 0) 521 return ret; 522 523 return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); 524 } 525 526 /* 527 * Reasonably generic function for adding memory. It is 528 * expected that archs that support memory hotplug will 529 * call this function after deciding the zone to which to 530 * add the new pages. 531 */ 532 int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, 533 unsigned long nr_pages) 534 { 535 unsigned long i; 536 int err = 0; 537 int start_sec, end_sec; 538 struct vmem_altmap *altmap; 539 540 clear_zone_contiguous(zone); 541 542 /* during initialize mem_map, align hot-added range to section */ 543 start_sec = pfn_to_section_nr(phys_start_pfn); 544 end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); 545 546 altmap = to_vmem_altmap((unsigned long) pfn_to_page(phys_start_pfn)); 547 if (altmap) { 548 /* 549 * Validate altmap is within bounds of the total request 550 */ 551 if (altmap->base_pfn != phys_start_pfn 552 || vmem_altmap_offset(altmap) > nr_pages) { 553 pr_warn_once("memory add fail, invalid altmap\n"); 554 err = -EINVAL; 555 goto out; 556 } 557 altmap->alloc = 0; 558 } 559 560 for (i = start_sec; i <= end_sec; i++) { 561 err = __add_section(nid, zone, section_nr_to_pfn(i)); 562 563 /* 564 * EEXIST is finally dealt with by ioresource collision 565 * check. see add_memory() => register_memory_resource() 566 * Warning will be printed if there is collision. 567 */ 568 if (err && (err != -EEXIST)) 569 break; 570 err = 0; 571 } 572 vmemmap_populate_print_last(); 573 out: 574 set_zone_contiguous(zone); 575 return err; 576 } 577 EXPORT_SYMBOL_GPL(__add_pages); 578 579 #ifdef CONFIG_MEMORY_HOTREMOVE 580 /* find the smallest valid pfn in the range [start_pfn, end_pfn) */ 581 static int find_smallest_section_pfn(int nid, struct zone *zone, 582 unsigned long start_pfn, 583 unsigned long end_pfn) 584 { 585 struct mem_section *ms; 586 587 for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) { 588 ms = __pfn_to_section(start_pfn); 589 590 if (unlikely(!valid_section(ms))) 591 continue; 592 593 if (unlikely(pfn_to_nid(start_pfn) != nid)) 594 continue; 595 596 if (zone && zone != page_zone(pfn_to_page(start_pfn))) 597 continue; 598 599 return start_pfn; 600 } 601 602 return 0; 603 } 604 605 /* find the biggest valid pfn in the range [start_pfn, end_pfn). */ 606 static int find_biggest_section_pfn(int nid, struct zone *zone, 607 unsigned long start_pfn, 608 unsigned long end_pfn) 609 { 610 struct mem_section *ms; 611 unsigned long pfn; 612 613 /* pfn is the end pfn of a memory section. */ 614 pfn = end_pfn - 1; 615 for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) { 616 ms = __pfn_to_section(pfn); 617 618 if (unlikely(!valid_section(ms))) 619 continue; 620 621 if (unlikely(pfn_to_nid(pfn) != nid)) 622 continue; 623 624 if (zone && zone != page_zone(pfn_to_page(pfn))) 625 continue; 626 627 return pfn; 628 } 629 630 return 0; 631 } 632 633 static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, 634 unsigned long end_pfn) 635 { 636 unsigned long zone_start_pfn = zone->zone_start_pfn; 637 unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */ 638 unsigned long zone_end_pfn = z; 639 unsigned long pfn; 640 struct mem_section *ms; 641 int nid = zone_to_nid(zone); 642 643 zone_span_writelock(zone); 644 if (zone_start_pfn == start_pfn) { 645 /* 646 * If the section is smallest section in the zone, it need 647 * shrink zone->zone_start_pfn and zone->zone_spanned_pages. 648 * In this case, we find second smallest valid mem_section 649 * for shrinking zone. 650 */ 651 pfn = find_smallest_section_pfn(nid, zone, end_pfn, 652 zone_end_pfn); 653 if (pfn) { 654 zone->zone_start_pfn = pfn; 655 zone->spanned_pages = zone_end_pfn - pfn; 656 } 657 } else if (zone_end_pfn == end_pfn) { 658 /* 659 * If the section is biggest section in the zone, it need 660 * shrink zone->spanned_pages. 661 * In this case, we find second biggest valid mem_section for 662 * shrinking zone. 663 */ 664 pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn, 665 start_pfn); 666 if (pfn) 667 zone->spanned_pages = pfn - zone_start_pfn + 1; 668 } 669 670 /* 671 * The section is not biggest or smallest mem_section in the zone, it 672 * only creates a hole in the zone. So in this case, we need not 673 * change the zone. But perhaps, the zone has only hole data. Thus 674 * it check the zone has only hole or not. 675 */ 676 pfn = zone_start_pfn; 677 for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) { 678 ms = __pfn_to_section(pfn); 679 680 if (unlikely(!valid_section(ms))) 681 continue; 682 683 if (page_zone(pfn_to_page(pfn)) != zone) 684 continue; 685 686 /* If the section is current section, it continues the loop */ 687 if (start_pfn == pfn) 688 continue; 689 690 /* If we find valid section, we have nothing to do */ 691 zone_span_writeunlock(zone); 692 return; 693 } 694 695 /* The zone has no valid section */ 696 zone->zone_start_pfn = 0; 697 zone->spanned_pages = 0; 698 zone_span_writeunlock(zone); 699 } 700 701 static void shrink_pgdat_span(struct pglist_data *pgdat, 702 unsigned long start_pfn, unsigned long end_pfn) 703 { 704 unsigned long pgdat_start_pfn = pgdat->node_start_pfn; 705 unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */ 706 unsigned long pgdat_end_pfn = p; 707 unsigned long pfn; 708 struct mem_section *ms; 709 int nid = pgdat->node_id; 710 711 if (pgdat_start_pfn == start_pfn) { 712 /* 713 * If the section is smallest section in the pgdat, it need 714 * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages. 715 * In this case, we find second smallest valid mem_section 716 * for shrinking zone. 717 */ 718 pfn = find_smallest_section_pfn(nid, NULL, end_pfn, 719 pgdat_end_pfn); 720 if (pfn) { 721 pgdat->node_start_pfn = pfn; 722 pgdat->node_spanned_pages = pgdat_end_pfn - pfn; 723 } 724 } else if (pgdat_end_pfn == end_pfn) { 725 /* 726 * If the section is biggest section in the pgdat, it need 727 * shrink pgdat->node_spanned_pages. 728 * In this case, we find second biggest valid mem_section for 729 * shrinking zone. 730 */ 731 pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn, 732 start_pfn); 733 if (pfn) 734 pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1; 735 } 736 737 /* 738 * If the section is not biggest or smallest mem_section in the pgdat, 739 * it only creates a hole in the pgdat. So in this case, we need not 740 * change the pgdat. 741 * But perhaps, the pgdat has only hole data. Thus it check the pgdat 742 * has only hole or not. 743 */ 744 pfn = pgdat_start_pfn; 745 for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) { 746 ms = __pfn_to_section(pfn); 747 748 if (unlikely(!valid_section(ms))) 749 continue; 750 751 if (pfn_to_nid(pfn) != nid) 752 continue; 753 754 /* If the section is current section, it continues the loop */ 755 if (start_pfn == pfn) 756 continue; 757 758 /* If we find valid section, we have nothing to do */ 759 return; 760 } 761 762 /* The pgdat has no valid section */ 763 pgdat->node_start_pfn = 0; 764 pgdat->node_spanned_pages = 0; 765 } 766 767 static void __remove_zone(struct zone *zone, unsigned long start_pfn) 768 { 769 struct pglist_data *pgdat = zone->zone_pgdat; 770 int nr_pages = PAGES_PER_SECTION; 771 int zone_type; 772 unsigned long flags; 773 774 zone_type = zone - pgdat->node_zones; 775 776 pgdat_resize_lock(zone->zone_pgdat, &flags); 777 shrink_zone_span(zone, start_pfn, start_pfn + nr_pages); 778 shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages); 779 pgdat_resize_unlock(zone->zone_pgdat, &flags); 780 } 781 782 static int __remove_section(struct zone *zone, struct mem_section *ms, 783 unsigned long map_offset) 784 { 785 unsigned long start_pfn; 786 int scn_nr; 787 int ret = -EINVAL; 788 789 if (!valid_section(ms)) 790 return ret; 791 792 ret = unregister_memory_section(ms); 793 if (ret) 794 return ret; 795 796 scn_nr = __section_nr(ms); 797 start_pfn = section_nr_to_pfn(scn_nr); 798 __remove_zone(zone, start_pfn); 799 800 sparse_remove_one_section(zone, ms, map_offset); 801 return 0; 802 } 803 804 /** 805 * __remove_pages() - remove sections of pages from a zone 806 * @zone: zone from which pages need to be removed 807 * @phys_start_pfn: starting pageframe (must be aligned to start of a section) 808 * @nr_pages: number of pages to remove (must be multiple of section size) 809 * 810 * Generic helper function to remove section mappings and sysfs entries 811 * for the section of the memory we are removing. Caller needs to make 812 * sure that pages are marked reserved and zones are adjust properly by 813 * calling offline_pages(). 814 */ 815 int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, 816 unsigned long nr_pages) 817 { 818 unsigned long i; 819 unsigned long map_offset = 0; 820 int sections_to_remove, ret = 0; 821 822 /* In the ZONE_DEVICE case device driver owns the memory region */ 823 if (is_dev_zone(zone)) { 824 struct page *page = pfn_to_page(phys_start_pfn); 825 struct vmem_altmap *altmap; 826 827 altmap = to_vmem_altmap((unsigned long) page); 828 if (altmap) 829 map_offset = vmem_altmap_offset(altmap); 830 } else { 831 resource_size_t start, size; 832 833 start = phys_start_pfn << PAGE_SHIFT; 834 size = nr_pages * PAGE_SIZE; 835 836 ret = release_mem_region_adjustable(&iomem_resource, start, 837 size); 838 if (ret) { 839 resource_size_t endres = start + size - 1; 840 841 pr_warn("Unable to release resource <%pa-%pa> (%d)\n", 842 &start, &endres, ret); 843 } 844 } 845 846 clear_zone_contiguous(zone); 847 848 /* 849 * We can only remove entire sections 850 */ 851 BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); 852 BUG_ON(nr_pages % PAGES_PER_SECTION); 853 854 sections_to_remove = nr_pages / PAGES_PER_SECTION; 855 for (i = 0; i < sections_to_remove; i++) { 856 unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; 857 858 ret = __remove_section(zone, __pfn_to_section(pfn), map_offset); 859 map_offset = 0; 860 if (ret) 861 break; 862 } 863 864 set_zone_contiguous(zone); 865 866 return ret; 867 } 868 #endif /* CONFIG_MEMORY_HOTREMOVE */ 869 870 int set_online_page_callback(online_page_callback_t callback) 871 { 872 int rc = -EINVAL; 873 874 get_online_mems(); 875 mutex_lock(&online_page_callback_lock); 876 877 if (online_page_callback == generic_online_page) { 878 online_page_callback = callback; 879 rc = 0; 880 } 881 882 mutex_unlock(&online_page_callback_lock); 883 put_online_mems(); 884 885 return rc; 886 } 887 EXPORT_SYMBOL_GPL(set_online_page_callback); 888 889 int restore_online_page_callback(online_page_callback_t callback) 890 { 891 int rc = -EINVAL; 892 893 get_online_mems(); 894 mutex_lock(&online_page_callback_lock); 895 896 if (online_page_callback == callback) { 897 online_page_callback = generic_online_page; 898 rc = 0; 899 } 900 901 mutex_unlock(&online_page_callback_lock); 902 put_online_mems(); 903 904 return rc; 905 } 906 EXPORT_SYMBOL_GPL(restore_online_page_callback); 907 908 void __online_page_set_limits(struct page *page) 909 { 910 } 911 EXPORT_SYMBOL_GPL(__online_page_set_limits); 912 913 void __online_page_increment_counters(struct page *page) 914 { 915 adjust_managed_page_count(page, 1); 916 } 917 EXPORT_SYMBOL_GPL(__online_page_increment_counters); 918 919 void __online_page_free(struct page *page) 920 { 921 __free_reserved_page(page); 922 } 923 EXPORT_SYMBOL_GPL(__online_page_free); 924 925 static void generic_online_page(struct page *page) 926 { 927 __online_page_set_limits(page); 928 __online_page_increment_counters(page); 929 __online_page_free(page); 930 } 931 932 static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, 933 void *arg) 934 { 935 unsigned long i; 936 unsigned long onlined_pages = *(unsigned long *)arg; 937 struct page *page; 938 if (PageReserved(pfn_to_page(start_pfn))) 939 for (i = 0; i < nr_pages; i++) { 940 page = pfn_to_page(start_pfn + i); 941 (*online_page_callback)(page); 942 onlined_pages++; 943 } 944 *(unsigned long *)arg = onlined_pages; 945 return 0; 946 } 947 948 #ifdef CONFIG_MOVABLE_NODE 949 /* 950 * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have 951 * normal memory. 952 */ 953 static bool can_online_high_movable(struct zone *zone) 954 { 955 return true; 956 } 957 #else /* CONFIG_MOVABLE_NODE */ 958 /* ensure every online node has NORMAL memory */ 959 static bool can_online_high_movable(struct zone *zone) 960 { 961 return node_state(zone_to_nid(zone), N_NORMAL_MEMORY); 962 } 963 #endif /* CONFIG_MOVABLE_NODE */ 964 965 /* check which state of node_states will be changed when online memory */ 966 static void node_states_check_changes_online(unsigned long nr_pages, 967 struct zone *zone, struct memory_notify *arg) 968 { 969 int nid = zone_to_nid(zone); 970 enum zone_type zone_last = ZONE_NORMAL; 971 972 /* 973 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] 974 * contains nodes which have zones of 0...ZONE_NORMAL, 975 * set zone_last to ZONE_NORMAL. 976 * 977 * If we don't have HIGHMEM nor movable node, 978 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of 979 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. 980 */ 981 if (N_MEMORY == N_NORMAL_MEMORY) 982 zone_last = ZONE_MOVABLE; 983 984 /* 985 * if the memory to be online is in a zone of 0...zone_last, and 986 * the zones of 0...zone_last don't have memory before online, we will 987 * need to set the node to node_states[N_NORMAL_MEMORY] after 988 * the memory is online. 989 */ 990 if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY)) 991 arg->status_change_nid_normal = nid; 992 else 993 arg->status_change_nid_normal = -1; 994 995 #ifdef CONFIG_HIGHMEM 996 /* 997 * If we have movable node, node_states[N_HIGH_MEMORY] 998 * contains nodes which have zones of 0...ZONE_HIGHMEM, 999 * set zone_last to ZONE_HIGHMEM. 1000 * 1001 * If we don't have movable node, node_states[N_NORMAL_MEMORY] 1002 * contains nodes which have zones of 0...ZONE_MOVABLE, 1003 * set zone_last to ZONE_MOVABLE. 1004 */ 1005 zone_last = ZONE_HIGHMEM; 1006 if (N_MEMORY == N_HIGH_MEMORY) 1007 zone_last = ZONE_MOVABLE; 1008 1009 if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY)) 1010 arg->status_change_nid_high = nid; 1011 else 1012 arg->status_change_nid_high = -1; 1013 #else 1014 arg->status_change_nid_high = arg->status_change_nid_normal; 1015 #endif 1016 1017 /* 1018 * if the node don't have memory befor online, we will need to 1019 * set the node to node_states[N_MEMORY] after the memory 1020 * is online. 1021 */ 1022 if (!node_state(nid, N_MEMORY)) 1023 arg->status_change_nid = nid; 1024 else 1025 arg->status_change_nid = -1; 1026 } 1027 1028 static void node_states_set_node(int node, struct memory_notify *arg) 1029 { 1030 if (arg->status_change_nid_normal >= 0) 1031 node_set_state(node, N_NORMAL_MEMORY); 1032 1033 if (arg->status_change_nid_high >= 0) 1034 node_set_state(node, N_HIGH_MEMORY); 1035 1036 node_set_state(node, N_MEMORY); 1037 } 1038 1039 bool zone_can_shift(unsigned long pfn, unsigned long nr_pages, 1040 enum zone_type target, int *zone_shift) 1041 { 1042 struct zone *zone = page_zone(pfn_to_page(pfn)); 1043 enum zone_type idx = zone_idx(zone); 1044 int i; 1045 1046 *zone_shift = 0; 1047 1048 if (idx < target) { 1049 /* pages must be at end of current zone */ 1050 if (pfn + nr_pages != zone_end_pfn(zone)) 1051 return false; 1052 1053 /* no zones in use between current zone and target */ 1054 for (i = idx + 1; i < target; i++) 1055 if (zone_is_initialized(zone - idx + i)) 1056 return false; 1057 } 1058 1059 if (target < idx) { 1060 /* pages must be at beginning of current zone */ 1061 if (pfn != zone->zone_start_pfn) 1062 return false; 1063 1064 /* no zones in use between current zone and target */ 1065 for (i = target + 1; i < idx; i++) 1066 if (zone_is_initialized(zone - idx + i)) 1067 return false; 1068 } 1069 1070 *zone_shift = target - idx; 1071 return true; 1072 } 1073 1074 /* Must be protected by mem_hotplug_begin() */ 1075 int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) 1076 { 1077 unsigned long flags; 1078 unsigned long onlined_pages = 0; 1079 struct zone *zone; 1080 int need_zonelists_rebuild = 0; 1081 int nid; 1082 int ret; 1083 struct memory_notify arg; 1084 int zone_shift = 0; 1085 1086 /* 1087 * This doesn't need a lock to do pfn_to_page(). 1088 * The section can't be removed here because of the 1089 * memory_block->state_mutex. 1090 */ 1091 zone = page_zone(pfn_to_page(pfn)); 1092 1093 if ((zone_idx(zone) > ZONE_NORMAL || 1094 online_type == MMOP_ONLINE_MOVABLE) && 1095 !can_online_high_movable(zone)) 1096 return -EINVAL; 1097 1098 if (online_type == MMOP_ONLINE_KERNEL) { 1099 if (!zone_can_shift(pfn, nr_pages, ZONE_NORMAL, &zone_shift)) 1100 return -EINVAL; 1101 } else if (online_type == MMOP_ONLINE_MOVABLE) { 1102 if (!zone_can_shift(pfn, nr_pages, ZONE_MOVABLE, &zone_shift)) 1103 return -EINVAL; 1104 } 1105 1106 zone = move_pfn_range(zone_shift, pfn, pfn + nr_pages); 1107 if (!zone) 1108 return -EINVAL; 1109 1110 arg.start_pfn = pfn; 1111 arg.nr_pages = nr_pages; 1112 node_states_check_changes_online(nr_pages, zone, &arg); 1113 1114 nid = zone_to_nid(zone); 1115 1116 ret = memory_notify(MEM_GOING_ONLINE, &arg); 1117 ret = notifier_to_errno(ret); 1118 if (ret) 1119 goto failed_addition; 1120 1121 /* 1122 * If this zone is not populated, then it is not in zonelist. 1123 * This means the page allocator ignores this zone. 1124 * So, zonelist must be updated after online. 1125 */ 1126 mutex_lock(&zonelists_mutex); 1127 if (!populated_zone(zone)) { 1128 need_zonelists_rebuild = 1; 1129 build_all_zonelists(NULL, zone); 1130 } 1131 1132 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, 1133 online_pages_range); 1134 if (ret) { 1135 if (need_zonelists_rebuild) 1136 zone_pcp_reset(zone); 1137 mutex_unlock(&zonelists_mutex); 1138 goto failed_addition; 1139 } 1140 1141 zone->present_pages += onlined_pages; 1142 1143 pgdat_resize_lock(zone->zone_pgdat, &flags); 1144 zone->zone_pgdat->node_present_pages += onlined_pages; 1145 pgdat_resize_unlock(zone->zone_pgdat, &flags); 1146 1147 if (onlined_pages) { 1148 node_states_set_node(nid, &arg); 1149 if (need_zonelists_rebuild) 1150 build_all_zonelists(NULL, NULL); 1151 else 1152 zone_pcp_update(zone); 1153 } 1154 1155 mutex_unlock(&zonelists_mutex); 1156 1157 init_per_zone_wmark_min(); 1158 1159 if (onlined_pages) { 1160 kswapd_run(nid); 1161 kcompactd_run(nid); 1162 } 1163 1164 vm_total_pages = nr_free_pagecache_pages(); 1165 1166 writeback_set_ratelimit(); 1167 1168 if (onlined_pages) 1169 memory_notify(MEM_ONLINE, &arg); 1170 return 0; 1171 1172 failed_addition: 1173 pr_debug("online_pages [mem %#010llx-%#010llx] failed\n", 1174 (unsigned long long) pfn << PAGE_SHIFT, 1175 (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1); 1176 memory_notify(MEM_CANCEL_ONLINE, &arg); 1177 return ret; 1178 } 1179 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ 1180 1181 static void reset_node_present_pages(pg_data_t *pgdat) 1182 { 1183 struct zone *z; 1184 1185 for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) 1186 z->present_pages = 0; 1187 1188 pgdat->node_present_pages = 0; 1189 } 1190 1191 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 1192 static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) 1193 { 1194 struct pglist_data *pgdat; 1195 unsigned long zones_size[MAX_NR_ZONES] = {0}; 1196 unsigned long zholes_size[MAX_NR_ZONES] = {0}; 1197 unsigned long start_pfn = PFN_DOWN(start); 1198 1199 pgdat = NODE_DATA(nid); 1200 if (!pgdat) { 1201 pgdat = arch_alloc_nodedata(nid); 1202 if (!pgdat) 1203 return NULL; 1204 1205 arch_refresh_nodedata(nid, pgdat); 1206 } else { 1207 /* Reset the nr_zones, order and classzone_idx before reuse */ 1208 pgdat->nr_zones = 0; 1209 pgdat->kswapd_order = 0; 1210 pgdat->kswapd_classzone_idx = 0; 1211 } 1212 1213 /* we can use NODE_DATA(nid) from here */ 1214 1215 /* init node's zones as empty zones, we don't have any present pages.*/ 1216 free_area_init_node(nid, zones_size, start_pfn, zholes_size); 1217 pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat); 1218 1219 /* 1220 * The node we allocated has no zone fallback lists. For avoiding 1221 * to access not-initialized zonelist, build here. 1222 */ 1223 mutex_lock(&zonelists_mutex); 1224 build_all_zonelists(pgdat, NULL); 1225 mutex_unlock(&zonelists_mutex); 1226 1227 /* 1228 * zone->managed_pages is set to an approximate value in 1229 * free_area_init_core(), which will cause 1230 * /sys/device/system/node/nodeX/meminfo has wrong data. 1231 * So reset it to 0 before any memory is onlined. 1232 */ 1233 reset_node_managed_pages(pgdat); 1234 1235 /* 1236 * When memory is hot-added, all the memory is in offline state. So 1237 * clear all zones' present_pages because they will be updated in 1238 * online_pages() and offline_pages(). 1239 */ 1240 reset_node_present_pages(pgdat); 1241 1242 return pgdat; 1243 } 1244 1245 static void rollback_node_hotadd(int nid, pg_data_t *pgdat) 1246 { 1247 arch_refresh_nodedata(nid, NULL); 1248 free_percpu(pgdat->per_cpu_nodestats); 1249 arch_free_nodedata(pgdat); 1250 return; 1251 } 1252 1253 1254 /** 1255 * try_online_node - online a node if offlined 1256 * 1257 * called by cpu_up() to online a node without onlined memory. 1258 */ 1259 int try_online_node(int nid) 1260 { 1261 pg_data_t *pgdat; 1262 int ret; 1263 1264 if (node_online(nid)) 1265 return 0; 1266 1267 mem_hotplug_begin(); 1268 pgdat = hotadd_new_pgdat(nid, 0); 1269 if (!pgdat) { 1270 pr_err("Cannot online node %d due to NULL pgdat\n", nid); 1271 ret = -ENOMEM; 1272 goto out; 1273 } 1274 node_set_online(nid); 1275 ret = register_one_node(nid); 1276 BUG_ON(ret); 1277 1278 if (pgdat->node_zonelists->_zonerefs->zone == NULL) { 1279 mutex_lock(&zonelists_mutex); 1280 build_all_zonelists(NULL, NULL); 1281 mutex_unlock(&zonelists_mutex); 1282 } 1283 1284 out: 1285 mem_hotplug_done(); 1286 return ret; 1287 } 1288 1289 static int check_hotplug_memory_range(u64 start, u64 size) 1290 { 1291 u64 start_pfn = PFN_DOWN(start); 1292 u64 nr_pages = size >> PAGE_SHIFT; 1293 1294 /* Memory range must be aligned with section */ 1295 if ((start_pfn & ~PAGE_SECTION_MASK) || 1296 (nr_pages % PAGES_PER_SECTION) || (!nr_pages)) { 1297 pr_err("Section-unaligned hotplug range: start 0x%llx, size 0x%llx\n", 1298 (unsigned long long)start, 1299 (unsigned long long)size); 1300 return -EINVAL; 1301 } 1302 1303 return 0; 1304 } 1305 1306 /* 1307 * If movable zone has already been setup, newly added memory should be check. 1308 * If its address is higher than movable zone, it should be added as movable. 1309 * Without this check, movable zone may overlap with other zone. 1310 */ 1311 static int should_add_memory_movable(int nid, u64 start, u64 size) 1312 { 1313 unsigned long start_pfn = start >> PAGE_SHIFT; 1314 pg_data_t *pgdat = NODE_DATA(nid); 1315 struct zone *movable_zone = pgdat->node_zones + ZONE_MOVABLE; 1316 1317 if (zone_is_empty(movable_zone)) 1318 return 0; 1319 1320 if (movable_zone->zone_start_pfn <= start_pfn) 1321 return 1; 1322 1323 return 0; 1324 } 1325 1326 int zone_for_memory(int nid, u64 start, u64 size, int zone_default, 1327 bool for_device) 1328 { 1329 #ifdef CONFIG_ZONE_DEVICE 1330 if (for_device) 1331 return ZONE_DEVICE; 1332 #endif 1333 if (should_add_memory_movable(nid, start, size)) 1334 return ZONE_MOVABLE; 1335 1336 return zone_default; 1337 } 1338 1339 static int online_memory_block(struct memory_block *mem, void *arg) 1340 { 1341 return device_online(&mem->dev); 1342 } 1343 1344 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 1345 int __ref add_memory_resource(int nid, struct resource *res, bool online) 1346 { 1347 u64 start, size; 1348 pg_data_t *pgdat = NULL; 1349 bool new_pgdat; 1350 bool new_node; 1351 int ret; 1352 1353 start = res->start; 1354 size = resource_size(res); 1355 1356 ret = check_hotplug_memory_range(start, size); 1357 if (ret) 1358 return ret; 1359 1360 { /* Stupid hack to suppress address-never-null warning */ 1361 void *p = NODE_DATA(nid); 1362 new_pgdat = !p; 1363 } 1364 1365 mem_hotplug_begin(); 1366 1367 /* 1368 * Add new range to memblock so that when hotadd_new_pgdat() is called 1369 * to allocate new pgdat, get_pfn_range_for_nid() will be able to find 1370 * this new range and calculate total pages correctly. The range will 1371 * be removed at hot-remove time. 1372 */ 1373 memblock_add_node(start, size, nid); 1374 1375 new_node = !node_online(nid); 1376 if (new_node) { 1377 pgdat = hotadd_new_pgdat(nid, start); 1378 ret = -ENOMEM; 1379 if (!pgdat) 1380 goto error; 1381 } 1382 1383 /* call arch's memory hotadd */ 1384 ret = arch_add_memory(nid, start, size, false); 1385 1386 if (ret < 0) 1387 goto error; 1388 1389 /* we online node here. we can't roll back from here. */ 1390 node_set_online(nid); 1391 1392 if (new_node) { 1393 ret = register_one_node(nid); 1394 /* 1395 * If sysfs file of new node can't create, cpu on the node 1396 * can't be hot-added. There is no rollback way now. 1397 * So, check by BUG_ON() to catch it reluctantly.. 1398 */ 1399 BUG_ON(ret); 1400 } 1401 1402 /* create new memmap entry */ 1403 firmware_map_add_hotplug(start, start + size, "System RAM"); 1404 1405 /* online pages if requested */ 1406 if (online) 1407 walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), 1408 NULL, online_memory_block); 1409 1410 goto out; 1411 1412 error: 1413 /* rollback pgdat allocation and others */ 1414 if (new_pgdat) 1415 rollback_node_hotadd(nid, pgdat); 1416 memblock_remove(start, size); 1417 1418 out: 1419 mem_hotplug_done(); 1420 return ret; 1421 } 1422 EXPORT_SYMBOL_GPL(add_memory_resource); 1423 1424 int __ref add_memory(int nid, u64 start, u64 size) 1425 { 1426 struct resource *res; 1427 int ret; 1428 1429 res = register_memory_resource(start, size); 1430 if (IS_ERR(res)) 1431 return PTR_ERR(res); 1432 1433 ret = add_memory_resource(nid, res, memhp_auto_online); 1434 if (ret < 0) 1435 release_memory_resource(res); 1436 return ret; 1437 } 1438 EXPORT_SYMBOL_GPL(add_memory); 1439 1440 #ifdef CONFIG_MEMORY_HOTREMOVE 1441 /* 1442 * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy 1443 * set and the size of the free page is given by page_order(). Using this, 1444 * the function determines if the pageblock contains only free pages. 1445 * Due to buddy contraints, a free page at least the size of a pageblock will 1446 * be located at the start of the pageblock 1447 */ 1448 static inline int pageblock_free(struct page *page) 1449 { 1450 return PageBuddy(page) && page_order(page) >= pageblock_order; 1451 } 1452 1453 /* Return the start of the next active pageblock after a given page */ 1454 static struct page *next_active_pageblock(struct page *page) 1455 { 1456 /* Ensure the starting page is pageblock-aligned */ 1457 BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1)); 1458 1459 /* If the entire pageblock is free, move to the end of free page */ 1460 if (pageblock_free(page)) { 1461 int order; 1462 /* be careful. we don't have locks, page_order can be changed.*/ 1463 order = page_order(page); 1464 if ((order < MAX_ORDER) && (order >= pageblock_order)) 1465 return page + (1 << order); 1466 } 1467 1468 return page + pageblock_nr_pages; 1469 } 1470 1471 /* Checks if this range of memory is likely to be hot-removable. */ 1472 bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) 1473 { 1474 struct page *page = pfn_to_page(start_pfn); 1475 struct page *end_page = page + nr_pages; 1476 1477 /* Check the starting page of each pageblock within the range */ 1478 for (; page < end_page; page = next_active_pageblock(page)) { 1479 if (!is_pageblock_removable_nolock(page)) 1480 return false; 1481 cond_resched(); 1482 } 1483 1484 /* All pageblocks in the memory block are likely to be hot-removable */ 1485 return true; 1486 } 1487 1488 /* 1489 * Confirm all pages in a range [start, end) belong to the same zone. 1490 * When true, return its valid [start, end). 1491 */ 1492 int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn, 1493 unsigned long *valid_start, unsigned long *valid_end) 1494 { 1495 unsigned long pfn, sec_end_pfn; 1496 unsigned long start, end; 1497 struct zone *zone = NULL; 1498 struct page *page; 1499 int i; 1500 for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn + 1); 1501 pfn < end_pfn; 1502 pfn = sec_end_pfn, sec_end_pfn += PAGES_PER_SECTION) { 1503 /* Make sure the memory section is present first */ 1504 if (!present_section_nr(pfn_to_section_nr(pfn))) 1505 continue; 1506 for (; pfn < sec_end_pfn && pfn < end_pfn; 1507 pfn += MAX_ORDER_NR_PAGES) { 1508 i = 0; 1509 /* This is just a CONFIG_HOLES_IN_ZONE check.*/ 1510 while ((i < MAX_ORDER_NR_PAGES) && 1511 !pfn_valid_within(pfn + i)) 1512 i++; 1513 if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn) 1514 continue; 1515 page = pfn_to_page(pfn + i); 1516 if (zone && page_zone(page) != zone) 1517 return 0; 1518 if (!zone) 1519 start = pfn + i; 1520 zone = page_zone(page); 1521 end = pfn + MAX_ORDER_NR_PAGES; 1522 } 1523 } 1524 1525 if (zone) { 1526 *valid_start = start; 1527 *valid_end = min(end, end_pfn); 1528 return 1; 1529 } else { 1530 return 0; 1531 } 1532 } 1533 1534 /* 1535 * Scan pfn range [start,end) to find movable/migratable pages (LRU pages, 1536 * non-lru movable pages and hugepages). We scan pfn because it's much 1537 * easier than scanning over linked list. This function returns the pfn 1538 * of the first found movable page if it's found, otherwise 0. 1539 */ 1540 static unsigned long scan_movable_pages(unsigned long start, unsigned long end) 1541 { 1542 unsigned long pfn; 1543 struct page *page; 1544 for (pfn = start; pfn < end; pfn++) { 1545 if (pfn_valid(pfn)) { 1546 page = pfn_to_page(pfn); 1547 if (PageLRU(page)) 1548 return pfn; 1549 if (__PageMovable(page)) 1550 return pfn; 1551 if (PageHuge(page)) { 1552 if (page_huge_active(page)) 1553 return pfn; 1554 else 1555 pfn = round_up(pfn + 1, 1556 1 << compound_order(page)) - 1; 1557 } 1558 } 1559 } 1560 return 0; 1561 } 1562 1563 static struct page *new_node_page(struct page *page, unsigned long private, 1564 int **result) 1565 { 1566 gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; 1567 int nid = page_to_nid(page); 1568 nodemask_t nmask = node_states[N_MEMORY]; 1569 struct page *new_page = NULL; 1570 1571 /* 1572 * TODO: allocate a destination hugepage from a nearest neighbor node, 1573 * accordance with memory policy of the user process if possible. For 1574 * now as a simple work-around, we use the next node for destination. 1575 */ 1576 if (PageHuge(page)) 1577 return alloc_huge_page_node(page_hstate(compound_head(page)), 1578 next_node_in(nid, nmask)); 1579 1580 node_clear(nid, nmask); 1581 1582 if (PageHighMem(page) 1583 || (zone_idx(page_zone(page)) == ZONE_MOVABLE)) 1584 gfp_mask |= __GFP_HIGHMEM; 1585 1586 if (!nodes_empty(nmask)) 1587 new_page = __alloc_pages_nodemask(gfp_mask, 0, 1588 node_zonelist(nid, gfp_mask), &nmask); 1589 if (!new_page) 1590 new_page = __alloc_pages(gfp_mask, 0, 1591 node_zonelist(nid, gfp_mask)); 1592 1593 return new_page; 1594 } 1595 1596 #define NR_OFFLINE_AT_ONCE_PAGES (256) 1597 static int 1598 do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) 1599 { 1600 unsigned long pfn; 1601 struct page *page; 1602 int move_pages = NR_OFFLINE_AT_ONCE_PAGES; 1603 int not_managed = 0; 1604 int ret = 0; 1605 LIST_HEAD(source); 1606 1607 for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) { 1608 if (!pfn_valid(pfn)) 1609 continue; 1610 page = pfn_to_page(pfn); 1611 1612 if (PageHuge(page)) { 1613 struct page *head = compound_head(page); 1614 pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1; 1615 if (compound_order(head) > PFN_SECTION_SHIFT) { 1616 ret = -EBUSY; 1617 break; 1618 } 1619 if (isolate_huge_page(page, &source)) 1620 move_pages -= 1 << compound_order(head); 1621 continue; 1622 } 1623 1624 if (!get_page_unless_zero(page)) 1625 continue; 1626 /* 1627 * We can skip free pages. And we can deal with pages on 1628 * LRU and non-lru movable pages. 1629 */ 1630 if (PageLRU(page)) 1631 ret = isolate_lru_page(page); 1632 else 1633 ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE); 1634 if (!ret) { /* Success */ 1635 put_page(page); 1636 list_add_tail(&page->lru, &source); 1637 move_pages--; 1638 if (!__PageMovable(page)) 1639 inc_node_page_state(page, NR_ISOLATED_ANON + 1640 page_is_file_cache(page)); 1641 1642 } else { 1643 #ifdef CONFIG_DEBUG_VM 1644 pr_alert("failed to isolate pfn %lx\n", pfn); 1645 dump_page(page, "isolation failed"); 1646 #endif 1647 put_page(page); 1648 /* Because we don't have big zone->lock. we should 1649 check this again here. */ 1650 if (page_count(page)) { 1651 not_managed++; 1652 ret = -EBUSY; 1653 break; 1654 } 1655 } 1656 } 1657 if (!list_empty(&source)) { 1658 if (not_managed) { 1659 putback_movable_pages(&source); 1660 goto out; 1661 } 1662 1663 /* Allocate a new page from the nearest neighbor node */ 1664 ret = migrate_pages(&source, new_node_page, NULL, 0, 1665 MIGRATE_SYNC, MR_MEMORY_HOTPLUG); 1666 if (ret) 1667 putback_movable_pages(&source); 1668 } 1669 out: 1670 return ret; 1671 } 1672 1673 /* 1674 * remove from free_area[] and mark all as Reserved. 1675 */ 1676 static int 1677 offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, 1678 void *data) 1679 { 1680 __offline_isolated_pages(start, start + nr_pages); 1681 return 0; 1682 } 1683 1684 static void 1685 offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 1686 { 1687 walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL, 1688 offline_isolated_pages_cb); 1689 } 1690 1691 /* 1692 * Check all pages in range, recoreded as memory resource, are isolated. 1693 */ 1694 static int 1695 check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, 1696 void *data) 1697 { 1698 int ret; 1699 long offlined = *(long *)data; 1700 ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true); 1701 offlined = nr_pages; 1702 if (!ret) 1703 *(long *)data += offlined; 1704 return ret; 1705 } 1706 1707 static long 1708 check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) 1709 { 1710 long offlined = 0; 1711 int ret; 1712 1713 ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined, 1714 check_pages_isolated_cb); 1715 if (ret < 0) 1716 offlined = (long)ret; 1717 return offlined; 1718 } 1719 1720 #ifdef CONFIG_MOVABLE_NODE 1721 /* 1722 * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have 1723 * normal memory. 1724 */ 1725 static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) 1726 { 1727 return true; 1728 } 1729 #else /* CONFIG_MOVABLE_NODE */ 1730 /* ensure the node has NORMAL memory if it is still online */ 1731 static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) 1732 { 1733 struct pglist_data *pgdat = zone->zone_pgdat; 1734 unsigned long present_pages = 0; 1735 enum zone_type zt; 1736 1737 for (zt = 0; zt <= ZONE_NORMAL; zt++) 1738 present_pages += pgdat->node_zones[zt].present_pages; 1739 1740 if (present_pages > nr_pages) 1741 return true; 1742 1743 present_pages = 0; 1744 for (; zt <= ZONE_MOVABLE; zt++) 1745 present_pages += pgdat->node_zones[zt].present_pages; 1746 1747 /* 1748 * we can't offline the last normal memory until all 1749 * higher memory is offlined. 1750 */ 1751 return present_pages == 0; 1752 } 1753 #endif /* CONFIG_MOVABLE_NODE */ 1754 1755 static int __init cmdline_parse_movable_node(char *p) 1756 { 1757 #ifdef CONFIG_MOVABLE_NODE 1758 movable_node_enabled = true; 1759 #else 1760 pr_warn("movable_node option not supported\n"); 1761 #endif 1762 return 0; 1763 } 1764 early_param("movable_node", cmdline_parse_movable_node); 1765 1766 /* check which state of node_states will be changed when offline memory */ 1767 static void node_states_check_changes_offline(unsigned long nr_pages, 1768 struct zone *zone, struct memory_notify *arg) 1769 { 1770 struct pglist_data *pgdat = zone->zone_pgdat; 1771 unsigned long present_pages = 0; 1772 enum zone_type zt, zone_last = ZONE_NORMAL; 1773 1774 /* 1775 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] 1776 * contains nodes which have zones of 0...ZONE_NORMAL, 1777 * set zone_last to ZONE_NORMAL. 1778 * 1779 * If we don't have HIGHMEM nor movable node, 1780 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of 1781 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. 1782 */ 1783 if (N_MEMORY == N_NORMAL_MEMORY) 1784 zone_last = ZONE_MOVABLE; 1785 1786 /* 1787 * check whether node_states[N_NORMAL_MEMORY] will be changed. 1788 * If the memory to be offline is in a zone of 0...zone_last, 1789 * and it is the last present memory, 0...zone_last will 1790 * become empty after offline , thus we can determind we will 1791 * need to clear the node from node_states[N_NORMAL_MEMORY]. 1792 */ 1793 for (zt = 0; zt <= zone_last; zt++) 1794 present_pages += pgdat->node_zones[zt].present_pages; 1795 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) 1796 arg->status_change_nid_normal = zone_to_nid(zone); 1797 else 1798 arg->status_change_nid_normal = -1; 1799 1800 #ifdef CONFIG_HIGHMEM 1801 /* 1802 * If we have movable node, node_states[N_HIGH_MEMORY] 1803 * contains nodes which have zones of 0...ZONE_HIGHMEM, 1804 * set zone_last to ZONE_HIGHMEM. 1805 * 1806 * If we don't have movable node, node_states[N_NORMAL_MEMORY] 1807 * contains nodes which have zones of 0...ZONE_MOVABLE, 1808 * set zone_last to ZONE_MOVABLE. 1809 */ 1810 zone_last = ZONE_HIGHMEM; 1811 if (N_MEMORY == N_HIGH_MEMORY) 1812 zone_last = ZONE_MOVABLE; 1813 1814 for (; zt <= zone_last; zt++) 1815 present_pages += pgdat->node_zones[zt].present_pages; 1816 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) 1817 arg->status_change_nid_high = zone_to_nid(zone); 1818 else 1819 arg->status_change_nid_high = -1; 1820 #else 1821 arg->status_change_nid_high = arg->status_change_nid_normal; 1822 #endif 1823 1824 /* 1825 * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE 1826 */ 1827 zone_last = ZONE_MOVABLE; 1828 1829 /* 1830 * check whether node_states[N_HIGH_MEMORY] will be changed 1831 * If we try to offline the last present @nr_pages from the node, 1832 * we can determind we will need to clear the node from 1833 * node_states[N_HIGH_MEMORY]. 1834 */ 1835 for (; zt <= zone_last; zt++) 1836 present_pages += pgdat->node_zones[zt].present_pages; 1837 if (nr_pages >= present_pages) 1838 arg->status_change_nid = zone_to_nid(zone); 1839 else 1840 arg->status_change_nid = -1; 1841 } 1842 1843 static void node_states_clear_node(int node, struct memory_notify *arg) 1844 { 1845 if (arg->status_change_nid_normal >= 0) 1846 node_clear_state(node, N_NORMAL_MEMORY); 1847 1848 if ((N_MEMORY != N_NORMAL_MEMORY) && 1849 (arg->status_change_nid_high >= 0)) 1850 node_clear_state(node, N_HIGH_MEMORY); 1851 1852 if ((N_MEMORY != N_HIGH_MEMORY) && 1853 (arg->status_change_nid >= 0)) 1854 node_clear_state(node, N_MEMORY); 1855 } 1856 1857 static int __ref __offline_pages(unsigned long start_pfn, 1858 unsigned long end_pfn, unsigned long timeout) 1859 { 1860 unsigned long pfn, nr_pages, expire; 1861 long offlined_pages; 1862 int ret, drain, retry_max, node; 1863 unsigned long flags; 1864 unsigned long valid_start, valid_end; 1865 struct zone *zone; 1866 struct memory_notify arg; 1867 1868 /* at least, alignment against pageblock is necessary */ 1869 if (!IS_ALIGNED(start_pfn, pageblock_nr_pages)) 1870 return -EINVAL; 1871 if (!IS_ALIGNED(end_pfn, pageblock_nr_pages)) 1872 return -EINVAL; 1873 /* This makes hotplug much easier...and readable. 1874 we assume this for now. .*/ 1875 if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, &valid_end)) 1876 return -EINVAL; 1877 1878 zone = page_zone(pfn_to_page(valid_start)); 1879 node = zone_to_nid(zone); 1880 nr_pages = end_pfn - start_pfn; 1881 1882 if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages)) 1883 return -EINVAL; 1884 1885 /* set above range as isolated */ 1886 ret = start_isolate_page_range(start_pfn, end_pfn, 1887 MIGRATE_MOVABLE, true); 1888 if (ret) 1889 return ret; 1890 1891 arg.start_pfn = start_pfn; 1892 arg.nr_pages = nr_pages; 1893 node_states_check_changes_offline(nr_pages, zone, &arg); 1894 1895 ret = memory_notify(MEM_GOING_OFFLINE, &arg); 1896 ret = notifier_to_errno(ret); 1897 if (ret) 1898 goto failed_removal; 1899 1900 pfn = start_pfn; 1901 expire = jiffies + timeout; 1902 drain = 0; 1903 retry_max = 5; 1904 repeat: 1905 /* start memory hot removal */ 1906 ret = -EAGAIN; 1907 if (time_after(jiffies, expire)) 1908 goto failed_removal; 1909 ret = -EINTR; 1910 if (signal_pending(current)) 1911 goto failed_removal; 1912 ret = 0; 1913 if (drain) { 1914 lru_add_drain_all(); 1915 cond_resched(); 1916 drain_all_pages(zone); 1917 } 1918 1919 pfn = scan_movable_pages(start_pfn, end_pfn); 1920 if (pfn) { /* We have movable pages */ 1921 ret = do_migrate_range(pfn, end_pfn); 1922 if (!ret) { 1923 drain = 1; 1924 goto repeat; 1925 } else { 1926 if (ret < 0) 1927 if (--retry_max == 0) 1928 goto failed_removal; 1929 yield(); 1930 drain = 1; 1931 goto repeat; 1932 } 1933 } 1934 /* drain all zone's lru pagevec, this is asynchronous... */ 1935 lru_add_drain_all(); 1936 yield(); 1937 /* drain pcp pages, this is synchronous. */ 1938 drain_all_pages(zone); 1939 /* 1940 * dissolve free hugepages in the memory block before doing offlining 1941 * actually in order to make hugetlbfs's object counting consistent. 1942 */ 1943 ret = dissolve_free_huge_pages(start_pfn, end_pfn); 1944 if (ret) 1945 goto failed_removal; 1946 /* check again */ 1947 offlined_pages = check_pages_isolated(start_pfn, end_pfn); 1948 if (offlined_pages < 0) { 1949 ret = -EBUSY; 1950 goto failed_removal; 1951 } 1952 pr_info("Offlined Pages %ld\n", offlined_pages); 1953 /* Ok, all of our target is isolated. 1954 We cannot do rollback at this point. */ 1955 offline_isolated_pages(start_pfn, end_pfn); 1956 /* reset pagetype flags and makes migrate type to be MOVABLE */ 1957 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1958 /* removal success */ 1959 adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages); 1960 zone->present_pages -= offlined_pages; 1961 1962 pgdat_resize_lock(zone->zone_pgdat, &flags); 1963 zone->zone_pgdat->node_present_pages -= offlined_pages; 1964 pgdat_resize_unlock(zone->zone_pgdat, &flags); 1965 1966 init_per_zone_wmark_min(); 1967 1968 if (!populated_zone(zone)) { 1969 zone_pcp_reset(zone); 1970 mutex_lock(&zonelists_mutex); 1971 build_all_zonelists(NULL, NULL); 1972 mutex_unlock(&zonelists_mutex); 1973 } else 1974 zone_pcp_update(zone); 1975 1976 node_states_clear_node(node, &arg); 1977 if (arg.status_change_nid >= 0) { 1978 kswapd_stop(node); 1979 kcompactd_stop(node); 1980 } 1981 1982 vm_total_pages = nr_free_pagecache_pages(); 1983 writeback_set_ratelimit(); 1984 1985 memory_notify(MEM_OFFLINE, &arg); 1986 return 0; 1987 1988 failed_removal: 1989 pr_debug("memory offlining [mem %#010llx-%#010llx] failed\n", 1990 (unsigned long long) start_pfn << PAGE_SHIFT, 1991 ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); 1992 memory_notify(MEM_CANCEL_OFFLINE, &arg); 1993 /* pushback to free area */ 1994 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1995 return ret; 1996 } 1997 1998 /* Must be protected by mem_hotplug_begin() */ 1999 int offline_pages(unsigned long start_pfn, unsigned long nr_pages) 2000 { 2001 return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); 2002 } 2003 #endif /* CONFIG_MEMORY_HOTREMOVE */ 2004 2005 /** 2006 * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn) 2007 * @start_pfn: start pfn of the memory range 2008 * @end_pfn: end pfn of the memory range 2009 * @arg: argument passed to func 2010 * @func: callback for each memory section walked 2011 * 2012 * This function walks through all present mem sections in range 2013 * [start_pfn, end_pfn) and call func on each mem section. 2014 * 2015 * Returns the return value of func. 2016 */ 2017 int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, 2018 void *arg, int (*func)(struct memory_block *, void *)) 2019 { 2020 struct memory_block *mem = NULL; 2021 struct mem_section *section; 2022 unsigned long pfn, section_nr; 2023 int ret; 2024 2025 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 2026 section_nr = pfn_to_section_nr(pfn); 2027 if (!present_section_nr(section_nr)) 2028 continue; 2029 2030 section = __nr_to_section(section_nr); 2031 /* same memblock? */ 2032 if (mem) 2033 if ((section_nr >= mem->start_section_nr) && 2034 (section_nr <= mem->end_section_nr)) 2035 continue; 2036 2037 mem = find_memory_block_hinted(section, mem); 2038 if (!mem) 2039 continue; 2040 2041 ret = func(mem, arg); 2042 if (ret) { 2043 kobject_put(&mem->dev.kobj); 2044 return ret; 2045 } 2046 } 2047 2048 if (mem) 2049 kobject_put(&mem->dev.kobj); 2050 2051 return 0; 2052 } 2053 2054 #ifdef CONFIG_MEMORY_HOTREMOVE 2055 static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) 2056 { 2057 int ret = !is_memblock_offlined(mem); 2058 2059 if (unlikely(ret)) { 2060 phys_addr_t beginpa, endpa; 2061 2062 beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); 2063 endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1; 2064 pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n", 2065 &beginpa, &endpa); 2066 } 2067 2068 return ret; 2069 } 2070 2071 static int check_cpu_on_node(pg_data_t *pgdat) 2072 { 2073 int cpu; 2074 2075 for_each_present_cpu(cpu) { 2076 if (cpu_to_node(cpu) == pgdat->node_id) 2077 /* 2078 * the cpu on this node isn't removed, and we can't 2079 * offline this node. 2080 */ 2081 return -EBUSY; 2082 } 2083 2084 return 0; 2085 } 2086 2087 static void unmap_cpu_on_node(pg_data_t *pgdat) 2088 { 2089 #ifdef CONFIG_ACPI_NUMA 2090 int cpu; 2091 2092 for_each_possible_cpu(cpu) 2093 if (cpu_to_node(cpu) == pgdat->node_id) 2094 numa_clear_node(cpu); 2095 #endif 2096 } 2097 2098 static int check_and_unmap_cpu_on_node(pg_data_t *pgdat) 2099 { 2100 int ret; 2101 2102 ret = check_cpu_on_node(pgdat); 2103 if (ret) 2104 return ret; 2105 2106 /* 2107 * the node will be offlined when we come here, so we can clear 2108 * the cpu_to_node() now. 2109 */ 2110 2111 unmap_cpu_on_node(pgdat); 2112 return 0; 2113 } 2114 2115 /** 2116 * try_offline_node 2117 * 2118 * Offline a node if all memory sections and cpus of the node are removed. 2119 * 2120 * NOTE: The caller must call lock_device_hotplug() to serialize hotplug 2121 * and online/offline operations before this call. 2122 */ 2123 void try_offline_node(int nid) 2124 { 2125 pg_data_t *pgdat = NODE_DATA(nid); 2126 unsigned long start_pfn = pgdat->node_start_pfn; 2127 unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; 2128 unsigned long pfn; 2129 2130 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 2131 unsigned long section_nr = pfn_to_section_nr(pfn); 2132 2133 if (!present_section_nr(section_nr)) 2134 continue; 2135 2136 if (pfn_to_nid(pfn) != nid) 2137 continue; 2138 2139 /* 2140 * some memory sections of this node are not removed, and we 2141 * can't offline node now. 2142 */ 2143 return; 2144 } 2145 2146 if (check_and_unmap_cpu_on_node(pgdat)) 2147 return; 2148 2149 /* 2150 * all memory/cpu of this node are removed, we can offline this 2151 * node now. 2152 */ 2153 node_set_offline(nid); 2154 unregister_one_node(nid); 2155 } 2156 EXPORT_SYMBOL(try_offline_node); 2157 2158 /** 2159 * remove_memory 2160 * 2161 * NOTE: The caller must call lock_device_hotplug() to serialize hotplug 2162 * and online/offline operations before this call, as required by 2163 * try_offline_node(). 2164 */ 2165 void __ref remove_memory(int nid, u64 start, u64 size) 2166 { 2167 int ret; 2168 2169 BUG_ON(check_hotplug_memory_range(start, size)); 2170 2171 mem_hotplug_begin(); 2172 2173 /* 2174 * All memory blocks must be offlined before removing memory. Check 2175 * whether all memory blocks in question are offline and trigger a BUG() 2176 * if this is not the case. 2177 */ 2178 ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, 2179 check_memblock_offlined_cb); 2180 if (ret) 2181 BUG(); 2182 2183 /* remove memmap entry */ 2184 firmware_map_remove(start, start + size, "System RAM"); 2185 memblock_free(start, size); 2186 memblock_remove(start, size); 2187 2188 arch_remove_memory(start, size); 2189 2190 try_offline_node(nid); 2191 2192 mem_hotplug_done(); 2193 } 2194 EXPORT_SYMBOL_GPL(remove_memory); 2195 #endif /* CONFIG_MEMORY_HOTREMOVE */ 2196