1 /* 2 * linux/mm/memory_hotplug.c 3 * 4 * Copyright (C) 5 */ 6 7 #include <linux/stddef.h> 8 #include <linux/mm.h> 9 #include <linux/sched/signal.h> 10 #include <linux/swap.h> 11 #include <linux/interrupt.h> 12 #include <linux/pagemap.h> 13 #include <linux/compiler.h> 14 #include <linux/export.h> 15 #include <linux/pagevec.h> 16 #include <linux/writeback.h> 17 #include <linux/slab.h> 18 #include <linux/sysctl.h> 19 #include <linux/cpu.h> 20 #include <linux/memory.h> 21 #include <linux/memremap.h> 22 #include <linux/memory_hotplug.h> 23 #include <linux/highmem.h> 24 #include <linux/vmalloc.h> 25 #include <linux/ioport.h> 26 #include <linux/delay.h> 27 #include <linux/migrate.h> 28 #include <linux/page-isolation.h> 29 #include <linux/pfn.h> 30 #include <linux/suspend.h> 31 #include <linux/mm_inline.h> 32 #include <linux/firmware-map.h> 33 #include <linux/stop_machine.h> 34 #include <linux/hugetlb.h> 35 #include <linux/memblock.h> 36 #include <linux/bootmem.h> 37 #include <linux/compaction.h> 38 39 #include <asm/tlbflush.h> 40 41 #include "internal.h" 42 43 /* 44 * online_page_callback contains pointer to current page onlining function. 45 * Initially it is generic_online_page(). If it is required it could be 46 * changed by calling set_online_page_callback() for callback registration 47 * and restore_online_page_callback() for generic callback restore. 48 */ 49 50 static void generic_online_page(struct page *page); 51 52 static online_page_callback_t online_page_callback = generic_online_page; 53 static DEFINE_MUTEX(online_page_callback_lock); 54 55 /* The same as the cpu_hotplug lock, but for memory hotplug. */ 56 static struct { 57 struct task_struct *active_writer; 58 struct mutex lock; /* Synchronizes accesses to refcount, */ 59 /* 60 * Also blocks the new readers during 61 * an ongoing mem hotplug operation. 62 */ 63 int refcount; 64 65 #ifdef CONFIG_DEBUG_LOCK_ALLOC 66 struct lockdep_map dep_map; 67 #endif 68 } mem_hotplug = { 69 .active_writer = NULL, 70 .lock = __MUTEX_INITIALIZER(mem_hotplug.lock), 71 .refcount = 0, 72 #ifdef CONFIG_DEBUG_LOCK_ALLOC 73 .dep_map = {.name = "mem_hotplug.lock" }, 74 #endif 75 }; 76 77 /* Lockdep annotations for get/put_online_mems() and mem_hotplug_begin/end() */ 78 #define memhp_lock_acquire_read() lock_map_acquire_read(&mem_hotplug.dep_map) 79 #define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map) 80 #define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map) 81 82 #ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE 83 bool memhp_auto_online; 84 #else 85 bool memhp_auto_online = true; 86 #endif 87 EXPORT_SYMBOL_GPL(memhp_auto_online); 88 89 static int __init setup_memhp_default_state(char *str) 90 { 91 if (!strcmp(str, "online")) 92 memhp_auto_online = true; 93 else if (!strcmp(str, "offline")) 94 memhp_auto_online = false; 95 96 return 1; 97 } 98 __setup("memhp_default_state=", setup_memhp_default_state); 99 100 void get_online_mems(void) 101 { 102 might_sleep(); 103 if (mem_hotplug.active_writer == current) 104 return; 105 memhp_lock_acquire_read(); 106 mutex_lock(&mem_hotplug.lock); 107 mem_hotplug.refcount++; 108 mutex_unlock(&mem_hotplug.lock); 109 110 } 111 112 void put_online_mems(void) 113 { 114 if (mem_hotplug.active_writer == current) 115 return; 116 mutex_lock(&mem_hotplug.lock); 117 118 if (WARN_ON(!mem_hotplug.refcount)) 119 mem_hotplug.refcount++; /* try to fix things up */ 120 121 if (!--mem_hotplug.refcount && unlikely(mem_hotplug.active_writer)) 122 wake_up_process(mem_hotplug.active_writer); 123 mutex_unlock(&mem_hotplug.lock); 124 memhp_lock_release(); 125 126 } 127 128 /* Serializes write accesses to mem_hotplug.active_writer. */ 129 static DEFINE_MUTEX(memory_add_remove_lock); 130 131 void mem_hotplug_begin(void) 132 { 133 mutex_lock(&memory_add_remove_lock); 134 135 mem_hotplug.active_writer = current; 136 137 memhp_lock_acquire(); 138 for (;;) { 139 mutex_lock(&mem_hotplug.lock); 140 if (likely(!mem_hotplug.refcount)) 141 break; 142 __set_current_state(TASK_UNINTERRUPTIBLE); 143 mutex_unlock(&mem_hotplug.lock); 144 schedule(); 145 } 146 } 147 148 void mem_hotplug_done(void) 149 { 150 mem_hotplug.active_writer = NULL; 151 mutex_unlock(&mem_hotplug.lock); 152 memhp_lock_release(); 153 mutex_unlock(&memory_add_remove_lock); 154 } 155 156 /* add this memory to iomem resource */ 157 static struct resource *register_memory_resource(u64 start, u64 size) 158 { 159 struct resource *res; 160 res = kzalloc(sizeof(struct resource), GFP_KERNEL); 161 if (!res) 162 return ERR_PTR(-ENOMEM); 163 164 res->name = "System RAM"; 165 res->start = start; 166 res->end = start + size - 1; 167 res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; 168 if (request_resource(&iomem_resource, res) < 0) { 169 pr_debug("System RAM resource %pR cannot be added\n", res); 170 kfree(res); 171 return ERR_PTR(-EEXIST); 172 } 173 return res; 174 } 175 176 static void release_memory_resource(struct resource *res) 177 { 178 if (!res) 179 return; 180 release_resource(res); 181 kfree(res); 182 return; 183 } 184 185 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 186 void get_page_bootmem(unsigned long info, struct page *page, 187 unsigned long type) 188 { 189 page->freelist = (void *)type; 190 SetPagePrivate(page); 191 set_page_private(page, info); 192 page_ref_inc(page); 193 } 194 195 void put_page_bootmem(struct page *page) 196 { 197 unsigned long type; 198 199 type = (unsigned long) page->freelist; 200 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || 201 type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); 202 203 if (page_ref_dec_return(page) == 1) { 204 page->freelist = NULL; 205 ClearPagePrivate(page); 206 set_page_private(page, 0); 207 INIT_LIST_HEAD(&page->lru); 208 free_reserved_page(page); 209 } 210 } 211 212 #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE 213 #ifndef CONFIG_SPARSEMEM_VMEMMAP 214 static void register_page_bootmem_info_section(unsigned long start_pfn) 215 { 216 unsigned long *usemap, mapsize, section_nr, i; 217 struct mem_section *ms; 218 struct page *page, *memmap; 219 220 section_nr = pfn_to_section_nr(start_pfn); 221 ms = __nr_to_section(section_nr); 222 223 /* Get section's memmap address */ 224 memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); 225 226 /* 227 * Get page for the memmap's phys address 228 * XXX: need more consideration for sparse_vmemmap... 229 */ 230 page = virt_to_page(memmap); 231 mapsize = sizeof(struct page) * PAGES_PER_SECTION; 232 mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT; 233 234 /* remember memmap's page */ 235 for (i = 0; i < mapsize; i++, page++) 236 get_page_bootmem(section_nr, page, SECTION_INFO); 237 238 usemap = __nr_to_section(section_nr)->pageblock_flags; 239 page = virt_to_page(usemap); 240 241 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; 242 243 for (i = 0; i < mapsize; i++, page++) 244 get_page_bootmem(section_nr, page, MIX_SECTION_INFO); 245 246 } 247 #else /* CONFIG_SPARSEMEM_VMEMMAP */ 248 static void register_page_bootmem_info_section(unsigned long start_pfn) 249 { 250 unsigned long *usemap, mapsize, section_nr, i; 251 struct mem_section *ms; 252 struct page *page, *memmap; 253 254 if (!pfn_valid(start_pfn)) 255 return; 256 257 section_nr = pfn_to_section_nr(start_pfn); 258 ms = __nr_to_section(section_nr); 259 260 memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); 261 262 register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION); 263 264 usemap = __nr_to_section(section_nr)->pageblock_flags; 265 page = virt_to_page(usemap); 266 267 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; 268 269 for (i = 0; i < mapsize; i++, page++) 270 get_page_bootmem(section_nr, page, MIX_SECTION_INFO); 271 } 272 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ 273 274 void __init register_page_bootmem_info_node(struct pglist_data *pgdat) 275 { 276 unsigned long i, pfn, end_pfn, nr_pages; 277 int node = pgdat->node_id; 278 struct page *page; 279 280 nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT; 281 page = virt_to_page(pgdat); 282 283 for (i = 0; i < nr_pages; i++, page++) 284 get_page_bootmem(node, page, NODE_INFO); 285 286 pfn = pgdat->node_start_pfn; 287 end_pfn = pgdat_end_pfn(pgdat); 288 289 /* register section info */ 290 for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 291 /* 292 * Some platforms can assign the same pfn to multiple nodes - on 293 * node0 as well as nodeN. To avoid registering a pfn against 294 * multiple nodes we check that this pfn does not already 295 * reside in some other nodes. 296 */ 297 if (pfn_valid(pfn) && (early_pfn_to_nid(pfn) == node)) 298 register_page_bootmem_info_section(pfn); 299 } 300 } 301 #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ 302 303 static void __meminit grow_zone_span(struct zone *zone, unsigned long start_pfn, 304 unsigned long end_pfn) 305 { 306 unsigned long old_zone_end_pfn; 307 308 zone_span_writelock(zone); 309 310 old_zone_end_pfn = zone_end_pfn(zone); 311 if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) 312 zone->zone_start_pfn = start_pfn; 313 314 zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - 315 zone->zone_start_pfn; 316 317 zone_span_writeunlock(zone); 318 } 319 320 static void resize_zone(struct zone *zone, unsigned long start_pfn, 321 unsigned long end_pfn) 322 { 323 zone_span_writelock(zone); 324 325 if (end_pfn - start_pfn) { 326 zone->zone_start_pfn = start_pfn; 327 zone->spanned_pages = end_pfn - start_pfn; 328 } else { 329 /* 330 * make it consist as free_area_init_core(), 331 * if spanned_pages = 0, then keep start_pfn = 0 332 */ 333 zone->zone_start_pfn = 0; 334 zone->spanned_pages = 0; 335 } 336 337 zone_span_writeunlock(zone); 338 } 339 340 static void fix_zone_id(struct zone *zone, unsigned long start_pfn, 341 unsigned long end_pfn) 342 { 343 enum zone_type zid = zone_idx(zone); 344 int nid = zone->zone_pgdat->node_id; 345 unsigned long pfn; 346 347 for (pfn = start_pfn; pfn < end_pfn; pfn++) 348 set_page_links(pfn_to_page(pfn), zid, nid, pfn); 349 } 350 351 /* Can fail with -ENOMEM from allocating a wait table with vmalloc() or 352 * alloc_bootmem_node_nopanic()/memblock_virt_alloc_node_nopanic() */ 353 static int __ref ensure_zone_is_initialized(struct zone *zone, 354 unsigned long start_pfn, unsigned long num_pages) 355 { 356 if (!zone_is_initialized(zone)) 357 return init_currently_empty_zone(zone, start_pfn, num_pages); 358 359 return 0; 360 } 361 362 static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, 363 unsigned long start_pfn, unsigned long end_pfn) 364 { 365 int ret; 366 unsigned long flags; 367 unsigned long z1_start_pfn; 368 369 ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn); 370 if (ret) 371 return ret; 372 373 pgdat_resize_lock(z1->zone_pgdat, &flags); 374 375 /* can't move pfns which are higher than @z2 */ 376 if (end_pfn > zone_end_pfn(z2)) 377 goto out_fail; 378 /* the move out part must be at the left most of @z2 */ 379 if (start_pfn > z2->zone_start_pfn) 380 goto out_fail; 381 /* must included/overlap */ 382 if (end_pfn <= z2->zone_start_pfn) 383 goto out_fail; 384 385 /* use start_pfn for z1's start_pfn if z1 is empty */ 386 if (!zone_is_empty(z1)) 387 z1_start_pfn = z1->zone_start_pfn; 388 else 389 z1_start_pfn = start_pfn; 390 391 resize_zone(z1, z1_start_pfn, end_pfn); 392 resize_zone(z2, end_pfn, zone_end_pfn(z2)); 393 394 pgdat_resize_unlock(z1->zone_pgdat, &flags); 395 396 fix_zone_id(z1, start_pfn, end_pfn); 397 398 return 0; 399 out_fail: 400 pgdat_resize_unlock(z1->zone_pgdat, &flags); 401 return -1; 402 } 403 404 static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, 405 unsigned long start_pfn, unsigned long end_pfn) 406 { 407 int ret; 408 unsigned long flags; 409 unsigned long z2_end_pfn; 410 411 ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn); 412 if (ret) 413 return ret; 414 415 pgdat_resize_lock(z1->zone_pgdat, &flags); 416 417 /* can't move pfns which are lower than @z1 */ 418 if (z1->zone_start_pfn > start_pfn) 419 goto out_fail; 420 /* the move out part mast at the right most of @z1 */ 421 if (zone_end_pfn(z1) > end_pfn) 422 goto out_fail; 423 /* must included/overlap */ 424 if (start_pfn >= zone_end_pfn(z1)) 425 goto out_fail; 426 427 /* use end_pfn for z2's end_pfn if z2 is empty */ 428 if (!zone_is_empty(z2)) 429 z2_end_pfn = zone_end_pfn(z2); 430 else 431 z2_end_pfn = end_pfn; 432 433 resize_zone(z1, z1->zone_start_pfn, start_pfn); 434 resize_zone(z2, start_pfn, z2_end_pfn); 435 436 pgdat_resize_unlock(z1->zone_pgdat, &flags); 437 438 fix_zone_id(z2, start_pfn, end_pfn); 439 440 return 0; 441 out_fail: 442 pgdat_resize_unlock(z1->zone_pgdat, &flags); 443 return -1; 444 } 445 446 static struct zone * __meminit move_pfn_range(int zone_shift, 447 unsigned long start_pfn, unsigned long end_pfn) 448 { 449 struct zone *zone = page_zone(pfn_to_page(start_pfn)); 450 int ret = 0; 451 452 if (zone_shift < 0) 453 ret = move_pfn_range_left(zone + zone_shift, zone, 454 start_pfn, end_pfn); 455 else if (zone_shift) 456 ret = move_pfn_range_right(zone, zone + zone_shift, 457 start_pfn, end_pfn); 458 459 if (ret) 460 return NULL; 461 462 return zone + zone_shift; 463 } 464 465 static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, 466 unsigned long end_pfn) 467 { 468 unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat); 469 470 if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) 471 pgdat->node_start_pfn = start_pfn; 472 473 pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - 474 pgdat->node_start_pfn; 475 } 476 477 static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) 478 { 479 struct pglist_data *pgdat = zone->zone_pgdat; 480 int nr_pages = PAGES_PER_SECTION; 481 int nid = pgdat->node_id; 482 int zone_type; 483 unsigned long flags, pfn; 484 int ret; 485 486 zone_type = zone - pgdat->node_zones; 487 ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages); 488 if (ret) 489 return ret; 490 491 pgdat_resize_lock(zone->zone_pgdat, &flags); 492 grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); 493 grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, 494 phys_start_pfn + nr_pages); 495 pgdat_resize_unlock(zone->zone_pgdat, &flags); 496 memmap_init_zone(nr_pages, nid, zone_type, 497 phys_start_pfn, MEMMAP_HOTPLUG); 498 499 /* online_page_range is called later and expects pages reserved */ 500 for (pfn = phys_start_pfn; pfn < phys_start_pfn + nr_pages; pfn++) { 501 if (!pfn_valid(pfn)) 502 continue; 503 504 SetPageReserved(pfn_to_page(pfn)); 505 } 506 return 0; 507 } 508 509 static int __meminit __add_section(int nid, struct zone *zone, 510 unsigned long phys_start_pfn) 511 { 512 int ret; 513 514 if (pfn_valid(phys_start_pfn)) 515 return -EEXIST; 516 517 ret = sparse_add_one_section(zone, phys_start_pfn); 518 519 if (ret < 0) 520 return ret; 521 522 ret = __add_zone(zone, phys_start_pfn); 523 524 if (ret < 0) 525 return ret; 526 527 return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); 528 } 529 530 /* 531 * Reasonably generic function for adding memory. It is 532 * expected that archs that support memory hotplug will 533 * call this function after deciding the zone to which to 534 * add the new pages. 535 */ 536 int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, 537 unsigned long nr_pages) 538 { 539 unsigned long i; 540 int err = 0; 541 int start_sec, end_sec; 542 struct vmem_altmap *altmap; 543 544 clear_zone_contiguous(zone); 545 546 /* during initialize mem_map, align hot-added range to section */ 547 start_sec = pfn_to_section_nr(phys_start_pfn); 548 end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); 549 550 altmap = to_vmem_altmap((unsigned long) pfn_to_page(phys_start_pfn)); 551 if (altmap) { 552 /* 553 * Validate altmap is within bounds of the total request 554 */ 555 if (altmap->base_pfn != phys_start_pfn 556 || vmem_altmap_offset(altmap) > nr_pages) { 557 pr_warn_once("memory add fail, invalid altmap\n"); 558 err = -EINVAL; 559 goto out; 560 } 561 altmap->alloc = 0; 562 } 563 564 for (i = start_sec; i <= end_sec; i++) { 565 err = __add_section(nid, zone, section_nr_to_pfn(i)); 566 567 /* 568 * EEXIST is finally dealt with by ioresource collision 569 * check. see add_memory() => register_memory_resource() 570 * Warning will be printed if there is collision. 571 */ 572 if (err && (err != -EEXIST)) 573 break; 574 err = 0; 575 } 576 vmemmap_populate_print_last(); 577 out: 578 set_zone_contiguous(zone); 579 return err; 580 } 581 EXPORT_SYMBOL_GPL(__add_pages); 582 583 #ifdef CONFIG_MEMORY_HOTREMOVE 584 /* find the smallest valid pfn in the range [start_pfn, end_pfn) */ 585 static int find_smallest_section_pfn(int nid, struct zone *zone, 586 unsigned long start_pfn, 587 unsigned long end_pfn) 588 { 589 struct mem_section *ms; 590 591 for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) { 592 ms = __pfn_to_section(start_pfn); 593 594 if (unlikely(!valid_section(ms))) 595 continue; 596 597 if (unlikely(pfn_to_nid(start_pfn) != nid)) 598 continue; 599 600 if (zone && zone != page_zone(pfn_to_page(start_pfn))) 601 continue; 602 603 return start_pfn; 604 } 605 606 return 0; 607 } 608 609 /* find the biggest valid pfn in the range [start_pfn, end_pfn). */ 610 static int find_biggest_section_pfn(int nid, struct zone *zone, 611 unsigned long start_pfn, 612 unsigned long end_pfn) 613 { 614 struct mem_section *ms; 615 unsigned long pfn; 616 617 /* pfn is the end pfn of a memory section. */ 618 pfn = end_pfn - 1; 619 for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) { 620 ms = __pfn_to_section(pfn); 621 622 if (unlikely(!valid_section(ms))) 623 continue; 624 625 if (unlikely(pfn_to_nid(pfn) != nid)) 626 continue; 627 628 if (zone && zone != page_zone(pfn_to_page(pfn))) 629 continue; 630 631 return pfn; 632 } 633 634 return 0; 635 } 636 637 static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, 638 unsigned long end_pfn) 639 { 640 unsigned long zone_start_pfn = zone->zone_start_pfn; 641 unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */ 642 unsigned long zone_end_pfn = z; 643 unsigned long pfn; 644 struct mem_section *ms; 645 int nid = zone_to_nid(zone); 646 647 zone_span_writelock(zone); 648 if (zone_start_pfn == start_pfn) { 649 /* 650 * If the section is smallest section in the zone, it need 651 * shrink zone->zone_start_pfn and zone->zone_spanned_pages. 652 * In this case, we find second smallest valid mem_section 653 * for shrinking zone. 654 */ 655 pfn = find_smallest_section_pfn(nid, zone, end_pfn, 656 zone_end_pfn); 657 if (pfn) { 658 zone->zone_start_pfn = pfn; 659 zone->spanned_pages = zone_end_pfn - pfn; 660 } 661 } else if (zone_end_pfn == end_pfn) { 662 /* 663 * If the section is biggest section in the zone, it need 664 * shrink zone->spanned_pages. 665 * In this case, we find second biggest valid mem_section for 666 * shrinking zone. 667 */ 668 pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn, 669 start_pfn); 670 if (pfn) 671 zone->spanned_pages = pfn - zone_start_pfn + 1; 672 } 673 674 /* 675 * The section is not biggest or smallest mem_section in the zone, it 676 * only creates a hole in the zone. So in this case, we need not 677 * change the zone. But perhaps, the zone has only hole data. Thus 678 * it check the zone has only hole or not. 679 */ 680 pfn = zone_start_pfn; 681 for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) { 682 ms = __pfn_to_section(pfn); 683 684 if (unlikely(!valid_section(ms))) 685 continue; 686 687 if (page_zone(pfn_to_page(pfn)) != zone) 688 continue; 689 690 /* If the section is current section, it continues the loop */ 691 if (start_pfn == pfn) 692 continue; 693 694 /* If we find valid section, we have nothing to do */ 695 zone_span_writeunlock(zone); 696 return; 697 } 698 699 /* The zone has no valid section */ 700 zone->zone_start_pfn = 0; 701 zone->spanned_pages = 0; 702 zone_span_writeunlock(zone); 703 } 704 705 static void shrink_pgdat_span(struct pglist_data *pgdat, 706 unsigned long start_pfn, unsigned long end_pfn) 707 { 708 unsigned long pgdat_start_pfn = pgdat->node_start_pfn; 709 unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */ 710 unsigned long pgdat_end_pfn = p; 711 unsigned long pfn; 712 struct mem_section *ms; 713 int nid = pgdat->node_id; 714 715 if (pgdat_start_pfn == start_pfn) { 716 /* 717 * If the section is smallest section in the pgdat, it need 718 * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages. 719 * In this case, we find second smallest valid mem_section 720 * for shrinking zone. 721 */ 722 pfn = find_smallest_section_pfn(nid, NULL, end_pfn, 723 pgdat_end_pfn); 724 if (pfn) { 725 pgdat->node_start_pfn = pfn; 726 pgdat->node_spanned_pages = pgdat_end_pfn - pfn; 727 } 728 } else if (pgdat_end_pfn == end_pfn) { 729 /* 730 * If the section is biggest section in the pgdat, it need 731 * shrink pgdat->node_spanned_pages. 732 * In this case, we find second biggest valid mem_section for 733 * shrinking zone. 734 */ 735 pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn, 736 start_pfn); 737 if (pfn) 738 pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1; 739 } 740 741 /* 742 * If the section is not biggest or smallest mem_section in the pgdat, 743 * it only creates a hole in the pgdat. So in this case, we need not 744 * change the pgdat. 745 * But perhaps, the pgdat has only hole data. Thus it check the pgdat 746 * has only hole or not. 747 */ 748 pfn = pgdat_start_pfn; 749 for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) { 750 ms = __pfn_to_section(pfn); 751 752 if (unlikely(!valid_section(ms))) 753 continue; 754 755 if (pfn_to_nid(pfn) != nid) 756 continue; 757 758 /* If the section is current section, it continues the loop */ 759 if (start_pfn == pfn) 760 continue; 761 762 /* If we find valid section, we have nothing to do */ 763 return; 764 } 765 766 /* The pgdat has no valid section */ 767 pgdat->node_start_pfn = 0; 768 pgdat->node_spanned_pages = 0; 769 } 770 771 static void __remove_zone(struct zone *zone, unsigned long start_pfn) 772 { 773 struct pglist_data *pgdat = zone->zone_pgdat; 774 int nr_pages = PAGES_PER_SECTION; 775 int zone_type; 776 unsigned long flags; 777 778 zone_type = zone - pgdat->node_zones; 779 780 pgdat_resize_lock(zone->zone_pgdat, &flags); 781 shrink_zone_span(zone, start_pfn, start_pfn + nr_pages); 782 shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages); 783 pgdat_resize_unlock(zone->zone_pgdat, &flags); 784 } 785 786 static int __remove_section(struct zone *zone, struct mem_section *ms, 787 unsigned long map_offset) 788 { 789 unsigned long start_pfn; 790 int scn_nr; 791 int ret = -EINVAL; 792 793 if (!valid_section(ms)) 794 return ret; 795 796 ret = unregister_memory_section(ms); 797 if (ret) 798 return ret; 799 800 scn_nr = __section_nr(ms); 801 start_pfn = section_nr_to_pfn(scn_nr); 802 __remove_zone(zone, start_pfn); 803 804 sparse_remove_one_section(zone, ms, map_offset); 805 return 0; 806 } 807 808 /** 809 * __remove_pages() - remove sections of pages from a zone 810 * @zone: zone from which pages need to be removed 811 * @phys_start_pfn: starting pageframe (must be aligned to start of a section) 812 * @nr_pages: number of pages to remove (must be multiple of section size) 813 * 814 * Generic helper function to remove section mappings and sysfs entries 815 * for the section of the memory we are removing. Caller needs to make 816 * sure that pages are marked reserved and zones are adjust properly by 817 * calling offline_pages(). 818 */ 819 int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, 820 unsigned long nr_pages) 821 { 822 unsigned long i; 823 unsigned long map_offset = 0; 824 int sections_to_remove, ret = 0; 825 826 /* In the ZONE_DEVICE case device driver owns the memory region */ 827 if (is_dev_zone(zone)) { 828 struct page *page = pfn_to_page(phys_start_pfn); 829 struct vmem_altmap *altmap; 830 831 altmap = to_vmem_altmap((unsigned long) page); 832 if (altmap) 833 map_offset = vmem_altmap_offset(altmap); 834 } else { 835 resource_size_t start, size; 836 837 start = phys_start_pfn << PAGE_SHIFT; 838 size = nr_pages * PAGE_SIZE; 839 840 ret = release_mem_region_adjustable(&iomem_resource, start, 841 size); 842 if (ret) { 843 resource_size_t endres = start + size - 1; 844 845 pr_warn("Unable to release resource <%pa-%pa> (%d)\n", 846 &start, &endres, ret); 847 } 848 } 849 850 clear_zone_contiguous(zone); 851 852 /* 853 * We can only remove entire sections 854 */ 855 BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); 856 BUG_ON(nr_pages % PAGES_PER_SECTION); 857 858 sections_to_remove = nr_pages / PAGES_PER_SECTION; 859 for (i = 0; i < sections_to_remove; i++) { 860 unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; 861 862 ret = __remove_section(zone, __pfn_to_section(pfn), map_offset); 863 map_offset = 0; 864 if (ret) 865 break; 866 } 867 868 set_zone_contiguous(zone); 869 870 return ret; 871 } 872 #endif /* CONFIG_MEMORY_HOTREMOVE */ 873 874 int set_online_page_callback(online_page_callback_t callback) 875 { 876 int rc = -EINVAL; 877 878 get_online_mems(); 879 mutex_lock(&online_page_callback_lock); 880 881 if (online_page_callback == generic_online_page) { 882 online_page_callback = callback; 883 rc = 0; 884 } 885 886 mutex_unlock(&online_page_callback_lock); 887 put_online_mems(); 888 889 return rc; 890 } 891 EXPORT_SYMBOL_GPL(set_online_page_callback); 892 893 int restore_online_page_callback(online_page_callback_t callback) 894 { 895 int rc = -EINVAL; 896 897 get_online_mems(); 898 mutex_lock(&online_page_callback_lock); 899 900 if (online_page_callback == callback) { 901 online_page_callback = generic_online_page; 902 rc = 0; 903 } 904 905 mutex_unlock(&online_page_callback_lock); 906 put_online_mems(); 907 908 return rc; 909 } 910 EXPORT_SYMBOL_GPL(restore_online_page_callback); 911 912 void __online_page_set_limits(struct page *page) 913 { 914 } 915 EXPORT_SYMBOL_GPL(__online_page_set_limits); 916 917 void __online_page_increment_counters(struct page *page) 918 { 919 adjust_managed_page_count(page, 1); 920 } 921 EXPORT_SYMBOL_GPL(__online_page_increment_counters); 922 923 void __online_page_free(struct page *page) 924 { 925 __free_reserved_page(page); 926 } 927 EXPORT_SYMBOL_GPL(__online_page_free); 928 929 static void generic_online_page(struct page *page) 930 { 931 __online_page_set_limits(page); 932 __online_page_increment_counters(page); 933 __online_page_free(page); 934 } 935 936 static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, 937 void *arg) 938 { 939 unsigned long i; 940 unsigned long onlined_pages = *(unsigned long *)arg; 941 struct page *page; 942 if (PageReserved(pfn_to_page(start_pfn))) 943 for (i = 0; i < nr_pages; i++) { 944 page = pfn_to_page(start_pfn + i); 945 (*online_page_callback)(page); 946 onlined_pages++; 947 } 948 *(unsigned long *)arg = onlined_pages; 949 return 0; 950 } 951 952 #ifdef CONFIG_MOVABLE_NODE 953 /* 954 * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have 955 * normal memory. 956 */ 957 static bool can_online_high_movable(struct zone *zone) 958 { 959 return true; 960 } 961 #else /* CONFIG_MOVABLE_NODE */ 962 /* ensure every online node has NORMAL memory */ 963 static bool can_online_high_movable(struct zone *zone) 964 { 965 return node_state(zone_to_nid(zone), N_NORMAL_MEMORY); 966 } 967 #endif /* CONFIG_MOVABLE_NODE */ 968 969 /* check which state of node_states will be changed when online memory */ 970 static void node_states_check_changes_online(unsigned long nr_pages, 971 struct zone *zone, struct memory_notify *arg) 972 { 973 int nid = zone_to_nid(zone); 974 enum zone_type zone_last = ZONE_NORMAL; 975 976 /* 977 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] 978 * contains nodes which have zones of 0...ZONE_NORMAL, 979 * set zone_last to ZONE_NORMAL. 980 * 981 * If we don't have HIGHMEM nor movable node, 982 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of 983 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. 984 */ 985 if (N_MEMORY == N_NORMAL_MEMORY) 986 zone_last = ZONE_MOVABLE; 987 988 /* 989 * if the memory to be online is in a zone of 0...zone_last, and 990 * the zones of 0...zone_last don't have memory before online, we will 991 * need to set the node to node_states[N_NORMAL_MEMORY] after 992 * the memory is online. 993 */ 994 if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY)) 995 arg->status_change_nid_normal = nid; 996 else 997 arg->status_change_nid_normal = -1; 998 999 #ifdef CONFIG_HIGHMEM 1000 /* 1001 * If we have movable node, node_states[N_HIGH_MEMORY] 1002 * contains nodes which have zones of 0...ZONE_HIGHMEM, 1003 * set zone_last to ZONE_HIGHMEM. 1004 * 1005 * If we don't have movable node, node_states[N_NORMAL_MEMORY] 1006 * contains nodes which have zones of 0...ZONE_MOVABLE, 1007 * set zone_last to ZONE_MOVABLE. 1008 */ 1009 zone_last = ZONE_HIGHMEM; 1010 if (N_MEMORY == N_HIGH_MEMORY) 1011 zone_last = ZONE_MOVABLE; 1012 1013 if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY)) 1014 arg->status_change_nid_high = nid; 1015 else 1016 arg->status_change_nid_high = -1; 1017 #else 1018 arg->status_change_nid_high = arg->status_change_nid_normal; 1019 #endif 1020 1021 /* 1022 * if the node don't have memory befor online, we will need to 1023 * set the node to node_states[N_MEMORY] after the memory 1024 * is online. 1025 */ 1026 if (!node_state(nid, N_MEMORY)) 1027 arg->status_change_nid = nid; 1028 else 1029 arg->status_change_nid = -1; 1030 } 1031 1032 static void node_states_set_node(int node, struct memory_notify *arg) 1033 { 1034 if (arg->status_change_nid_normal >= 0) 1035 node_set_state(node, N_NORMAL_MEMORY); 1036 1037 if (arg->status_change_nid_high >= 0) 1038 node_set_state(node, N_HIGH_MEMORY); 1039 1040 node_set_state(node, N_MEMORY); 1041 } 1042 1043 bool zone_can_shift(unsigned long pfn, unsigned long nr_pages, 1044 enum zone_type target, int *zone_shift) 1045 { 1046 struct zone *zone = page_zone(pfn_to_page(pfn)); 1047 enum zone_type idx = zone_idx(zone); 1048 int i; 1049 1050 *zone_shift = 0; 1051 1052 if (idx < target) { 1053 /* pages must be at end of current zone */ 1054 if (pfn + nr_pages != zone_end_pfn(zone)) 1055 return false; 1056 1057 /* no zones in use between current zone and target */ 1058 for (i = idx + 1; i < target; i++) 1059 if (zone_is_initialized(zone - idx + i)) 1060 return false; 1061 } 1062 1063 if (target < idx) { 1064 /* pages must be at beginning of current zone */ 1065 if (pfn != zone->zone_start_pfn) 1066 return false; 1067 1068 /* no zones in use between current zone and target */ 1069 for (i = target + 1; i < idx; i++) 1070 if (zone_is_initialized(zone - idx + i)) 1071 return false; 1072 } 1073 1074 *zone_shift = target - idx; 1075 return true; 1076 } 1077 1078 /* Must be protected by mem_hotplug_begin() */ 1079 int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) 1080 { 1081 unsigned long flags; 1082 unsigned long onlined_pages = 0; 1083 struct zone *zone; 1084 int need_zonelists_rebuild = 0; 1085 int nid; 1086 int ret; 1087 struct memory_notify arg; 1088 int zone_shift = 0; 1089 1090 /* 1091 * This doesn't need a lock to do pfn_to_page(). 1092 * The section can't be removed here because of the 1093 * memory_block->state_mutex. 1094 */ 1095 zone = page_zone(pfn_to_page(pfn)); 1096 1097 if ((zone_idx(zone) > ZONE_NORMAL || 1098 online_type == MMOP_ONLINE_MOVABLE) && 1099 !can_online_high_movable(zone)) 1100 return -EINVAL; 1101 1102 if (online_type == MMOP_ONLINE_KERNEL) { 1103 if (!zone_can_shift(pfn, nr_pages, ZONE_NORMAL, &zone_shift)) 1104 return -EINVAL; 1105 } else if (online_type == MMOP_ONLINE_MOVABLE) { 1106 if (!zone_can_shift(pfn, nr_pages, ZONE_MOVABLE, &zone_shift)) 1107 return -EINVAL; 1108 } 1109 1110 zone = move_pfn_range(zone_shift, pfn, pfn + nr_pages); 1111 if (!zone) 1112 return -EINVAL; 1113 1114 arg.start_pfn = pfn; 1115 arg.nr_pages = nr_pages; 1116 node_states_check_changes_online(nr_pages, zone, &arg); 1117 1118 nid = zone_to_nid(zone); 1119 1120 ret = memory_notify(MEM_GOING_ONLINE, &arg); 1121 ret = notifier_to_errno(ret); 1122 if (ret) 1123 goto failed_addition; 1124 1125 /* 1126 * If this zone is not populated, then it is not in zonelist. 1127 * This means the page allocator ignores this zone. 1128 * So, zonelist must be updated after online. 1129 */ 1130 mutex_lock(&zonelists_mutex); 1131 if (!populated_zone(zone)) { 1132 need_zonelists_rebuild = 1; 1133 build_all_zonelists(NULL, zone); 1134 } 1135 1136 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, 1137 online_pages_range); 1138 if (ret) { 1139 if (need_zonelists_rebuild) 1140 zone_pcp_reset(zone); 1141 mutex_unlock(&zonelists_mutex); 1142 goto failed_addition; 1143 } 1144 1145 zone->present_pages += onlined_pages; 1146 1147 pgdat_resize_lock(zone->zone_pgdat, &flags); 1148 zone->zone_pgdat->node_present_pages += onlined_pages; 1149 pgdat_resize_unlock(zone->zone_pgdat, &flags); 1150 1151 if (onlined_pages) { 1152 node_states_set_node(nid, &arg); 1153 if (need_zonelists_rebuild) 1154 build_all_zonelists(NULL, NULL); 1155 else 1156 zone_pcp_update(zone); 1157 } 1158 1159 mutex_unlock(&zonelists_mutex); 1160 1161 init_per_zone_wmark_min(); 1162 1163 if (onlined_pages) { 1164 kswapd_run(nid); 1165 kcompactd_run(nid); 1166 } 1167 1168 vm_total_pages = nr_free_pagecache_pages(); 1169 1170 writeback_set_ratelimit(); 1171 1172 if (onlined_pages) 1173 memory_notify(MEM_ONLINE, &arg); 1174 return 0; 1175 1176 failed_addition: 1177 pr_debug("online_pages [mem %#010llx-%#010llx] failed\n", 1178 (unsigned long long) pfn << PAGE_SHIFT, 1179 (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1); 1180 memory_notify(MEM_CANCEL_ONLINE, &arg); 1181 return ret; 1182 } 1183 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ 1184 1185 static void reset_node_present_pages(pg_data_t *pgdat) 1186 { 1187 struct zone *z; 1188 1189 for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) 1190 z->present_pages = 0; 1191 1192 pgdat->node_present_pages = 0; 1193 } 1194 1195 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 1196 static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) 1197 { 1198 struct pglist_data *pgdat; 1199 unsigned long zones_size[MAX_NR_ZONES] = {0}; 1200 unsigned long zholes_size[MAX_NR_ZONES] = {0}; 1201 unsigned long start_pfn = PFN_DOWN(start); 1202 1203 pgdat = NODE_DATA(nid); 1204 if (!pgdat) { 1205 pgdat = arch_alloc_nodedata(nid); 1206 if (!pgdat) 1207 return NULL; 1208 1209 arch_refresh_nodedata(nid, pgdat); 1210 } else { 1211 /* 1212 * Reset the nr_zones, order and classzone_idx before reuse. 1213 * Note that kswapd will init kswapd_classzone_idx properly 1214 * when it starts in the near future. 1215 */ 1216 pgdat->nr_zones = 0; 1217 pgdat->kswapd_order = 0; 1218 pgdat->kswapd_classzone_idx = 0; 1219 } 1220 1221 /* we can use NODE_DATA(nid) from here */ 1222 1223 /* init node's zones as empty zones, we don't have any present pages.*/ 1224 free_area_init_node(nid, zones_size, start_pfn, zholes_size); 1225 pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat); 1226 1227 /* 1228 * The node we allocated has no zone fallback lists. For avoiding 1229 * to access not-initialized zonelist, build here. 1230 */ 1231 mutex_lock(&zonelists_mutex); 1232 build_all_zonelists(pgdat, NULL); 1233 mutex_unlock(&zonelists_mutex); 1234 1235 /* 1236 * zone->managed_pages is set to an approximate value in 1237 * free_area_init_core(), which will cause 1238 * /sys/device/system/node/nodeX/meminfo has wrong data. 1239 * So reset it to 0 before any memory is onlined. 1240 */ 1241 reset_node_managed_pages(pgdat); 1242 1243 /* 1244 * When memory is hot-added, all the memory is in offline state. So 1245 * clear all zones' present_pages because they will be updated in 1246 * online_pages() and offline_pages(). 1247 */ 1248 reset_node_present_pages(pgdat); 1249 1250 return pgdat; 1251 } 1252 1253 static void rollback_node_hotadd(int nid, pg_data_t *pgdat) 1254 { 1255 arch_refresh_nodedata(nid, NULL); 1256 free_percpu(pgdat->per_cpu_nodestats); 1257 arch_free_nodedata(pgdat); 1258 return; 1259 } 1260 1261 1262 /** 1263 * try_online_node - online a node if offlined 1264 * 1265 * called by cpu_up() to online a node without onlined memory. 1266 */ 1267 int try_online_node(int nid) 1268 { 1269 pg_data_t *pgdat; 1270 int ret; 1271 1272 if (node_online(nid)) 1273 return 0; 1274 1275 mem_hotplug_begin(); 1276 pgdat = hotadd_new_pgdat(nid, 0); 1277 if (!pgdat) { 1278 pr_err("Cannot online node %d due to NULL pgdat\n", nid); 1279 ret = -ENOMEM; 1280 goto out; 1281 } 1282 node_set_online(nid); 1283 ret = register_one_node(nid); 1284 BUG_ON(ret); 1285 1286 if (pgdat->node_zonelists->_zonerefs->zone == NULL) { 1287 mutex_lock(&zonelists_mutex); 1288 build_all_zonelists(NULL, NULL); 1289 mutex_unlock(&zonelists_mutex); 1290 } 1291 1292 out: 1293 mem_hotplug_done(); 1294 return ret; 1295 } 1296 1297 static int check_hotplug_memory_range(u64 start, u64 size) 1298 { 1299 u64 start_pfn = PFN_DOWN(start); 1300 u64 nr_pages = size >> PAGE_SHIFT; 1301 1302 /* Memory range must be aligned with section */ 1303 if ((start_pfn & ~PAGE_SECTION_MASK) || 1304 (nr_pages % PAGES_PER_SECTION) || (!nr_pages)) { 1305 pr_err("Section-unaligned hotplug range: start 0x%llx, size 0x%llx\n", 1306 (unsigned long long)start, 1307 (unsigned long long)size); 1308 return -EINVAL; 1309 } 1310 1311 return 0; 1312 } 1313 1314 /* 1315 * If movable zone has already been setup, newly added memory should be check. 1316 * If its address is higher than movable zone, it should be added as movable. 1317 * Without this check, movable zone may overlap with other zone. 1318 */ 1319 static int should_add_memory_movable(int nid, u64 start, u64 size) 1320 { 1321 unsigned long start_pfn = start >> PAGE_SHIFT; 1322 pg_data_t *pgdat = NODE_DATA(nid); 1323 struct zone *movable_zone = pgdat->node_zones + ZONE_MOVABLE; 1324 1325 if (zone_is_empty(movable_zone)) 1326 return 0; 1327 1328 if (movable_zone->zone_start_pfn <= start_pfn) 1329 return 1; 1330 1331 return 0; 1332 } 1333 1334 int zone_for_memory(int nid, u64 start, u64 size, int zone_default, 1335 bool for_device) 1336 { 1337 #ifdef CONFIG_ZONE_DEVICE 1338 if (for_device) 1339 return ZONE_DEVICE; 1340 #endif 1341 if (should_add_memory_movable(nid, start, size)) 1342 return ZONE_MOVABLE; 1343 1344 return zone_default; 1345 } 1346 1347 static int online_memory_block(struct memory_block *mem, void *arg) 1348 { 1349 return device_online(&mem->dev); 1350 } 1351 1352 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 1353 int __ref add_memory_resource(int nid, struct resource *res, bool online) 1354 { 1355 u64 start, size; 1356 pg_data_t *pgdat = NULL; 1357 bool new_pgdat; 1358 bool new_node; 1359 int ret; 1360 1361 start = res->start; 1362 size = resource_size(res); 1363 1364 ret = check_hotplug_memory_range(start, size); 1365 if (ret) 1366 return ret; 1367 1368 { /* Stupid hack to suppress address-never-null warning */ 1369 void *p = NODE_DATA(nid); 1370 new_pgdat = !p; 1371 } 1372 1373 mem_hotplug_begin(); 1374 1375 /* 1376 * Add new range to memblock so that when hotadd_new_pgdat() is called 1377 * to allocate new pgdat, get_pfn_range_for_nid() will be able to find 1378 * this new range and calculate total pages correctly. The range will 1379 * be removed at hot-remove time. 1380 */ 1381 memblock_add_node(start, size, nid); 1382 1383 new_node = !node_online(nid); 1384 if (new_node) { 1385 pgdat = hotadd_new_pgdat(nid, start); 1386 ret = -ENOMEM; 1387 if (!pgdat) 1388 goto error; 1389 } 1390 1391 /* call arch's memory hotadd */ 1392 ret = arch_add_memory(nid, start, size, false); 1393 1394 if (ret < 0) 1395 goto error; 1396 1397 /* we online node here. we can't roll back from here. */ 1398 node_set_online(nid); 1399 1400 if (new_node) { 1401 ret = register_one_node(nid); 1402 /* 1403 * If sysfs file of new node can't create, cpu on the node 1404 * can't be hot-added. There is no rollback way now. 1405 * So, check by BUG_ON() to catch it reluctantly.. 1406 */ 1407 BUG_ON(ret); 1408 } 1409 1410 /* create new memmap entry */ 1411 firmware_map_add_hotplug(start, start + size, "System RAM"); 1412 1413 /* online pages if requested */ 1414 if (online) 1415 walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), 1416 NULL, online_memory_block); 1417 1418 goto out; 1419 1420 error: 1421 /* rollback pgdat allocation and others */ 1422 if (new_pgdat) 1423 rollback_node_hotadd(nid, pgdat); 1424 memblock_remove(start, size); 1425 1426 out: 1427 mem_hotplug_done(); 1428 return ret; 1429 } 1430 EXPORT_SYMBOL_GPL(add_memory_resource); 1431 1432 int __ref add_memory(int nid, u64 start, u64 size) 1433 { 1434 struct resource *res; 1435 int ret; 1436 1437 res = register_memory_resource(start, size); 1438 if (IS_ERR(res)) 1439 return PTR_ERR(res); 1440 1441 ret = add_memory_resource(nid, res, memhp_auto_online); 1442 if (ret < 0) 1443 release_memory_resource(res); 1444 return ret; 1445 } 1446 EXPORT_SYMBOL_GPL(add_memory); 1447 1448 #ifdef CONFIG_MEMORY_HOTREMOVE 1449 /* 1450 * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy 1451 * set and the size of the free page is given by page_order(). Using this, 1452 * the function determines if the pageblock contains only free pages. 1453 * Due to buddy contraints, a free page at least the size of a pageblock will 1454 * be located at the start of the pageblock 1455 */ 1456 static inline int pageblock_free(struct page *page) 1457 { 1458 return PageBuddy(page) && page_order(page) >= pageblock_order; 1459 } 1460 1461 /* Return the start of the next active pageblock after a given page */ 1462 static struct page *next_active_pageblock(struct page *page) 1463 { 1464 /* Ensure the starting page is pageblock-aligned */ 1465 BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1)); 1466 1467 /* If the entire pageblock is free, move to the end of free page */ 1468 if (pageblock_free(page)) { 1469 int order; 1470 /* be careful. we don't have locks, page_order can be changed.*/ 1471 order = page_order(page); 1472 if ((order < MAX_ORDER) && (order >= pageblock_order)) 1473 return page + (1 << order); 1474 } 1475 1476 return page + pageblock_nr_pages; 1477 } 1478 1479 /* Checks if this range of memory is likely to be hot-removable. */ 1480 bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) 1481 { 1482 struct page *page = pfn_to_page(start_pfn); 1483 struct page *end_page = page + nr_pages; 1484 1485 /* Check the starting page of each pageblock within the range */ 1486 for (; page < end_page; page = next_active_pageblock(page)) { 1487 if (!is_pageblock_removable_nolock(page)) 1488 return false; 1489 cond_resched(); 1490 } 1491 1492 /* All pageblocks in the memory block are likely to be hot-removable */ 1493 return true; 1494 } 1495 1496 /* 1497 * Confirm all pages in a range [start, end) belong to the same zone. 1498 * When true, return its valid [start, end). 1499 */ 1500 int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn, 1501 unsigned long *valid_start, unsigned long *valid_end) 1502 { 1503 unsigned long pfn, sec_end_pfn; 1504 unsigned long start, end; 1505 struct zone *zone = NULL; 1506 struct page *page; 1507 int i; 1508 for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn + 1); 1509 pfn < end_pfn; 1510 pfn = sec_end_pfn, sec_end_pfn += PAGES_PER_SECTION) { 1511 /* Make sure the memory section is present first */ 1512 if (!present_section_nr(pfn_to_section_nr(pfn))) 1513 continue; 1514 for (; pfn < sec_end_pfn && pfn < end_pfn; 1515 pfn += MAX_ORDER_NR_PAGES) { 1516 i = 0; 1517 /* This is just a CONFIG_HOLES_IN_ZONE check.*/ 1518 while ((i < MAX_ORDER_NR_PAGES) && 1519 !pfn_valid_within(pfn + i)) 1520 i++; 1521 if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn) 1522 continue; 1523 page = pfn_to_page(pfn + i); 1524 if (zone && page_zone(page) != zone) 1525 return 0; 1526 if (!zone) 1527 start = pfn + i; 1528 zone = page_zone(page); 1529 end = pfn + MAX_ORDER_NR_PAGES; 1530 } 1531 } 1532 1533 if (zone) { 1534 *valid_start = start; 1535 *valid_end = min(end, end_pfn); 1536 return 1; 1537 } else { 1538 return 0; 1539 } 1540 } 1541 1542 /* 1543 * Scan pfn range [start,end) to find movable/migratable pages (LRU pages, 1544 * non-lru movable pages and hugepages). We scan pfn because it's much 1545 * easier than scanning over linked list. This function returns the pfn 1546 * of the first found movable page if it's found, otherwise 0. 1547 */ 1548 static unsigned long scan_movable_pages(unsigned long start, unsigned long end) 1549 { 1550 unsigned long pfn; 1551 struct page *page; 1552 for (pfn = start; pfn < end; pfn++) { 1553 if (pfn_valid(pfn)) { 1554 page = pfn_to_page(pfn); 1555 if (PageLRU(page)) 1556 return pfn; 1557 if (__PageMovable(page)) 1558 return pfn; 1559 if (PageHuge(page)) { 1560 if (page_huge_active(page)) 1561 return pfn; 1562 else 1563 pfn = round_up(pfn + 1, 1564 1 << compound_order(page)) - 1; 1565 } 1566 } 1567 } 1568 return 0; 1569 } 1570 1571 static struct page *new_node_page(struct page *page, unsigned long private, 1572 int **result) 1573 { 1574 gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; 1575 int nid = page_to_nid(page); 1576 nodemask_t nmask = node_states[N_MEMORY]; 1577 struct page *new_page = NULL; 1578 1579 /* 1580 * TODO: allocate a destination hugepage from a nearest neighbor node, 1581 * accordance with memory policy of the user process if possible. For 1582 * now as a simple work-around, we use the next node for destination. 1583 */ 1584 if (PageHuge(page)) 1585 return alloc_huge_page_node(page_hstate(compound_head(page)), 1586 next_node_in(nid, nmask)); 1587 1588 node_clear(nid, nmask); 1589 1590 if (PageHighMem(page) 1591 || (zone_idx(page_zone(page)) == ZONE_MOVABLE)) 1592 gfp_mask |= __GFP_HIGHMEM; 1593 1594 if (!nodes_empty(nmask)) 1595 new_page = __alloc_pages_nodemask(gfp_mask, 0, 1596 node_zonelist(nid, gfp_mask), &nmask); 1597 if (!new_page) 1598 new_page = __alloc_pages(gfp_mask, 0, 1599 node_zonelist(nid, gfp_mask)); 1600 1601 return new_page; 1602 } 1603 1604 #define NR_OFFLINE_AT_ONCE_PAGES (256) 1605 static int 1606 do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) 1607 { 1608 unsigned long pfn; 1609 struct page *page; 1610 int move_pages = NR_OFFLINE_AT_ONCE_PAGES; 1611 int not_managed = 0; 1612 int ret = 0; 1613 LIST_HEAD(source); 1614 1615 for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) { 1616 if (!pfn_valid(pfn)) 1617 continue; 1618 page = pfn_to_page(pfn); 1619 1620 if (PageHuge(page)) { 1621 struct page *head = compound_head(page); 1622 pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1; 1623 if (compound_order(head) > PFN_SECTION_SHIFT) { 1624 ret = -EBUSY; 1625 break; 1626 } 1627 if (isolate_huge_page(page, &source)) 1628 move_pages -= 1 << compound_order(head); 1629 continue; 1630 } 1631 1632 if (!get_page_unless_zero(page)) 1633 continue; 1634 /* 1635 * We can skip free pages. And we can deal with pages on 1636 * LRU and non-lru movable pages. 1637 */ 1638 if (PageLRU(page)) 1639 ret = isolate_lru_page(page); 1640 else 1641 ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE); 1642 if (!ret) { /* Success */ 1643 put_page(page); 1644 list_add_tail(&page->lru, &source); 1645 move_pages--; 1646 if (!__PageMovable(page)) 1647 inc_node_page_state(page, NR_ISOLATED_ANON + 1648 page_is_file_cache(page)); 1649 1650 } else { 1651 #ifdef CONFIG_DEBUG_VM 1652 pr_alert("failed to isolate pfn %lx\n", pfn); 1653 dump_page(page, "isolation failed"); 1654 #endif 1655 put_page(page); 1656 /* Because we don't have big zone->lock. we should 1657 check this again here. */ 1658 if (page_count(page)) { 1659 not_managed++; 1660 ret = -EBUSY; 1661 break; 1662 } 1663 } 1664 } 1665 if (!list_empty(&source)) { 1666 if (not_managed) { 1667 putback_movable_pages(&source); 1668 goto out; 1669 } 1670 1671 /* Allocate a new page from the nearest neighbor node */ 1672 ret = migrate_pages(&source, new_node_page, NULL, 0, 1673 MIGRATE_SYNC, MR_MEMORY_HOTPLUG); 1674 if (ret) 1675 putback_movable_pages(&source); 1676 } 1677 out: 1678 return ret; 1679 } 1680 1681 /* 1682 * remove from free_area[] and mark all as Reserved. 1683 */ 1684 static int 1685 offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, 1686 void *data) 1687 { 1688 __offline_isolated_pages(start, start + nr_pages); 1689 return 0; 1690 } 1691 1692 static void 1693 offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 1694 { 1695 walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL, 1696 offline_isolated_pages_cb); 1697 } 1698 1699 /* 1700 * Check all pages in range, recoreded as memory resource, are isolated. 1701 */ 1702 static int 1703 check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, 1704 void *data) 1705 { 1706 int ret; 1707 long offlined = *(long *)data; 1708 ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true); 1709 offlined = nr_pages; 1710 if (!ret) 1711 *(long *)data += offlined; 1712 return ret; 1713 } 1714 1715 static long 1716 check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) 1717 { 1718 long offlined = 0; 1719 int ret; 1720 1721 ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined, 1722 check_pages_isolated_cb); 1723 if (ret < 0) 1724 offlined = (long)ret; 1725 return offlined; 1726 } 1727 1728 #ifdef CONFIG_MOVABLE_NODE 1729 /* 1730 * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have 1731 * normal memory. 1732 */ 1733 static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) 1734 { 1735 return true; 1736 } 1737 #else /* CONFIG_MOVABLE_NODE */ 1738 /* ensure the node has NORMAL memory if it is still online */ 1739 static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) 1740 { 1741 struct pglist_data *pgdat = zone->zone_pgdat; 1742 unsigned long present_pages = 0; 1743 enum zone_type zt; 1744 1745 for (zt = 0; zt <= ZONE_NORMAL; zt++) 1746 present_pages += pgdat->node_zones[zt].present_pages; 1747 1748 if (present_pages > nr_pages) 1749 return true; 1750 1751 present_pages = 0; 1752 for (; zt <= ZONE_MOVABLE; zt++) 1753 present_pages += pgdat->node_zones[zt].present_pages; 1754 1755 /* 1756 * we can't offline the last normal memory until all 1757 * higher memory is offlined. 1758 */ 1759 return present_pages == 0; 1760 } 1761 #endif /* CONFIG_MOVABLE_NODE */ 1762 1763 static int __init cmdline_parse_movable_node(char *p) 1764 { 1765 #ifdef CONFIG_MOVABLE_NODE 1766 movable_node_enabled = true; 1767 #else 1768 pr_warn("movable_node option not supported\n"); 1769 #endif 1770 return 0; 1771 } 1772 early_param("movable_node", cmdline_parse_movable_node); 1773 1774 /* check which state of node_states will be changed when offline memory */ 1775 static void node_states_check_changes_offline(unsigned long nr_pages, 1776 struct zone *zone, struct memory_notify *arg) 1777 { 1778 struct pglist_data *pgdat = zone->zone_pgdat; 1779 unsigned long present_pages = 0; 1780 enum zone_type zt, zone_last = ZONE_NORMAL; 1781 1782 /* 1783 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] 1784 * contains nodes which have zones of 0...ZONE_NORMAL, 1785 * set zone_last to ZONE_NORMAL. 1786 * 1787 * If we don't have HIGHMEM nor movable node, 1788 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of 1789 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. 1790 */ 1791 if (N_MEMORY == N_NORMAL_MEMORY) 1792 zone_last = ZONE_MOVABLE; 1793 1794 /* 1795 * check whether node_states[N_NORMAL_MEMORY] will be changed. 1796 * If the memory to be offline is in a zone of 0...zone_last, 1797 * and it is the last present memory, 0...zone_last will 1798 * become empty after offline , thus we can determind we will 1799 * need to clear the node from node_states[N_NORMAL_MEMORY]. 1800 */ 1801 for (zt = 0; zt <= zone_last; zt++) 1802 present_pages += pgdat->node_zones[zt].present_pages; 1803 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) 1804 arg->status_change_nid_normal = zone_to_nid(zone); 1805 else 1806 arg->status_change_nid_normal = -1; 1807 1808 #ifdef CONFIG_HIGHMEM 1809 /* 1810 * If we have movable node, node_states[N_HIGH_MEMORY] 1811 * contains nodes which have zones of 0...ZONE_HIGHMEM, 1812 * set zone_last to ZONE_HIGHMEM. 1813 * 1814 * If we don't have movable node, node_states[N_NORMAL_MEMORY] 1815 * contains nodes which have zones of 0...ZONE_MOVABLE, 1816 * set zone_last to ZONE_MOVABLE. 1817 */ 1818 zone_last = ZONE_HIGHMEM; 1819 if (N_MEMORY == N_HIGH_MEMORY) 1820 zone_last = ZONE_MOVABLE; 1821 1822 for (; zt <= zone_last; zt++) 1823 present_pages += pgdat->node_zones[zt].present_pages; 1824 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) 1825 arg->status_change_nid_high = zone_to_nid(zone); 1826 else 1827 arg->status_change_nid_high = -1; 1828 #else 1829 arg->status_change_nid_high = arg->status_change_nid_normal; 1830 #endif 1831 1832 /* 1833 * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE 1834 */ 1835 zone_last = ZONE_MOVABLE; 1836 1837 /* 1838 * check whether node_states[N_HIGH_MEMORY] will be changed 1839 * If we try to offline the last present @nr_pages from the node, 1840 * we can determind we will need to clear the node from 1841 * node_states[N_HIGH_MEMORY]. 1842 */ 1843 for (; zt <= zone_last; zt++) 1844 present_pages += pgdat->node_zones[zt].present_pages; 1845 if (nr_pages >= present_pages) 1846 arg->status_change_nid = zone_to_nid(zone); 1847 else 1848 arg->status_change_nid = -1; 1849 } 1850 1851 static void node_states_clear_node(int node, struct memory_notify *arg) 1852 { 1853 if (arg->status_change_nid_normal >= 0) 1854 node_clear_state(node, N_NORMAL_MEMORY); 1855 1856 if ((N_MEMORY != N_NORMAL_MEMORY) && 1857 (arg->status_change_nid_high >= 0)) 1858 node_clear_state(node, N_HIGH_MEMORY); 1859 1860 if ((N_MEMORY != N_HIGH_MEMORY) && 1861 (arg->status_change_nid >= 0)) 1862 node_clear_state(node, N_MEMORY); 1863 } 1864 1865 static int __ref __offline_pages(unsigned long start_pfn, 1866 unsigned long end_pfn, unsigned long timeout) 1867 { 1868 unsigned long pfn, nr_pages, expire; 1869 long offlined_pages; 1870 int ret, drain, retry_max, node; 1871 unsigned long flags; 1872 unsigned long valid_start, valid_end; 1873 struct zone *zone; 1874 struct memory_notify arg; 1875 1876 /* at least, alignment against pageblock is necessary */ 1877 if (!IS_ALIGNED(start_pfn, pageblock_nr_pages)) 1878 return -EINVAL; 1879 if (!IS_ALIGNED(end_pfn, pageblock_nr_pages)) 1880 return -EINVAL; 1881 /* This makes hotplug much easier...and readable. 1882 we assume this for now. .*/ 1883 if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, &valid_end)) 1884 return -EINVAL; 1885 1886 zone = page_zone(pfn_to_page(valid_start)); 1887 node = zone_to_nid(zone); 1888 nr_pages = end_pfn - start_pfn; 1889 1890 if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages)) 1891 return -EINVAL; 1892 1893 /* set above range as isolated */ 1894 ret = start_isolate_page_range(start_pfn, end_pfn, 1895 MIGRATE_MOVABLE, true); 1896 if (ret) 1897 return ret; 1898 1899 arg.start_pfn = start_pfn; 1900 arg.nr_pages = nr_pages; 1901 node_states_check_changes_offline(nr_pages, zone, &arg); 1902 1903 ret = memory_notify(MEM_GOING_OFFLINE, &arg); 1904 ret = notifier_to_errno(ret); 1905 if (ret) 1906 goto failed_removal; 1907 1908 pfn = start_pfn; 1909 expire = jiffies + timeout; 1910 drain = 0; 1911 retry_max = 5; 1912 repeat: 1913 /* start memory hot removal */ 1914 ret = -EAGAIN; 1915 if (time_after(jiffies, expire)) 1916 goto failed_removal; 1917 ret = -EINTR; 1918 if (signal_pending(current)) 1919 goto failed_removal; 1920 ret = 0; 1921 if (drain) { 1922 lru_add_drain_all(); 1923 cond_resched(); 1924 drain_all_pages(zone); 1925 } 1926 1927 pfn = scan_movable_pages(start_pfn, end_pfn); 1928 if (pfn) { /* We have movable pages */ 1929 ret = do_migrate_range(pfn, end_pfn); 1930 if (!ret) { 1931 drain = 1; 1932 goto repeat; 1933 } else { 1934 if (ret < 0) 1935 if (--retry_max == 0) 1936 goto failed_removal; 1937 yield(); 1938 drain = 1; 1939 goto repeat; 1940 } 1941 } 1942 /* drain all zone's lru pagevec, this is asynchronous... */ 1943 lru_add_drain_all(); 1944 yield(); 1945 /* drain pcp pages, this is synchronous. */ 1946 drain_all_pages(zone); 1947 /* 1948 * dissolve free hugepages in the memory block before doing offlining 1949 * actually in order to make hugetlbfs's object counting consistent. 1950 */ 1951 ret = dissolve_free_huge_pages(start_pfn, end_pfn); 1952 if (ret) 1953 goto failed_removal; 1954 /* check again */ 1955 offlined_pages = check_pages_isolated(start_pfn, end_pfn); 1956 if (offlined_pages < 0) { 1957 ret = -EBUSY; 1958 goto failed_removal; 1959 } 1960 pr_info("Offlined Pages %ld\n", offlined_pages); 1961 /* Ok, all of our target is isolated. 1962 We cannot do rollback at this point. */ 1963 offline_isolated_pages(start_pfn, end_pfn); 1964 /* reset pagetype flags and makes migrate type to be MOVABLE */ 1965 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1966 /* removal success */ 1967 adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages); 1968 zone->present_pages -= offlined_pages; 1969 1970 pgdat_resize_lock(zone->zone_pgdat, &flags); 1971 zone->zone_pgdat->node_present_pages -= offlined_pages; 1972 pgdat_resize_unlock(zone->zone_pgdat, &flags); 1973 1974 init_per_zone_wmark_min(); 1975 1976 if (!populated_zone(zone)) { 1977 zone_pcp_reset(zone); 1978 mutex_lock(&zonelists_mutex); 1979 build_all_zonelists(NULL, NULL); 1980 mutex_unlock(&zonelists_mutex); 1981 } else 1982 zone_pcp_update(zone); 1983 1984 node_states_clear_node(node, &arg); 1985 if (arg.status_change_nid >= 0) { 1986 kswapd_stop(node); 1987 kcompactd_stop(node); 1988 } 1989 1990 vm_total_pages = nr_free_pagecache_pages(); 1991 writeback_set_ratelimit(); 1992 1993 memory_notify(MEM_OFFLINE, &arg); 1994 return 0; 1995 1996 failed_removal: 1997 pr_debug("memory offlining [mem %#010llx-%#010llx] failed\n", 1998 (unsigned long long) start_pfn << PAGE_SHIFT, 1999 ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); 2000 memory_notify(MEM_CANCEL_OFFLINE, &arg); 2001 /* pushback to free area */ 2002 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 2003 return ret; 2004 } 2005 2006 /* Must be protected by mem_hotplug_begin() */ 2007 int offline_pages(unsigned long start_pfn, unsigned long nr_pages) 2008 { 2009 return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); 2010 } 2011 #endif /* CONFIG_MEMORY_HOTREMOVE */ 2012 2013 /** 2014 * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn) 2015 * @start_pfn: start pfn of the memory range 2016 * @end_pfn: end pfn of the memory range 2017 * @arg: argument passed to func 2018 * @func: callback for each memory section walked 2019 * 2020 * This function walks through all present mem sections in range 2021 * [start_pfn, end_pfn) and call func on each mem section. 2022 * 2023 * Returns the return value of func. 2024 */ 2025 int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, 2026 void *arg, int (*func)(struct memory_block *, void *)) 2027 { 2028 struct memory_block *mem = NULL; 2029 struct mem_section *section; 2030 unsigned long pfn, section_nr; 2031 int ret; 2032 2033 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 2034 section_nr = pfn_to_section_nr(pfn); 2035 if (!present_section_nr(section_nr)) 2036 continue; 2037 2038 section = __nr_to_section(section_nr); 2039 /* same memblock? */ 2040 if (mem) 2041 if ((section_nr >= mem->start_section_nr) && 2042 (section_nr <= mem->end_section_nr)) 2043 continue; 2044 2045 mem = find_memory_block_hinted(section, mem); 2046 if (!mem) 2047 continue; 2048 2049 ret = func(mem, arg); 2050 if (ret) { 2051 kobject_put(&mem->dev.kobj); 2052 return ret; 2053 } 2054 } 2055 2056 if (mem) 2057 kobject_put(&mem->dev.kobj); 2058 2059 return 0; 2060 } 2061 2062 #ifdef CONFIG_MEMORY_HOTREMOVE 2063 static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) 2064 { 2065 int ret = !is_memblock_offlined(mem); 2066 2067 if (unlikely(ret)) { 2068 phys_addr_t beginpa, endpa; 2069 2070 beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); 2071 endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1; 2072 pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n", 2073 &beginpa, &endpa); 2074 } 2075 2076 return ret; 2077 } 2078 2079 static int check_cpu_on_node(pg_data_t *pgdat) 2080 { 2081 int cpu; 2082 2083 for_each_present_cpu(cpu) { 2084 if (cpu_to_node(cpu) == pgdat->node_id) 2085 /* 2086 * the cpu on this node isn't removed, and we can't 2087 * offline this node. 2088 */ 2089 return -EBUSY; 2090 } 2091 2092 return 0; 2093 } 2094 2095 static void unmap_cpu_on_node(pg_data_t *pgdat) 2096 { 2097 #ifdef CONFIG_ACPI_NUMA 2098 int cpu; 2099 2100 for_each_possible_cpu(cpu) 2101 if (cpu_to_node(cpu) == pgdat->node_id) 2102 numa_clear_node(cpu); 2103 #endif 2104 } 2105 2106 static int check_and_unmap_cpu_on_node(pg_data_t *pgdat) 2107 { 2108 int ret; 2109 2110 ret = check_cpu_on_node(pgdat); 2111 if (ret) 2112 return ret; 2113 2114 /* 2115 * the node will be offlined when we come here, so we can clear 2116 * the cpu_to_node() now. 2117 */ 2118 2119 unmap_cpu_on_node(pgdat); 2120 return 0; 2121 } 2122 2123 /** 2124 * try_offline_node 2125 * 2126 * Offline a node if all memory sections and cpus of the node are removed. 2127 * 2128 * NOTE: The caller must call lock_device_hotplug() to serialize hotplug 2129 * and online/offline operations before this call. 2130 */ 2131 void try_offline_node(int nid) 2132 { 2133 pg_data_t *pgdat = NODE_DATA(nid); 2134 unsigned long start_pfn = pgdat->node_start_pfn; 2135 unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; 2136 unsigned long pfn; 2137 2138 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 2139 unsigned long section_nr = pfn_to_section_nr(pfn); 2140 2141 if (!present_section_nr(section_nr)) 2142 continue; 2143 2144 if (pfn_to_nid(pfn) != nid) 2145 continue; 2146 2147 /* 2148 * some memory sections of this node are not removed, and we 2149 * can't offline node now. 2150 */ 2151 return; 2152 } 2153 2154 if (check_and_unmap_cpu_on_node(pgdat)) 2155 return; 2156 2157 /* 2158 * all memory/cpu of this node are removed, we can offline this 2159 * node now. 2160 */ 2161 node_set_offline(nid); 2162 unregister_one_node(nid); 2163 } 2164 EXPORT_SYMBOL(try_offline_node); 2165 2166 /** 2167 * remove_memory 2168 * 2169 * NOTE: The caller must call lock_device_hotplug() to serialize hotplug 2170 * and online/offline operations before this call, as required by 2171 * try_offline_node(). 2172 */ 2173 void __ref remove_memory(int nid, u64 start, u64 size) 2174 { 2175 int ret; 2176 2177 BUG_ON(check_hotplug_memory_range(start, size)); 2178 2179 mem_hotplug_begin(); 2180 2181 /* 2182 * All memory blocks must be offlined before removing memory. Check 2183 * whether all memory blocks in question are offline and trigger a BUG() 2184 * if this is not the case. 2185 */ 2186 ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, 2187 check_memblock_offlined_cb); 2188 if (ret) 2189 BUG(); 2190 2191 /* remove memmap entry */ 2192 firmware_map_remove(start, start + size, "System RAM"); 2193 memblock_free(start, size); 2194 memblock_remove(start, size); 2195 2196 arch_remove_memory(start, size); 2197 2198 try_offline_node(nid); 2199 2200 mem_hotplug_done(); 2201 } 2202 EXPORT_SYMBOL_GPL(remove_memory); 2203 #endif /* CONFIG_MEMORY_HOTREMOVE */ 2204