1 /* 2 * linux/mm/memory_hotplug.c 3 * 4 * Copyright (C) 5 */ 6 7 #include <linux/stddef.h> 8 #include <linux/mm.h> 9 #include <linux/swap.h> 10 #include <linux/interrupt.h> 11 #include <linux/pagemap.h> 12 #include <linux/bootmem.h> 13 #include <linux/compiler.h> 14 #include <linux/export.h> 15 #include <linux/pagevec.h> 16 #include <linux/writeback.h> 17 #include <linux/slab.h> 18 #include <linux/sysctl.h> 19 #include <linux/cpu.h> 20 #include <linux/memory.h> 21 #include <linux/memory_hotplug.h> 22 #include <linux/highmem.h> 23 #include <linux/vmalloc.h> 24 #include <linux/ioport.h> 25 #include <linux/delay.h> 26 #include <linux/migrate.h> 27 #include <linux/page-isolation.h> 28 #include <linux/pfn.h> 29 #include <linux/suspend.h> 30 #include <linux/mm_inline.h> 31 #include <linux/firmware-map.h> 32 #include <linux/stop_machine.h> 33 #include <linux/hugetlb.h> 34 35 #include <asm/tlbflush.h> 36 37 #include "internal.h" 38 39 /* 40 * online_page_callback contains pointer to current page onlining function. 41 * Initially it is generic_online_page(). If it is required it could be 42 * changed by calling set_online_page_callback() for callback registration 43 * and restore_online_page_callback() for generic callback restore. 44 */ 45 46 static void generic_online_page(struct page *page); 47 48 static online_page_callback_t online_page_callback = generic_online_page; 49 50 DEFINE_MUTEX(mem_hotplug_mutex); 51 52 void lock_memory_hotplug(void) 53 { 54 mutex_lock(&mem_hotplug_mutex); 55 } 56 57 void unlock_memory_hotplug(void) 58 { 59 mutex_unlock(&mem_hotplug_mutex); 60 } 61 62 63 /* add this memory to iomem resource */ 64 static struct resource *register_memory_resource(u64 start, u64 size) 65 { 66 struct resource *res; 67 res = kzalloc(sizeof(struct resource), GFP_KERNEL); 68 BUG_ON(!res); 69 70 res->name = "System RAM"; 71 res->start = start; 72 res->end = start + size - 1; 73 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; 74 if (request_resource(&iomem_resource, res) < 0) { 75 pr_debug("System RAM resource %pR cannot be added\n", res); 76 kfree(res); 77 res = NULL; 78 } 79 return res; 80 } 81 82 static void release_memory_resource(struct resource *res) 83 { 84 if (!res) 85 return; 86 release_resource(res); 87 kfree(res); 88 return; 89 } 90 91 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 92 void get_page_bootmem(unsigned long info, struct page *page, 93 unsigned long type) 94 { 95 page->lru.next = (struct list_head *) type; 96 SetPagePrivate(page); 97 set_page_private(page, info); 98 atomic_inc(&page->_count); 99 } 100 101 void put_page_bootmem(struct page *page) 102 { 103 unsigned long type; 104 105 type = (unsigned long) page->lru.next; 106 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || 107 type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); 108 109 if (atomic_dec_return(&page->_count) == 1) { 110 ClearPagePrivate(page); 111 set_page_private(page, 0); 112 INIT_LIST_HEAD(&page->lru); 113 free_reserved_page(page); 114 } 115 } 116 117 #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE 118 #ifndef CONFIG_SPARSEMEM_VMEMMAP 119 static void register_page_bootmem_info_section(unsigned long start_pfn) 120 { 121 unsigned long *usemap, mapsize, section_nr, i; 122 struct mem_section *ms; 123 struct page *page, *memmap; 124 125 section_nr = pfn_to_section_nr(start_pfn); 126 ms = __nr_to_section(section_nr); 127 128 /* Get section's memmap address */ 129 memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); 130 131 /* 132 * Get page for the memmap's phys address 133 * XXX: need more consideration for sparse_vmemmap... 134 */ 135 page = virt_to_page(memmap); 136 mapsize = sizeof(struct page) * PAGES_PER_SECTION; 137 mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT; 138 139 /* remember memmap's page */ 140 for (i = 0; i < mapsize; i++, page++) 141 get_page_bootmem(section_nr, page, SECTION_INFO); 142 143 usemap = __nr_to_section(section_nr)->pageblock_flags; 144 page = virt_to_page(usemap); 145 146 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; 147 148 for (i = 0; i < mapsize; i++, page++) 149 get_page_bootmem(section_nr, page, MIX_SECTION_INFO); 150 151 } 152 #else /* CONFIG_SPARSEMEM_VMEMMAP */ 153 static void register_page_bootmem_info_section(unsigned long start_pfn) 154 { 155 unsigned long *usemap, mapsize, section_nr, i; 156 struct mem_section *ms; 157 struct page *page, *memmap; 158 159 if (!pfn_valid(start_pfn)) 160 return; 161 162 section_nr = pfn_to_section_nr(start_pfn); 163 ms = __nr_to_section(section_nr); 164 165 memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); 166 167 register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION); 168 169 usemap = __nr_to_section(section_nr)->pageblock_flags; 170 page = virt_to_page(usemap); 171 172 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; 173 174 for (i = 0; i < mapsize; i++, page++) 175 get_page_bootmem(section_nr, page, MIX_SECTION_INFO); 176 } 177 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ 178 179 void register_page_bootmem_info_node(struct pglist_data *pgdat) 180 { 181 unsigned long i, pfn, end_pfn, nr_pages; 182 int node = pgdat->node_id; 183 struct page *page; 184 struct zone *zone; 185 186 nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT; 187 page = virt_to_page(pgdat); 188 189 for (i = 0; i < nr_pages; i++, page++) 190 get_page_bootmem(node, page, NODE_INFO); 191 192 zone = &pgdat->node_zones[0]; 193 for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) { 194 if (zone_is_initialized(zone)) { 195 nr_pages = zone->wait_table_hash_nr_entries 196 * sizeof(wait_queue_head_t); 197 nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT; 198 page = virt_to_page(zone->wait_table); 199 200 for (i = 0; i < nr_pages; i++, page++) 201 get_page_bootmem(node, page, NODE_INFO); 202 } 203 } 204 205 pfn = pgdat->node_start_pfn; 206 end_pfn = pgdat_end_pfn(pgdat); 207 208 /* register section info */ 209 for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 210 /* 211 * Some platforms can assign the same pfn to multiple nodes - on 212 * node0 as well as nodeN. To avoid registering a pfn against 213 * multiple nodes we check that this pfn does not already 214 * reside in some other nodes. 215 */ 216 if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node)) 217 register_page_bootmem_info_section(pfn); 218 } 219 } 220 #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ 221 222 static void grow_zone_span(struct zone *zone, unsigned long start_pfn, 223 unsigned long end_pfn) 224 { 225 unsigned long old_zone_end_pfn; 226 227 zone_span_writelock(zone); 228 229 old_zone_end_pfn = zone_end_pfn(zone); 230 if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) 231 zone->zone_start_pfn = start_pfn; 232 233 zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - 234 zone->zone_start_pfn; 235 236 zone_span_writeunlock(zone); 237 } 238 239 static void resize_zone(struct zone *zone, unsigned long start_pfn, 240 unsigned long end_pfn) 241 { 242 zone_span_writelock(zone); 243 244 if (end_pfn - start_pfn) { 245 zone->zone_start_pfn = start_pfn; 246 zone->spanned_pages = end_pfn - start_pfn; 247 } else { 248 /* 249 * make it consist as free_area_init_core(), 250 * if spanned_pages = 0, then keep start_pfn = 0 251 */ 252 zone->zone_start_pfn = 0; 253 zone->spanned_pages = 0; 254 } 255 256 zone_span_writeunlock(zone); 257 } 258 259 static void fix_zone_id(struct zone *zone, unsigned long start_pfn, 260 unsigned long end_pfn) 261 { 262 enum zone_type zid = zone_idx(zone); 263 int nid = zone->zone_pgdat->node_id; 264 unsigned long pfn; 265 266 for (pfn = start_pfn; pfn < end_pfn; pfn++) 267 set_page_links(pfn_to_page(pfn), zid, nid, pfn); 268 } 269 270 /* Can fail with -ENOMEM from allocating a wait table with vmalloc() or 271 * alloc_bootmem_node_nopanic() */ 272 static int __ref ensure_zone_is_initialized(struct zone *zone, 273 unsigned long start_pfn, unsigned long num_pages) 274 { 275 if (!zone_is_initialized(zone)) 276 return init_currently_empty_zone(zone, start_pfn, num_pages, 277 MEMMAP_HOTPLUG); 278 return 0; 279 } 280 281 static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, 282 unsigned long start_pfn, unsigned long end_pfn) 283 { 284 int ret; 285 unsigned long flags; 286 unsigned long z1_start_pfn; 287 288 ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn); 289 if (ret) 290 return ret; 291 292 pgdat_resize_lock(z1->zone_pgdat, &flags); 293 294 /* can't move pfns which are higher than @z2 */ 295 if (end_pfn > zone_end_pfn(z2)) 296 goto out_fail; 297 /* the move out part must be at the left most of @z2 */ 298 if (start_pfn > z2->zone_start_pfn) 299 goto out_fail; 300 /* must included/overlap */ 301 if (end_pfn <= z2->zone_start_pfn) 302 goto out_fail; 303 304 /* use start_pfn for z1's start_pfn if z1 is empty */ 305 if (!zone_is_empty(z1)) 306 z1_start_pfn = z1->zone_start_pfn; 307 else 308 z1_start_pfn = start_pfn; 309 310 resize_zone(z1, z1_start_pfn, end_pfn); 311 resize_zone(z2, end_pfn, zone_end_pfn(z2)); 312 313 pgdat_resize_unlock(z1->zone_pgdat, &flags); 314 315 fix_zone_id(z1, start_pfn, end_pfn); 316 317 return 0; 318 out_fail: 319 pgdat_resize_unlock(z1->zone_pgdat, &flags); 320 return -1; 321 } 322 323 static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, 324 unsigned long start_pfn, unsigned long end_pfn) 325 { 326 int ret; 327 unsigned long flags; 328 unsigned long z2_end_pfn; 329 330 ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn); 331 if (ret) 332 return ret; 333 334 pgdat_resize_lock(z1->zone_pgdat, &flags); 335 336 /* can't move pfns which are lower than @z1 */ 337 if (z1->zone_start_pfn > start_pfn) 338 goto out_fail; 339 /* the move out part mast at the right most of @z1 */ 340 if (zone_end_pfn(z1) > end_pfn) 341 goto out_fail; 342 /* must included/overlap */ 343 if (start_pfn >= zone_end_pfn(z1)) 344 goto out_fail; 345 346 /* use end_pfn for z2's end_pfn if z2 is empty */ 347 if (!zone_is_empty(z2)) 348 z2_end_pfn = zone_end_pfn(z2); 349 else 350 z2_end_pfn = end_pfn; 351 352 resize_zone(z1, z1->zone_start_pfn, start_pfn); 353 resize_zone(z2, start_pfn, z2_end_pfn); 354 355 pgdat_resize_unlock(z1->zone_pgdat, &flags); 356 357 fix_zone_id(z2, start_pfn, end_pfn); 358 359 return 0; 360 out_fail: 361 pgdat_resize_unlock(z1->zone_pgdat, &flags); 362 return -1; 363 } 364 365 static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, 366 unsigned long end_pfn) 367 { 368 unsigned long old_pgdat_end_pfn = 369 pgdat->node_start_pfn + pgdat->node_spanned_pages; 370 371 if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) 372 pgdat->node_start_pfn = start_pfn; 373 374 pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - 375 pgdat->node_start_pfn; 376 } 377 378 static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) 379 { 380 struct pglist_data *pgdat = zone->zone_pgdat; 381 int nr_pages = PAGES_PER_SECTION; 382 int nid = pgdat->node_id; 383 int zone_type; 384 unsigned long flags; 385 int ret; 386 387 zone_type = zone - pgdat->node_zones; 388 ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages); 389 if (ret) 390 return ret; 391 392 pgdat_resize_lock(zone->zone_pgdat, &flags); 393 grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); 394 grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, 395 phys_start_pfn + nr_pages); 396 pgdat_resize_unlock(zone->zone_pgdat, &flags); 397 memmap_init_zone(nr_pages, nid, zone_type, 398 phys_start_pfn, MEMMAP_HOTPLUG); 399 return 0; 400 } 401 402 static int __meminit __add_section(int nid, struct zone *zone, 403 unsigned long phys_start_pfn) 404 { 405 int nr_pages = PAGES_PER_SECTION; 406 int ret; 407 408 if (pfn_valid(phys_start_pfn)) 409 return -EEXIST; 410 411 ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages); 412 413 if (ret < 0) 414 return ret; 415 416 ret = __add_zone(zone, phys_start_pfn); 417 418 if (ret < 0) 419 return ret; 420 421 return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); 422 } 423 424 /* 425 * Reasonably generic function for adding memory. It is 426 * expected that archs that support memory hotplug will 427 * call this function after deciding the zone to which to 428 * add the new pages. 429 */ 430 int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, 431 unsigned long nr_pages) 432 { 433 unsigned long i; 434 int err = 0; 435 int start_sec, end_sec; 436 /* during initialize mem_map, align hot-added range to section */ 437 start_sec = pfn_to_section_nr(phys_start_pfn); 438 end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); 439 440 for (i = start_sec; i <= end_sec; i++) { 441 err = __add_section(nid, zone, i << PFN_SECTION_SHIFT); 442 443 /* 444 * EEXIST is finally dealt with by ioresource collision 445 * check. see add_memory() => register_memory_resource() 446 * Warning will be printed if there is collision. 447 */ 448 if (err && (err != -EEXIST)) 449 break; 450 err = 0; 451 } 452 453 return err; 454 } 455 EXPORT_SYMBOL_GPL(__add_pages); 456 457 #ifdef CONFIG_MEMORY_HOTREMOVE 458 /* find the smallest valid pfn in the range [start_pfn, end_pfn) */ 459 static int find_smallest_section_pfn(int nid, struct zone *zone, 460 unsigned long start_pfn, 461 unsigned long end_pfn) 462 { 463 struct mem_section *ms; 464 465 for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) { 466 ms = __pfn_to_section(start_pfn); 467 468 if (unlikely(!valid_section(ms))) 469 continue; 470 471 if (unlikely(pfn_to_nid(start_pfn) != nid)) 472 continue; 473 474 if (zone && zone != page_zone(pfn_to_page(start_pfn))) 475 continue; 476 477 return start_pfn; 478 } 479 480 return 0; 481 } 482 483 /* find the biggest valid pfn in the range [start_pfn, end_pfn). */ 484 static int find_biggest_section_pfn(int nid, struct zone *zone, 485 unsigned long start_pfn, 486 unsigned long end_pfn) 487 { 488 struct mem_section *ms; 489 unsigned long pfn; 490 491 /* pfn is the end pfn of a memory section. */ 492 pfn = end_pfn - 1; 493 for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) { 494 ms = __pfn_to_section(pfn); 495 496 if (unlikely(!valid_section(ms))) 497 continue; 498 499 if (unlikely(pfn_to_nid(pfn) != nid)) 500 continue; 501 502 if (zone && zone != page_zone(pfn_to_page(pfn))) 503 continue; 504 505 return pfn; 506 } 507 508 return 0; 509 } 510 511 static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, 512 unsigned long end_pfn) 513 { 514 unsigned long zone_start_pfn = zone->zone_start_pfn; 515 unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */ 516 unsigned long zone_end_pfn = z; 517 unsigned long pfn; 518 struct mem_section *ms; 519 int nid = zone_to_nid(zone); 520 521 zone_span_writelock(zone); 522 if (zone_start_pfn == start_pfn) { 523 /* 524 * If the section is smallest section in the zone, it need 525 * shrink zone->zone_start_pfn and zone->zone_spanned_pages. 526 * In this case, we find second smallest valid mem_section 527 * for shrinking zone. 528 */ 529 pfn = find_smallest_section_pfn(nid, zone, end_pfn, 530 zone_end_pfn); 531 if (pfn) { 532 zone->zone_start_pfn = pfn; 533 zone->spanned_pages = zone_end_pfn - pfn; 534 } 535 } else if (zone_end_pfn == end_pfn) { 536 /* 537 * If the section is biggest section in the zone, it need 538 * shrink zone->spanned_pages. 539 * In this case, we find second biggest valid mem_section for 540 * shrinking zone. 541 */ 542 pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn, 543 start_pfn); 544 if (pfn) 545 zone->spanned_pages = pfn - zone_start_pfn + 1; 546 } 547 548 /* 549 * The section is not biggest or smallest mem_section in the zone, it 550 * only creates a hole in the zone. So in this case, we need not 551 * change the zone. But perhaps, the zone has only hole data. Thus 552 * it check the zone has only hole or not. 553 */ 554 pfn = zone_start_pfn; 555 for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) { 556 ms = __pfn_to_section(pfn); 557 558 if (unlikely(!valid_section(ms))) 559 continue; 560 561 if (page_zone(pfn_to_page(pfn)) != zone) 562 continue; 563 564 /* If the section is current section, it continues the loop */ 565 if (start_pfn == pfn) 566 continue; 567 568 /* If we find valid section, we have nothing to do */ 569 zone_span_writeunlock(zone); 570 return; 571 } 572 573 /* The zone has no valid section */ 574 zone->zone_start_pfn = 0; 575 zone->spanned_pages = 0; 576 zone_span_writeunlock(zone); 577 } 578 579 static void shrink_pgdat_span(struct pglist_data *pgdat, 580 unsigned long start_pfn, unsigned long end_pfn) 581 { 582 unsigned long pgdat_start_pfn = pgdat->node_start_pfn; 583 unsigned long pgdat_end_pfn = 584 pgdat->node_start_pfn + pgdat->node_spanned_pages; 585 unsigned long pfn; 586 struct mem_section *ms; 587 int nid = pgdat->node_id; 588 589 if (pgdat_start_pfn == start_pfn) { 590 /* 591 * If the section is smallest section in the pgdat, it need 592 * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages. 593 * In this case, we find second smallest valid mem_section 594 * for shrinking zone. 595 */ 596 pfn = find_smallest_section_pfn(nid, NULL, end_pfn, 597 pgdat_end_pfn); 598 if (pfn) { 599 pgdat->node_start_pfn = pfn; 600 pgdat->node_spanned_pages = pgdat_end_pfn - pfn; 601 } 602 } else if (pgdat_end_pfn == end_pfn) { 603 /* 604 * If the section is biggest section in the pgdat, it need 605 * shrink pgdat->node_spanned_pages. 606 * In this case, we find second biggest valid mem_section for 607 * shrinking zone. 608 */ 609 pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn, 610 start_pfn); 611 if (pfn) 612 pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1; 613 } 614 615 /* 616 * If the section is not biggest or smallest mem_section in the pgdat, 617 * it only creates a hole in the pgdat. So in this case, we need not 618 * change the pgdat. 619 * But perhaps, the pgdat has only hole data. Thus it check the pgdat 620 * has only hole or not. 621 */ 622 pfn = pgdat_start_pfn; 623 for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) { 624 ms = __pfn_to_section(pfn); 625 626 if (unlikely(!valid_section(ms))) 627 continue; 628 629 if (pfn_to_nid(pfn) != nid) 630 continue; 631 632 /* If the section is current section, it continues the loop */ 633 if (start_pfn == pfn) 634 continue; 635 636 /* If we find valid section, we have nothing to do */ 637 return; 638 } 639 640 /* The pgdat has no valid section */ 641 pgdat->node_start_pfn = 0; 642 pgdat->node_spanned_pages = 0; 643 } 644 645 static void __remove_zone(struct zone *zone, unsigned long start_pfn) 646 { 647 struct pglist_data *pgdat = zone->zone_pgdat; 648 int nr_pages = PAGES_PER_SECTION; 649 int zone_type; 650 unsigned long flags; 651 652 zone_type = zone - pgdat->node_zones; 653 654 pgdat_resize_lock(zone->zone_pgdat, &flags); 655 shrink_zone_span(zone, start_pfn, start_pfn + nr_pages); 656 shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages); 657 pgdat_resize_unlock(zone->zone_pgdat, &flags); 658 } 659 660 static int __remove_section(struct zone *zone, struct mem_section *ms) 661 { 662 unsigned long start_pfn; 663 int scn_nr; 664 int ret = -EINVAL; 665 666 if (!valid_section(ms)) 667 return ret; 668 669 ret = unregister_memory_section(ms); 670 if (ret) 671 return ret; 672 673 scn_nr = __section_nr(ms); 674 start_pfn = section_nr_to_pfn(scn_nr); 675 __remove_zone(zone, start_pfn); 676 677 sparse_remove_one_section(zone, ms); 678 return 0; 679 } 680 681 /** 682 * __remove_pages() - remove sections of pages from a zone 683 * @zone: zone from which pages need to be removed 684 * @phys_start_pfn: starting pageframe (must be aligned to start of a section) 685 * @nr_pages: number of pages to remove (must be multiple of section size) 686 * 687 * Generic helper function to remove section mappings and sysfs entries 688 * for the section of the memory we are removing. Caller needs to make 689 * sure that pages are marked reserved and zones are adjust properly by 690 * calling offline_pages(). 691 */ 692 int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, 693 unsigned long nr_pages) 694 { 695 unsigned long i; 696 int sections_to_remove; 697 resource_size_t start, size; 698 int ret = 0; 699 700 /* 701 * We can only remove entire sections 702 */ 703 BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); 704 BUG_ON(nr_pages % PAGES_PER_SECTION); 705 706 start = phys_start_pfn << PAGE_SHIFT; 707 size = nr_pages * PAGE_SIZE; 708 ret = release_mem_region_adjustable(&iomem_resource, start, size); 709 if (ret) { 710 resource_size_t endres = start + size - 1; 711 712 pr_warn("Unable to release resource <%pa-%pa> (%d)\n", 713 &start, &endres, ret); 714 } 715 716 sections_to_remove = nr_pages / PAGES_PER_SECTION; 717 for (i = 0; i < sections_to_remove; i++) { 718 unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; 719 ret = __remove_section(zone, __pfn_to_section(pfn)); 720 if (ret) 721 break; 722 } 723 return ret; 724 } 725 EXPORT_SYMBOL_GPL(__remove_pages); 726 #endif /* CONFIG_MEMORY_HOTREMOVE */ 727 728 int set_online_page_callback(online_page_callback_t callback) 729 { 730 int rc = -EINVAL; 731 732 lock_memory_hotplug(); 733 734 if (online_page_callback == generic_online_page) { 735 online_page_callback = callback; 736 rc = 0; 737 } 738 739 unlock_memory_hotplug(); 740 741 return rc; 742 } 743 EXPORT_SYMBOL_GPL(set_online_page_callback); 744 745 int restore_online_page_callback(online_page_callback_t callback) 746 { 747 int rc = -EINVAL; 748 749 lock_memory_hotplug(); 750 751 if (online_page_callback == callback) { 752 online_page_callback = generic_online_page; 753 rc = 0; 754 } 755 756 unlock_memory_hotplug(); 757 758 return rc; 759 } 760 EXPORT_SYMBOL_GPL(restore_online_page_callback); 761 762 void __online_page_set_limits(struct page *page) 763 { 764 } 765 EXPORT_SYMBOL_GPL(__online_page_set_limits); 766 767 void __online_page_increment_counters(struct page *page) 768 { 769 adjust_managed_page_count(page, 1); 770 } 771 EXPORT_SYMBOL_GPL(__online_page_increment_counters); 772 773 void __online_page_free(struct page *page) 774 { 775 __free_reserved_page(page); 776 } 777 EXPORT_SYMBOL_GPL(__online_page_free); 778 779 static void generic_online_page(struct page *page) 780 { 781 __online_page_set_limits(page); 782 __online_page_increment_counters(page); 783 __online_page_free(page); 784 } 785 786 static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, 787 void *arg) 788 { 789 unsigned long i; 790 unsigned long onlined_pages = *(unsigned long *)arg; 791 struct page *page; 792 if (PageReserved(pfn_to_page(start_pfn))) 793 for (i = 0; i < nr_pages; i++) { 794 page = pfn_to_page(start_pfn + i); 795 (*online_page_callback)(page); 796 onlined_pages++; 797 } 798 *(unsigned long *)arg = onlined_pages; 799 return 0; 800 } 801 802 #ifdef CONFIG_MOVABLE_NODE 803 /* 804 * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have 805 * normal memory. 806 */ 807 static bool can_online_high_movable(struct zone *zone) 808 { 809 return true; 810 } 811 #else /* CONFIG_MOVABLE_NODE */ 812 /* ensure every online node has NORMAL memory */ 813 static bool can_online_high_movable(struct zone *zone) 814 { 815 return node_state(zone_to_nid(zone), N_NORMAL_MEMORY); 816 } 817 #endif /* CONFIG_MOVABLE_NODE */ 818 819 /* check which state of node_states will be changed when online memory */ 820 static void node_states_check_changes_online(unsigned long nr_pages, 821 struct zone *zone, struct memory_notify *arg) 822 { 823 int nid = zone_to_nid(zone); 824 enum zone_type zone_last = ZONE_NORMAL; 825 826 /* 827 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] 828 * contains nodes which have zones of 0...ZONE_NORMAL, 829 * set zone_last to ZONE_NORMAL. 830 * 831 * If we don't have HIGHMEM nor movable node, 832 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of 833 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. 834 */ 835 if (N_MEMORY == N_NORMAL_MEMORY) 836 zone_last = ZONE_MOVABLE; 837 838 /* 839 * if the memory to be online is in a zone of 0...zone_last, and 840 * the zones of 0...zone_last don't have memory before online, we will 841 * need to set the node to node_states[N_NORMAL_MEMORY] after 842 * the memory is online. 843 */ 844 if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY)) 845 arg->status_change_nid_normal = nid; 846 else 847 arg->status_change_nid_normal = -1; 848 849 #ifdef CONFIG_HIGHMEM 850 /* 851 * If we have movable node, node_states[N_HIGH_MEMORY] 852 * contains nodes which have zones of 0...ZONE_HIGHMEM, 853 * set zone_last to ZONE_HIGHMEM. 854 * 855 * If we don't have movable node, node_states[N_NORMAL_MEMORY] 856 * contains nodes which have zones of 0...ZONE_MOVABLE, 857 * set zone_last to ZONE_MOVABLE. 858 */ 859 zone_last = ZONE_HIGHMEM; 860 if (N_MEMORY == N_HIGH_MEMORY) 861 zone_last = ZONE_MOVABLE; 862 863 if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY)) 864 arg->status_change_nid_high = nid; 865 else 866 arg->status_change_nid_high = -1; 867 #else 868 arg->status_change_nid_high = arg->status_change_nid_normal; 869 #endif 870 871 /* 872 * if the node don't have memory befor online, we will need to 873 * set the node to node_states[N_MEMORY] after the memory 874 * is online. 875 */ 876 if (!node_state(nid, N_MEMORY)) 877 arg->status_change_nid = nid; 878 else 879 arg->status_change_nid = -1; 880 } 881 882 static void node_states_set_node(int node, struct memory_notify *arg) 883 { 884 if (arg->status_change_nid_normal >= 0) 885 node_set_state(node, N_NORMAL_MEMORY); 886 887 if (arg->status_change_nid_high >= 0) 888 node_set_state(node, N_HIGH_MEMORY); 889 890 node_set_state(node, N_MEMORY); 891 } 892 893 894 int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) 895 { 896 unsigned long flags; 897 unsigned long onlined_pages = 0; 898 struct zone *zone; 899 int need_zonelists_rebuild = 0; 900 int nid; 901 int ret; 902 struct memory_notify arg; 903 904 lock_memory_hotplug(); 905 /* 906 * This doesn't need a lock to do pfn_to_page(). 907 * The section can't be removed here because of the 908 * memory_block->state_mutex. 909 */ 910 zone = page_zone(pfn_to_page(pfn)); 911 912 if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) && 913 !can_online_high_movable(zone)) { 914 unlock_memory_hotplug(); 915 return -EINVAL; 916 } 917 918 if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { 919 if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) { 920 unlock_memory_hotplug(); 921 return -EINVAL; 922 } 923 } 924 if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { 925 if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) { 926 unlock_memory_hotplug(); 927 return -EINVAL; 928 } 929 } 930 931 /* Previous code may changed the zone of the pfn range */ 932 zone = page_zone(pfn_to_page(pfn)); 933 934 arg.start_pfn = pfn; 935 arg.nr_pages = nr_pages; 936 node_states_check_changes_online(nr_pages, zone, &arg); 937 938 nid = page_to_nid(pfn_to_page(pfn)); 939 940 ret = memory_notify(MEM_GOING_ONLINE, &arg); 941 ret = notifier_to_errno(ret); 942 if (ret) { 943 memory_notify(MEM_CANCEL_ONLINE, &arg); 944 unlock_memory_hotplug(); 945 return ret; 946 } 947 /* 948 * If this zone is not populated, then it is not in zonelist. 949 * This means the page allocator ignores this zone. 950 * So, zonelist must be updated after online. 951 */ 952 mutex_lock(&zonelists_mutex); 953 if (!populated_zone(zone)) { 954 need_zonelists_rebuild = 1; 955 build_all_zonelists(NULL, zone); 956 } 957 958 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, 959 online_pages_range); 960 if (ret) { 961 if (need_zonelists_rebuild) 962 zone_pcp_reset(zone); 963 mutex_unlock(&zonelists_mutex); 964 printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n", 965 (unsigned long long) pfn << PAGE_SHIFT, 966 (((unsigned long long) pfn + nr_pages) 967 << PAGE_SHIFT) - 1); 968 memory_notify(MEM_CANCEL_ONLINE, &arg); 969 unlock_memory_hotplug(); 970 return ret; 971 } 972 973 zone->present_pages += onlined_pages; 974 975 pgdat_resize_lock(zone->zone_pgdat, &flags); 976 zone->zone_pgdat->node_present_pages += onlined_pages; 977 pgdat_resize_unlock(zone->zone_pgdat, &flags); 978 979 if (onlined_pages) { 980 node_states_set_node(zone_to_nid(zone), &arg); 981 if (need_zonelists_rebuild) 982 build_all_zonelists(NULL, NULL); 983 else 984 zone_pcp_update(zone); 985 } 986 987 mutex_unlock(&zonelists_mutex); 988 989 init_per_zone_wmark_min(); 990 991 if (onlined_pages) 992 kswapd_run(zone_to_nid(zone)); 993 994 vm_total_pages = nr_free_pagecache_pages(); 995 996 writeback_set_ratelimit(); 997 998 if (onlined_pages) 999 memory_notify(MEM_ONLINE, &arg); 1000 unlock_memory_hotplug(); 1001 1002 return 0; 1003 } 1004 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ 1005 1006 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 1007 static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) 1008 { 1009 struct pglist_data *pgdat; 1010 unsigned long zones_size[MAX_NR_ZONES] = {0}; 1011 unsigned long zholes_size[MAX_NR_ZONES] = {0}; 1012 unsigned long start_pfn = start >> PAGE_SHIFT; 1013 1014 pgdat = NODE_DATA(nid); 1015 if (!pgdat) { 1016 pgdat = arch_alloc_nodedata(nid); 1017 if (!pgdat) 1018 return NULL; 1019 1020 arch_refresh_nodedata(nid, pgdat); 1021 } 1022 1023 /* we can use NODE_DATA(nid) from here */ 1024 1025 /* init node's zones as empty zones, we don't have any present pages.*/ 1026 free_area_init_node(nid, zones_size, start_pfn, zholes_size); 1027 1028 /* 1029 * The node we allocated has no zone fallback lists. For avoiding 1030 * to access not-initialized zonelist, build here. 1031 */ 1032 mutex_lock(&zonelists_mutex); 1033 build_all_zonelists(pgdat, NULL); 1034 mutex_unlock(&zonelists_mutex); 1035 1036 return pgdat; 1037 } 1038 1039 static void rollback_node_hotadd(int nid, pg_data_t *pgdat) 1040 { 1041 arch_refresh_nodedata(nid, NULL); 1042 arch_free_nodedata(pgdat); 1043 return; 1044 } 1045 1046 1047 /* 1048 * called by cpu_up() to online a node without onlined memory. 1049 */ 1050 int mem_online_node(int nid) 1051 { 1052 pg_data_t *pgdat; 1053 int ret; 1054 1055 lock_memory_hotplug(); 1056 pgdat = hotadd_new_pgdat(nid, 0); 1057 if (!pgdat) { 1058 ret = -ENOMEM; 1059 goto out; 1060 } 1061 node_set_online(nid); 1062 ret = register_one_node(nid); 1063 BUG_ON(ret); 1064 1065 out: 1066 unlock_memory_hotplug(); 1067 return ret; 1068 } 1069 1070 static int check_hotplug_memory_range(u64 start, u64 size) 1071 { 1072 u64 start_pfn = start >> PAGE_SHIFT; 1073 u64 nr_pages = size >> PAGE_SHIFT; 1074 1075 /* Memory range must be aligned with section */ 1076 if ((start_pfn & ~PAGE_SECTION_MASK) || 1077 (nr_pages % PAGES_PER_SECTION) || (!nr_pages)) { 1078 pr_err("Section-unaligned hotplug range: start 0x%llx, size 0x%llx\n", 1079 (unsigned long long)start, 1080 (unsigned long long)size); 1081 return -EINVAL; 1082 } 1083 1084 return 0; 1085 } 1086 1087 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 1088 int __ref add_memory(int nid, u64 start, u64 size) 1089 { 1090 pg_data_t *pgdat = NULL; 1091 bool new_pgdat; 1092 bool new_node; 1093 struct resource *res; 1094 int ret; 1095 1096 ret = check_hotplug_memory_range(start, size); 1097 if (ret) 1098 return ret; 1099 1100 lock_memory_hotplug(); 1101 1102 res = register_memory_resource(start, size); 1103 ret = -EEXIST; 1104 if (!res) 1105 goto out; 1106 1107 { /* Stupid hack to suppress address-never-null warning */ 1108 void *p = NODE_DATA(nid); 1109 new_pgdat = !p; 1110 } 1111 new_node = !node_online(nid); 1112 if (new_node) { 1113 pgdat = hotadd_new_pgdat(nid, start); 1114 ret = -ENOMEM; 1115 if (!pgdat) 1116 goto error; 1117 } 1118 1119 /* call arch's memory hotadd */ 1120 ret = arch_add_memory(nid, start, size); 1121 1122 if (ret < 0) 1123 goto error; 1124 1125 /* we online node here. we can't roll back from here. */ 1126 node_set_online(nid); 1127 1128 if (new_node) { 1129 ret = register_one_node(nid); 1130 /* 1131 * If sysfs file of new node can't create, cpu on the node 1132 * can't be hot-added. There is no rollback way now. 1133 * So, check by BUG_ON() to catch it reluctantly.. 1134 */ 1135 BUG_ON(ret); 1136 } 1137 1138 /* create new memmap entry */ 1139 firmware_map_add_hotplug(start, start + size, "System RAM"); 1140 1141 goto out; 1142 1143 error: 1144 /* rollback pgdat allocation and others */ 1145 if (new_pgdat) 1146 rollback_node_hotadd(nid, pgdat); 1147 release_memory_resource(res); 1148 1149 out: 1150 unlock_memory_hotplug(); 1151 return ret; 1152 } 1153 EXPORT_SYMBOL_GPL(add_memory); 1154 1155 #ifdef CONFIG_MEMORY_HOTREMOVE 1156 /* 1157 * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy 1158 * set and the size of the free page is given by page_order(). Using this, 1159 * the function determines if the pageblock contains only free pages. 1160 * Due to buddy contraints, a free page at least the size of a pageblock will 1161 * be located at the start of the pageblock 1162 */ 1163 static inline int pageblock_free(struct page *page) 1164 { 1165 return PageBuddy(page) && page_order(page) >= pageblock_order; 1166 } 1167 1168 /* Return the start of the next active pageblock after a given page */ 1169 static struct page *next_active_pageblock(struct page *page) 1170 { 1171 /* Ensure the starting page is pageblock-aligned */ 1172 BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1)); 1173 1174 /* If the entire pageblock is free, move to the end of free page */ 1175 if (pageblock_free(page)) { 1176 int order; 1177 /* be careful. we don't have locks, page_order can be changed.*/ 1178 order = page_order(page); 1179 if ((order < MAX_ORDER) && (order >= pageblock_order)) 1180 return page + (1 << order); 1181 } 1182 1183 return page + pageblock_nr_pages; 1184 } 1185 1186 /* Checks if this range of memory is likely to be hot-removable. */ 1187 int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) 1188 { 1189 struct page *page = pfn_to_page(start_pfn); 1190 struct page *end_page = page + nr_pages; 1191 1192 /* Check the starting page of each pageblock within the range */ 1193 for (; page < end_page; page = next_active_pageblock(page)) { 1194 if (!is_pageblock_removable_nolock(page)) 1195 return 0; 1196 cond_resched(); 1197 } 1198 1199 /* All pageblocks in the memory block are likely to be hot-removable */ 1200 return 1; 1201 } 1202 1203 /* 1204 * Confirm all pages in a range [start, end) is belongs to the same zone. 1205 */ 1206 static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) 1207 { 1208 unsigned long pfn; 1209 struct zone *zone = NULL; 1210 struct page *page; 1211 int i; 1212 for (pfn = start_pfn; 1213 pfn < end_pfn; 1214 pfn += MAX_ORDER_NR_PAGES) { 1215 i = 0; 1216 /* This is just a CONFIG_HOLES_IN_ZONE check.*/ 1217 while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i)) 1218 i++; 1219 if (i == MAX_ORDER_NR_PAGES) 1220 continue; 1221 page = pfn_to_page(pfn + i); 1222 if (zone && page_zone(page) != zone) 1223 return 0; 1224 zone = page_zone(page); 1225 } 1226 return 1; 1227 } 1228 1229 /* 1230 * Scan pfn range [start,end) to find movable/migratable pages (LRU pages 1231 * and hugepages). We scan pfn because it's much easier than scanning over 1232 * linked list. This function returns the pfn of the first found movable 1233 * page if it's found, otherwise 0. 1234 */ 1235 static unsigned long scan_movable_pages(unsigned long start, unsigned long end) 1236 { 1237 unsigned long pfn; 1238 struct page *page; 1239 for (pfn = start; pfn < end; pfn++) { 1240 if (pfn_valid(pfn)) { 1241 page = pfn_to_page(pfn); 1242 if (PageLRU(page)) 1243 return pfn; 1244 if (PageHuge(page)) { 1245 if (is_hugepage_active(page)) 1246 return pfn; 1247 else 1248 pfn = round_up(pfn + 1, 1249 1 << compound_order(page)) - 1; 1250 } 1251 } 1252 } 1253 return 0; 1254 } 1255 1256 #define NR_OFFLINE_AT_ONCE_PAGES (256) 1257 static int 1258 do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) 1259 { 1260 unsigned long pfn; 1261 struct page *page; 1262 int move_pages = NR_OFFLINE_AT_ONCE_PAGES; 1263 int not_managed = 0; 1264 int ret = 0; 1265 LIST_HEAD(source); 1266 1267 for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) { 1268 if (!pfn_valid(pfn)) 1269 continue; 1270 page = pfn_to_page(pfn); 1271 1272 if (PageHuge(page)) { 1273 struct page *head = compound_head(page); 1274 pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1; 1275 if (compound_order(head) > PFN_SECTION_SHIFT) { 1276 ret = -EBUSY; 1277 break; 1278 } 1279 if (isolate_huge_page(page, &source)) 1280 move_pages -= 1 << compound_order(head); 1281 continue; 1282 } 1283 1284 if (!get_page_unless_zero(page)) 1285 continue; 1286 /* 1287 * We can skip free pages. And we can only deal with pages on 1288 * LRU. 1289 */ 1290 ret = isolate_lru_page(page); 1291 if (!ret) { /* Success */ 1292 put_page(page); 1293 list_add_tail(&page->lru, &source); 1294 move_pages--; 1295 inc_zone_page_state(page, NR_ISOLATED_ANON + 1296 page_is_file_cache(page)); 1297 1298 } else { 1299 #ifdef CONFIG_DEBUG_VM 1300 printk(KERN_ALERT "removing pfn %lx from LRU failed\n", 1301 pfn); 1302 dump_page(page); 1303 #endif 1304 put_page(page); 1305 /* Because we don't have big zone->lock. we should 1306 check this again here. */ 1307 if (page_count(page)) { 1308 not_managed++; 1309 ret = -EBUSY; 1310 break; 1311 } 1312 } 1313 } 1314 if (!list_empty(&source)) { 1315 if (not_managed) { 1316 putback_movable_pages(&source); 1317 goto out; 1318 } 1319 1320 /* 1321 * alloc_migrate_target should be improooooved!! 1322 * migrate_pages returns # of failed pages. 1323 */ 1324 ret = migrate_pages(&source, alloc_migrate_target, 0, 1325 MIGRATE_SYNC, MR_MEMORY_HOTPLUG); 1326 if (ret) 1327 putback_movable_pages(&source); 1328 } 1329 out: 1330 return ret; 1331 } 1332 1333 /* 1334 * remove from free_area[] and mark all as Reserved. 1335 */ 1336 static int 1337 offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, 1338 void *data) 1339 { 1340 __offline_isolated_pages(start, start + nr_pages); 1341 return 0; 1342 } 1343 1344 static void 1345 offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 1346 { 1347 walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL, 1348 offline_isolated_pages_cb); 1349 } 1350 1351 /* 1352 * Check all pages in range, recoreded as memory resource, are isolated. 1353 */ 1354 static int 1355 check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, 1356 void *data) 1357 { 1358 int ret; 1359 long offlined = *(long *)data; 1360 ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true); 1361 offlined = nr_pages; 1362 if (!ret) 1363 *(long *)data += offlined; 1364 return ret; 1365 } 1366 1367 static long 1368 check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) 1369 { 1370 long offlined = 0; 1371 int ret; 1372 1373 ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined, 1374 check_pages_isolated_cb); 1375 if (ret < 0) 1376 offlined = (long)ret; 1377 return offlined; 1378 } 1379 1380 #ifdef CONFIG_MOVABLE_NODE 1381 /* 1382 * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have 1383 * normal memory. 1384 */ 1385 static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) 1386 { 1387 return true; 1388 } 1389 #else /* CONFIG_MOVABLE_NODE */ 1390 /* ensure the node has NORMAL memory if it is still online */ 1391 static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) 1392 { 1393 struct pglist_data *pgdat = zone->zone_pgdat; 1394 unsigned long present_pages = 0; 1395 enum zone_type zt; 1396 1397 for (zt = 0; zt <= ZONE_NORMAL; zt++) 1398 present_pages += pgdat->node_zones[zt].present_pages; 1399 1400 if (present_pages > nr_pages) 1401 return true; 1402 1403 present_pages = 0; 1404 for (; zt <= ZONE_MOVABLE; zt++) 1405 present_pages += pgdat->node_zones[zt].present_pages; 1406 1407 /* 1408 * we can't offline the last normal memory until all 1409 * higher memory is offlined. 1410 */ 1411 return present_pages == 0; 1412 } 1413 #endif /* CONFIG_MOVABLE_NODE */ 1414 1415 /* check which state of node_states will be changed when offline memory */ 1416 static void node_states_check_changes_offline(unsigned long nr_pages, 1417 struct zone *zone, struct memory_notify *arg) 1418 { 1419 struct pglist_data *pgdat = zone->zone_pgdat; 1420 unsigned long present_pages = 0; 1421 enum zone_type zt, zone_last = ZONE_NORMAL; 1422 1423 /* 1424 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] 1425 * contains nodes which have zones of 0...ZONE_NORMAL, 1426 * set zone_last to ZONE_NORMAL. 1427 * 1428 * If we don't have HIGHMEM nor movable node, 1429 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of 1430 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. 1431 */ 1432 if (N_MEMORY == N_NORMAL_MEMORY) 1433 zone_last = ZONE_MOVABLE; 1434 1435 /* 1436 * check whether node_states[N_NORMAL_MEMORY] will be changed. 1437 * If the memory to be offline is in a zone of 0...zone_last, 1438 * and it is the last present memory, 0...zone_last will 1439 * become empty after offline , thus we can determind we will 1440 * need to clear the node from node_states[N_NORMAL_MEMORY]. 1441 */ 1442 for (zt = 0; zt <= zone_last; zt++) 1443 present_pages += pgdat->node_zones[zt].present_pages; 1444 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) 1445 arg->status_change_nid_normal = zone_to_nid(zone); 1446 else 1447 arg->status_change_nid_normal = -1; 1448 1449 #ifdef CONFIG_HIGHMEM 1450 /* 1451 * If we have movable node, node_states[N_HIGH_MEMORY] 1452 * contains nodes which have zones of 0...ZONE_HIGHMEM, 1453 * set zone_last to ZONE_HIGHMEM. 1454 * 1455 * If we don't have movable node, node_states[N_NORMAL_MEMORY] 1456 * contains nodes which have zones of 0...ZONE_MOVABLE, 1457 * set zone_last to ZONE_MOVABLE. 1458 */ 1459 zone_last = ZONE_HIGHMEM; 1460 if (N_MEMORY == N_HIGH_MEMORY) 1461 zone_last = ZONE_MOVABLE; 1462 1463 for (; zt <= zone_last; zt++) 1464 present_pages += pgdat->node_zones[zt].present_pages; 1465 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) 1466 arg->status_change_nid_high = zone_to_nid(zone); 1467 else 1468 arg->status_change_nid_high = -1; 1469 #else 1470 arg->status_change_nid_high = arg->status_change_nid_normal; 1471 #endif 1472 1473 /* 1474 * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE 1475 */ 1476 zone_last = ZONE_MOVABLE; 1477 1478 /* 1479 * check whether node_states[N_HIGH_MEMORY] will be changed 1480 * If we try to offline the last present @nr_pages from the node, 1481 * we can determind we will need to clear the node from 1482 * node_states[N_HIGH_MEMORY]. 1483 */ 1484 for (; zt <= zone_last; zt++) 1485 present_pages += pgdat->node_zones[zt].present_pages; 1486 if (nr_pages >= present_pages) 1487 arg->status_change_nid = zone_to_nid(zone); 1488 else 1489 arg->status_change_nid = -1; 1490 } 1491 1492 static void node_states_clear_node(int node, struct memory_notify *arg) 1493 { 1494 if (arg->status_change_nid_normal >= 0) 1495 node_clear_state(node, N_NORMAL_MEMORY); 1496 1497 if ((N_MEMORY != N_NORMAL_MEMORY) && 1498 (arg->status_change_nid_high >= 0)) 1499 node_clear_state(node, N_HIGH_MEMORY); 1500 1501 if ((N_MEMORY != N_HIGH_MEMORY) && 1502 (arg->status_change_nid >= 0)) 1503 node_clear_state(node, N_MEMORY); 1504 } 1505 1506 static int __ref __offline_pages(unsigned long start_pfn, 1507 unsigned long end_pfn, unsigned long timeout) 1508 { 1509 unsigned long pfn, nr_pages, expire; 1510 long offlined_pages; 1511 int ret, drain, retry_max, node; 1512 unsigned long flags; 1513 struct zone *zone; 1514 struct memory_notify arg; 1515 1516 /* at least, alignment against pageblock is necessary */ 1517 if (!IS_ALIGNED(start_pfn, pageblock_nr_pages)) 1518 return -EINVAL; 1519 if (!IS_ALIGNED(end_pfn, pageblock_nr_pages)) 1520 return -EINVAL; 1521 /* This makes hotplug much easier...and readable. 1522 we assume this for now. .*/ 1523 if (!test_pages_in_a_zone(start_pfn, end_pfn)) 1524 return -EINVAL; 1525 1526 lock_memory_hotplug(); 1527 1528 zone = page_zone(pfn_to_page(start_pfn)); 1529 node = zone_to_nid(zone); 1530 nr_pages = end_pfn - start_pfn; 1531 1532 ret = -EINVAL; 1533 if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages)) 1534 goto out; 1535 1536 /* set above range as isolated */ 1537 ret = start_isolate_page_range(start_pfn, end_pfn, 1538 MIGRATE_MOVABLE, true); 1539 if (ret) 1540 goto out; 1541 1542 arg.start_pfn = start_pfn; 1543 arg.nr_pages = nr_pages; 1544 node_states_check_changes_offline(nr_pages, zone, &arg); 1545 1546 ret = memory_notify(MEM_GOING_OFFLINE, &arg); 1547 ret = notifier_to_errno(ret); 1548 if (ret) 1549 goto failed_removal; 1550 1551 pfn = start_pfn; 1552 expire = jiffies + timeout; 1553 drain = 0; 1554 retry_max = 5; 1555 repeat: 1556 /* start memory hot removal */ 1557 ret = -EAGAIN; 1558 if (time_after(jiffies, expire)) 1559 goto failed_removal; 1560 ret = -EINTR; 1561 if (signal_pending(current)) 1562 goto failed_removal; 1563 ret = 0; 1564 if (drain) { 1565 lru_add_drain_all(); 1566 cond_resched(); 1567 drain_all_pages(); 1568 } 1569 1570 pfn = scan_movable_pages(start_pfn, end_pfn); 1571 if (pfn) { /* We have movable pages */ 1572 ret = do_migrate_range(pfn, end_pfn); 1573 if (!ret) { 1574 drain = 1; 1575 goto repeat; 1576 } else { 1577 if (ret < 0) 1578 if (--retry_max == 0) 1579 goto failed_removal; 1580 yield(); 1581 drain = 1; 1582 goto repeat; 1583 } 1584 } 1585 /* drain all zone's lru pagevec, this is asynchronous... */ 1586 lru_add_drain_all(); 1587 yield(); 1588 /* drain pcp pages, this is synchronous. */ 1589 drain_all_pages(); 1590 /* 1591 * dissolve free hugepages in the memory block before doing offlining 1592 * actually in order to make hugetlbfs's object counting consistent. 1593 */ 1594 dissolve_free_huge_pages(start_pfn, end_pfn); 1595 /* check again */ 1596 offlined_pages = check_pages_isolated(start_pfn, end_pfn); 1597 if (offlined_pages < 0) { 1598 ret = -EBUSY; 1599 goto failed_removal; 1600 } 1601 printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages); 1602 /* Ok, all of our target is isolated. 1603 We cannot do rollback at this point. */ 1604 offline_isolated_pages(start_pfn, end_pfn); 1605 /* reset pagetype flags and makes migrate type to be MOVABLE */ 1606 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1607 /* removal success */ 1608 adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages); 1609 zone->present_pages -= offlined_pages; 1610 1611 pgdat_resize_lock(zone->zone_pgdat, &flags); 1612 zone->zone_pgdat->node_present_pages -= offlined_pages; 1613 pgdat_resize_unlock(zone->zone_pgdat, &flags); 1614 1615 init_per_zone_wmark_min(); 1616 1617 if (!populated_zone(zone)) { 1618 zone_pcp_reset(zone); 1619 mutex_lock(&zonelists_mutex); 1620 build_all_zonelists(NULL, NULL); 1621 mutex_unlock(&zonelists_mutex); 1622 } else 1623 zone_pcp_update(zone); 1624 1625 node_states_clear_node(node, &arg); 1626 if (arg.status_change_nid >= 0) 1627 kswapd_stop(node); 1628 1629 vm_total_pages = nr_free_pagecache_pages(); 1630 writeback_set_ratelimit(); 1631 1632 memory_notify(MEM_OFFLINE, &arg); 1633 unlock_memory_hotplug(); 1634 return 0; 1635 1636 failed_removal: 1637 printk(KERN_INFO "memory offlining [mem %#010llx-%#010llx] failed\n", 1638 (unsigned long long) start_pfn << PAGE_SHIFT, 1639 ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); 1640 memory_notify(MEM_CANCEL_OFFLINE, &arg); 1641 /* pushback to free area */ 1642 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1643 1644 out: 1645 unlock_memory_hotplug(); 1646 return ret; 1647 } 1648 1649 int offline_pages(unsigned long start_pfn, unsigned long nr_pages) 1650 { 1651 return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); 1652 } 1653 #endif /* CONFIG_MEMORY_HOTREMOVE */ 1654 1655 /** 1656 * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn) 1657 * @start_pfn: start pfn of the memory range 1658 * @end_pfn: end pfn of the memory range 1659 * @arg: argument passed to func 1660 * @func: callback for each memory section walked 1661 * 1662 * This function walks through all present mem sections in range 1663 * [start_pfn, end_pfn) and call func on each mem section. 1664 * 1665 * Returns the return value of func. 1666 */ 1667 int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, 1668 void *arg, int (*func)(struct memory_block *, void *)) 1669 { 1670 struct memory_block *mem = NULL; 1671 struct mem_section *section; 1672 unsigned long pfn, section_nr; 1673 int ret; 1674 1675 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 1676 section_nr = pfn_to_section_nr(pfn); 1677 if (!present_section_nr(section_nr)) 1678 continue; 1679 1680 section = __nr_to_section(section_nr); 1681 /* same memblock? */ 1682 if (mem) 1683 if ((section_nr >= mem->start_section_nr) && 1684 (section_nr <= mem->end_section_nr)) 1685 continue; 1686 1687 mem = find_memory_block_hinted(section, mem); 1688 if (!mem) 1689 continue; 1690 1691 ret = func(mem, arg); 1692 if (ret) { 1693 kobject_put(&mem->dev.kobj); 1694 return ret; 1695 } 1696 } 1697 1698 if (mem) 1699 kobject_put(&mem->dev.kobj); 1700 1701 return 0; 1702 } 1703 1704 #ifdef CONFIG_MEMORY_HOTREMOVE 1705 static int is_memblock_offlined_cb(struct memory_block *mem, void *arg) 1706 { 1707 int ret = !is_memblock_offlined(mem); 1708 1709 if (unlikely(ret)) { 1710 phys_addr_t beginpa, endpa; 1711 1712 beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); 1713 endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1; 1714 pr_warn("removing memory fails, because memory " 1715 "[%pa-%pa] is onlined\n", 1716 &beginpa, &endpa); 1717 } 1718 1719 return ret; 1720 } 1721 1722 static int check_cpu_on_node(pg_data_t *pgdat) 1723 { 1724 int cpu; 1725 1726 for_each_present_cpu(cpu) { 1727 if (cpu_to_node(cpu) == pgdat->node_id) 1728 /* 1729 * the cpu on this node isn't removed, and we can't 1730 * offline this node. 1731 */ 1732 return -EBUSY; 1733 } 1734 1735 return 0; 1736 } 1737 1738 static void unmap_cpu_on_node(pg_data_t *pgdat) 1739 { 1740 #ifdef CONFIG_ACPI_NUMA 1741 int cpu; 1742 1743 for_each_possible_cpu(cpu) 1744 if (cpu_to_node(cpu) == pgdat->node_id) 1745 numa_clear_node(cpu); 1746 #endif 1747 } 1748 1749 static int check_and_unmap_cpu_on_node(pg_data_t *pgdat) 1750 { 1751 int ret; 1752 1753 ret = check_cpu_on_node(pgdat); 1754 if (ret) 1755 return ret; 1756 1757 /* 1758 * the node will be offlined when we come here, so we can clear 1759 * the cpu_to_node() now. 1760 */ 1761 1762 unmap_cpu_on_node(pgdat); 1763 return 0; 1764 } 1765 1766 /** 1767 * try_offline_node 1768 * 1769 * Offline a node if all memory sections and cpus of the node are removed. 1770 * 1771 * NOTE: The caller must call lock_device_hotplug() to serialize hotplug 1772 * and online/offline operations before this call. 1773 */ 1774 void try_offline_node(int nid) 1775 { 1776 pg_data_t *pgdat = NODE_DATA(nid); 1777 unsigned long start_pfn = pgdat->node_start_pfn; 1778 unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; 1779 unsigned long pfn; 1780 struct page *pgdat_page = virt_to_page(pgdat); 1781 int i; 1782 1783 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 1784 unsigned long section_nr = pfn_to_section_nr(pfn); 1785 1786 if (!present_section_nr(section_nr)) 1787 continue; 1788 1789 if (pfn_to_nid(pfn) != nid) 1790 continue; 1791 1792 /* 1793 * some memory sections of this node are not removed, and we 1794 * can't offline node now. 1795 */ 1796 return; 1797 } 1798 1799 if (check_and_unmap_cpu_on_node(pgdat)) 1800 return; 1801 1802 /* 1803 * all memory/cpu of this node are removed, we can offline this 1804 * node now. 1805 */ 1806 node_set_offline(nid); 1807 unregister_one_node(nid); 1808 1809 if (!PageSlab(pgdat_page) && !PageCompound(pgdat_page)) 1810 /* node data is allocated from boot memory */ 1811 return; 1812 1813 /* free waittable in each zone */ 1814 for (i = 0; i < MAX_NR_ZONES; i++) { 1815 struct zone *zone = pgdat->node_zones + i; 1816 1817 /* 1818 * wait_table may be allocated from boot memory, 1819 * here only free if it's allocated by vmalloc. 1820 */ 1821 if (is_vmalloc_addr(zone->wait_table)) 1822 vfree(zone->wait_table); 1823 } 1824 1825 /* 1826 * Since there is no way to guarentee the address of pgdat/zone is not 1827 * on stack of any kernel threads or used by other kernel objects 1828 * without reference counting or other symchronizing method, do not 1829 * reset node_data and free pgdat here. Just reset it to 0 and reuse 1830 * the memory when the node is online again. 1831 */ 1832 memset(pgdat, 0, sizeof(*pgdat)); 1833 } 1834 EXPORT_SYMBOL(try_offline_node); 1835 1836 /** 1837 * remove_memory 1838 * 1839 * NOTE: The caller must call lock_device_hotplug() to serialize hotplug 1840 * and online/offline operations before this call, as required by 1841 * try_offline_node(). 1842 */ 1843 void __ref remove_memory(int nid, u64 start, u64 size) 1844 { 1845 int ret; 1846 1847 BUG_ON(check_hotplug_memory_range(start, size)); 1848 1849 lock_memory_hotplug(); 1850 1851 /* 1852 * All memory blocks must be offlined before removing memory. Check 1853 * whether all memory blocks in question are offline and trigger a BUG() 1854 * if this is not the case. 1855 */ 1856 ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, 1857 is_memblock_offlined_cb); 1858 if (ret) { 1859 unlock_memory_hotplug(); 1860 BUG(); 1861 } 1862 1863 /* remove memmap entry */ 1864 firmware_map_remove(start, start + size, "System RAM"); 1865 1866 arch_remove_memory(start, size); 1867 1868 try_offline_node(nid); 1869 1870 unlock_memory_hotplug(); 1871 } 1872 EXPORT_SYMBOL_GPL(remove_memory); 1873 #endif /* CONFIG_MEMORY_HOTREMOVE */ 1874