1 /* 2 * linux/mm/memory_hotplug.c 3 * 4 * Copyright (C) 5 */ 6 7 #include <linux/stddef.h> 8 #include <linux/mm.h> 9 #include <linux/swap.h> 10 #include <linux/interrupt.h> 11 #include <linux/pagemap.h> 12 #include <linux/bootmem.h> 13 #include <linux/compiler.h> 14 #include <linux/export.h> 15 #include <linux/pagevec.h> 16 #include <linux/writeback.h> 17 #include <linux/slab.h> 18 #include <linux/sysctl.h> 19 #include <linux/cpu.h> 20 #include <linux/memory.h> 21 #include <linux/memory_hotplug.h> 22 #include <linux/highmem.h> 23 #include <linux/vmalloc.h> 24 #include <linux/ioport.h> 25 #include <linux/delay.h> 26 #include <linux/migrate.h> 27 #include <linux/page-isolation.h> 28 #include <linux/pfn.h> 29 #include <linux/suspend.h> 30 #include <linux/mm_inline.h> 31 #include <linux/firmware-map.h> 32 #include <linux/stop_machine.h> 33 34 #include <asm/tlbflush.h> 35 36 #include "internal.h" 37 38 /* 39 * online_page_callback contains pointer to current page onlining function. 40 * Initially it is generic_online_page(). If it is required it could be 41 * changed by calling set_online_page_callback() for callback registration 42 * and restore_online_page_callback() for generic callback restore. 43 */ 44 45 static void generic_online_page(struct page *page); 46 47 static online_page_callback_t online_page_callback = generic_online_page; 48 49 DEFINE_MUTEX(mem_hotplug_mutex); 50 51 void lock_memory_hotplug(void) 52 { 53 mutex_lock(&mem_hotplug_mutex); 54 55 /* for exclusive hibernation if CONFIG_HIBERNATION=y */ 56 lock_system_sleep(); 57 } 58 59 void unlock_memory_hotplug(void) 60 { 61 unlock_system_sleep(); 62 mutex_unlock(&mem_hotplug_mutex); 63 } 64 65 66 /* add this memory to iomem resource */ 67 static struct resource *register_memory_resource(u64 start, u64 size) 68 { 69 struct resource *res; 70 res = kzalloc(sizeof(struct resource), GFP_KERNEL); 71 BUG_ON(!res); 72 73 res->name = "System RAM"; 74 res->start = start; 75 res->end = start + size - 1; 76 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; 77 if (request_resource(&iomem_resource, res) < 0) { 78 pr_debug("System RAM resource %pR cannot be added\n", res); 79 kfree(res); 80 res = NULL; 81 } 82 return res; 83 } 84 85 static void release_memory_resource(struct resource *res) 86 { 87 if (!res) 88 return; 89 release_resource(res); 90 kfree(res); 91 return; 92 } 93 94 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 95 void get_page_bootmem(unsigned long info, struct page *page, 96 unsigned long type) 97 { 98 page->lru.next = (struct list_head *) type; 99 SetPagePrivate(page); 100 set_page_private(page, info); 101 atomic_inc(&page->_count); 102 } 103 104 void put_page_bootmem(struct page *page) 105 { 106 unsigned long type; 107 108 type = (unsigned long) page->lru.next; 109 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || 110 type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); 111 112 if (atomic_dec_return(&page->_count) == 1) { 113 ClearPagePrivate(page); 114 set_page_private(page, 0); 115 INIT_LIST_HEAD(&page->lru); 116 free_reserved_page(page); 117 } 118 } 119 120 #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE 121 #ifndef CONFIG_SPARSEMEM_VMEMMAP 122 static void register_page_bootmem_info_section(unsigned long start_pfn) 123 { 124 unsigned long *usemap, mapsize, section_nr, i; 125 struct mem_section *ms; 126 struct page *page, *memmap; 127 128 section_nr = pfn_to_section_nr(start_pfn); 129 ms = __nr_to_section(section_nr); 130 131 /* Get section's memmap address */ 132 memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); 133 134 /* 135 * Get page for the memmap's phys address 136 * XXX: need more consideration for sparse_vmemmap... 137 */ 138 page = virt_to_page(memmap); 139 mapsize = sizeof(struct page) * PAGES_PER_SECTION; 140 mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT; 141 142 /* remember memmap's page */ 143 for (i = 0; i < mapsize; i++, page++) 144 get_page_bootmem(section_nr, page, SECTION_INFO); 145 146 usemap = __nr_to_section(section_nr)->pageblock_flags; 147 page = virt_to_page(usemap); 148 149 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; 150 151 for (i = 0; i < mapsize; i++, page++) 152 get_page_bootmem(section_nr, page, MIX_SECTION_INFO); 153 154 } 155 #else /* CONFIG_SPARSEMEM_VMEMMAP */ 156 static void register_page_bootmem_info_section(unsigned long start_pfn) 157 { 158 unsigned long *usemap, mapsize, section_nr, i; 159 struct mem_section *ms; 160 struct page *page, *memmap; 161 162 if (!pfn_valid(start_pfn)) 163 return; 164 165 section_nr = pfn_to_section_nr(start_pfn); 166 ms = __nr_to_section(section_nr); 167 168 memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); 169 170 register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION); 171 172 usemap = __nr_to_section(section_nr)->pageblock_flags; 173 page = virt_to_page(usemap); 174 175 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; 176 177 for (i = 0; i < mapsize; i++, page++) 178 get_page_bootmem(section_nr, page, MIX_SECTION_INFO); 179 } 180 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ 181 182 void register_page_bootmem_info_node(struct pglist_data *pgdat) 183 { 184 unsigned long i, pfn, end_pfn, nr_pages; 185 int node = pgdat->node_id; 186 struct page *page; 187 struct zone *zone; 188 189 nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT; 190 page = virt_to_page(pgdat); 191 192 for (i = 0; i < nr_pages; i++, page++) 193 get_page_bootmem(node, page, NODE_INFO); 194 195 zone = &pgdat->node_zones[0]; 196 for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) { 197 if (zone->wait_table) { 198 nr_pages = zone->wait_table_hash_nr_entries 199 * sizeof(wait_queue_head_t); 200 nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT; 201 page = virt_to_page(zone->wait_table); 202 203 for (i = 0; i < nr_pages; i++, page++) 204 get_page_bootmem(node, page, NODE_INFO); 205 } 206 } 207 208 pfn = pgdat->node_start_pfn; 209 end_pfn = pgdat_end_pfn(pgdat); 210 211 /* register section info */ 212 for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 213 /* 214 * Some platforms can assign the same pfn to multiple nodes - on 215 * node0 as well as nodeN. To avoid registering a pfn against 216 * multiple nodes we check that this pfn does not already 217 * reside in some other nodes. 218 */ 219 if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node)) 220 register_page_bootmem_info_section(pfn); 221 } 222 } 223 #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ 224 225 static void grow_zone_span(struct zone *zone, unsigned long start_pfn, 226 unsigned long end_pfn) 227 { 228 unsigned long old_zone_end_pfn; 229 230 zone_span_writelock(zone); 231 232 old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; 233 if (!zone->spanned_pages || start_pfn < zone->zone_start_pfn) 234 zone->zone_start_pfn = start_pfn; 235 236 zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - 237 zone->zone_start_pfn; 238 239 zone_span_writeunlock(zone); 240 } 241 242 static void resize_zone(struct zone *zone, unsigned long start_pfn, 243 unsigned long end_pfn) 244 { 245 zone_span_writelock(zone); 246 247 if (end_pfn - start_pfn) { 248 zone->zone_start_pfn = start_pfn; 249 zone->spanned_pages = end_pfn - start_pfn; 250 } else { 251 /* 252 * make it consist as free_area_init_core(), 253 * if spanned_pages = 0, then keep start_pfn = 0 254 */ 255 zone->zone_start_pfn = 0; 256 zone->spanned_pages = 0; 257 } 258 259 zone_span_writeunlock(zone); 260 } 261 262 static void fix_zone_id(struct zone *zone, unsigned long start_pfn, 263 unsigned long end_pfn) 264 { 265 enum zone_type zid = zone_idx(zone); 266 int nid = zone->zone_pgdat->node_id; 267 unsigned long pfn; 268 269 for (pfn = start_pfn; pfn < end_pfn; pfn++) 270 set_page_links(pfn_to_page(pfn), zid, nid, pfn); 271 } 272 273 /* Can fail with -ENOMEM from allocating a wait table with vmalloc() or 274 * alloc_bootmem_node_nopanic() */ 275 static int __ref ensure_zone_is_initialized(struct zone *zone, 276 unsigned long start_pfn, unsigned long num_pages) 277 { 278 if (!zone_is_initialized(zone)) 279 return init_currently_empty_zone(zone, start_pfn, num_pages, 280 MEMMAP_HOTPLUG); 281 return 0; 282 } 283 284 static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, 285 unsigned long start_pfn, unsigned long end_pfn) 286 { 287 int ret; 288 unsigned long flags; 289 unsigned long z1_start_pfn; 290 291 ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn); 292 if (ret) 293 return ret; 294 295 pgdat_resize_lock(z1->zone_pgdat, &flags); 296 297 /* can't move pfns which are higher than @z2 */ 298 if (end_pfn > zone_end_pfn(z2)) 299 goto out_fail; 300 /* the move out part must be at the left most of @z2 */ 301 if (start_pfn > z2->zone_start_pfn) 302 goto out_fail; 303 /* must included/overlap */ 304 if (end_pfn <= z2->zone_start_pfn) 305 goto out_fail; 306 307 /* use start_pfn for z1's start_pfn if z1 is empty */ 308 if (z1->spanned_pages) 309 z1_start_pfn = z1->zone_start_pfn; 310 else 311 z1_start_pfn = start_pfn; 312 313 resize_zone(z1, z1_start_pfn, end_pfn); 314 resize_zone(z2, end_pfn, zone_end_pfn(z2)); 315 316 pgdat_resize_unlock(z1->zone_pgdat, &flags); 317 318 fix_zone_id(z1, start_pfn, end_pfn); 319 320 return 0; 321 out_fail: 322 pgdat_resize_unlock(z1->zone_pgdat, &flags); 323 return -1; 324 } 325 326 static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, 327 unsigned long start_pfn, unsigned long end_pfn) 328 { 329 int ret; 330 unsigned long flags; 331 unsigned long z2_end_pfn; 332 333 ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn); 334 if (ret) 335 return ret; 336 337 pgdat_resize_lock(z1->zone_pgdat, &flags); 338 339 /* can't move pfns which are lower than @z1 */ 340 if (z1->zone_start_pfn > start_pfn) 341 goto out_fail; 342 /* the move out part mast at the right most of @z1 */ 343 if (zone_end_pfn(z1) > end_pfn) 344 goto out_fail; 345 /* must included/overlap */ 346 if (start_pfn >= zone_end_pfn(z1)) 347 goto out_fail; 348 349 /* use end_pfn for z2's end_pfn if z2 is empty */ 350 if (z2->spanned_pages) 351 z2_end_pfn = zone_end_pfn(z2); 352 else 353 z2_end_pfn = end_pfn; 354 355 resize_zone(z1, z1->zone_start_pfn, start_pfn); 356 resize_zone(z2, start_pfn, z2_end_pfn); 357 358 pgdat_resize_unlock(z1->zone_pgdat, &flags); 359 360 fix_zone_id(z2, start_pfn, end_pfn); 361 362 return 0; 363 out_fail: 364 pgdat_resize_unlock(z1->zone_pgdat, &flags); 365 return -1; 366 } 367 368 static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, 369 unsigned long end_pfn) 370 { 371 unsigned long old_pgdat_end_pfn = 372 pgdat->node_start_pfn + pgdat->node_spanned_pages; 373 374 if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) 375 pgdat->node_start_pfn = start_pfn; 376 377 pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - 378 pgdat->node_start_pfn; 379 } 380 381 static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) 382 { 383 struct pglist_data *pgdat = zone->zone_pgdat; 384 int nr_pages = PAGES_PER_SECTION; 385 int nid = pgdat->node_id; 386 int zone_type; 387 unsigned long flags; 388 int ret; 389 390 zone_type = zone - pgdat->node_zones; 391 ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages); 392 if (ret) 393 return ret; 394 395 pgdat_resize_lock(zone->zone_pgdat, &flags); 396 grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); 397 grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, 398 phys_start_pfn + nr_pages); 399 pgdat_resize_unlock(zone->zone_pgdat, &flags); 400 memmap_init_zone(nr_pages, nid, zone_type, 401 phys_start_pfn, MEMMAP_HOTPLUG); 402 return 0; 403 } 404 405 static int __meminit __add_section(int nid, struct zone *zone, 406 unsigned long phys_start_pfn) 407 { 408 int nr_pages = PAGES_PER_SECTION; 409 int ret; 410 411 if (pfn_valid(phys_start_pfn)) 412 return -EEXIST; 413 414 ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages); 415 416 if (ret < 0) 417 return ret; 418 419 ret = __add_zone(zone, phys_start_pfn); 420 421 if (ret < 0) 422 return ret; 423 424 return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); 425 } 426 427 /* 428 * Reasonably generic function for adding memory. It is 429 * expected that archs that support memory hotplug will 430 * call this function after deciding the zone to which to 431 * add the new pages. 432 */ 433 int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, 434 unsigned long nr_pages) 435 { 436 unsigned long i; 437 int err = 0; 438 int start_sec, end_sec; 439 /* during initialize mem_map, align hot-added range to section */ 440 start_sec = pfn_to_section_nr(phys_start_pfn); 441 end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); 442 443 for (i = start_sec; i <= end_sec; i++) { 444 err = __add_section(nid, zone, i << PFN_SECTION_SHIFT); 445 446 /* 447 * EEXIST is finally dealt with by ioresource collision 448 * check. see add_memory() => register_memory_resource() 449 * Warning will be printed if there is collision. 450 */ 451 if (err && (err != -EEXIST)) 452 break; 453 err = 0; 454 } 455 456 return err; 457 } 458 EXPORT_SYMBOL_GPL(__add_pages); 459 460 #ifdef CONFIG_MEMORY_HOTREMOVE 461 /* find the smallest valid pfn in the range [start_pfn, end_pfn) */ 462 static int find_smallest_section_pfn(int nid, struct zone *zone, 463 unsigned long start_pfn, 464 unsigned long end_pfn) 465 { 466 struct mem_section *ms; 467 468 for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) { 469 ms = __pfn_to_section(start_pfn); 470 471 if (unlikely(!valid_section(ms))) 472 continue; 473 474 if (unlikely(pfn_to_nid(start_pfn) != nid)) 475 continue; 476 477 if (zone && zone != page_zone(pfn_to_page(start_pfn))) 478 continue; 479 480 return start_pfn; 481 } 482 483 return 0; 484 } 485 486 /* find the biggest valid pfn in the range [start_pfn, end_pfn). */ 487 static int find_biggest_section_pfn(int nid, struct zone *zone, 488 unsigned long start_pfn, 489 unsigned long end_pfn) 490 { 491 struct mem_section *ms; 492 unsigned long pfn; 493 494 /* pfn is the end pfn of a memory section. */ 495 pfn = end_pfn - 1; 496 for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) { 497 ms = __pfn_to_section(pfn); 498 499 if (unlikely(!valid_section(ms))) 500 continue; 501 502 if (unlikely(pfn_to_nid(pfn) != nid)) 503 continue; 504 505 if (zone && zone != page_zone(pfn_to_page(pfn))) 506 continue; 507 508 return pfn; 509 } 510 511 return 0; 512 } 513 514 static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, 515 unsigned long end_pfn) 516 { 517 unsigned long zone_start_pfn = zone->zone_start_pfn; 518 unsigned long zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; 519 unsigned long pfn; 520 struct mem_section *ms; 521 int nid = zone_to_nid(zone); 522 523 zone_span_writelock(zone); 524 if (zone_start_pfn == start_pfn) { 525 /* 526 * If the section is smallest section in the zone, it need 527 * shrink zone->zone_start_pfn and zone->zone_spanned_pages. 528 * In this case, we find second smallest valid mem_section 529 * for shrinking zone. 530 */ 531 pfn = find_smallest_section_pfn(nid, zone, end_pfn, 532 zone_end_pfn); 533 if (pfn) { 534 zone->zone_start_pfn = pfn; 535 zone->spanned_pages = zone_end_pfn - pfn; 536 } 537 } else if (zone_end_pfn == end_pfn) { 538 /* 539 * If the section is biggest section in the zone, it need 540 * shrink zone->spanned_pages. 541 * In this case, we find second biggest valid mem_section for 542 * shrinking zone. 543 */ 544 pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn, 545 start_pfn); 546 if (pfn) 547 zone->spanned_pages = pfn - zone_start_pfn + 1; 548 } 549 550 /* 551 * The section is not biggest or smallest mem_section in the zone, it 552 * only creates a hole in the zone. So in this case, we need not 553 * change the zone. But perhaps, the zone has only hole data. Thus 554 * it check the zone has only hole or not. 555 */ 556 pfn = zone_start_pfn; 557 for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) { 558 ms = __pfn_to_section(pfn); 559 560 if (unlikely(!valid_section(ms))) 561 continue; 562 563 if (page_zone(pfn_to_page(pfn)) != zone) 564 continue; 565 566 /* If the section is current section, it continues the loop */ 567 if (start_pfn == pfn) 568 continue; 569 570 /* If we find valid section, we have nothing to do */ 571 zone_span_writeunlock(zone); 572 return; 573 } 574 575 /* The zone has no valid section */ 576 zone->zone_start_pfn = 0; 577 zone->spanned_pages = 0; 578 zone_span_writeunlock(zone); 579 } 580 581 static void shrink_pgdat_span(struct pglist_data *pgdat, 582 unsigned long start_pfn, unsigned long end_pfn) 583 { 584 unsigned long pgdat_start_pfn = pgdat->node_start_pfn; 585 unsigned long pgdat_end_pfn = 586 pgdat->node_start_pfn + pgdat->node_spanned_pages; 587 unsigned long pfn; 588 struct mem_section *ms; 589 int nid = pgdat->node_id; 590 591 if (pgdat_start_pfn == start_pfn) { 592 /* 593 * If the section is smallest section in the pgdat, it need 594 * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages. 595 * In this case, we find second smallest valid mem_section 596 * for shrinking zone. 597 */ 598 pfn = find_smallest_section_pfn(nid, NULL, end_pfn, 599 pgdat_end_pfn); 600 if (pfn) { 601 pgdat->node_start_pfn = pfn; 602 pgdat->node_spanned_pages = pgdat_end_pfn - pfn; 603 } 604 } else if (pgdat_end_pfn == end_pfn) { 605 /* 606 * If the section is biggest section in the pgdat, it need 607 * shrink pgdat->node_spanned_pages. 608 * In this case, we find second biggest valid mem_section for 609 * shrinking zone. 610 */ 611 pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn, 612 start_pfn); 613 if (pfn) 614 pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1; 615 } 616 617 /* 618 * If the section is not biggest or smallest mem_section in the pgdat, 619 * it only creates a hole in the pgdat. So in this case, we need not 620 * change the pgdat. 621 * But perhaps, the pgdat has only hole data. Thus it check the pgdat 622 * has only hole or not. 623 */ 624 pfn = pgdat_start_pfn; 625 for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) { 626 ms = __pfn_to_section(pfn); 627 628 if (unlikely(!valid_section(ms))) 629 continue; 630 631 if (pfn_to_nid(pfn) != nid) 632 continue; 633 634 /* If the section is current section, it continues the loop */ 635 if (start_pfn == pfn) 636 continue; 637 638 /* If we find valid section, we have nothing to do */ 639 return; 640 } 641 642 /* The pgdat has no valid section */ 643 pgdat->node_start_pfn = 0; 644 pgdat->node_spanned_pages = 0; 645 } 646 647 static void __remove_zone(struct zone *zone, unsigned long start_pfn) 648 { 649 struct pglist_data *pgdat = zone->zone_pgdat; 650 int nr_pages = PAGES_PER_SECTION; 651 int zone_type; 652 unsigned long flags; 653 654 zone_type = zone - pgdat->node_zones; 655 656 pgdat_resize_lock(zone->zone_pgdat, &flags); 657 shrink_zone_span(zone, start_pfn, start_pfn + nr_pages); 658 shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages); 659 pgdat_resize_unlock(zone->zone_pgdat, &flags); 660 } 661 662 static int __remove_section(struct zone *zone, struct mem_section *ms) 663 { 664 unsigned long start_pfn; 665 int scn_nr; 666 int ret = -EINVAL; 667 668 if (!valid_section(ms)) 669 return ret; 670 671 ret = unregister_memory_section(ms); 672 if (ret) 673 return ret; 674 675 scn_nr = __section_nr(ms); 676 start_pfn = section_nr_to_pfn(scn_nr); 677 __remove_zone(zone, start_pfn); 678 679 sparse_remove_one_section(zone, ms); 680 return 0; 681 } 682 683 /** 684 * __remove_pages() - remove sections of pages from a zone 685 * @zone: zone from which pages need to be removed 686 * @phys_start_pfn: starting pageframe (must be aligned to start of a section) 687 * @nr_pages: number of pages to remove (must be multiple of section size) 688 * 689 * Generic helper function to remove section mappings and sysfs entries 690 * for the section of the memory we are removing. Caller needs to make 691 * sure that pages are marked reserved and zones are adjust properly by 692 * calling offline_pages(). 693 */ 694 int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, 695 unsigned long nr_pages) 696 { 697 unsigned long i; 698 int sections_to_remove; 699 resource_size_t start, size; 700 int ret = 0; 701 702 /* 703 * We can only remove entire sections 704 */ 705 BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); 706 BUG_ON(nr_pages % PAGES_PER_SECTION); 707 708 start = phys_start_pfn << PAGE_SHIFT; 709 size = nr_pages * PAGE_SIZE; 710 ret = release_mem_region_adjustable(&iomem_resource, start, size); 711 if (ret) { 712 resource_size_t endres = start + size - 1; 713 714 pr_warn("Unable to release resource <%pa-%pa> (%d)\n", 715 &start, &endres, ret); 716 } 717 718 sections_to_remove = nr_pages / PAGES_PER_SECTION; 719 for (i = 0; i < sections_to_remove; i++) { 720 unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; 721 ret = __remove_section(zone, __pfn_to_section(pfn)); 722 if (ret) 723 break; 724 } 725 return ret; 726 } 727 EXPORT_SYMBOL_GPL(__remove_pages); 728 #endif /* CONFIG_MEMORY_HOTREMOVE */ 729 730 int set_online_page_callback(online_page_callback_t callback) 731 { 732 int rc = -EINVAL; 733 734 lock_memory_hotplug(); 735 736 if (online_page_callback == generic_online_page) { 737 online_page_callback = callback; 738 rc = 0; 739 } 740 741 unlock_memory_hotplug(); 742 743 return rc; 744 } 745 EXPORT_SYMBOL_GPL(set_online_page_callback); 746 747 int restore_online_page_callback(online_page_callback_t callback) 748 { 749 int rc = -EINVAL; 750 751 lock_memory_hotplug(); 752 753 if (online_page_callback == callback) { 754 online_page_callback = generic_online_page; 755 rc = 0; 756 } 757 758 unlock_memory_hotplug(); 759 760 return rc; 761 } 762 EXPORT_SYMBOL_GPL(restore_online_page_callback); 763 764 void __online_page_set_limits(struct page *page) 765 { 766 } 767 EXPORT_SYMBOL_GPL(__online_page_set_limits); 768 769 void __online_page_increment_counters(struct page *page) 770 { 771 adjust_managed_page_count(page, 1); 772 } 773 EXPORT_SYMBOL_GPL(__online_page_increment_counters); 774 775 void __online_page_free(struct page *page) 776 { 777 __free_reserved_page(page); 778 } 779 EXPORT_SYMBOL_GPL(__online_page_free); 780 781 static void generic_online_page(struct page *page) 782 { 783 __online_page_set_limits(page); 784 __online_page_increment_counters(page); 785 __online_page_free(page); 786 } 787 788 static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, 789 void *arg) 790 { 791 unsigned long i; 792 unsigned long onlined_pages = *(unsigned long *)arg; 793 struct page *page; 794 if (PageReserved(pfn_to_page(start_pfn))) 795 for (i = 0; i < nr_pages; i++) { 796 page = pfn_to_page(start_pfn + i); 797 (*online_page_callback)(page); 798 onlined_pages++; 799 } 800 *(unsigned long *)arg = onlined_pages; 801 return 0; 802 } 803 804 #ifdef CONFIG_MOVABLE_NODE 805 /* 806 * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have 807 * normal memory. 808 */ 809 static bool can_online_high_movable(struct zone *zone) 810 { 811 return true; 812 } 813 #else /* CONFIG_MOVABLE_NODE */ 814 /* ensure every online node has NORMAL memory */ 815 static bool can_online_high_movable(struct zone *zone) 816 { 817 return node_state(zone_to_nid(zone), N_NORMAL_MEMORY); 818 } 819 #endif /* CONFIG_MOVABLE_NODE */ 820 821 /* check which state of node_states will be changed when online memory */ 822 static void node_states_check_changes_online(unsigned long nr_pages, 823 struct zone *zone, struct memory_notify *arg) 824 { 825 int nid = zone_to_nid(zone); 826 enum zone_type zone_last = ZONE_NORMAL; 827 828 /* 829 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] 830 * contains nodes which have zones of 0...ZONE_NORMAL, 831 * set zone_last to ZONE_NORMAL. 832 * 833 * If we don't have HIGHMEM nor movable node, 834 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of 835 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. 836 */ 837 if (N_MEMORY == N_NORMAL_MEMORY) 838 zone_last = ZONE_MOVABLE; 839 840 /* 841 * if the memory to be online is in a zone of 0...zone_last, and 842 * the zones of 0...zone_last don't have memory before online, we will 843 * need to set the node to node_states[N_NORMAL_MEMORY] after 844 * the memory is online. 845 */ 846 if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY)) 847 arg->status_change_nid_normal = nid; 848 else 849 arg->status_change_nid_normal = -1; 850 851 #ifdef CONFIG_HIGHMEM 852 /* 853 * If we have movable node, node_states[N_HIGH_MEMORY] 854 * contains nodes which have zones of 0...ZONE_HIGHMEM, 855 * set zone_last to ZONE_HIGHMEM. 856 * 857 * If we don't have movable node, node_states[N_NORMAL_MEMORY] 858 * contains nodes which have zones of 0...ZONE_MOVABLE, 859 * set zone_last to ZONE_MOVABLE. 860 */ 861 zone_last = ZONE_HIGHMEM; 862 if (N_MEMORY == N_HIGH_MEMORY) 863 zone_last = ZONE_MOVABLE; 864 865 if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY)) 866 arg->status_change_nid_high = nid; 867 else 868 arg->status_change_nid_high = -1; 869 #else 870 arg->status_change_nid_high = arg->status_change_nid_normal; 871 #endif 872 873 /* 874 * if the node don't have memory befor online, we will need to 875 * set the node to node_states[N_MEMORY] after the memory 876 * is online. 877 */ 878 if (!node_state(nid, N_MEMORY)) 879 arg->status_change_nid = nid; 880 else 881 arg->status_change_nid = -1; 882 } 883 884 static void node_states_set_node(int node, struct memory_notify *arg) 885 { 886 if (arg->status_change_nid_normal >= 0) 887 node_set_state(node, N_NORMAL_MEMORY); 888 889 if (arg->status_change_nid_high >= 0) 890 node_set_state(node, N_HIGH_MEMORY); 891 892 node_set_state(node, N_MEMORY); 893 } 894 895 896 int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) 897 { 898 unsigned long flags; 899 unsigned long onlined_pages = 0; 900 struct zone *zone; 901 int need_zonelists_rebuild = 0; 902 int nid; 903 int ret; 904 struct memory_notify arg; 905 906 lock_memory_hotplug(); 907 /* 908 * This doesn't need a lock to do pfn_to_page(). 909 * The section can't be removed here because of the 910 * memory_block->state_mutex. 911 */ 912 zone = page_zone(pfn_to_page(pfn)); 913 914 if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) && 915 !can_online_high_movable(zone)) { 916 unlock_memory_hotplug(); 917 return -EINVAL; 918 } 919 920 if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { 921 if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) { 922 unlock_memory_hotplug(); 923 return -EINVAL; 924 } 925 } 926 if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { 927 if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) { 928 unlock_memory_hotplug(); 929 return -EINVAL; 930 } 931 } 932 933 /* Previous code may changed the zone of the pfn range */ 934 zone = page_zone(pfn_to_page(pfn)); 935 936 arg.start_pfn = pfn; 937 arg.nr_pages = nr_pages; 938 node_states_check_changes_online(nr_pages, zone, &arg); 939 940 nid = page_to_nid(pfn_to_page(pfn)); 941 942 ret = memory_notify(MEM_GOING_ONLINE, &arg); 943 ret = notifier_to_errno(ret); 944 if (ret) { 945 memory_notify(MEM_CANCEL_ONLINE, &arg); 946 unlock_memory_hotplug(); 947 return ret; 948 } 949 /* 950 * If this zone is not populated, then it is not in zonelist. 951 * This means the page allocator ignores this zone. 952 * So, zonelist must be updated after online. 953 */ 954 mutex_lock(&zonelists_mutex); 955 if (!populated_zone(zone)) { 956 need_zonelists_rebuild = 1; 957 build_all_zonelists(NULL, zone); 958 } 959 960 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, 961 online_pages_range); 962 if (ret) { 963 if (need_zonelists_rebuild) 964 zone_pcp_reset(zone); 965 mutex_unlock(&zonelists_mutex); 966 printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n", 967 (unsigned long long) pfn << PAGE_SHIFT, 968 (((unsigned long long) pfn + nr_pages) 969 << PAGE_SHIFT) - 1); 970 memory_notify(MEM_CANCEL_ONLINE, &arg); 971 unlock_memory_hotplug(); 972 return ret; 973 } 974 975 zone->present_pages += onlined_pages; 976 977 pgdat_resize_lock(zone->zone_pgdat, &flags); 978 zone->zone_pgdat->node_present_pages += onlined_pages; 979 pgdat_resize_unlock(zone->zone_pgdat, &flags); 980 981 if (onlined_pages) { 982 node_states_set_node(zone_to_nid(zone), &arg); 983 if (need_zonelists_rebuild) 984 build_all_zonelists(NULL, NULL); 985 else 986 zone_pcp_update(zone); 987 } 988 989 mutex_unlock(&zonelists_mutex); 990 991 init_per_zone_wmark_min(); 992 993 if (onlined_pages) 994 kswapd_run(zone_to_nid(zone)); 995 996 vm_total_pages = nr_free_pagecache_pages(); 997 998 writeback_set_ratelimit(); 999 1000 if (onlined_pages) 1001 memory_notify(MEM_ONLINE, &arg); 1002 unlock_memory_hotplug(); 1003 1004 return 0; 1005 } 1006 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ 1007 1008 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 1009 static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) 1010 { 1011 struct pglist_data *pgdat; 1012 unsigned long zones_size[MAX_NR_ZONES] = {0}; 1013 unsigned long zholes_size[MAX_NR_ZONES] = {0}; 1014 unsigned long start_pfn = start >> PAGE_SHIFT; 1015 1016 pgdat = NODE_DATA(nid); 1017 if (!pgdat) { 1018 pgdat = arch_alloc_nodedata(nid); 1019 if (!pgdat) 1020 return NULL; 1021 1022 arch_refresh_nodedata(nid, pgdat); 1023 } 1024 1025 /* we can use NODE_DATA(nid) from here */ 1026 1027 /* init node's zones as empty zones, we don't have any present pages.*/ 1028 free_area_init_node(nid, zones_size, start_pfn, zholes_size); 1029 1030 /* 1031 * The node we allocated has no zone fallback lists. For avoiding 1032 * to access not-initialized zonelist, build here. 1033 */ 1034 mutex_lock(&zonelists_mutex); 1035 build_all_zonelists(pgdat, NULL); 1036 mutex_unlock(&zonelists_mutex); 1037 1038 return pgdat; 1039 } 1040 1041 static void rollback_node_hotadd(int nid, pg_data_t *pgdat) 1042 { 1043 arch_refresh_nodedata(nid, NULL); 1044 arch_free_nodedata(pgdat); 1045 return; 1046 } 1047 1048 1049 /* 1050 * called by cpu_up() to online a node without onlined memory. 1051 */ 1052 int mem_online_node(int nid) 1053 { 1054 pg_data_t *pgdat; 1055 int ret; 1056 1057 lock_memory_hotplug(); 1058 pgdat = hotadd_new_pgdat(nid, 0); 1059 if (!pgdat) { 1060 ret = -ENOMEM; 1061 goto out; 1062 } 1063 node_set_online(nid); 1064 ret = register_one_node(nid); 1065 BUG_ON(ret); 1066 1067 out: 1068 unlock_memory_hotplug(); 1069 return ret; 1070 } 1071 1072 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 1073 int __ref add_memory(int nid, u64 start, u64 size) 1074 { 1075 pg_data_t *pgdat = NULL; 1076 bool new_pgdat; 1077 bool new_node; 1078 struct resource *res; 1079 int ret; 1080 1081 lock_memory_hotplug(); 1082 1083 res = register_memory_resource(start, size); 1084 ret = -EEXIST; 1085 if (!res) 1086 goto out; 1087 1088 { /* Stupid hack to suppress address-never-null warning */ 1089 void *p = NODE_DATA(nid); 1090 new_pgdat = !p; 1091 } 1092 new_node = !node_online(nid); 1093 if (new_node) { 1094 pgdat = hotadd_new_pgdat(nid, start); 1095 ret = -ENOMEM; 1096 if (!pgdat) 1097 goto error; 1098 } 1099 1100 /* call arch's memory hotadd */ 1101 ret = arch_add_memory(nid, start, size); 1102 1103 if (ret < 0) 1104 goto error; 1105 1106 /* we online node here. we can't roll back from here. */ 1107 node_set_online(nid); 1108 1109 if (new_node) { 1110 ret = register_one_node(nid); 1111 /* 1112 * If sysfs file of new node can't create, cpu on the node 1113 * can't be hot-added. There is no rollback way now. 1114 * So, check by BUG_ON() to catch it reluctantly.. 1115 */ 1116 BUG_ON(ret); 1117 } 1118 1119 /* create new memmap entry */ 1120 firmware_map_add_hotplug(start, start + size, "System RAM"); 1121 1122 goto out; 1123 1124 error: 1125 /* rollback pgdat allocation and others */ 1126 if (new_pgdat) 1127 rollback_node_hotadd(nid, pgdat); 1128 release_memory_resource(res); 1129 1130 out: 1131 unlock_memory_hotplug(); 1132 return ret; 1133 } 1134 EXPORT_SYMBOL_GPL(add_memory); 1135 1136 #ifdef CONFIG_MEMORY_HOTREMOVE 1137 /* 1138 * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy 1139 * set and the size of the free page is given by page_order(). Using this, 1140 * the function determines if the pageblock contains only free pages. 1141 * Due to buddy contraints, a free page at least the size of a pageblock will 1142 * be located at the start of the pageblock 1143 */ 1144 static inline int pageblock_free(struct page *page) 1145 { 1146 return PageBuddy(page) && page_order(page) >= pageblock_order; 1147 } 1148 1149 /* Return the start of the next active pageblock after a given page */ 1150 static struct page *next_active_pageblock(struct page *page) 1151 { 1152 /* Ensure the starting page is pageblock-aligned */ 1153 BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1)); 1154 1155 /* If the entire pageblock is free, move to the end of free page */ 1156 if (pageblock_free(page)) { 1157 int order; 1158 /* be careful. we don't have locks, page_order can be changed.*/ 1159 order = page_order(page); 1160 if ((order < MAX_ORDER) && (order >= pageblock_order)) 1161 return page + (1 << order); 1162 } 1163 1164 return page + pageblock_nr_pages; 1165 } 1166 1167 /* Checks if this range of memory is likely to be hot-removable. */ 1168 int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) 1169 { 1170 struct page *page = pfn_to_page(start_pfn); 1171 struct page *end_page = page + nr_pages; 1172 1173 /* Check the starting page of each pageblock within the range */ 1174 for (; page < end_page; page = next_active_pageblock(page)) { 1175 if (!is_pageblock_removable_nolock(page)) 1176 return 0; 1177 cond_resched(); 1178 } 1179 1180 /* All pageblocks in the memory block are likely to be hot-removable */ 1181 return 1; 1182 } 1183 1184 /* 1185 * Confirm all pages in a range [start, end) is belongs to the same zone. 1186 */ 1187 static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) 1188 { 1189 unsigned long pfn; 1190 struct zone *zone = NULL; 1191 struct page *page; 1192 int i; 1193 for (pfn = start_pfn; 1194 pfn < end_pfn; 1195 pfn += MAX_ORDER_NR_PAGES) { 1196 i = 0; 1197 /* This is just a CONFIG_HOLES_IN_ZONE check.*/ 1198 while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i)) 1199 i++; 1200 if (i == MAX_ORDER_NR_PAGES) 1201 continue; 1202 page = pfn_to_page(pfn + i); 1203 if (zone && page_zone(page) != zone) 1204 return 0; 1205 zone = page_zone(page); 1206 } 1207 return 1; 1208 } 1209 1210 /* 1211 * Scanning pfn is much easier than scanning lru list. 1212 * Scan pfn from start to end and Find LRU page. 1213 */ 1214 static unsigned long scan_lru_pages(unsigned long start, unsigned long end) 1215 { 1216 unsigned long pfn; 1217 struct page *page; 1218 for (pfn = start; pfn < end; pfn++) { 1219 if (pfn_valid(pfn)) { 1220 page = pfn_to_page(pfn); 1221 if (PageLRU(page)) 1222 return pfn; 1223 } 1224 } 1225 return 0; 1226 } 1227 1228 #define NR_OFFLINE_AT_ONCE_PAGES (256) 1229 static int 1230 do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) 1231 { 1232 unsigned long pfn; 1233 struct page *page; 1234 int move_pages = NR_OFFLINE_AT_ONCE_PAGES; 1235 int not_managed = 0; 1236 int ret = 0; 1237 LIST_HEAD(source); 1238 1239 for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) { 1240 if (!pfn_valid(pfn)) 1241 continue; 1242 page = pfn_to_page(pfn); 1243 if (!get_page_unless_zero(page)) 1244 continue; 1245 /* 1246 * We can skip free pages. And we can only deal with pages on 1247 * LRU. 1248 */ 1249 ret = isolate_lru_page(page); 1250 if (!ret) { /* Success */ 1251 put_page(page); 1252 list_add_tail(&page->lru, &source); 1253 move_pages--; 1254 inc_zone_page_state(page, NR_ISOLATED_ANON + 1255 page_is_file_cache(page)); 1256 1257 } else { 1258 #ifdef CONFIG_DEBUG_VM 1259 printk(KERN_ALERT "removing pfn %lx from LRU failed\n", 1260 pfn); 1261 dump_page(page); 1262 #endif 1263 put_page(page); 1264 /* Because we don't have big zone->lock. we should 1265 check this again here. */ 1266 if (page_count(page)) { 1267 not_managed++; 1268 ret = -EBUSY; 1269 break; 1270 } 1271 } 1272 } 1273 if (!list_empty(&source)) { 1274 if (not_managed) { 1275 putback_lru_pages(&source); 1276 goto out; 1277 } 1278 1279 /* 1280 * alloc_migrate_target should be improooooved!! 1281 * migrate_pages returns # of failed pages. 1282 */ 1283 ret = migrate_pages(&source, alloc_migrate_target, 0, 1284 MIGRATE_SYNC, MR_MEMORY_HOTPLUG); 1285 if (ret) 1286 putback_lru_pages(&source); 1287 } 1288 out: 1289 return ret; 1290 } 1291 1292 /* 1293 * remove from free_area[] and mark all as Reserved. 1294 */ 1295 static int 1296 offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, 1297 void *data) 1298 { 1299 __offline_isolated_pages(start, start + nr_pages); 1300 return 0; 1301 } 1302 1303 static void 1304 offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 1305 { 1306 walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL, 1307 offline_isolated_pages_cb); 1308 } 1309 1310 /* 1311 * Check all pages in range, recoreded as memory resource, are isolated. 1312 */ 1313 static int 1314 check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, 1315 void *data) 1316 { 1317 int ret; 1318 long offlined = *(long *)data; 1319 ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true); 1320 offlined = nr_pages; 1321 if (!ret) 1322 *(long *)data += offlined; 1323 return ret; 1324 } 1325 1326 static long 1327 check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) 1328 { 1329 long offlined = 0; 1330 int ret; 1331 1332 ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined, 1333 check_pages_isolated_cb); 1334 if (ret < 0) 1335 offlined = (long)ret; 1336 return offlined; 1337 } 1338 1339 #ifdef CONFIG_MOVABLE_NODE 1340 /* 1341 * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have 1342 * normal memory. 1343 */ 1344 static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) 1345 { 1346 return true; 1347 } 1348 #else /* CONFIG_MOVABLE_NODE */ 1349 /* ensure the node has NORMAL memory if it is still online */ 1350 static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) 1351 { 1352 struct pglist_data *pgdat = zone->zone_pgdat; 1353 unsigned long present_pages = 0; 1354 enum zone_type zt; 1355 1356 for (zt = 0; zt <= ZONE_NORMAL; zt++) 1357 present_pages += pgdat->node_zones[zt].present_pages; 1358 1359 if (present_pages > nr_pages) 1360 return true; 1361 1362 present_pages = 0; 1363 for (; zt <= ZONE_MOVABLE; zt++) 1364 present_pages += pgdat->node_zones[zt].present_pages; 1365 1366 /* 1367 * we can't offline the last normal memory until all 1368 * higher memory is offlined. 1369 */ 1370 return present_pages == 0; 1371 } 1372 #endif /* CONFIG_MOVABLE_NODE */ 1373 1374 /* check which state of node_states will be changed when offline memory */ 1375 static void node_states_check_changes_offline(unsigned long nr_pages, 1376 struct zone *zone, struct memory_notify *arg) 1377 { 1378 struct pglist_data *pgdat = zone->zone_pgdat; 1379 unsigned long present_pages = 0; 1380 enum zone_type zt, zone_last = ZONE_NORMAL; 1381 1382 /* 1383 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] 1384 * contains nodes which have zones of 0...ZONE_NORMAL, 1385 * set zone_last to ZONE_NORMAL. 1386 * 1387 * If we don't have HIGHMEM nor movable node, 1388 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of 1389 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. 1390 */ 1391 if (N_MEMORY == N_NORMAL_MEMORY) 1392 zone_last = ZONE_MOVABLE; 1393 1394 /* 1395 * check whether node_states[N_NORMAL_MEMORY] will be changed. 1396 * If the memory to be offline is in a zone of 0...zone_last, 1397 * and it is the last present memory, 0...zone_last will 1398 * become empty after offline , thus we can determind we will 1399 * need to clear the node from node_states[N_NORMAL_MEMORY]. 1400 */ 1401 for (zt = 0; zt <= zone_last; zt++) 1402 present_pages += pgdat->node_zones[zt].present_pages; 1403 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) 1404 arg->status_change_nid_normal = zone_to_nid(zone); 1405 else 1406 arg->status_change_nid_normal = -1; 1407 1408 #ifdef CONFIG_HIGHMEM 1409 /* 1410 * If we have movable node, node_states[N_HIGH_MEMORY] 1411 * contains nodes which have zones of 0...ZONE_HIGHMEM, 1412 * set zone_last to ZONE_HIGHMEM. 1413 * 1414 * If we don't have movable node, node_states[N_NORMAL_MEMORY] 1415 * contains nodes which have zones of 0...ZONE_MOVABLE, 1416 * set zone_last to ZONE_MOVABLE. 1417 */ 1418 zone_last = ZONE_HIGHMEM; 1419 if (N_MEMORY == N_HIGH_MEMORY) 1420 zone_last = ZONE_MOVABLE; 1421 1422 for (; zt <= zone_last; zt++) 1423 present_pages += pgdat->node_zones[zt].present_pages; 1424 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) 1425 arg->status_change_nid_high = zone_to_nid(zone); 1426 else 1427 arg->status_change_nid_high = -1; 1428 #else 1429 arg->status_change_nid_high = arg->status_change_nid_normal; 1430 #endif 1431 1432 /* 1433 * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE 1434 */ 1435 zone_last = ZONE_MOVABLE; 1436 1437 /* 1438 * check whether node_states[N_HIGH_MEMORY] will be changed 1439 * If we try to offline the last present @nr_pages from the node, 1440 * we can determind we will need to clear the node from 1441 * node_states[N_HIGH_MEMORY]. 1442 */ 1443 for (; zt <= zone_last; zt++) 1444 present_pages += pgdat->node_zones[zt].present_pages; 1445 if (nr_pages >= present_pages) 1446 arg->status_change_nid = zone_to_nid(zone); 1447 else 1448 arg->status_change_nid = -1; 1449 } 1450 1451 static void node_states_clear_node(int node, struct memory_notify *arg) 1452 { 1453 if (arg->status_change_nid_normal >= 0) 1454 node_clear_state(node, N_NORMAL_MEMORY); 1455 1456 if ((N_MEMORY != N_NORMAL_MEMORY) && 1457 (arg->status_change_nid_high >= 0)) 1458 node_clear_state(node, N_HIGH_MEMORY); 1459 1460 if ((N_MEMORY != N_HIGH_MEMORY) && 1461 (arg->status_change_nid >= 0)) 1462 node_clear_state(node, N_MEMORY); 1463 } 1464 1465 static int __ref __offline_pages(unsigned long start_pfn, 1466 unsigned long end_pfn, unsigned long timeout) 1467 { 1468 unsigned long pfn, nr_pages, expire; 1469 long offlined_pages; 1470 int ret, drain, retry_max, node; 1471 unsigned long flags; 1472 struct zone *zone; 1473 struct memory_notify arg; 1474 1475 BUG_ON(start_pfn >= end_pfn); 1476 /* at least, alignment against pageblock is necessary */ 1477 if (!IS_ALIGNED(start_pfn, pageblock_nr_pages)) 1478 return -EINVAL; 1479 if (!IS_ALIGNED(end_pfn, pageblock_nr_pages)) 1480 return -EINVAL; 1481 /* This makes hotplug much easier...and readable. 1482 we assume this for now. .*/ 1483 if (!test_pages_in_a_zone(start_pfn, end_pfn)) 1484 return -EINVAL; 1485 1486 lock_memory_hotplug(); 1487 1488 zone = page_zone(pfn_to_page(start_pfn)); 1489 node = zone_to_nid(zone); 1490 nr_pages = end_pfn - start_pfn; 1491 1492 ret = -EINVAL; 1493 if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages)) 1494 goto out; 1495 1496 /* set above range as isolated */ 1497 ret = start_isolate_page_range(start_pfn, end_pfn, 1498 MIGRATE_MOVABLE, true); 1499 if (ret) 1500 goto out; 1501 1502 arg.start_pfn = start_pfn; 1503 arg.nr_pages = nr_pages; 1504 node_states_check_changes_offline(nr_pages, zone, &arg); 1505 1506 ret = memory_notify(MEM_GOING_OFFLINE, &arg); 1507 ret = notifier_to_errno(ret); 1508 if (ret) 1509 goto failed_removal; 1510 1511 pfn = start_pfn; 1512 expire = jiffies + timeout; 1513 drain = 0; 1514 retry_max = 5; 1515 repeat: 1516 /* start memory hot removal */ 1517 ret = -EAGAIN; 1518 if (time_after(jiffies, expire)) 1519 goto failed_removal; 1520 ret = -EINTR; 1521 if (signal_pending(current)) 1522 goto failed_removal; 1523 ret = 0; 1524 if (drain) { 1525 lru_add_drain_all(); 1526 cond_resched(); 1527 drain_all_pages(); 1528 } 1529 1530 pfn = scan_lru_pages(start_pfn, end_pfn); 1531 if (pfn) { /* We have page on LRU */ 1532 ret = do_migrate_range(pfn, end_pfn); 1533 if (!ret) { 1534 drain = 1; 1535 goto repeat; 1536 } else { 1537 if (ret < 0) 1538 if (--retry_max == 0) 1539 goto failed_removal; 1540 yield(); 1541 drain = 1; 1542 goto repeat; 1543 } 1544 } 1545 /* drain all zone's lru pagevec, this is asynchronous... */ 1546 lru_add_drain_all(); 1547 yield(); 1548 /* drain pcp pages, this is synchronous. */ 1549 drain_all_pages(); 1550 /* check again */ 1551 offlined_pages = check_pages_isolated(start_pfn, end_pfn); 1552 if (offlined_pages < 0) { 1553 ret = -EBUSY; 1554 goto failed_removal; 1555 } 1556 printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages); 1557 /* Ok, all of our target is isolated. 1558 We cannot do rollback at this point. */ 1559 offline_isolated_pages(start_pfn, end_pfn); 1560 /* reset pagetype flags and makes migrate type to be MOVABLE */ 1561 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1562 /* removal success */ 1563 adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages); 1564 zone->present_pages -= offlined_pages; 1565 1566 pgdat_resize_lock(zone->zone_pgdat, &flags); 1567 zone->zone_pgdat->node_present_pages -= offlined_pages; 1568 pgdat_resize_unlock(zone->zone_pgdat, &flags); 1569 1570 init_per_zone_wmark_min(); 1571 1572 if (!populated_zone(zone)) { 1573 zone_pcp_reset(zone); 1574 mutex_lock(&zonelists_mutex); 1575 build_all_zonelists(NULL, NULL); 1576 mutex_unlock(&zonelists_mutex); 1577 } else 1578 zone_pcp_update(zone); 1579 1580 node_states_clear_node(node, &arg); 1581 if (arg.status_change_nid >= 0) 1582 kswapd_stop(node); 1583 1584 vm_total_pages = nr_free_pagecache_pages(); 1585 writeback_set_ratelimit(); 1586 1587 memory_notify(MEM_OFFLINE, &arg); 1588 unlock_memory_hotplug(); 1589 return 0; 1590 1591 failed_removal: 1592 printk(KERN_INFO "memory offlining [mem %#010llx-%#010llx] failed\n", 1593 (unsigned long long) start_pfn << PAGE_SHIFT, 1594 ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); 1595 memory_notify(MEM_CANCEL_OFFLINE, &arg); 1596 /* pushback to free area */ 1597 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1598 1599 out: 1600 unlock_memory_hotplug(); 1601 return ret; 1602 } 1603 1604 int offline_pages(unsigned long start_pfn, unsigned long nr_pages) 1605 { 1606 return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); 1607 } 1608 #endif /* CONFIG_MEMORY_HOTREMOVE */ 1609 1610 /** 1611 * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn) 1612 * @start_pfn: start pfn of the memory range 1613 * @end_pfn: end pfn of the memory range 1614 * @arg: argument passed to func 1615 * @func: callback for each memory section walked 1616 * 1617 * This function walks through all present mem sections in range 1618 * [start_pfn, end_pfn) and call func on each mem section. 1619 * 1620 * Returns the return value of func. 1621 */ 1622 int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, 1623 void *arg, int (*func)(struct memory_block *, void *)) 1624 { 1625 struct memory_block *mem = NULL; 1626 struct mem_section *section; 1627 unsigned long pfn, section_nr; 1628 int ret; 1629 1630 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 1631 section_nr = pfn_to_section_nr(pfn); 1632 if (!present_section_nr(section_nr)) 1633 continue; 1634 1635 section = __nr_to_section(section_nr); 1636 /* same memblock? */ 1637 if (mem) 1638 if ((section_nr >= mem->start_section_nr) && 1639 (section_nr <= mem->end_section_nr)) 1640 continue; 1641 1642 mem = find_memory_block_hinted(section, mem); 1643 if (!mem) 1644 continue; 1645 1646 ret = func(mem, arg); 1647 if (ret) { 1648 kobject_put(&mem->dev.kobj); 1649 return ret; 1650 } 1651 } 1652 1653 if (mem) 1654 kobject_put(&mem->dev.kobj); 1655 1656 return 0; 1657 } 1658 1659 #ifdef CONFIG_MEMORY_HOTREMOVE 1660 static int is_memblock_offlined_cb(struct memory_block *mem, void *arg) 1661 { 1662 int ret = !is_memblock_offlined(mem); 1663 1664 if (unlikely(ret)) { 1665 phys_addr_t beginpa, endpa; 1666 1667 beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); 1668 endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1; 1669 pr_warn("removing memory fails, because memory " 1670 "[%pa-%pa] is onlined\n", 1671 &beginpa, &endpa); 1672 } 1673 1674 return ret; 1675 } 1676 1677 static int check_cpu_on_node(void *data) 1678 { 1679 struct pglist_data *pgdat = data; 1680 int cpu; 1681 1682 for_each_present_cpu(cpu) { 1683 if (cpu_to_node(cpu) == pgdat->node_id) 1684 /* 1685 * the cpu on this node isn't removed, and we can't 1686 * offline this node. 1687 */ 1688 return -EBUSY; 1689 } 1690 1691 return 0; 1692 } 1693 1694 static void unmap_cpu_on_node(void *data) 1695 { 1696 #ifdef CONFIG_ACPI_NUMA 1697 struct pglist_data *pgdat = data; 1698 int cpu; 1699 1700 for_each_possible_cpu(cpu) 1701 if (cpu_to_node(cpu) == pgdat->node_id) 1702 numa_clear_node(cpu); 1703 #endif 1704 } 1705 1706 static int check_and_unmap_cpu_on_node(void *data) 1707 { 1708 int ret = check_cpu_on_node(data); 1709 1710 if (ret) 1711 return ret; 1712 1713 /* 1714 * the node will be offlined when we come here, so we can clear 1715 * the cpu_to_node() now. 1716 */ 1717 1718 unmap_cpu_on_node(data); 1719 return 0; 1720 } 1721 1722 /* offline the node if all memory sections of this node are removed */ 1723 void try_offline_node(int nid) 1724 { 1725 pg_data_t *pgdat = NODE_DATA(nid); 1726 unsigned long start_pfn = pgdat->node_start_pfn; 1727 unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; 1728 unsigned long pfn; 1729 struct page *pgdat_page = virt_to_page(pgdat); 1730 int i; 1731 1732 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 1733 unsigned long section_nr = pfn_to_section_nr(pfn); 1734 1735 if (!present_section_nr(section_nr)) 1736 continue; 1737 1738 if (pfn_to_nid(pfn) != nid) 1739 continue; 1740 1741 /* 1742 * some memory sections of this node are not removed, and we 1743 * can't offline node now. 1744 */ 1745 return; 1746 } 1747 1748 if (stop_machine(check_and_unmap_cpu_on_node, pgdat, NULL)) 1749 return; 1750 1751 /* 1752 * all memory/cpu of this node are removed, we can offline this 1753 * node now. 1754 */ 1755 node_set_offline(nid); 1756 unregister_one_node(nid); 1757 1758 if (!PageSlab(pgdat_page) && !PageCompound(pgdat_page)) 1759 /* node data is allocated from boot memory */ 1760 return; 1761 1762 /* free waittable in each zone */ 1763 for (i = 0; i < MAX_NR_ZONES; i++) { 1764 struct zone *zone = pgdat->node_zones + i; 1765 1766 /* 1767 * wait_table may be allocated from boot memory, 1768 * here only free if it's allocated by vmalloc. 1769 */ 1770 if (is_vmalloc_addr(zone->wait_table)) 1771 vfree(zone->wait_table); 1772 } 1773 1774 /* 1775 * Since there is no way to guarentee the address of pgdat/zone is not 1776 * on stack of any kernel threads or used by other kernel objects 1777 * without reference counting or other symchronizing method, do not 1778 * reset node_data and free pgdat here. Just reset it to 0 and reuse 1779 * the memory when the node is online again. 1780 */ 1781 memset(pgdat, 0, sizeof(*pgdat)); 1782 } 1783 EXPORT_SYMBOL(try_offline_node); 1784 1785 void __ref remove_memory(int nid, u64 start, u64 size) 1786 { 1787 int ret; 1788 1789 lock_memory_hotplug(); 1790 1791 /* 1792 * All memory blocks must be offlined before removing memory. Check 1793 * whether all memory blocks in question are offline and trigger a BUG() 1794 * if this is not the case. 1795 */ 1796 ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, 1797 is_memblock_offlined_cb); 1798 if (ret) { 1799 unlock_memory_hotplug(); 1800 BUG(); 1801 } 1802 1803 /* remove memmap entry */ 1804 firmware_map_remove(start, start + size, "System RAM"); 1805 1806 arch_remove_memory(start, size); 1807 1808 try_offline_node(nid); 1809 1810 unlock_memory_hotplug(); 1811 } 1812 EXPORT_SYMBOL_GPL(remove_memory); 1813 #endif /* CONFIG_MEMORY_HOTREMOVE */ 1814