1 /* 2 * linux/mm/memory_hotplug.c 3 * 4 * Copyright (C) 5 */ 6 7 #include <linux/stddef.h> 8 #include <linux/mm.h> 9 #include <linux/swap.h> 10 #include <linux/interrupt.h> 11 #include <linux/pagemap.h> 12 #include <linux/bootmem.h> 13 #include <linux/compiler.h> 14 #include <linux/export.h> 15 #include <linux/pagevec.h> 16 #include <linux/writeback.h> 17 #include <linux/slab.h> 18 #include <linux/sysctl.h> 19 #include <linux/cpu.h> 20 #include <linux/memory.h> 21 #include <linux/memory_hotplug.h> 22 #include <linux/highmem.h> 23 #include <linux/vmalloc.h> 24 #include <linux/ioport.h> 25 #include <linux/delay.h> 26 #include <linux/migrate.h> 27 #include <linux/page-isolation.h> 28 #include <linux/pfn.h> 29 #include <linux/suspend.h> 30 #include <linux/mm_inline.h> 31 #include <linux/firmware-map.h> 32 #include <linux/stop_machine.h> 33 34 #include <asm/tlbflush.h> 35 36 #include "internal.h" 37 38 /* 39 * online_page_callback contains pointer to current page onlining function. 40 * Initially it is generic_online_page(). If it is required it could be 41 * changed by calling set_online_page_callback() for callback registration 42 * and restore_online_page_callback() for generic callback restore. 43 */ 44 45 static void generic_online_page(struct page *page); 46 47 static online_page_callback_t online_page_callback = generic_online_page; 48 49 DEFINE_MUTEX(mem_hotplug_mutex); 50 51 void lock_memory_hotplug(void) 52 { 53 mutex_lock(&mem_hotplug_mutex); 54 55 /* for exclusive hibernation if CONFIG_HIBERNATION=y */ 56 lock_system_sleep(); 57 } 58 59 void unlock_memory_hotplug(void) 60 { 61 unlock_system_sleep(); 62 mutex_unlock(&mem_hotplug_mutex); 63 } 64 65 66 /* add this memory to iomem resource */ 67 static struct resource *register_memory_resource(u64 start, u64 size) 68 { 69 struct resource *res; 70 res = kzalloc(sizeof(struct resource), GFP_KERNEL); 71 BUG_ON(!res); 72 73 res->name = "System RAM"; 74 res->start = start; 75 res->end = start + size - 1; 76 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; 77 if (request_resource(&iomem_resource, res) < 0) { 78 printk("System RAM resource %pR cannot be added\n", res); 79 kfree(res); 80 res = NULL; 81 } 82 return res; 83 } 84 85 static void release_memory_resource(struct resource *res) 86 { 87 if (!res) 88 return; 89 release_resource(res); 90 kfree(res); 91 return; 92 } 93 94 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 95 void get_page_bootmem(unsigned long info, struct page *page, 96 unsigned long type) 97 { 98 page->lru.next = (struct list_head *) type; 99 SetPagePrivate(page); 100 set_page_private(page, info); 101 atomic_inc(&page->_count); 102 } 103 104 /* reference to __meminit __free_pages_bootmem is valid 105 * so use __ref to tell modpost not to generate a warning */ 106 void __ref put_page_bootmem(struct page *page) 107 { 108 unsigned long type; 109 static DEFINE_MUTEX(ppb_lock); 110 111 type = (unsigned long) page->lru.next; 112 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || 113 type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); 114 115 if (atomic_dec_return(&page->_count) == 1) { 116 ClearPagePrivate(page); 117 set_page_private(page, 0); 118 INIT_LIST_HEAD(&page->lru); 119 120 /* 121 * Please refer to comment for __free_pages_bootmem() 122 * for why we serialize here. 123 */ 124 mutex_lock(&ppb_lock); 125 __free_pages_bootmem(page, 0); 126 mutex_unlock(&ppb_lock); 127 totalram_pages++; 128 } 129 130 } 131 132 #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE 133 #ifndef CONFIG_SPARSEMEM_VMEMMAP 134 static void register_page_bootmem_info_section(unsigned long start_pfn) 135 { 136 unsigned long *usemap, mapsize, section_nr, i; 137 struct mem_section *ms; 138 struct page *page, *memmap; 139 140 section_nr = pfn_to_section_nr(start_pfn); 141 ms = __nr_to_section(section_nr); 142 143 /* Get section's memmap address */ 144 memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); 145 146 /* 147 * Get page for the memmap's phys address 148 * XXX: need more consideration for sparse_vmemmap... 149 */ 150 page = virt_to_page(memmap); 151 mapsize = sizeof(struct page) * PAGES_PER_SECTION; 152 mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT; 153 154 /* remember memmap's page */ 155 for (i = 0; i < mapsize; i++, page++) 156 get_page_bootmem(section_nr, page, SECTION_INFO); 157 158 usemap = __nr_to_section(section_nr)->pageblock_flags; 159 page = virt_to_page(usemap); 160 161 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; 162 163 for (i = 0; i < mapsize; i++, page++) 164 get_page_bootmem(section_nr, page, MIX_SECTION_INFO); 165 166 } 167 #else /* CONFIG_SPARSEMEM_VMEMMAP */ 168 static void register_page_bootmem_info_section(unsigned long start_pfn) 169 { 170 unsigned long *usemap, mapsize, section_nr, i; 171 struct mem_section *ms; 172 struct page *page, *memmap; 173 174 if (!pfn_valid(start_pfn)) 175 return; 176 177 section_nr = pfn_to_section_nr(start_pfn); 178 ms = __nr_to_section(section_nr); 179 180 memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); 181 182 register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION); 183 184 usemap = __nr_to_section(section_nr)->pageblock_flags; 185 page = virt_to_page(usemap); 186 187 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; 188 189 for (i = 0; i < mapsize; i++, page++) 190 get_page_bootmem(section_nr, page, MIX_SECTION_INFO); 191 } 192 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ 193 194 void register_page_bootmem_info_node(struct pglist_data *pgdat) 195 { 196 unsigned long i, pfn, end_pfn, nr_pages; 197 int node = pgdat->node_id; 198 struct page *page; 199 struct zone *zone; 200 201 nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT; 202 page = virt_to_page(pgdat); 203 204 for (i = 0; i < nr_pages; i++, page++) 205 get_page_bootmem(node, page, NODE_INFO); 206 207 zone = &pgdat->node_zones[0]; 208 for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) { 209 if (zone->wait_table) { 210 nr_pages = zone->wait_table_hash_nr_entries 211 * sizeof(wait_queue_head_t); 212 nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT; 213 page = virt_to_page(zone->wait_table); 214 215 for (i = 0; i < nr_pages; i++, page++) 216 get_page_bootmem(node, page, NODE_INFO); 217 } 218 } 219 220 pfn = pgdat->node_start_pfn; 221 end_pfn = pgdat_end_pfn(pgdat); 222 223 /* register_section info */ 224 for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 225 /* 226 * Some platforms can assign the same pfn to multiple nodes - on 227 * node0 as well as nodeN. To avoid registering a pfn against 228 * multiple nodes we check that this pfn does not already 229 * reside in some other node. 230 */ 231 if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node)) 232 register_page_bootmem_info_section(pfn); 233 } 234 } 235 #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ 236 237 static void grow_zone_span(struct zone *zone, unsigned long start_pfn, 238 unsigned long end_pfn) 239 { 240 unsigned long old_zone_end_pfn; 241 242 zone_span_writelock(zone); 243 244 old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; 245 if (!zone->spanned_pages || start_pfn < zone->zone_start_pfn) 246 zone->zone_start_pfn = start_pfn; 247 248 zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - 249 zone->zone_start_pfn; 250 251 zone_span_writeunlock(zone); 252 } 253 254 static void resize_zone(struct zone *zone, unsigned long start_pfn, 255 unsigned long end_pfn) 256 { 257 zone_span_writelock(zone); 258 259 if (end_pfn - start_pfn) { 260 zone->zone_start_pfn = start_pfn; 261 zone->spanned_pages = end_pfn - start_pfn; 262 } else { 263 /* 264 * make it consist as free_area_init_core(), 265 * if spanned_pages = 0, then keep start_pfn = 0 266 */ 267 zone->zone_start_pfn = 0; 268 zone->spanned_pages = 0; 269 } 270 271 zone_span_writeunlock(zone); 272 } 273 274 static void fix_zone_id(struct zone *zone, unsigned long start_pfn, 275 unsigned long end_pfn) 276 { 277 enum zone_type zid = zone_idx(zone); 278 int nid = zone->zone_pgdat->node_id; 279 unsigned long pfn; 280 281 for (pfn = start_pfn; pfn < end_pfn; pfn++) 282 set_page_links(pfn_to_page(pfn), zid, nid, pfn); 283 } 284 285 /* Can fail with -ENOMEM from allocating a wait table with vmalloc() or 286 * alloc_bootmem_node_nopanic() */ 287 static int __ref ensure_zone_is_initialized(struct zone *zone, 288 unsigned long start_pfn, unsigned long num_pages) 289 { 290 if (!zone_is_initialized(zone)) 291 return init_currently_empty_zone(zone, start_pfn, num_pages, 292 MEMMAP_HOTPLUG); 293 return 0; 294 } 295 296 static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, 297 unsigned long start_pfn, unsigned long end_pfn) 298 { 299 int ret; 300 unsigned long flags; 301 unsigned long z1_start_pfn; 302 303 ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn); 304 if (ret) 305 return ret; 306 307 pgdat_resize_lock(z1->zone_pgdat, &flags); 308 309 /* can't move pfns which are higher than @z2 */ 310 if (end_pfn > zone_end_pfn(z2)) 311 goto out_fail; 312 /* the move out part mast at the left most of @z2 */ 313 if (start_pfn > z2->zone_start_pfn) 314 goto out_fail; 315 /* must included/overlap */ 316 if (end_pfn <= z2->zone_start_pfn) 317 goto out_fail; 318 319 /* use start_pfn for z1's start_pfn if z1 is empty */ 320 if (z1->spanned_pages) 321 z1_start_pfn = z1->zone_start_pfn; 322 else 323 z1_start_pfn = start_pfn; 324 325 resize_zone(z1, z1_start_pfn, end_pfn); 326 resize_zone(z2, end_pfn, zone_end_pfn(z2)); 327 328 pgdat_resize_unlock(z1->zone_pgdat, &flags); 329 330 fix_zone_id(z1, start_pfn, end_pfn); 331 332 return 0; 333 out_fail: 334 pgdat_resize_unlock(z1->zone_pgdat, &flags); 335 return -1; 336 } 337 338 static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, 339 unsigned long start_pfn, unsigned long end_pfn) 340 { 341 int ret; 342 unsigned long flags; 343 unsigned long z2_end_pfn; 344 345 ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn); 346 if (ret) 347 return ret; 348 349 pgdat_resize_lock(z1->zone_pgdat, &flags); 350 351 /* can't move pfns which are lower than @z1 */ 352 if (z1->zone_start_pfn > start_pfn) 353 goto out_fail; 354 /* the move out part mast at the right most of @z1 */ 355 if (zone_end_pfn(z1) > end_pfn) 356 goto out_fail; 357 /* must included/overlap */ 358 if (start_pfn >= zone_end_pfn(z1)) 359 goto out_fail; 360 361 /* use end_pfn for z2's end_pfn if z2 is empty */ 362 if (z2->spanned_pages) 363 z2_end_pfn = zone_end_pfn(z2); 364 else 365 z2_end_pfn = end_pfn; 366 367 resize_zone(z1, z1->zone_start_pfn, start_pfn); 368 resize_zone(z2, start_pfn, z2_end_pfn); 369 370 pgdat_resize_unlock(z1->zone_pgdat, &flags); 371 372 fix_zone_id(z2, start_pfn, end_pfn); 373 374 return 0; 375 out_fail: 376 pgdat_resize_unlock(z1->zone_pgdat, &flags); 377 return -1; 378 } 379 380 static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, 381 unsigned long end_pfn) 382 { 383 unsigned long old_pgdat_end_pfn = 384 pgdat->node_start_pfn + pgdat->node_spanned_pages; 385 386 if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) 387 pgdat->node_start_pfn = start_pfn; 388 389 pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - 390 pgdat->node_start_pfn; 391 } 392 393 static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) 394 { 395 struct pglist_data *pgdat = zone->zone_pgdat; 396 int nr_pages = PAGES_PER_SECTION; 397 int nid = pgdat->node_id; 398 int zone_type; 399 unsigned long flags; 400 int ret; 401 402 zone_type = zone - pgdat->node_zones; 403 ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages); 404 if (ret) 405 return ret; 406 407 pgdat_resize_lock(zone->zone_pgdat, &flags); 408 grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); 409 grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, 410 phys_start_pfn + nr_pages); 411 pgdat_resize_unlock(zone->zone_pgdat, &flags); 412 memmap_init_zone(nr_pages, nid, zone_type, 413 phys_start_pfn, MEMMAP_HOTPLUG); 414 return 0; 415 } 416 417 static int __meminit __add_section(int nid, struct zone *zone, 418 unsigned long phys_start_pfn) 419 { 420 int nr_pages = PAGES_PER_SECTION; 421 int ret; 422 423 if (pfn_valid(phys_start_pfn)) 424 return -EEXIST; 425 426 ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages); 427 428 if (ret < 0) 429 return ret; 430 431 ret = __add_zone(zone, phys_start_pfn); 432 433 if (ret < 0) 434 return ret; 435 436 return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); 437 } 438 439 /* 440 * Reasonably generic function for adding memory. It is 441 * expected that archs that support memory hotplug will 442 * call this function after deciding the zone to which to 443 * add the new pages. 444 */ 445 int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, 446 unsigned long nr_pages) 447 { 448 unsigned long i; 449 int err = 0; 450 int start_sec, end_sec; 451 /* during initialize mem_map, align hot-added range to section */ 452 start_sec = pfn_to_section_nr(phys_start_pfn); 453 end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); 454 455 for (i = start_sec; i <= end_sec; i++) { 456 err = __add_section(nid, zone, i << PFN_SECTION_SHIFT); 457 458 /* 459 * EEXIST is finally dealt with by ioresource collision 460 * check. see add_memory() => register_memory_resource() 461 * Warning will be printed if there is collision. 462 */ 463 if (err && (err != -EEXIST)) 464 break; 465 err = 0; 466 } 467 468 return err; 469 } 470 EXPORT_SYMBOL_GPL(__add_pages); 471 472 #ifdef CONFIG_MEMORY_HOTREMOVE 473 /* find the smallest valid pfn in the range [start_pfn, end_pfn) */ 474 static int find_smallest_section_pfn(int nid, struct zone *zone, 475 unsigned long start_pfn, 476 unsigned long end_pfn) 477 { 478 struct mem_section *ms; 479 480 for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) { 481 ms = __pfn_to_section(start_pfn); 482 483 if (unlikely(!valid_section(ms))) 484 continue; 485 486 if (unlikely(pfn_to_nid(start_pfn) != nid)) 487 continue; 488 489 if (zone && zone != page_zone(pfn_to_page(start_pfn))) 490 continue; 491 492 return start_pfn; 493 } 494 495 return 0; 496 } 497 498 /* find the biggest valid pfn in the range [start_pfn, end_pfn). */ 499 static int find_biggest_section_pfn(int nid, struct zone *zone, 500 unsigned long start_pfn, 501 unsigned long end_pfn) 502 { 503 struct mem_section *ms; 504 unsigned long pfn; 505 506 /* pfn is the end pfn of a memory section. */ 507 pfn = end_pfn - 1; 508 for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) { 509 ms = __pfn_to_section(pfn); 510 511 if (unlikely(!valid_section(ms))) 512 continue; 513 514 if (unlikely(pfn_to_nid(pfn) != nid)) 515 continue; 516 517 if (zone && zone != page_zone(pfn_to_page(pfn))) 518 continue; 519 520 return pfn; 521 } 522 523 return 0; 524 } 525 526 static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, 527 unsigned long end_pfn) 528 { 529 unsigned long zone_start_pfn = zone->zone_start_pfn; 530 unsigned long zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; 531 unsigned long pfn; 532 struct mem_section *ms; 533 int nid = zone_to_nid(zone); 534 535 zone_span_writelock(zone); 536 if (zone_start_pfn == start_pfn) { 537 /* 538 * If the section is smallest section in the zone, it need 539 * shrink zone->zone_start_pfn and zone->zone_spanned_pages. 540 * In this case, we find second smallest valid mem_section 541 * for shrinking zone. 542 */ 543 pfn = find_smallest_section_pfn(nid, zone, end_pfn, 544 zone_end_pfn); 545 if (pfn) { 546 zone->zone_start_pfn = pfn; 547 zone->spanned_pages = zone_end_pfn - pfn; 548 } 549 } else if (zone_end_pfn == end_pfn) { 550 /* 551 * If the section is biggest section in the zone, it need 552 * shrink zone->spanned_pages. 553 * In this case, we find second biggest valid mem_section for 554 * shrinking zone. 555 */ 556 pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn, 557 start_pfn); 558 if (pfn) 559 zone->spanned_pages = pfn - zone_start_pfn + 1; 560 } 561 562 /* 563 * The section is not biggest or smallest mem_section in the zone, it 564 * only creates a hole in the zone. So in this case, we need not 565 * change the zone. But perhaps, the zone has only hole data. Thus 566 * it check the zone has only hole or not. 567 */ 568 pfn = zone_start_pfn; 569 for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) { 570 ms = __pfn_to_section(pfn); 571 572 if (unlikely(!valid_section(ms))) 573 continue; 574 575 if (page_zone(pfn_to_page(pfn)) != zone) 576 continue; 577 578 /* If the section is current section, it continues the loop */ 579 if (start_pfn == pfn) 580 continue; 581 582 /* If we find valid section, we have nothing to do */ 583 zone_span_writeunlock(zone); 584 return; 585 } 586 587 /* The zone has no valid section */ 588 zone->zone_start_pfn = 0; 589 zone->spanned_pages = 0; 590 zone_span_writeunlock(zone); 591 } 592 593 static void shrink_pgdat_span(struct pglist_data *pgdat, 594 unsigned long start_pfn, unsigned long end_pfn) 595 { 596 unsigned long pgdat_start_pfn = pgdat->node_start_pfn; 597 unsigned long pgdat_end_pfn = 598 pgdat->node_start_pfn + pgdat->node_spanned_pages; 599 unsigned long pfn; 600 struct mem_section *ms; 601 int nid = pgdat->node_id; 602 603 if (pgdat_start_pfn == start_pfn) { 604 /* 605 * If the section is smallest section in the pgdat, it need 606 * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages. 607 * In this case, we find second smallest valid mem_section 608 * for shrinking zone. 609 */ 610 pfn = find_smallest_section_pfn(nid, NULL, end_pfn, 611 pgdat_end_pfn); 612 if (pfn) { 613 pgdat->node_start_pfn = pfn; 614 pgdat->node_spanned_pages = pgdat_end_pfn - pfn; 615 } 616 } else if (pgdat_end_pfn == end_pfn) { 617 /* 618 * If the section is biggest section in the pgdat, it need 619 * shrink pgdat->node_spanned_pages. 620 * In this case, we find second biggest valid mem_section for 621 * shrinking zone. 622 */ 623 pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn, 624 start_pfn); 625 if (pfn) 626 pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1; 627 } 628 629 /* 630 * If the section is not biggest or smallest mem_section in the pgdat, 631 * it only creates a hole in the pgdat. So in this case, we need not 632 * change the pgdat. 633 * But perhaps, the pgdat has only hole data. Thus it check the pgdat 634 * has only hole or not. 635 */ 636 pfn = pgdat_start_pfn; 637 for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) { 638 ms = __pfn_to_section(pfn); 639 640 if (unlikely(!valid_section(ms))) 641 continue; 642 643 if (pfn_to_nid(pfn) != nid) 644 continue; 645 646 /* If the section is current section, it continues the loop */ 647 if (start_pfn == pfn) 648 continue; 649 650 /* If we find valid section, we have nothing to do */ 651 return; 652 } 653 654 /* The pgdat has no valid section */ 655 pgdat->node_start_pfn = 0; 656 pgdat->node_spanned_pages = 0; 657 } 658 659 static void __remove_zone(struct zone *zone, unsigned long start_pfn) 660 { 661 struct pglist_data *pgdat = zone->zone_pgdat; 662 int nr_pages = PAGES_PER_SECTION; 663 int zone_type; 664 unsigned long flags; 665 666 zone_type = zone - pgdat->node_zones; 667 668 pgdat_resize_lock(zone->zone_pgdat, &flags); 669 shrink_zone_span(zone, start_pfn, start_pfn + nr_pages); 670 shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages); 671 pgdat_resize_unlock(zone->zone_pgdat, &flags); 672 } 673 674 static int __remove_section(struct zone *zone, struct mem_section *ms) 675 { 676 unsigned long start_pfn; 677 int scn_nr; 678 int ret = -EINVAL; 679 680 if (!valid_section(ms)) 681 return ret; 682 683 ret = unregister_memory_section(ms); 684 if (ret) 685 return ret; 686 687 scn_nr = __section_nr(ms); 688 start_pfn = section_nr_to_pfn(scn_nr); 689 __remove_zone(zone, start_pfn); 690 691 sparse_remove_one_section(zone, ms); 692 return 0; 693 } 694 695 /** 696 * __remove_pages() - remove sections of pages from a zone 697 * @zone: zone from which pages need to be removed 698 * @phys_start_pfn: starting pageframe (must be aligned to start of a section) 699 * @nr_pages: number of pages to remove (must be multiple of section size) 700 * 701 * Generic helper function to remove section mappings and sysfs entries 702 * for the section of the memory we are removing. Caller needs to make 703 * sure that pages are marked reserved and zones are adjust properly by 704 * calling offline_pages(). 705 */ 706 int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, 707 unsigned long nr_pages) 708 { 709 unsigned long i; 710 int sections_to_remove; 711 resource_size_t start, size; 712 int ret = 0; 713 714 /* 715 * We can only remove entire sections 716 */ 717 BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); 718 BUG_ON(nr_pages % PAGES_PER_SECTION); 719 720 start = phys_start_pfn << PAGE_SHIFT; 721 size = nr_pages * PAGE_SIZE; 722 ret = release_mem_region_adjustable(&iomem_resource, start, size); 723 if (ret) { 724 resource_size_t endres = start + size - 1; 725 726 pr_warn("Unable to release resource <%pa-%pa> (%d)\n", 727 &start, &endres, ret); 728 } 729 730 sections_to_remove = nr_pages / PAGES_PER_SECTION; 731 for (i = 0; i < sections_to_remove; i++) { 732 unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; 733 ret = __remove_section(zone, __pfn_to_section(pfn)); 734 if (ret) 735 break; 736 } 737 return ret; 738 } 739 EXPORT_SYMBOL_GPL(__remove_pages); 740 #endif /* CONFIG_MEMORY_HOTREMOVE */ 741 742 int set_online_page_callback(online_page_callback_t callback) 743 { 744 int rc = -EINVAL; 745 746 lock_memory_hotplug(); 747 748 if (online_page_callback == generic_online_page) { 749 online_page_callback = callback; 750 rc = 0; 751 } 752 753 unlock_memory_hotplug(); 754 755 return rc; 756 } 757 EXPORT_SYMBOL_GPL(set_online_page_callback); 758 759 int restore_online_page_callback(online_page_callback_t callback) 760 { 761 int rc = -EINVAL; 762 763 lock_memory_hotplug(); 764 765 if (online_page_callback == callback) { 766 online_page_callback = generic_online_page; 767 rc = 0; 768 } 769 770 unlock_memory_hotplug(); 771 772 return rc; 773 } 774 EXPORT_SYMBOL_GPL(restore_online_page_callback); 775 776 void __online_page_set_limits(struct page *page) 777 { 778 unsigned long pfn = page_to_pfn(page); 779 780 if (pfn >= num_physpages) 781 num_physpages = pfn + 1; 782 } 783 EXPORT_SYMBOL_GPL(__online_page_set_limits); 784 785 void __online_page_increment_counters(struct page *page) 786 { 787 totalram_pages++; 788 789 #ifdef CONFIG_HIGHMEM 790 if (PageHighMem(page)) 791 totalhigh_pages++; 792 #endif 793 } 794 EXPORT_SYMBOL_GPL(__online_page_increment_counters); 795 796 void __online_page_free(struct page *page) 797 { 798 ClearPageReserved(page); 799 init_page_count(page); 800 __free_page(page); 801 } 802 EXPORT_SYMBOL_GPL(__online_page_free); 803 804 static void generic_online_page(struct page *page) 805 { 806 __online_page_set_limits(page); 807 __online_page_increment_counters(page); 808 __online_page_free(page); 809 } 810 811 static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, 812 void *arg) 813 { 814 unsigned long i; 815 unsigned long onlined_pages = *(unsigned long *)arg; 816 struct page *page; 817 if (PageReserved(pfn_to_page(start_pfn))) 818 for (i = 0; i < nr_pages; i++) { 819 page = pfn_to_page(start_pfn + i); 820 (*online_page_callback)(page); 821 onlined_pages++; 822 } 823 *(unsigned long *)arg = onlined_pages; 824 return 0; 825 } 826 827 #ifdef CONFIG_MOVABLE_NODE 828 /* 829 * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have 830 * normal memory. 831 */ 832 static bool can_online_high_movable(struct zone *zone) 833 { 834 return true; 835 } 836 #else /* CONFIG_MOVABLE_NODE */ 837 /* ensure every online node has NORMAL memory */ 838 static bool can_online_high_movable(struct zone *zone) 839 { 840 return node_state(zone_to_nid(zone), N_NORMAL_MEMORY); 841 } 842 #endif /* CONFIG_MOVABLE_NODE */ 843 844 /* check which state of node_states will be changed when online memory */ 845 static void node_states_check_changes_online(unsigned long nr_pages, 846 struct zone *zone, struct memory_notify *arg) 847 { 848 int nid = zone_to_nid(zone); 849 enum zone_type zone_last = ZONE_NORMAL; 850 851 /* 852 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] 853 * contains nodes which have zones of 0...ZONE_NORMAL, 854 * set zone_last to ZONE_NORMAL. 855 * 856 * If we don't have HIGHMEM nor movable node, 857 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of 858 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. 859 */ 860 if (N_MEMORY == N_NORMAL_MEMORY) 861 zone_last = ZONE_MOVABLE; 862 863 /* 864 * if the memory to be online is in a zone of 0...zone_last, and 865 * the zones of 0...zone_last don't have memory before online, we will 866 * need to set the node to node_states[N_NORMAL_MEMORY] after 867 * the memory is online. 868 */ 869 if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY)) 870 arg->status_change_nid_normal = nid; 871 else 872 arg->status_change_nid_normal = -1; 873 874 #ifdef CONFIG_HIGHMEM 875 /* 876 * If we have movable node, node_states[N_HIGH_MEMORY] 877 * contains nodes which have zones of 0...ZONE_HIGHMEM, 878 * set zone_last to ZONE_HIGHMEM. 879 * 880 * If we don't have movable node, node_states[N_NORMAL_MEMORY] 881 * contains nodes which have zones of 0...ZONE_MOVABLE, 882 * set zone_last to ZONE_MOVABLE. 883 */ 884 zone_last = ZONE_HIGHMEM; 885 if (N_MEMORY == N_HIGH_MEMORY) 886 zone_last = ZONE_MOVABLE; 887 888 if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY)) 889 arg->status_change_nid_high = nid; 890 else 891 arg->status_change_nid_high = -1; 892 #else 893 arg->status_change_nid_high = arg->status_change_nid_normal; 894 #endif 895 896 /* 897 * if the node don't have memory befor online, we will need to 898 * set the node to node_states[N_MEMORY] after the memory 899 * is online. 900 */ 901 if (!node_state(nid, N_MEMORY)) 902 arg->status_change_nid = nid; 903 else 904 arg->status_change_nid = -1; 905 } 906 907 static void node_states_set_node(int node, struct memory_notify *arg) 908 { 909 if (arg->status_change_nid_normal >= 0) 910 node_set_state(node, N_NORMAL_MEMORY); 911 912 if (arg->status_change_nid_high >= 0) 913 node_set_state(node, N_HIGH_MEMORY); 914 915 node_set_state(node, N_MEMORY); 916 } 917 918 919 int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) 920 { 921 unsigned long onlined_pages = 0; 922 struct zone *zone; 923 int need_zonelists_rebuild = 0; 924 int nid; 925 int ret; 926 struct memory_notify arg; 927 928 lock_memory_hotplug(); 929 /* 930 * This doesn't need a lock to do pfn_to_page(). 931 * The section can't be removed here because of the 932 * memory_block->state_mutex. 933 */ 934 zone = page_zone(pfn_to_page(pfn)); 935 936 if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) && 937 !can_online_high_movable(zone)) { 938 unlock_memory_hotplug(); 939 return -1; 940 } 941 942 if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { 943 if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) { 944 unlock_memory_hotplug(); 945 return -1; 946 } 947 } 948 if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { 949 if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) { 950 unlock_memory_hotplug(); 951 return -1; 952 } 953 } 954 955 /* Previous code may changed the zone of the pfn range */ 956 zone = page_zone(pfn_to_page(pfn)); 957 958 arg.start_pfn = pfn; 959 arg.nr_pages = nr_pages; 960 node_states_check_changes_online(nr_pages, zone, &arg); 961 962 nid = page_to_nid(pfn_to_page(pfn)); 963 964 ret = memory_notify(MEM_GOING_ONLINE, &arg); 965 ret = notifier_to_errno(ret); 966 if (ret) { 967 memory_notify(MEM_CANCEL_ONLINE, &arg); 968 unlock_memory_hotplug(); 969 return ret; 970 } 971 /* 972 * If this zone is not populated, then it is not in zonelist. 973 * This means the page allocator ignores this zone. 974 * So, zonelist must be updated after online. 975 */ 976 mutex_lock(&zonelists_mutex); 977 if (!populated_zone(zone)) { 978 need_zonelists_rebuild = 1; 979 build_all_zonelists(NULL, zone); 980 } 981 982 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, 983 online_pages_range); 984 if (ret) { 985 if (need_zonelists_rebuild) 986 zone_pcp_reset(zone); 987 mutex_unlock(&zonelists_mutex); 988 printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n", 989 (unsigned long long) pfn << PAGE_SHIFT, 990 (((unsigned long long) pfn + nr_pages) 991 << PAGE_SHIFT) - 1); 992 memory_notify(MEM_CANCEL_ONLINE, &arg); 993 unlock_memory_hotplug(); 994 return ret; 995 } 996 997 zone->managed_pages += onlined_pages; 998 zone->present_pages += onlined_pages; 999 zone->zone_pgdat->node_present_pages += onlined_pages; 1000 if (onlined_pages) { 1001 node_states_set_node(zone_to_nid(zone), &arg); 1002 if (need_zonelists_rebuild) 1003 build_all_zonelists(NULL, NULL); 1004 else 1005 zone_pcp_update(zone); 1006 } 1007 1008 mutex_unlock(&zonelists_mutex); 1009 1010 init_per_zone_wmark_min(); 1011 1012 if (onlined_pages) 1013 kswapd_run(zone_to_nid(zone)); 1014 1015 vm_total_pages = nr_free_pagecache_pages(); 1016 1017 writeback_set_ratelimit(); 1018 1019 if (onlined_pages) 1020 memory_notify(MEM_ONLINE, &arg); 1021 unlock_memory_hotplug(); 1022 1023 return 0; 1024 } 1025 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ 1026 1027 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 1028 static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) 1029 { 1030 struct pglist_data *pgdat; 1031 unsigned long zones_size[MAX_NR_ZONES] = {0}; 1032 unsigned long zholes_size[MAX_NR_ZONES] = {0}; 1033 unsigned long start_pfn = start >> PAGE_SHIFT; 1034 1035 pgdat = NODE_DATA(nid); 1036 if (!pgdat) { 1037 pgdat = arch_alloc_nodedata(nid); 1038 if (!pgdat) 1039 return NULL; 1040 1041 arch_refresh_nodedata(nid, pgdat); 1042 } 1043 1044 /* we can use NODE_DATA(nid) from here */ 1045 1046 /* init node's zones as empty zones, we don't have any present pages.*/ 1047 free_area_init_node(nid, zones_size, start_pfn, zholes_size); 1048 1049 /* 1050 * The node we allocated has no zone fallback lists. For avoiding 1051 * to access not-initialized zonelist, build here. 1052 */ 1053 mutex_lock(&zonelists_mutex); 1054 build_all_zonelists(pgdat, NULL); 1055 mutex_unlock(&zonelists_mutex); 1056 1057 return pgdat; 1058 } 1059 1060 static void rollback_node_hotadd(int nid, pg_data_t *pgdat) 1061 { 1062 arch_refresh_nodedata(nid, NULL); 1063 arch_free_nodedata(pgdat); 1064 return; 1065 } 1066 1067 1068 /* 1069 * called by cpu_up() to online a node without onlined memory. 1070 */ 1071 int mem_online_node(int nid) 1072 { 1073 pg_data_t *pgdat; 1074 int ret; 1075 1076 lock_memory_hotplug(); 1077 pgdat = hotadd_new_pgdat(nid, 0); 1078 if (!pgdat) { 1079 ret = -ENOMEM; 1080 goto out; 1081 } 1082 node_set_online(nid); 1083 ret = register_one_node(nid); 1084 BUG_ON(ret); 1085 1086 out: 1087 unlock_memory_hotplug(); 1088 return ret; 1089 } 1090 1091 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 1092 int __ref add_memory(int nid, u64 start, u64 size) 1093 { 1094 pg_data_t *pgdat = NULL; 1095 bool new_pgdat; 1096 bool new_node; 1097 struct resource *res; 1098 int ret; 1099 1100 lock_memory_hotplug(); 1101 1102 res = register_memory_resource(start, size); 1103 ret = -EEXIST; 1104 if (!res) 1105 goto out; 1106 1107 { /* Stupid hack to suppress address-never-null warning */ 1108 void *p = NODE_DATA(nid); 1109 new_pgdat = !p; 1110 } 1111 new_node = !node_online(nid); 1112 if (new_node) { 1113 pgdat = hotadd_new_pgdat(nid, start); 1114 ret = -ENOMEM; 1115 if (!pgdat) 1116 goto error; 1117 } 1118 1119 /* call arch's memory hotadd */ 1120 ret = arch_add_memory(nid, start, size); 1121 1122 if (ret < 0) 1123 goto error; 1124 1125 /* we online node here. we can't roll back from here. */ 1126 node_set_online(nid); 1127 1128 if (new_node) { 1129 ret = register_one_node(nid); 1130 /* 1131 * If sysfs file of new node can't create, cpu on the node 1132 * can't be hot-added. There is no rollback way now. 1133 * So, check by BUG_ON() to catch it reluctantly.. 1134 */ 1135 BUG_ON(ret); 1136 } 1137 1138 /* create new memmap entry */ 1139 firmware_map_add_hotplug(start, start + size, "System RAM"); 1140 1141 goto out; 1142 1143 error: 1144 /* rollback pgdat allocation and others */ 1145 if (new_pgdat) 1146 rollback_node_hotadd(nid, pgdat); 1147 release_memory_resource(res); 1148 1149 out: 1150 unlock_memory_hotplug(); 1151 return ret; 1152 } 1153 EXPORT_SYMBOL_GPL(add_memory); 1154 1155 #ifdef CONFIG_MEMORY_HOTREMOVE 1156 /* 1157 * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy 1158 * set and the size of the free page is given by page_order(). Using this, 1159 * the function determines if the pageblock contains only free pages. 1160 * Due to buddy contraints, a free page at least the size of a pageblock will 1161 * be located at the start of the pageblock 1162 */ 1163 static inline int pageblock_free(struct page *page) 1164 { 1165 return PageBuddy(page) && page_order(page) >= pageblock_order; 1166 } 1167 1168 /* Return the start of the next active pageblock after a given page */ 1169 static struct page *next_active_pageblock(struct page *page) 1170 { 1171 /* Ensure the starting page is pageblock-aligned */ 1172 BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1)); 1173 1174 /* If the entire pageblock is free, move to the end of free page */ 1175 if (pageblock_free(page)) { 1176 int order; 1177 /* be careful. we don't have locks, page_order can be changed.*/ 1178 order = page_order(page); 1179 if ((order < MAX_ORDER) && (order >= pageblock_order)) 1180 return page + (1 << order); 1181 } 1182 1183 return page + pageblock_nr_pages; 1184 } 1185 1186 /* Checks if this range of memory is likely to be hot-removable. */ 1187 int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) 1188 { 1189 struct page *page = pfn_to_page(start_pfn); 1190 struct page *end_page = page + nr_pages; 1191 1192 /* Check the starting page of each pageblock within the range */ 1193 for (; page < end_page; page = next_active_pageblock(page)) { 1194 if (!is_pageblock_removable_nolock(page)) 1195 return 0; 1196 cond_resched(); 1197 } 1198 1199 /* All pageblocks in the memory block are likely to be hot-removable */ 1200 return 1; 1201 } 1202 1203 /* 1204 * Confirm all pages in a range [start, end) is belongs to the same zone. 1205 */ 1206 static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) 1207 { 1208 unsigned long pfn; 1209 struct zone *zone = NULL; 1210 struct page *page; 1211 int i; 1212 for (pfn = start_pfn; 1213 pfn < end_pfn; 1214 pfn += MAX_ORDER_NR_PAGES) { 1215 i = 0; 1216 /* This is just a CONFIG_HOLES_IN_ZONE check.*/ 1217 while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i)) 1218 i++; 1219 if (i == MAX_ORDER_NR_PAGES) 1220 continue; 1221 page = pfn_to_page(pfn + i); 1222 if (zone && page_zone(page) != zone) 1223 return 0; 1224 zone = page_zone(page); 1225 } 1226 return 1; 1227 } 1228 1229 /* 1230 * Scanning pfn is much easier than scanning lru list. 1231 * Scan pfn from start to end and Find LRU page. 1232 */ 1233 static unsigned long scan_lru_pages(unsigned long start, unsigned long end) 1234 { 1235 unsigned long pfn; 1236 struct page *page; 1237 for (pfn = start; pfn < end; pfn++) { 1238 if (pfn_valid(pfn)) { 1239 page = pfn_to_page(pfn); 1240 if (PageLRU(page)) 1241 return pfn; 1242 } 1243 } 1244 return 0; 1245 } 1246 1247 #define NR_OFFLINE_AT_ONCE_PAGES (256) 1248 static int 1249 do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) 1250 { 1251 unsigned long pfn; 1252 struct page *page; 1253 int move_pages = NR_OFFLINE_AT_ONCE_PAGES; 1254 int not_managed = 0; 1255 int ret = 0; 1256 LIST_HEAD(source); 1257 1258 for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) { 1259 if (!pfn_valid(pfn)) 1260 continue; 1261 page = pfn_to_page(pfn); 1262 if (!get_page_unless_zero(page)) 1263 continue; 1264 /* 1265 * We can skip free pages. And we can only deal with pages on 1266 * LRU. 1267 */ 1268 ret = isolate_lru_page(page); 1269 if (!ret) { /* Success */ 1270 put_page(page); 1271 list_add_tail(&page->lru, &source); 1272 move_pages--; 1273 inc_zone_page_state(page, NR_ISOLATED_ANON + 1274 page_is_file_cache(page)); 1275 1276 } else { 1277 #ifdef CONFIG_DEBUG_VM 1278 printk(KERN_ALERT "removing pfn %lx from LRU failed\n", 1279 pfn); 1280 dump_page(page); 1281 #endif 1282 put_page(page); 1283 /* Because we don't have big zone->lock. we should 1284 check this again here. */ 1285 if (page_count(page)) { 1286 not_managed++; 1287 ret = -EBUSY; 1288 break; 1289 } 1290 } 1291 } 1292 if (!list_empty(&source)) { 1293 if (not_managed) { 1294 putback_lru_pages(&source); 1295 goto out; 1296 } 1297 1298 /* 1299 * alloc_migrate_target should be improooooved!! 1300 * migrate_pages returns # of failed pages. 1301 */ 1302 ret = migrate_pages(&source, alloc_migrate_target, 0, 1303 MIGRATE_SYNC, MR_MEMORY_HOTPLUG); 1304 if (ret) 1305 putback_lru_pages(&source); 1306 } 1307 out: 1308 return ret; 1309 } 1310 1311 /* 1312 * remove from free_area[] and mark all as Reserved. 1313 */ 1314 static int 1315 offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, 1316 void *data) 1317 { 1318 __offline_isolated_pages(start, start + nr_pages); 1319 return 0; 1320 } 1321 1322 static void 1323 offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 1324 { 1325 walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL, 1326 offline_isolated_pages_cb); 1327 } 1328 1329 /* 1330 * Check all pages in range, recoreded as memory resource, are isolated. 1331 */ 1332 static int 1333 check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, 1334 void *data) 1335 { 1336 int ret; 1337 long offlined = *(long *)data; 1338 ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true); 1339 offlined = nr_pages; 1340 if (!ret) 1341 *(long *)data += offlined; 1342 return ret; 1343 } 1344 1345 static long 1346 check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) 1347 { 1348 long offlined = 0; 1349 int ret; 1350 1351 ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined, 1352 check_pages_isolated_cb); 1353 if (ret < 0) 1354 offlined = (long)ret; 1355 return offlined; 1356 } 1357 1358 #ifdef CONFIG_MOVABLE_NODE 1359 /* 1360 * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have 1361 * normal memory. 1362 */ 1363 static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) 1364 { 1365 return true; 1366 } 1367 #else /* CONFIG_MOVABLE_NODE */ 1368 /* ensure the node has NORMAL memory if it is still online */ 1369 static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) 1370 { 1371 struct pglist_data *pgdat = zone->zone_pgdat; 1372 unsigned long present_pages = 0; 1373 enum zone_type zt; 1374 1375 for (zt = 0; zt <= ZONE_NORMAL; zt++) 1376 present_pages += pgdat->node_zones[zt].present_pages; 1377 1378 if (present_pages > nr_pages) 1379 return true; 1380 1381 present_pages = 0; 1382 for (; zt <= ZONE_MOVABLE; zt++) 1383 present_pages += pgdat->node_zones[zt].present_pages; 1384 1385 /* 1386 * we can't offline the last normal memory until all 1387 * higher memory is offlined. 1388 */ 1389 return present_pages == 0; 1390 } 1391 #endif /* CONFIG_MOVABLE_NODE */ 1392 1393 /* check which state of node_states will be changed when offline memory */ 1394 static void node_states_check_changes_offline(unsigned long nr_pages, 1395 struct zone *zone, struct memory_notify *arg) 1396 { 1397 struct pglist_data *pgdat = zone->zone_pgdat; 1398 unsigned long present_pages = 0; 1399 enum zone_type zt, zone_last = ZONE_NORMAL; 1400 1401 /* 1402 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] 1403 * contains nodes which have zones of 0...ZONE_NORMAL, 1404 * set zone_last to ZONE_NORMAL. 1405 * 1406 * If we don't have HIGHMEM nor movable node, 1407 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of 1408 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. 1409 */ 1410 if (N_MEMORY == N_NORMAL_MEMORY) 1411 zone_last = ZONE_MOVABLE; 1412 1413 /* 1414 * check whether node_states[N_NORMAL_MEMORY] will be changed. 1415 * If the memory to be offline is in a zone of 0...zone_last, 1416 * and it is the last present memory, 0...zone_last will 1417 * become empty after offline , thus we can determind we will 1418 * need to clear the node from node_states[N_NORMAL_MEMORY]. 1419 */ 1420 for (zt = 0; zt <= zone_last; zt++) 1421 present_pages += pgdat->node_zones[zt].present_pages; 1422 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) 1423 arg->status_change_nid_normal = zone_to_nid(zone); 1424 else 1425 arg->status_change_nid_normal = -1; 1426 1427 #ifdef CONFIG_HIGHMEM 1428 /* 1429 * If we have movable node, node_states[N_HIGH_MEMORY] 1430 * contains nodes which have zones of 0...ZONE_HIGHMEM, 1431 * set zone_last to ZONE_HIGHMEM. 1432 * 1433 * If we don't have movable node, node_states[N_NORMAL_MEMORY] 1434 * contains nodes which have zones of 0...ZONE_MOVABLE, 1435 * set zone_last to ZONE_MOVABLE. 1436 */ 1437 zone_last = ZONE_HIGHMEM; 1438 if (N_MEMORY == N_HIGH_MEMORY) 1439 zone_last = ZONE_MOVABLE; 1440 1441 for (; zt <= zone_last; zt++) 1442 present_pages += pgdat->node_zones[zt].present_pages; 1443 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) 1444 arg->status_change_nid_high = zone_to_nid(zone); 1445 else 1446 arg->status_change_nid_high = -1; 1447 #else 1448 arg->status_change_nid_high = arg->status_change_nid_normal; 1449 #endif 1450 1451 /* 1452 * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE 1453 */ 1454 zone_last = ZONE_MOVABLE; 1455 1456 /* 1457 * check whether node_states[N_HIGH_MEMORY] will be changed 1458 * If we try to offline the last present @nr_pages from the node, 1459 * we can determind we will need to clear the node from 1460 * node_states[N_HIGH_MEMORY]. 1461 */ 1462 for (; zt <= zone_last; zt++) 1463 present_pages += pgdat->node_zones[zt].present_pages; 1464 if (nr_pages >= present_pages) 1465 arg->status_change_nid = zone_to_nid(zone); 1466 else 1467 arg->status_change_nid = -1; 1468 } 1469 1470 static void node_states_clear_node(int node, struct memory_notify *arg) 1471 { 1472 if (arg->status_change_nid_normal >= 0) 1473 node_clear_state(node, N_NORMAL_MEMORY); 1474 1475 if ((N_MEMORY != N_NORMAL_MEMORY) && 1476 (arg->status_change_nid_high >= 0)) 1477 node_clear_state(node, N_HIGH_MEMORY); 1478 1479 if ((N_MEMORY != N_HIGH_MEMORY) && 1480 (arg->status_change_nid >= 0)) 1481 node_clear_state(node, N_MEMORY); 1482 } 1483 1484 static int __ref __offline_pages(unsigned long start_pfn, 1485 unsigned long end_pfn, unsigned long timeout) 1486 { 1487 unsigned long pfn, nr_pages, expire; 1488 long offlined_pages; 1489 int ret, drain, retry_max, node; 1490 struct zone *zone; 1491 struct memory_notify arg; 1492 1493 BUG_ON(start_pfn >= end_pfn); 1494 /* at least, alignment against pageblock is necessary */ 1495 if (!IS_ALIGNED(start_pfn, pageblock_nr_pages)) 1496 return -EINVAL; 1497 if (!IS_ALIGNED(end_pfn, pageblock_nr_pages)) 1498 return -EINVAL; 1499 /* This makes hotplug much easier...and readable. 1500 we assume this for now. .*/ 1501 if (!test_pages_in_a_zone(start_pfn, end_pfn)) 1502 return -EINVAL; 1503 1504 lock_memory_hotplug(); 1505 1506 zone = page_zone(pfn_to_page(start_pfn)); 1507 node = zone_to_nid(zone); 1508 nr_pages = end_pfn - start_pfn; 1509 1510 ret = -EINVAL; 1511 if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages)) 1512 goto out; 1513 1514 /* set above range as isolated */ 1515 ret = start_isolate_page_range(start_pfn, end_pfn, 1516 MIGRATE_MOVABLE, true); 1517 if (ret) 1518 goto out; 1519 1520 arg.start_pfn = start_pfn; 1521 arg.nr_pages = nr_pages; 1522 node_states_check_changes_offline(nr_pages, zone, &arg); 1523 1524 ret = memory_notify(MEM_GOING_OFFLINE, &arg); 1525 ret = notifier_to_errno(ret); 1526 if (ret) 1527 goto failed_removal; 1528 1529 pfn = start_pfn; 1530 expire = jiffies + timeout; 1531 drain = 0; 1532 retry_max = 5; 1533 repeat: 1534 /* start memory hot removal */ 1535 ret = -EAGAIN; 1536 if (time_after(jiffies, expire)) 1537 goto failed_removal; 1538 ret = -EINTR; 1539 if (signal_pending(current)) 1540 goto failed_removal; 1541 ret = 0; 1542 if (drain) { 1543 lru_add_drain_all(); 1544 cond_resched(); 1545 drain_all_pages(); 1546 } 1547 1548 pfn = scan_lru_pages(start_pfn, end_pfn); 1549 if (pfn) { /* We have page on LRU */ 1550 ret = do_migrate_range(pfn, end_pfn); 1551 if (!ret) { 1552 drain = 1; 1553 goto repeat; 1554 } else { 1555 if (ret < 0) 1556 if (--retry_max == 0) 1557 goto failed_removal; 1558 yield(); 1559 drain = 1; 1560 goto repeat; 1561 } 1562 } 1563 /* drain all zone's lru pagevec, this is asynchronous... */ 1564 lru_add_drain_all(); 1565 yield(); 1566 /* drain pcp pages, this is synchronous. */ 1567 drain_all_pages(); 1568 /* check again */ 1569 offlined_pages = check_pages_isolated(start_pfn, end_pfn); 1570 if (offlined_pages < 0) { 1571 ret = -EBUSY; 1572 goto failed_removal; 1573 } 1574 printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages); 1575 /* Ok, all of our target is isolated. 1576 We cannot do rollback at this point. */ 1577 offline_isolated_pages(start_pfn, end_pfn); 1578 /* reset pagetype flags and makes migrate type to be MOVABLE */ 1579 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1580 /* removal success */ 1581 zone->managed_pages -= offlined_pages; 1582 zone->present_pages -= offlined_pages; 1583 zone->zone_pgdat->node_present_pages -= offlined_pages; 1584 totalram_pages -= offlined_pages; 1585 1586 init_per_zone_wmark_min(); 1587 1588 if (!populated_zone(zone)) { 1589 zone_pcp_reset(zone); 1590 mutex_lock(&zonelists_mutex); 1591 build_all_zonelists(NULL, NULL); 1592 mutex_unlock(&zonelists_mutex); 1593 } else 1594 zone_pcp_update(zone); 1595 1596 node_states_clear_node(node, &arg); 1597 if (arg.status_change_nid >= 0) 1598 kswapd_stop(node); 1599 1600 vm_total_pages = nr_free_pagecache_pages(); 1601 writeback_set_ratelimit(); 1602 1603 memory_notify(MEM_OFFLINE, &arg); 1604 unlock_memory_hotplug(); 1605 return 0; 1606 1607 failed_removal: 1608 printk(KERN_INFO "memory offlining [mem %#010llx-%#010llx] failed\n", 1609 (unsigned long long) start_pfn << PAGE_SHIFT, 1610 ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); 1611 memory_notify(MEM_CANCEL_OFFLINE, &arg); 1612 /* pushback to free area */ 1613 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1614 1615 out: 1616 unlock_memory_hotplug(); 1617 return ret; 1618 } 1619 1620 int offline_pages(unsigned long start_pfn, unsigned long nr_pages) 1621 { 1622 return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); 1623 } 1624 1625 /** 1626 * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn) 1627 * @start_pfn: start pfn of the memory range 1628 * @end_pfn: end pfn of the memory range 1629 * @arg: argument passed to func 1630 * @func: callback for each memory section walked 1631 * 1632 * This function walks through all present mem sections in range 1633 * [start_pfn, end_pfn) and call func on each mem section. 1634 * 1635 * Returns the return value of func. 1636 */ 1637 static int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, 1638 void *arg, int (*func)(struct memory_block *, void *)) 1639 { 1640 struct memory_block *mem = NULL; 1641 struct mem_section *section; 1642 unsigned long pfn, section_nr; 1643 int ret; 1644 1645 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 1646 section_nr = pfn_to_section_nr(pfn); 1647 if (!present_section_nr(section_nr)) 1648 continue; 1649 1650 section = __nr_to_section(section_nr); 1651 /* same memblock? */ 1652 if (mem) 1653 if ((section_nr >= mem->start_section_nr) && 1654 (section_nr <= mem->end_section_nr)) 1655 continue; 1656 1657 mem = find_memory_block_hinted(section, mem); 1658 if (!mem) 1659 continue; 1660 1661 ret = func(mem, arg); 1662 if (ret) { 1663 kobject_put(&mem->dev.kobj); 1664 return ret; 1665 } 1666 } 1667 1668 if (mem) 1669 kobject_put(&mem->dev.kobj); 1670 1671 return 0; 1672 } 1673 1674 /** 1675 * offline_memory_block_cb - callback function for offlining memory block 1676 * @mem: the memory block to be offlined 1677 * @arg: buffer to hold error msg 1678 * 1679 * Always return 0, and put the error msg in arg if any. 1680 */ 1681 static int offline_memory_block_cb(struct memory_block *mem, void *arg) 1682 { 1683 int *ret = arg; 1684 int error = offline_memory_block(mem); 1685 1686 if (error != 0 && *ret == 0) 1687 *ret = error; 1688 1689 return 0; 1690 } 1691 1692 static int is_memblock_offlined_cb(struct memory_block *mem, void *arg) 1693 { 1694 int ret = !is_memblock_offlined(mem); 1695 1696 if (unlikely(ret)) { 1697 phys_addr_t beginpa, endpa; 1698 1699 beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); 1700 endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1; 1701 pr_warn("removing memory fails, because memory " 1702 "[%pa-%pa] is onlined\n", 1703 &beginpa, &endpa); 1704 } 1705 1706 return ret; 1707 } 1708 1709 static int check_cpu_on_node(void *data) 1710 { 1711 struct pglist_data *pgdat = data; 1712 int cpu; 1713 1714 for_each_present_cpu(cpu) { 1715 if (cpu_to_node(cpu) == pgdat->node_id) 1716 /* 1717 * the cpu on this node isn't removed, and we can't 1718 * offline this node. 1719 */ 1720 return -EBUSY; 1721 } 1722 1723 return 0; 1724 } 1725 1726 static void unmap_cpu_on_node(void *data) 1727 { 1728 #ifdef CONFIG_ACPI_NUMA 1729 struct pglist_data *pgdat = data; 1730 int cpu; 1731 1732 for_each_possible_cpu(cpu) 1733 if (cpu_to_node(cpu) == pgdat->node_id) 1734 numa_clear_node(cpu); 1735 #endif 1736 } 1737 1738 static int check_and_unmap_cpu_on_node(void *data) 1739 { 1740 int ret = check_cpu_on_node(data); 1741 1742 if (ret) 1743 return ret; 1744 1745 /* 1746 * the node will be offlined when we come here, so we can clear 1747 * the cpu_to_node() now. 1748 */ 1749 1750 unmap_cpu_on_node(data); 1751 return 0; 1752 } 1753 1754 /* offline the node if all memory sections of this node are removed */ 1755 void try_offline_node(int nid) 1756 { 1757 pg_data_t *pgdat = NODE_DATA(nid); 1758 unsigned long start_pfn = pgdat->node_start_pfn; 1759 unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; 1760 unsigned long pfn; 1761 struct page *pgdat_page = virt_to_page(pgdat); 1762 int i; 1763 1764 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 1765 unsigned long section_nr = pfn_to_section_nr(pfn); 1766 1767 if (!present_section_nr(section_nr)) 1768 continue; 1769 1770 if (pfn_to_nid(pfn) != nid) 1771 continue; 1772 1773 /* 1774 * some memory sections of this node are not removed, and we 1775 * can't offline node now. 1776 */ 1777 return; 1778 } 1779 1780 if (stop_machine(check_and_unmap_cpu_on_node, pgdat, NULL)) 1781 return; 1782 1783 /* 1784 * all memory/cpu of this node are removed, we can offline this 1785 * node now. 1786 */ 1787 node_set_offline(nid); 1788 unregister_one_node(nid); 1789 1790 if (!PageSlab(pgdat_page) && !PageCompound(pgdat_page)) 1791 /* node data is allocated from boot memory */ 1792 return; 1793 1794 /* free waittable in each zone */ 1795 for (i = 0; i < MAX_NR_ZONES; i++) { 1796 struct zone *zone = pgdat->node_zones + i; 1797 1798 /* 1799 * wait_table may be allocated from boot memory, 1800 * here only free if it's allocated by vmalloc. 1801 */ 1802 if (is_vmalloc_addr(zone->wait_table)) 1803 vfree(zone->wait_table); 1804 } 1805 1806 /* 1807 * Since there is no way to guarentee the address of pgdat/zone is not 1808 * on stack of any kernel threads or used by other kernel objects 1809 * without reference counting or other symchronizing method, do not 1810 * reset node_data and free pgdat here. Just reset it to 0 and reuse 1811 * the memory when the node is online again. 1812 */ 1813 memset(pgdat, 0, sizeof(*pgdat)); 1814 } 1815 EXPORT_SYMBOL(try_offline_node); 1816 1817 int __ref remove_memory(int nid, u64 start, u64 size) 1818 { 1819 unsigned long start_pfn, end_pfn; 1820 int ret = 0; 1821 int retry = 1; 1822 1823 start_pfn = PFN_DOWN(start); 1824 end_pfn = PFN_UP(start + size - 1); 1825 1826 /* 1827 * When CONFIG_MEMCG is on, one memory block may be used by other 1828 * blocks to store page cgroup when onlining pages. But we don't know 1829 * in what order pages are onlined. So we iterate twice to offline 1830 * memory: 1831 * 1st iterate: offline every non primary memory block. 1832 * 2nd iterate: offline primary (i.e. first added) memory block. 1833 */ 1834 repeat: 1835 walk_memory_range(start_pfn, end_pfn, &ret, 1836 offline_memory_block_cb); 1837 if (ret) { 1838 if (!retry) 1839 return ret; 1840 1841 retry = 0; 1842 ret = 0; 1843 goto repeat; 1844 } 1845 1846 lock_memory_hotplug(); 1847 1848 /* 1849 * we have offlined all memory blocks like this: 1850 * 1. lock memory hotplug 1851 * 2. offline a memory block 1852 * 3. unlock memory hotplug 1853 * 1854 * repeat step1-3 to offline the memory block. All memory blocks 1855 * must be offlined before removing memory. But we don't hold the 1856 * lock in the whole operation. So we should check whether all 1857 * memory blocks are offlined. 1858 */ 1859 1860 ret = walk_memory_range(start_pfn, end_pfn, NULL, 1861 is_memblock_offlined_cb); 1862 if (ret) { 1863 unlock_memory_hotplug(); 1864 return ret; 1865 } 1866 1867 /* remove memmap entry */ 1868 firmware_map_remove(start, start + size, "System RAM"); 1869 1870 arch_remove_memory(start, size); 1871 1872 try_offline_node(nid); 1873 1874 unlock_memory_hotplug(); 1875 1876 return 0; 1877 } 1878 #else 1879 int offline_pages(unsigned long start_pfn, unsigned long nr_pages) 1880 { 1881 return -EINVAL; 1882 } 1883 int remove_memory(int nid, u64 start, u64 size) 1884 { 1885 return -EINVAL; 1886 } 1887 #endif /* CONFIG_MEMORY_HOTREMOVE */ 1888 EXPORT_SYMBOL_GPL(remove_memory); 1889