1 /* 2 * linux/mm/memory_hotplug.c 3 * 4 * Copyright (C) 5 */ 6 7 #include <linux/stddef.h> 8 #include <linux/mm.h> 9 #include <linux/swap.h> 10 #include <linux/interrupt.h> 11 #include <linux/pagemap.h> 12 #include <linux/bootmem.h> 13 #include <linux/compiler.h> 14 #include <linux/export.h> 15 #include <linux/pagevec.h> 16 #include <linux/writeback.h> 17 #include <linux/slab.h> 18 #include <linux/sysctl.h> 19 #include <linux/cpu.h> 20 #include <linux/memory.h> 21 #include <linux/memory_hotplug.h> 22 #include <linux/highmem.h> 23 #include <linux/vmalloc.h> 24 #include <linux/ioport.h> 25 #include <linux/delay.h> 26 #include <linux/migrate.h> 27 #include <linux/page-isolation.h> 28 #include <linux/pfn.h> 29 #include <linux/suspend.h> 30 #include <linux/mm_inline.h> 31 #include <linux/firmware-map.h> 32 #include <linux/stop_machine.h> 33 34 #include <asm/tlbflush.h> 35 36 #include "internal.h" 37 38 /* 39 * online_page_callback contains pointer to current page onlining function. 40 * Initially it is generic_online_page(). If it is required it could be 41 * changed by calling set_online_page_callback() for callback registration 42 * and restore_online_page_callback() for generic callback restore. 43 */ 44 45 static void generic_online_page(struct page *page); 46 47 static online_page_callback_t online_page_callback = generic_online_page; 48 49 DEFINE_MUTEX(mem_hotplug_mutex); 50 51 void lock_memory_hotplug(void) 52 { 53 mutex_lock(&mem_hotplug_mutex); 54 55 /* for exclusive hibernation if CONFIG_HIBERNATION=y */ 56 lock_system_sleep(); 57 } 58 59 void unlock_memory_hotplug(void) 60 { 61 unlock_system_sleep(); 62 mutex_unlock(&mem_hotplug_mutex); 63 } 64 65 66 /* add this memory to iomem resource */ 67 static struct resource *register_memory_resource(u64 start, u64 size) 68 { 69 struct resource *res; 70 res = kzalloc(sizeof(struct resource), GFP_KERNEL); 71 BUG_ON(!res); 72 73 res->name = "System RAM"; 74 res->start = start; 75 res->end = start + size - 1; 76 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; 77 if (request_resource(&iomem_resource, res) < 0) { 78 printk("System RAM resource %pR cannot be added\n", res); 79 kfree(res); 80 res = NULL; 81 } 82 return res; 83 } 84 85 static void release_memory_resource(struct resource *res) 86 { 87 if (!res) 88 return; 89 release_resource(res); 90 kfree(res); 91 return; 92 } 93 94 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 95 void get_page_bootmem(unsigned long info, struct page *page, 96 unsigned long type) 97 { 98 page->lru.next = (struct list_head *) type; 99 SetPagePrivate(page); 100 set_page_private(page, info); 101 atomic_inc(&page->_count); 102 } 103 104 /* reference to __meminit __free_pages_bootmem is valid 105 * so use __ref to tell modpost not to generate a warning */ 106 void __ref put_page_bootmem(struct page *page) 107 { 108 unsigned long type; 109 static DEFINE_MUTEX(ppb_lock); 110 111 type = (unsigned long) page->lru.next; 112 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || 113 type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); 114 115 if (atomic_dec_return(&page->_count) == 1) { 116 ClearPagePrivate(page); 117 set_page_private(page, 0); 118 INIT_LIST_HEAD(&page->lru); 119 120 /* 121 * Please refer to comment for __free_pages_bootmem() 122 * for why we serialize here. 123 */ 124 mutex_lock(&ppb_lock); 125 __free_pages_bootmem(page, 0); 126 mutex_unlock(&ppb_lock); 127 totalram_pages++; 128 } 129 130 } 131 132 #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE 133 #ifndef CONFIG_SPARSEMEM_VMEMMAP 134 static void register_page_bootmem_info_section(unsigned long start_pfn) 135 { 136 unsigned long *usemap, mapsize, section_nr, i; 137 struct mem_section *ms; 138 struct page *page, *memmap; 139 140 section_nr = pfn_to_section_nr(start_pfn); 141 ms = __nr_to_section(section_nr); 142 143 /* Get section's memmap address */ 144 memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); 145 146 /* 147 * Get page for the memmap's phys address 148 * XXX: need more consideration for sparse_vmemmap... 149 */ 150 page = virt_to_page(memmap); 151 mapsize = sizeof(struct page) * PAGES_PER_SECTION; 152 mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT; 153 154 /* remember memmap's page */ 155 for (i = 0; i < mapsize; i++, page++) 156 get_page_bootmem(section_nr, page, SECTION_INFO); 157 158 usemap = __nr_to_section(section_nr)->pageblock_flags; 159 page = virt_to_page(usemap); 160 161 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; 162 163 for (i = 0; i < mapsize; i++, page++) 164 get_page_bootmem(section_nr, page, MIX_SECTION_INFO); 165 166 } 167 #else /* CONFIG_SPARSEMEM_VMEMMAP */ 168 static void register_page_bootmem_info_section(unsigned long start_pfn) 169 { 170 unsigned long *usemap, mapsize, section_nr, i; 171 struct mem_section *ms; 172 struct page *page, *memmap; 173 174 if (!pfn_valid(start_pfn)) 175 return; 176 177 section_nr = pfn_to_section_nr(start_pfn); 178 ms = __nr_to_section(section_nr); 179 180 memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); 181 182 register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION); 183 184 usemap = __nr_to_section(section_nr)->pageblock_flags; 185 page = virt_to_page(usemap); 186 187 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; 188 189 for (i = 0; i < mapsize; i++, page++) 190 get_page_bootmem(section_nr, page, MIX_SECTION_INFO); 191 } 192 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ 193 194 void register_page_bootmem_info_node(struct pglist_data *pgdat) 195 { 196 unsigned long i, pfn, end_pfn, nr_pages; 197 int node = pgdat->node_id; 198 struct page *page; 199 struct zone *zone; 200 201 nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT; 202 page = virt_to_page(pgdat); 203 204 for (i = 0; i < nr_pages; i++, page++) 205 get_page_bootmem(node, page, NODE_INFO); 206 207 zone = &pgdat->node_zones[0]; 208 for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) { 209 if (zone->wait_table) { 210 nr_pages = zone->wait_table_hash_nr_entries 211 * sizeof(wait_queue_head_t); 212 nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT; 213 page = virt_to_page(zone->wait_table); 214 215 for (i = 0; i < nr_pages; i++, page++) 216 get_page_bootmem(node, page, NODE_INFO); 217 } 218 } 219 220 pfn = pgdat->node_start_pfn; 221 end_pfn = pgdat_end_pfn(pgdat); 222 223 /* register_section info */ 224 for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 225 /* 226 * Some platforms can assign the same pfn to multiple nodes - on 227 * node0 as well as nodeN. To avoid registering a pfn against 228 * multiple nodes we check that this pfn does not already 229 * reside in some other node. 230 */ 231 if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node)) 232 register_page_bootmem_info_section(pfn); 233 } 234 } 235 #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ 236 237 static void grow_zone_span(struct zone *zone, unsigned long start_pfn, 238 unsigned long end_pfn) 239 { 240 unsigned long old_zone_end_pfn; 241 242 zone_span_writelock(zone); 243 244 old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; 245 if (!zone->spanned_pages || start_pfn < zone->zone_start_pfn) 246 zone->zone_start_pfn = start_pfn; 247 248 zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - 249 zone->zone_start_pfn; 250 251 zone_span_writeunlock(zone); 252 } 253 254 static void resize_zone(struct zone *zone, unsigned long start_pfn, 255 unsigned long end_pfn) 256 { 257 zone_span_writelock(zone); 258 259 if (end_pfn - start_pfn) { 260 zone->zone_start_pfn = start_pfn; 261 zone->spanned_pages = end_pfn - start_pfn; 262 } else { 263 /* 264 * make it consist as free_area_init_core(), 265 * if spanned_pages = 0, then keep start_pfn = 0 266 */ 267 zone->zone_start_pfn = 0; 268 zone->spanned_pages = 0; 269 } 270 271 zone_span_writeunlock(zone); 272 } 273 274 static void fix_zone_id(struct zone *zone, unsigned long start_pfn, 275 unsigned long end_pfn) 276 { 277 enum zone_type zid = zone_idx(zone); 278 int nid = zone->zone_pgdat->node_id; 279 unsigned long pfn; 280 281 for (pfn = start_pfn; pfn < end_pfn; pfn++) 282 set_page_links(pfn_to_page(pfn), zid, nid, pfn); 283 } 284 285 /* Can fail with -ENOMEM from allocating a wait table with vmalloc() or 286 * alloc_bootmem_node_nopanic() */ 287 static int __ref ensure_zone_is_initialized(struct zone *zone, 288 unsigned long start_pfn, unsigned long num_pages) 289 { 290 if (!zone_is_initialized(zone)) 291 return init_currently_empty_zone(zone, start_pfn, num_pages, 292 MEMMAP_HOTPLUG); 293 return 0; 294 } 295 296 static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, 297 unsigned long start_pfn, unsigned long end_pfn) 298 { 299 int ret; 300 unsigned long flags; 301 unsigned long z1_start_pfn; 302 303 ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn); 304 if (ret) 305 return ret; 306 307 pgdat_resize_lock(z1->zone_pgdat, &flags); 308 309 /* can't move pfns which are higher than @z2 */ 310 if (end_pfn > zone_end_pfn(z2)) 311 goto out_fail; 312 /* the move out part mast at the left most of @z2 */ 313 if (start_pfn > z2->zone_start_pfn) 314 goto out_fail; 315 /* must included/overlap */ 316 if (end_pfn <= z2->zone_start_pfn) 317 goto out_fail; 318 319 /* use start_pfn for z1's start_pfn if z1 is empty */ 320 if (z1->spanned_pages) 321 z1_start_pfn = z1->zone_start_pfn; 322 else 323 z1_start_pfn = start_pfn; 324 325 resize_zone(z1, z1_start_pfn, end_pfn); 326 resize_zone(z2, end_pfn, zone_end_pfn(z2)); 327 328 pgdat_resize_unlock(z1->zone_pgdat, &flags); 329 330 fix_zone_id(z1, start_pfn, end_pfn); 331 332 return 0; 333 out_fail: 334 pgdat_resize_unlock(z1->zone_pgdat, &flags); 335 return -1; 336 } 337 338 static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, 339 unsigned long start_pfn, unsigned long end_pfn) 340 { 341 int ret; 342 unsigned long flags; 343 unsigned long z2_end_pfn; 344 345 ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn); 346 if (ret) 347 return ret; 348 349 pgdat_resize_lock(z1->zone_pgdat, &flags); 350 351 /* can't move pfns which are lower than @z1 */ 352 if (z1->zone_start_pfn > start_pfn) 353 goto out_fail; 354 /* the move out part mast at the right most of @z1 */ 355 if (zone_end_pfn(z1) > end_pfn) 356 goto out_fail; 357 /* must included/overlap */ 358 if (start_pfn >= zone_end_pfn(z1)) 359 goto out_fail; 360 361 /* use end_pfn for z2's end_pfn if z2 is empty */ 362 if (z2->spanned_pages) 363 z2_end_pfn = zone_end_pfn(z2); 364 else 365 z2_end_pfn = end_pfn; 366 367 resize_zone(z1, z1->zone_start_pfn, start_pfn); 368 resize_zone(z2, start_pfn, z2_end_pfn); 369 370 pgdat_resize_unlock(z1->zone_pgdat, &flags); 371 372 fix_zone_id(z2, start_pfn, end_pfn); 373 374 return 0; 375 out_fail: 376 pgdat_resize_unlock(z1->zone_pgdat, &flags); 377 return -1; 378 } 379 380 static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, 381 unsigned long end_pfn) 382 { 383 unsigned long old_pgdat_end_pfn = 384 pgdat->node_start_pfn + pgdat->node_spanned_pages; 385 386 if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) 387 pgdat->node_start_pfn = start_pfn; 388 389 pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - 390 pgdat->node_start_pfn; 391 } 392 393 static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) 394 { 395 struct pglist_data *pgdat = zone->zone_pgdat; 396 int nr_pages = PAGES_PER_SECTION; 397 int nid = pgdat->node_id; 398 int zone_type; 399 unsigned long flags; 400 int ret; 401 402 zone_type = zone - pgdat->node_zones; 403 ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages); 404 if (ret) 405 return ret; 406 407 pgdat_resize_lock(zone->zone_pgdat, &flags); 408 grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); 409 grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, 410 phys_start_pfn + nr_pages); 411 pgdat_resize_unlock(zone->zone_pgdat, &flags); 412 memmap_init_zone(nr_pages, nid, zone_type, 413 phys_start_pfn, MEMMAP_HOTPLUG); 414 return 0; 415 } 416 417 static int __meminit __add_section(int nid, struct zone *zone, 418 unsigned long phys_start_pfn) 419 { 420 int nr_pages = PAGES_PER_SECTION; 421 int ret; 422 423 if (pfn_valid(phys_start_pfn)) 424 return -EEXIST; 425 426 ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages); 427 428 if (ret < 0) 429 return ret; 430 431 ret = __add_zone(zone, phys_start_pfn); 432 433 if (ret < 0) 434 return ret; 435 436 return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); 437 } 438 439 /* 440 * Reasonably generic function for adding memory. It is 441 * expected that archs that support memory hotplug will 442 * call this function after deciding the zone to which to 443 * add the new pages. 444 */ 445 int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, 446 unsigned long nr_pages) 447 { 448 unsigned long i; 449 int err = 0; 450 int start_sec, end_sec; 451 /* during initialize mem_map, align hot-added range to section */ 452 start_sec = pfn_to_section_nr(phys_start_pfn); 453 end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); 454 455 for (i = start_sec; i <= end_sec; i++) { 456 err = __add_section(nid, zone, i << PFN_SECTION_SHIFT); 457 458 /* 459 * EEXIST is finally dealt with by ioresource collision 460 * check. see add_memory() => register_memory_resource() 461 * Warning will be printed if there is collision. 462 */ 463 if (err && (err != -EEXIST)) 464 break; 465 err = 0; 466 } 467 468 return err; 469 } 470 EXPORT_SYMBOL_GPL(__add_pages); 471 472 #ifdef CONFIG_MEMORY_HOTREMOVE 473 /* find the smallest valid pfn in the range [start_pfn, end_pfn) */ 474 static int find_smallest_section_pfn(int nid, struct zone *zone, 475 unsigned long start_pfn, 476 unsigned long end_pfn) 477 { 478 struct mem_section *ms; 479 480 for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) { 481 ms = __pfn_to_section(start_pfn); 482 483 if (unlikely(!valid_section(ms))) 484 continue; 485 486 if (unlikely(pfn_to_nid(start_pfn) != nid)) 487 continue; 488 489 if (zone && zone != page_zone(pfn_to_page(start_pfn))) 490 continue; 491 492 return start_pfn; 493 } 494 495 return 0; 496 } 497 498 /* find the biggest valid pfn in the range [start_pfn, end_pfn). */ 499 static int find_biggest_section_pfn(int nid, struct zone *zone, 500 unsigned long start_pfn, 501 unsigned long end_pfn) 502 { 503 struct mem_section *ms; 504 unsigned long pfn; 505 506 /* pfn is the end pfn of a memory section. */ 507 pfn = end_pfn - 1; 508 for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) { 509 ms = __pfn_to_section(pfn); 510 511 if (unlikely(!valid_section(ms))) 512 continue; 513 514 if (unlikely(pfn_to_nid(pfn) != nid)) 515 continue; 516 517 if (zone && zone != page_zone(pfn_to_page(pfn))) 518 continue; 519 520 return pfn; 521 } 522 523 return 0; 524 } 525 526 static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, 527 unsigned long end_pfn) 528 { 529 unsigned long zone_start_pfn = zone->zone_start_pfn; 530 unsigned long zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; 531 unsigned long pfn; 532 struct mem_section *ms; 533 int nid = zone_to_nid(zone); 534 535 zone_span_writelock(zone); 536 if (zone_start_pfn == start_pfn) { 537 /* 538 * If the section is smallest section in the zone, it need 539 * shrink zone->zone_start_pfn and zone->zone_spanned_pages. 540 * In this case, we find second smallest valid mem_section 541 * for shrinking zone. 542 */ 543 pfn = find_smallest_section_pfn(nid, zone, end_pfn, 544 zone_end_pfn); 545 if (pfn) { 546 zone->zone_start_pfn = pfn; 547 zone->spanned_pages = zone_end_pfn - pfn; 548 } 549 } else if (zone_end_pfn == end_pfn) { 550 /* 551 * If the section is biggest section in the zone, it need 552 * shrink zone->spanned_pages. 553 * In this case, we find second biggest valid mem_section for 554 * shrinking zone. 555 */ 556 pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn, 557 start_pfn); 558 if (pfn) 559 zone->spanned_pages = pfn - zone_start_pfn + 1; 560 } 561 562 /* 563 * The section is not biggest or smallest mem_section in the zone, it 564 * only creates a hole in the zone. So in this case, we need not 565 * change the zone. But perhaps, the zone has only hole data. Thus 566 * it check the zone has only hole or not. 567 */ 568 pfn = zone_start_pfn; 569 for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) { 570 ms = __pfn_to_section(pfn); 571 572 if (unlikely(!valid_section(ms))) 573 continue; 574 575 if (page_zone(pfn_to_page(pfn)) != zone) 576 continue; 577 578 /* If the section is current section, it continues the loop */ 579 if (start_pfn == pfn) 580 continue; 581 582 /* If we find valid section, we have nothing to do */ 583 zone_span_writeunlock(zone); 584 return; 585 } 586 587 /* The zone has no valid section */ 588 zone->zone_start_pfn = 0; 589 zone->spanned_pages = 0; 590 zone_span_writeunlock(zone); 591 } 592 593 static void shrink_pgdat_span(struct pglist_data *pgdat, 594 unsigned long start_pfn, unsigned long end_pfn) 595 { 596 unsigned long pgdat_start_pfn = pgdat->node_start_pfn; 597 unsigned long pgdat_end_pfn = 598 pgdat->node_start_pfn + pgdat->node_spanned_pages; 599 unsigned long pfn; 600 struct mem_section *ms; 601 int nid = pgdat->node_id; 602 603 if (pgdat_start_pfn == start_pfn) { 604 /* 605 * If the section is smallest section in the pgdat, it need 606 * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages. 607 * In this case, we find second smallest valid mem_section 608 * for shrinking zone. 609 */ 610 pfn = find_smallest_section_pfn(nid, NULL, end_pfn, 611 pgdat_end_pfn); 612 if (pfn) { 613 pgdat->node_start_pfn = pfn; 614 pgdat->node_spanned_pages = pgdat_end_pfn - pfn; 615 } 616 } else if (pgdat_end_pfn == end_pfn) { 617 /* 618 * If the section is biggest section in the pgdat, it need 619 * shrink pgdat->node_spanned_pages. 620 * In this case, we find second biggest valid mem_section for 621 * shrinking zone. 622 */ 623 pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn, 624 start_pfn); 625 if (pfn) 626 pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1; 627 } 628 629 /* 630 * If the section is not biggest or smallest mem_section in the pgdat, 631 * it only creates a hole in the pgdat. So in this case, we need not 632 * change the pgdat. 633 * But perhaps, the pgdat has only hole data. Thus it check the pgdat 634 * has only hole or not. 635 */ 636 pfn = pgdat_start_pfn; 637 for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) { 638 ms = __pfn_to_section(pfn); 639 640 if (unlikely(!valid_section(ms))) 641 continue; 642 643 if (pfn_to_nid(pfn) != nid) 644 continue; 645 646 /* If the section is current section, it continues the loop */ 647 if (start_pfn == pfn) 648 continue; 649 650 /* If we find valid section, we have nothing to do */ 651 return; 652 } 653 654 /* The pgdat has no valid section */ 655 pgdat->node_start_pfn = 0; 656 pgdat->node_spanned_pages = 0; 657 } 658 659 static void __remove_zone(struct zone *zone, unsigned long start_pfn) 660 { 661 struct pglist_data *pgdat = zone->zone_pgdat; 662 int nr_pages = PAGES_PER_SECTION; 663 int zone_type; 664 unsigned long flags; 665 666 zone_type = zone - pgdat->node_zones; 667 668 pgdat_resize_lock(zone->zone_pgdat, &flags); 669 shrink_zone_span(zone, start_pfn, start_pfn + nr_pages); 670 shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages); 671 pgdat_resize_unlock(zone->zone_pgdat, &flags); 672 } 673 674 static int __remove_section(struct zone *zone, struct mem_section *ms) 675 { 676 unsigned long start_pfn; 677 int scn_nr; 678 int ret = -EINVAL; 679 680 if (!valid_section(ms)) 681 return ret; 682 683 ret = unregister_memory_section(ms); 684 if (ret) 685 return ret; 686 687 scn_nr = __section_nr(ms); 688 start_pfn = section_nr_to_pfn(scn_nr); 689 __remove_zone(zone, start_pfn); 690 691 sparse_remove_one_section(zone, ms); 692 return 0; 693 } 694 695 /** 696 * __remove_pages() - remove sections of pages from a zone 697 * @zone: zone from which pages need to be removed 698 * @phys_start_pfn: starting pageframe (must be aligned to start of a section) 699 * @nr_pages: number of pages to remove (must be multiple of section size) 700 * 701 * Generic helper function to remove section mappings and sysfs entries 702 * for the section of the memory we are removing. Caller needs to make 703 * sure that pages are marked reserved and zones are adjust properly by 704 * calling offline_pages(). 705 */ 706 int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, 707 unsigned long nr_pages) 708 { 709 unsigned long i; 710 int sections_to_remove; 711 resource_size_t start, size; 712 int ret = 0; 713 714 /* 715 * We can only remove entire sections 716 */ 717 BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); 718 BUG_ON(nr_pages % PAGES_PER_SECTION); 719 720 start = phys_start_pfn << PAGE_SHIFT; 721 size = nr_pages * PAGE_SIZE; 722 ret = release_mem_region_adjustable(&iomem_resource, start, size); 723 if (ret) 724 pr_warn("Unable to release resource <%016llx-%016llx> (%d)\n", 725 start, start + size - 1, ret); 726 727 sections_to_remove = nr_pages / PAGES_PER_SECTION; 728 for (i = 0; i < sections_to_remove; i++) { 729 unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; 730 ret = __remove_section(zone, __pfn_to_section(pfn)); 731 if (ret) 732 break; 733 } 734 return ret; 735 } 736 EXPORT_SYMBOL_GPL(__remove_pages); 737 #endif /* CONFIG_MEMORY_HOTREMOVE */ 738 739 int set_online_page_callback(online_page_callback_t callback) 740 { 741 int rc = -EINVAL; 742 743 lock_memory_hotplug(); 744 745 if (online_page_callback == generic_online_page) { 746 online_page_callback = callback; 747 rc = 0; 748 } 749 750 unlock_memory_hotplug(); 751 752 return rc; 753 } 754 EXPORT_SYMBOL_GPL(set_online_page_callback); 755 756 int restore_online_page_callback(online_page_callback_t callback) 757 { 758 int rc = -EINVAL; 759 760 lock_memory_hotplug(); 761 762 if (online_page_callback == callback) { 763 online_page_callback = generic_online_page; 764 rc = 0; 765 } 766 767 unlock_memory_hotplug(); 768 769 return rc; 770 } 771 EXPORT_SYMBOL_GPL(restore_online_page_callback); 772 773 void __online_page_set_limits(struct page *page) 774 { 775 unsigned long pfn = page_to_pfn(page); 776 777 if (pfn >= num_physpages) 778 num_physpages = pfn + 1; 779 } 780 EXPORT_SYMBOL_GPL(__online_page_set_limits); 781 782 void __online_page_increment_counters(struct page *page) 783 { 784 totalram_pages++; 785 786 #ifdef CONFIG_HIGHMEM 787 if (PageHighMem(page)) 788 totalhigh_pages++; 789 #endif 790 } 791 EXPORT_SYMBOL_GPL(__online_page_increment_counters); 792 793 void __online_page_free(struct page *page) 794 { 795 ClearPageReserved(page); 796 init_page_count(page); 797 __free_page(page); 798 } 799 EXPORT_SYMBOL_GPL(__online_page_free); 800 801 static void generic_online_page(struct page *page) 802 { 803 __online_page_set_limits(page); 804 __online_page_increment_counters(page); 805 __online_page_free(page); 806 } 807 808 static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, 809 void *arg) 810 { 811 unsigned long i; 812 unsigned long onlined_pages = *(unsigned long *)arg; 813 struct page *page; 814 if (PageReserved(pfn_to_page(start_pfn))) 815 for (i = 0; i < nr_pages; i++) { 816 page = pfn_to_page(start_pfn + i); 817 (*online_page_callback)(page); 818 onlined_pages++; 819 } 820 *(unsigned long *)arg = onlined_pages; 821 return 0; 822 } 823 824 #ifdef CONFIG_MOVABLE_NODE 825 /* 826 * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have 827 * normal memory. 828 */ 829 static bool can_online_high_movable(struct zone *zone) 830 { 831 return true; 832 } 833 #else /* CONFIG_MOVABLE_NODE */ 834 /* ensure every online node has NORMAL memory */ 835 static bool can_online_high_movable(struct zone *zone) 836 { 837 return node_state(zone_to_nid(zone), N_NORMAL_MEMORY); 838 } 839 #endif /* CONFIG_MOVABLE_NODE */ 840 841 /* check which state of node_states will be changed when online memory */ 842 static void node_states_check_changes_online(unsigned long nr_pages, 843 struct zone *zone, struct memory_notify *arg) 844 { 845 int nid = zone_to_nid(zone); 846 enum zone_type zone_last = ZONE_NORMAL; 847 848 /* 849 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] 850 * contains nodes which have zones of 0...ZONE_NORMAL, 851 * set zone_last to ZONE_NORMAL. 852 * 853 * If we don't have HIGHMEM nor movable node, 854 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of 855 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. 856 */ 857 if (N_MEMORY == N_NORMAL_MEMORY) 858 zone_last = ZONE_MOVABLE; 859 860 /* 861 * if the memory to be online is in a zone of 0...zone_last, and 862 * the zones of 0...zone_last don't have memory before online, we will 863 * need to set the node to node_states[N_NORMAL_MEMORY] after 864 * the memory is online. 865 */ 866 if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY)) 867 arg->status_change_nid_normal = nid; 868 else 869 arg->status_change_nid_normal = -1; 870 871 #ifdef CONFIG_HIGHMEM 872 /* 873 * If we have movable node, node_states[N_HIGH_MEMORY] 874 * contains nodes which have zones of 0...ZONE_HIGHMEM, 875 * set zone_last to ZONE_HIGHMEM. 876 * 877 * If we don't have movable node, node_states[N_NORMAL_MEMORY] 878 * contains nodes which have zones of 0...ZONE_MOVABLE, 879 * set zone_last to ZONE_MOVABLE. 880 */ 881 zone_last = ZONE_HIGHMEM; 882 if (N_MEMORY == N_HIGH_MEMORY) 883 zone_last = ZONE_MOVABLE; 884 885 if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY)) 886 arg->status_change_nid_high = nid; 887 else 888 arg->status_change_nid_high = -1; 889 #else 890 arg->status_change_nid_high = arg->status_change_nid_normal; 891 #endif 892 893 /* 894 * if the node don't have memory befor online, we will need to 895 * set the node to node_states[N_MEMORY] after the memory 896 * is online. 897 */ 898 if (!node_state(nid, N_MEMORY)) 899 arg->status_change_nid = nid; 900 else 901 arg->status_change_nid = -1; 902 } 903 904 static void node_states_set_node(int node, struct memory_notify *arg) 905 { 906 if (arg->status_change_nid_normal >= 0) 907 node_set_state(node, N_NORMAL_MEMORY); 908 909 if (arg->status_change_nid_high >= 0) 910 node_set_state(node, N_HIGH_MEMORY); 911 912 node_set_state(node, N_MEMORY); 913 } 914 915 916 int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) 917 { 918 unsigned long onlined_pages = 0; 919 struct zone *zone; 920 int need_zonelists_rebuild = 0; 921 int nid; 922 int ret; 923 struct memory_notify arg; 924 925 lock_memory_hotplug(); 926 /* 927 * This doesn't need a lock to do pfn_to_page(). 928 * The section can't be removed here because of the 929 * memory_block->state_mutex. 930 */ 931 zone = page_zone(pfn_to_page(pfn)); 932 933 if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) && 934 !can_online_high_movable(zone)) { 935 unlock_memory_hotplug(); 936 return -1; 937 } 938 939 if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { 940 if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) { 941 unlock_memory_hotplug(); 942 return -1; 943 } 944 } 945 if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { 946 if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) { 947 unlock_memory_hotplug(); 948 return -1; 949 } 950 } 951 952 /* Previous code may changed the zone of the pfn range */ 953 zone = page_zone(pfn_to_page(pfn)); 954 955 arg.start_pfn = pfn; 956 arg.nr_pages = nr_pages; 957 node_states_check_changes_online(nr_pages, zone, &arg); 958 959 nid = page_to_nid(pfn_to_page(pfn)); 960 961 ret = memory_notify(MEM_GOING_ONLINE, &arg); 962 ret = notifier_to_errno(ret); 963 if (ret) { 964 memory_notify(MEM_CANCEL_ONLINE, &arg); 965 unlock_memory_hotplug(); 966 return ret; 967 } 968 /* 969 * If this zone is not populated, then it is not in zonelist. 970 * This means the page allocator ignores this zone. 971 * So, zonelist must be updated after online. 972 */ 973 mutex_lock(&zonelists_mutex); 974 if (!populated_zone(zone)) { 975 need_zonelists_rebuild = 1; 976 build_all_zonelists(NULL, zone); 977 } 978 979 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, 980 online_pages_range); 981 if (ret) { 982 if (need_zonelists_rebuild) 983 zone_pcp_reset(zone); 984 mutex_unlock(&zonelists_mutex); 985 printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n", 986 (unsigned long long) pfn << PAGE_SHIFT, 987 (((unsigned long long) pfn + nr_pages) 988 << PAGE_SHIFT) - 1); 989 memory_notify(MEM_CANCEL_ONLINE, &arg); 990 unlock_memory_hotplug(); 991 return ret; 992 } 993 994 zone->managed_pages += onlined_pages; 995 zone->present_pages += onlined_pages; 996 zone->zone_pgdat->node_present_pages += onlined_pages; 997 if (onlined_pages) { 998 node_states_set_node(zone_to_nid(zone), &arg); 999 if (need_zonelists_rebuild) 1000 build_all_zonelists(NULL, NULL); 1001 else 1002 zone_pcp_update(zone); 1003 } 1004 1005 mutex_unlock(&zonelists_mutex); 1006 1007 init_per_zone_wmark_min(); 1008 1009 if (onlined_pages) 1010 kswapd_run(zone_to_nid(zone)); 1011 1012 vm_total_pages = nr_free_pagecache_pages(); 1013 1014 writeback_set_ratelimit(); 1015 1016 if (onlined_pages) 1017 memory_notify(MEM_ONLINE, &arg); 1018 unlock_memory_hotplug(); 1019 1020 return 0; 1021 } 1022 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ 1023 1024 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 1025 static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) 1026 { 1027 struct pglist_data *pgdat; 1028 unsigned long zones_size[MAX_NR_ZONES] = {0}; 1029 unsigned long zholes_size[MAX_NR_ZONES] = {0}; 1030 unsigned long start_pfn = start >> PAGE_SHIFT; 1031 1032 pgdat = NODE_DATA(nid); 1033 if (!pgdat) { 1034 pgdat = arch_alloc_nodedata(nid); 1035 if (!pgdat) 1036 return NULL; 1037 1038 arch_refresh_nodedata(nid, pgdat); 1039 } 1040 1041 /* we can use NODE_DATA(nid) from here */ 1042 1043 /* init node's zones as empty zones, we don't have any present pages.*/ 1044 free_area_init_node(nid, zones_size, start_pfn, zholes_size); 1045 1046 /* 1047 * The node we allocated has no zone fallback lists. For avoiding 1048 * to access not-initialized zonelist, build here. 1049 */ 1050 mutex_lock(&zonelists_mutex); 1051 build_all_zonelists(pgdat, NULL); 1052 mutex_unlock(&zonelists_mutex); 1053 1054 return pgdat; 1055 } 1056 1057 static void rollback_node_hotadd(int nid, pg_data_t *pgdat) 1058 { 1059 arch_refresh_nodedata(nid, NULL); 1060 arch_free_nodedata(pgdat); 1061 return; 1062 } 1063 1064 1065 /* 1066 * called by cpu_up() to online a node without onlined memory. 1067 */ 1068 int mem_online_node(int nid) 1069 { 1070 pg_data_t *pgdat; 1071 int ret; 1072 1073 lock_memory_hotplug(); 1074 pgdat = hotadd_new_pgdat(nid, 0); 1075 if (!pgdat) { 1076 ret = -ENOMEM; 1077 goto out; 1078 } 1079 node_set_online(nid); 1080 ret = register_one_node(nid); 1081 BUG_ON(ret); 1082 1083 out: 1084 unlock_memory_hotplug(); 1085 return ret; 1086 } 1087 1088 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 1089 int __ref add_memory(int nid, u64 start, u64 size) 1090 { 1091 pg_data_t *pgdat = NULL; 1092 bool new_pgdat; 1093 bool new_node; 1094 struct resource *res; 1095 int ret; 1096 1097 lock_memory_hotplug(); 1098 1099 res = register_memory_resource(start, size); 1100 ret = -EEXIST; 1101 if (!res) 1102 goto out; 1103 1104 { /* Stupid hack to suppress address-never-null warning */ 1105 void *p = NODE_DATA(nid); 1106 new_pgdat = !p; 1107 } 1108 new_node = !node_online(nid); 1109 if (new_node) { 1110 pgdat = hotadd_new_pgdat(nid, start); 1111 ret = -ENOMEM; 1112 if (!pgdat) 1113 goto error; 1114 } 1115 1116 /* call arch's memory hotadd */ 1117 ret = arch_add_memory(nid, start, size); 1118 1119 if (ret < 0) 1120 goto error; 1121 1122 /* we online node here. we can't roll back from here. */ 1123 node_set_online(nid); 1124 1125 if (new_node) { 1126 ret = register_one_node(nid); 1127 /* 1128 * If sysfs file of new node can't create, cpu on the node 1129 * can't be hot-added. There is no rollback way now. 1130 * So, check by BUG_ON() to catch it reluctantly.. 1131 */ 1132 BUG_ON(ret); 1133 } 1134 1135 /* create new memmap entry */ 1136 firmware_map_add_hotplug(start, start + size, "System RAM"); 1137 1138 goto out; 1139 1140 error: 1141 /* rollback pgdat allocation and others */ 1142 if (new_pgdat) 1143 rollback_node_hotadd(nid, pgdat); 1144 release_memory_resource(res); 1145 1146 out: 1147 unlock_memory_hotplug(); 1148 return ret; 1149 } 1150 EXPORT_SYMBOL_GPL(add_memory); 1151 1152 #ifdef CONFIG_MEMORY_HOTREMOVE 1153 /* 1154 * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy 1155 * set and the size of the free page is given by page_order(). Using this, 1156 * the function determines if the pageblock contains only free pages. 1157 * Due to buddy contraints, a free page at least the size of a pageblock will 1158 * be located at the start of the pageblock 1159 */ 1160 static inline int pageblock_free(struct page *page) 1161 { 1162 return PageBuddy(page) && page_order(page) >= pageblock_order; 1163 } 1164 1165 /* Return the start of the next active pageblock after a given page */ 1166 static struct page *next_active_pageblock(struct page *page) 1167 { 1168 /* Ensure the starting page is pageblock-aligned */ 1169 BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1)); 1170 1171 /* If the entire pageblock is free, move to the end of free page */ 1172 if (pageblock_free(page)) { 1173 int order; 1174 /* be careful. we don't have locks, page_order can be changed.*/ 1175 order = page_order(page); 1176 if ((order < MAX_ORDER) && (order >= pageblock_order)) 1177 return page + (1 << order); 1178 } 1179 1180 return page + pageblock_nr_pages; 1181 } 1182 1183 /* Checks if this range of memory is likely to be hot-removable. */ 1184 int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) 1185 { 1186 struct page *page = pfn_to_page(start_pfn); 1187 struct page *end_page = page + nr_pages; 1188 1189 /* Check the starting page of each pageblock within the range */ 1190 for (; page < end_page; page = next_active_pageblock(page)) { 1191 if (!is_pageblock_removable_nolock(page)) 1192 return 0; 1193 cond_resched(); 1194 } 1195 1196 /* All pageblocks in the memory block are likely to be hot-removable */ 1197 return 1; 1198 } 1199 1200 /* 1201 * Confirm all pages in a range [start, end) is belongs to the same zone. 1202 */ 1203 static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) 1204 { 1205 unsigned long pfn; 1206 struct zone *zone = NULL; 1207 struct page *page; 1208 int i; 1209 for (pfn = start_pfn; 1210 pfn < end_pfn; 1211 pfn += MAX_ORDER_NR_PAGES) { 1212 i = 0; 1213 /* This is just a CONFIG_HOLES_IN_ZONE check.*/ 1214 while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i)) 1215 i++; 1216 if (i == MAX_ORDER_NR_PAGES) 1217 continue; 1218 page = pfn_to_page(pfn + i); 1219 if (zone && page_zone(page) != zone) 1220 return 0; 1221 zone = page_zone(page); 1222 } 1223 return 1; 1224 } 1225 1226 /* 1227 * Scanning pfn is much easier than scanning lru list. 1228 * Scan pfn from start to end and Find LRU page. 1229 */ 1230 static unsigned long scan_lru_pages(unsigned long start, unsigned long end) 1231 { 1232 unsigned long pfn; 1233 struct page *page; 1234 for (pfn = start; pfn < end; pfn++) { 1235 if (pfn_valid(pfn)) { 1236 page = pfn_to_page(pfn); 1237 if (PageLRU(page)) 1238 return pfn; 1239 } 1240 } 1241 return 0; 1242 } 1243 1244 #define NR_OFFLINE_AT_ONCE_PAGES (256) 1245 static int 1246 do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) 1247 { 1248 unsigned long pfn; 1249 struct page *page; 1250 int move_pages = NR_OFFLINE_AT_ONCE_PAGES; 1251 int not_managed = 0; 1252 int ret = 0; 1253 LIST_HEAD(source); 1254 1255 for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) { 1256 if (!pfn_valid(pfn)) 1257 continue; 1258 page = pfn_to_page(pfn); 1259 if (!get_page_unless_zero(page)) 1260 continue; 1261 /* 1262 * We can skip free pages. And we can only deal with pages on 1263 * LRU. 1264 */ 1265 ret = isolate_lru_page(page); 1266 if (!ret) { /* Success */ 1267 put_page(page); 1268 list_add_tail(&page->lru, &source); 1269 move_pages--; 1270 inc_zone_page_state(page, NR_ISOLATED_ANON + 1271 page_is_file_cache(page)); 1272 1273 } else { 1274 #ifdef CONFIG_DEBUG_VM 1275 printk(KERN_ALERT "removing pfn %lx from LRU failed\n", 1276 pfn); 1277 dump_page(page); 1278 #endif 1279 put_page(page); 1280 /* Because we don't have big zone->lock. we should 1281 check this again here. */ 1282 if (page_count(page)) { 1283 not_managed++; 1284 ret = -EBUSY; 1285 break; 1286 } 1287 } 1288 } 1289 if (!list_empty(&source)) { 1290 if (not_managed) { 1291 putback_lru_pages(&source); 1292 goto out; 1293 } 1294 1295 /* 1296 * alloc_migrate_target should be improooooved!! 1297 * migrate_pages returns # of failed pages. 1298 */ 1299 ret = migrate_pages(&source, alloc_migrate_target, 0, 1300 MIGRATE_SYNC, MR_MEMORY_HOTPLUG); 1301 if (ret) 1302 putback_lru_pages(&source); 1303 } 1304 out: 1305 return ret; 1306 } 1307 1308 /* 1309 * remove from free_area[] and mark all as Reserved. 1310 */ 1311 static int 1312 offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, 1313 void *data) 1314 { 1315 __offline_isolated_pages(start, start + nr_pages); 1316 return 0; 1317 } 1318 1319 static void 1320 offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 1321 { 1322 walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL, 1323 offline_isolated_pages_cb); 1324 } 1325 1326 /* 1327 * Check all pages in range, recoreded as memory resource, are isolated. 1328 */ 1329 static int 1330 check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, 1331 void *data) 1332 { 1333 int ret; 1334 long offlined = *(long *)data; 1335 ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true); 1336 offlined = nr_pages; 1337 if (!ret) 1338 *(long *)data += offlined; 1339 return ret; 1340 } 1341 1342 static long 1343 check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) 1344 { 1345 long offlined = 0; 1346 int ret; 1347 1348 ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined, 1349 check_pages_isolated_cb); 1350 if (ret < 0) 1351 offlined = (long)ret; 1352 return offlined; 1353 } 1354 1355 #ifdef CONFIG_MOVABLE_NODE 1356 /* 1357 * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have 1358 * normal memory. 1359 */ 1360 static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) 1361 { 1362 return true; 1363 } 1364 #else /* CONFIG_MOVABLE_NODE */ 1365 /* ensure the node has NORMAL memory if it is still online */ 1366 static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) 1367 { 1368 struct pglist_data *pgdat = zone->zone_pgdat; 1369 unsigned long present_pages = 0; 1370 enum zone_type zt; 1371 1372 for (zt = 0; zt <= ZONE_NORMAL; zt++) 1373 present_pages += pgdat->node_zones[zt].present_pages; 1374 1375 if (present_pages > nr_pages) 1376 return true; 1377 1378 present_pages = 0; 1379 for (; zt <= ZONE_MOVABLE; zt++) 1380 present_pages += pgdat->node_zones[zt].present_pages; 1381 1382 /* 1383 * we can't offline the last normal memory until all 1384 * higher memory is offlined. 1385 */ 1386 return present_pages == 0; 1387 } 1388 #endif /* CONFIG_MOVABLE_NODE */ 1389 1390 /* check which state of node_states will be changed when offline memory */ 1391 static void node_states_check_changes_offline(unsigned long nr_pages, 1392 struct zone *zone, struct memory_notify *arg) 1393 { 1394 struct pglist_data *pgdat = zone->zone_pgdat; 1395 unsigned long present_pages = 0; 1396 enum zone_type zt, zone_last = ZONE_NORMAL; 1397 1398 /* 1399 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] 1400 * contains nodes which have zones of 0...ZONE_NORMAL, 1401 * set zone_last to ZONE_NORMAL. 1402 * 1403 * If we don't have HIGHMEM nor movable node, 1404 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of 1405 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. 1406 */ 1407 if (N_MEMORY == N_NORMAL_MEMORY) 1408 zone_last = ZONE_MOVABLE; 1409 1410 /* 1411 * check whether node_states[N_NORMAL_MEMORY] will be changed. 1412 * If the memory to be offline is in a zone of 0...zone_last, 1413 * and it is the last present memory, 0...zone_last will 1414 * become empty after offline , thus we can determind we will 1415 * need to clear the node from node_states[N_NORMAL_MEMORY]. 1416 */ 1417 for (zt = 0; zt <= zone_last; zt++) 1418 present_pages += pgdat->node_zones[zt].present_pages; 1419 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) 1420 arg->status_change_nid_normal = zone_to_nid(zone); 1421 else 1422 arg->status_change_nid_normal = -1; 1423 1424 #ifdef CONFIG_HIGHMEM 1425 /* 1426 * If we have movable node, node_states[N_HIGH_MEMORY] 1427 * contains nodes which have zones of 0...ZONE_HIGHMEM, 1428 * set zone_last to ZONE_HIGHMEM. 1429 * 1430 * If we don't have movable node, node_states[N_NORMAL_MEMORY] 1431 * contains nodes which have zones of 0...ZONE_MOVABLE, 1432 * set zone_last to ZONE_MOVABLE. 1433 */ 1434 zone_last = ZONE_HIGHMEM; 1435 if (N_MEMORY == N_HIGH_MEMORY) 1436 zone_last = ZONE_MOVABLE; 1437 1438 for (; zt <= zone_last; zt++) 1439 present_pages += pgdat->node_zones[zt].present_pages; 1440 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) 1441 arg->status_change_nid_high = zone_to_nid(zone); 1442 else 1443 arg->status_change_nid_high = -1; 1444 #else 1445 arg->status_change_nid_high = arg->status_change_nid_normal; 1446 #endif 1447 1448 /* 1449 * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE 1450 */ 1451 zone_last = ZONE_MOVABLE; 1452 1453 /* 1454 * check whether node_states[N_HIGH_MEMORY] will be changed 1455 * If we try to offline the last present @nr_pages from the node, 1456 * we can determind we will need to clear the node from 1457 * node_states[N_HIGH_MEMORY]. 1458 */ 1459 for (; zt <= zone_last; zt++) 1460 present_pages += pgdat->node_zones[zt].present_pages; 1461 if (nr_pages >= present_pages) 1462 arg->status_change_nid = zone_to_nid(zone); 1463 else 1464 arg->status_change_nid = -1; 1465 } 1466 1467 static void node_states_clear_node(int node, struct memory_notify *arg) 1468 { 1469 if (arg->status_change_nid_normal >= 0) 1470 node_clear_state(node, N_NORMAL_MEMORY); 1471 1472 if ((N_MEMORY != N_NORMAL_MEMORY) && 1473 (arg->status_change_nid_high >= 0)) 1474 node_clear_state(node, N_HIGH_MEMORY); 1475 1476 if ((N_MEMORY != N_HIGH_MEMORY) && 1477 (arg->status_change_nid >= 0)) 1478 node_clear_state(node, N_MEMORY); 1479 } 1480 1481 static int __ref __offline_pages(unsigned long start_pfn, 1482 unsigned long end_pfn, unsigned long timeout) 1483 { 1484 unsigned long pfn, nr_pages, expire; 1485 long offlined_pages; 1486 int ret, drain, retry_max, node; 1487 struct zone *zone; 1488 struct memory_notify arg; 1489 1490 BUG_ON(start_pfn >= end_pfn); 1491 /* at least, alignment against pageblock is necessary */ 1492 if (!IS_ALIGNED(start_pfn, pageblock_nr_pages)) 1493 return -EINVAL; 1494 if (!IS_ALIGNED(end_pfn, pageblock_nr_pages)) 1495 return -EINVAL; 1496 /* This makes hotplug much easier...and readable. 1497 we assume this for now. .*/ 1498 if (!test_pages_in_a_zone(start_pfn, end_pfn)) 1499 return -EINVAL; 1500 1501 lock_memory_hotplug(); 1502 1503 zone = page_zone(pfn_to_page(start_pfn)); 1504 node = zone_to_nid(zone); 1505 nr_pages = end_pfn - start_pfn; 1506 1507 ret = -EINVAL; 1508 if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages)) 1509 goto out; 1510 1511 /* set above range as isolated */ 1512 ret = start_isolate_page_range(start_pfn, end_pfn, 1513 MIGRATE_MOVABLE, true); 1514 if (ret) 1515 goto out; 1516 1517 arg.start_pfn = start_pfn; 1518 arg.nr_pages = nr_pages; 1519 node_states_check_changes_offline(nr_pages, zone, &arg); 1520 1521 ret = memory_notify(MEM_GOING_OFFLINE, &arg); 1522 ret = notifier_to_errno(ret); 1523 if (ret) 1524 goto failed_removal; 1525 1526 pfn = start_pfn; 1527 expire = jiffies + timeout; 1528 drain = 0; 1529 retry_max = 5; 1530 repeat: 1531 /* start memory hot removal */ 1532 ret = -EAGAIN; 1533 if (time_after(jiffies, expire)) 1534 goto failed_removal; 1535 ret = -EINTR; 1536 if (signal_pending(current)) 1537 goto failed_removal; 1538 ret = 0; 1539 if (drain) { 1540 lru_add_drain_all(); 1541 cond_resched(); 1542 drain_all_pages(); 1543 } 1544 1545 pfn = scan_lru_pages(start_pfn, end_pfn); 1546 if (pfn) { /* We have page on LRU */ 1547 ret = do_migrate_range(pfn, end_pfn); 1548 if (!ret) { 1549 drain = 1; 1550 goto repeat; 1551 } else { 1552 if (ret < 0) 1553 if (--retry_max == 0) 1554 goto failed_removal; 1555 yield(); 1556 drain = 1; 1557 goto repeat; 1558 } 1559 } 1560 /* drain all zone's lru pagevec, this is asynchronous... */ 1561 lru_add_drain_all(); 1562 yield(); 1563 /* drain pcp pages, this is synchronous. */ 1564 drain_all_pages(); 1565 /* check again */ 1566 offlined_pages = check_pages_isolated(start_pfn, end_pfn); 1567 if (offlined_pages < 0) { 1568 ret = -EBUSY; 1569 goto failed_removal; 1570 } 1571 printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages); 1572 /* Ok, all of our target is isolated. 1573 We cannot do rollback at this point. */ 1574 offline_isolated_pages(start_pfn, end_pfn); 1575 /* reset pagetype flags and makes migrate type to be MOVABLE */ 1576 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1577 /* removal success */ 1578 zone->managed_pages -= offlined_pages; 1579 zone->present_pages -= offlined_pages; 1580 zone->zone_pgdat->node_present_pages -= offlined_pages; 1581 totalram_pages -= offlined_pages; 1582 1583 init_per_zone_wmark_min(); 1584 1585 if (!populated_zone(zone)) { 1586 zone_pcp_reset(zone); 1587 mutex_lock(&zonelists_mutex); 1588 build_all_zonelists(NULL, NULL); 1589 mutex_unlock(&zonelists_mutex); 1590 } else 1591 zone_pcp_update(zone); 1592 1593 node_states_clear_node(node, &arg); 1594 if (arg.status_change_nid >= 0) 1595 kswapd_stop(node); 1596 1597 vm_total_pages = nr_free_pagecache_pages(); 1598 writeback_set_ratelimit(); 1599 1600 memory_notify(MEM_OFFLINE, &arg); 1601 unlock_memory_hotplug(); 1602 return 0; 1603 1604 failed_removal: 1605 printk(KERN_INFO "memory offlining [mem %#010llx-%#010llx] failed\n", 1606 (unsigned long long) start_pfn << PAGE_SHIFT, 1607 ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); 1608 memory_notify(MEM_CANCEL_OFFLINE, &arg); 1609 /* pushback to free area */ 1610 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1611 1612 out: 1613 unlock_memory_hotplug(); 1614 return ret; 1615 } 1616 1617 int offline_pages(unsigned long start_pfn, unsigned long nr_pages) 1618 { 1619 return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); 1620 } 1621 1622 /** 1623 * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn) 1624 * @start_pfn: start pfn of the memory range 1625 * @end_pfn: end pfn of the memory range 1626 * @arg: argument passed to func 1627 * @func: callback for each memory section walked 1628 * 1629 * This function walks through all present mem sections in range 1630 * [start_pfn, end_pfn) and call func on each mem section. 1631 * 1632 * Returns the return value of func. 1633 */ 1634 static int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, 1635 void *arg, int (*func)(struct memory_block *, void *)) 1636 { 1637 struct memory_block *mem = NULL; 1638 struct mem_section *section; 1639 unsigned long pfn, section_nr; 1640 int ret; 1641 1642 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 1643 section_nr = pfn_to_section_nr(pfn); 1644 if (!present_section_nr(section_nr)) 1645 continue; 1646 1647 section = __nr_to_section(section_nr); 1648 /* same memblock? */ 1649 if (mem) 1650 if ((section_nr >= mem->start_section_nr) && 1651 (section_nr <= mem->end_section_nr)) 1652 continue; 1653 1654 mem = find_memory_block_hinted(section, mem); 1655 if (!mem) 1656 continue; 1657 1658 ret = func(mem, arg); 1659 if (ret) { 1660 kobject_put(&mem->dev.kobj); 1661 return ret; 1662 } 1663 } 1664 1665 if (mem) 1666 kobject_put(&mem->dev.kobj); 1667 1668 return 0; 1669 } 1670 1671 /** 1672 * offline_memory_block_cb - callback function for offlining memory block 1673 * @mem: the memory block to be offlined 1674 * @arg: buffer to hold error msg 1675 * 1676 * Always return 0, and put the error msg in arg if any. 1677 */ 1678 static int offline_memory_block_cb(struct memory_block *mem, void *arg) 1679 { 1680 int *ret = arg; 1681 int error = offline_memory_block(mem); 1682 1683 if (error != 0 && *ret == 0) 1684 *ret = error; 1685 1686 return 0; 1687 } 1688 1689 static int is_memblock_offlined_cb(struct memory_block *mem, void *arg) 1690 { 1691 int ret = !is_memblock_offlined(mem); 1692 1693 if (unlikely(ret)) { 1694 phys_addr_t beginpa, endpa; 1695 1696 beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); 1697 endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1; 1698 pr_warn("removing memory fails, because memory " 1699 "[%pa-%pa] is onlined\n", 1700 &beginpa, &endpa); 1701 } 1702 1703 return ret; 1704 } 1705 1706 static int check_cpu_on_node(void *data) 1707 { 1708 struct pglist_data *pgdat = data; 1709 int cpu; 1710 1711 for_each_present_cpu(cpu) { 1712 if (cpu_to_node(cpu) == pgdat->node_id) 1713 /* 1714 * the cpu on this node isn't removed, and we can't 1715 * offline this node. 1716 */ 1717 return -EBUSY; 1718 } 1719 1720 return 0; 1721 } 1722 1723 static void unmap_cpu_on_node(void *data) 1724 { 1725 #ifdef CONFIG_ACPI_NUMA 1726 struct pglist_data *pgdat = data; 1727 int cpu; 1728 1729 for_each_possible_cpu(cpu) 1730 if (cpu_to_node(cpu) == pgdat->node_id) 1731 numa_clear_node(cpu); 1732 #endif 1733 } 1734 1735 static int check_and_unmap_cpu_on_node(void *data) 1736 { 1737 int ret = check_cpu_on_node(data); 1738 1739 if (ret) 1740 return ret; 1741 1742 /* 1743 * the node will be offlined when we come here, so we can clear 1744 * the cpu_to_node() now. 1745 */ 1746 1747 unmap_cpu_on_node(data); 1748 return 0; 1749 } 1750 1751 /* offline the node if all memory sections of this node are removed */ 1752 void try_offline_node(int nid) 1753 { 1754 pg_data_t *pgdat = NODE_DATA(nid); 1755 unsigned long start_pfn = pgdat->node_start_pfn; 1756 unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; 1757 unsigned long pfn; 1758 struct page *pgdat_page = virt_to_page(pgdat); 1759 int i; 1760 1761 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 1762 unsigned long section_nr = pfn_to_section_nr(pfn); 1763 1764 if (!present_section_nr(section_nr)) 1765 continue; 1766 1767 if (pfn_to_nid(pfn) != nid) 1768 continue; 1769 1770 /* 1771 * some memory sections of this node are not removed, and we 1772 * can't offline node now. 1773 */ 1774 return; 1775 } 1776 1777 if (stop_machine(check_and_unmap_cpu_on_node, pgdat, NULL)) 1778 return; 1779 1780 /* 1781 * all memory/cpu of this node are removed, we can offline this 1782 * node now. 1783 */ 1784 node_set_offline(nid); 1785 unregister_one_node(nid); 1786 1787 if (!PageSlab(pgdat_page) && !PageCompound(pgdat_page)) 1788 /* node data is allocated from boot memory */ 1789 return; 1790 1791 /* free waittable in each zone */ 1792 for (i = 0; i < MAX_NR_ZONES; i++) { 1793 struct zone *zone = pgdat->node_zones + i; 1794 1795 /* 1796 * wait_table may be allocated from boot memory, 1797 * here only free if it's allocated by vmalloc. 1798 */ 1799 if (is_vmalloc_addr(zone->wait_table)) 1800 vfree(zone->wait_table); 1801 } 1802 1803 /* 1804 * Since there is no way to guarentee the address of pgdat/zone is not 1805 * on stack of any kernel threads or used by other kernel objects 1806 * without reference counting or other symchronizing method, do not 1807 * reset node_data and free pgdat here. Just reset it to 0 and reuse 1808 * the memory when the node is online again. 1809 */ 1810 memset(pgdat, 0, sizeof(*pgdat)); 1811 } 1812 EXPORT_SYMBOL(try_offline_node); 1813 1814 int __ref remove_memory(int nid, u64 start, u64 size) 1815 { 1816 unsigned long start_pfn, end_pfn; 1817 int ret = 0; 1818 int retry = 1; 1819 1820 start_pfn = PFN_DOWN(start); 1821 end_pfn = PFN_UP(start + size - 1); 1822 1823 /* 1824 * When CONFIG_MEMCG is on, one memory block may be used by other 1825 * blocks to store page cgroup when onlining pages. But we don't know 1826 * in what order pages are onlined. So we iterate twice to offline 1827 * memory: 1828 * 1st iterate: offline every non primary memory block. 1829 * 2nd iterate: offline primary (i.e. first added) memory block. 1830 */ 1831 repeat: 1832 walk_memory_range(start_pfn, end_pfn, &ret, 1833 offline_memory_block_cb); 1834 if (ret) { 1835 if (!retry) 1836 return ret; 1837 1838 retry = 0; 1839 ret = 0; 1840 goto repeat; 1841 } 1842 1843 lock_memory_hotplug(); 1844 1845 /* 1846 * we have offlined all memory blocks like this: 1847 * 1. lock memory hotplug 1848 * 2. offline a memory block 1849 * 3. unlock memory hotplug 1850 * 1851 * repeat step1-3 to offline the memory block. All memory blocks 1852 * must be offlined before removing memory. But we don't hold the 1853 * lock in the whole operation. So we should check whether all 1854 * memory blocks are offlined. 1855 */ 1856 1857 ret = walk_memory_range(start_pfn, end_pfn, NULL, 1858 is_memblock_offlined_cb); 1859 if (ret) { 1860 unlock_memory_hotplug(); 1861 return ret; 1862 } 1863 1864 /* remove memmap entry */ 1865 firmware_map_remove(start, start + size, "System RAM"); 1866 1867 arch_remove_memory(start, size); 1868 1869 try_offline_node(nid); 1870 1871 unlock_memory_hotplug(); 1872 1873 return 0; 1874 } 1875 #else 1876 int offline_pages(unsigned long start_pfn, unsigned long nr_pages) 1877 { 1878 return -EINVAL; 1879 } 1880 int remove_memory(int nid, u64 start, u64 size) 1881 { 1882 return -EINVAL; 1883 } 1884 #endif /* CONFIG_MEMORY_HOTREMOVE */ 1885 EXPORT_SYMBOL_GPL(remove_memory); 1886