1 /* 2 * linux/mm/memory_hotplug.c 3 * 4 * Copyright (C) 5 */ 6 7 #include <linux/stddef.h> 8 #include <linux/mm.h> 9 #include <linux/swap.h> 10 #include <linux/interrupt.h> 11 #include <linux/pagemap.h> 12 #include <linux/bootmem.h> 13 #include <linux/compiler.h> 14 #include <linux/module.h> 15 #include <linux/pagevec.h> 16 #include <linux/writeback.h> 17 #include <linux/slab.h> 18 #include <linux/sysctl.h> 19 #include <linux/cpu.h> 20 #include <linux/memory.h> 21 #include <linux/memory_hotplug.h> 22 #include <linux/highmem.h> 23 #include <linux/vmalloc.h> 24 #include <linux/ioport.h> 25 #include <linux/delay.h> 26 #include <linux/migrate.h> 27 #include <linux/page-isolation.h> 28 #include <linux/pfn.h> 29 #include <linux/suspend.h> 30 #include <linux/mm_inline.h> 31 32 #include <asm/tlbflush.h> 33 34 #include "internal.h" 35 36 /* add this memory to iomem resource */ 37 static struct resource *register_memory_resource(u64 start, u64 size) 38 { 39 struct resource *res; 40 res = kzalloc(sizeof(struct resource), GFP_KERNEL); 41 BUG_ON(!res); 42 43 res->name = "System RAM"; 44 res->start = start; 45 res->end = start + size - 1; 46 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; 47 if (request_resource(&iomem_resource, res) < 0) { 48 printk("System RAM resource %llx - %llx cannot be added\n", 49 (unsigned long long)res->start, (unsigned long long)res->end); 50 kfree(res); 51 res = NULL; 52 } 53 return res; 54 } 55 56 static void release_memory_resource(struct resource *res) 57 { 58 if (!res) 59 return; 60 release_resource(res); 61 kfree(res); 62 return; 63 } 64 65 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 66 #ifndef CONFIG_SPARSEMEM_VMEMMAP 67 static void get_page_bootmem(unsigned long info, struct page *page, int type) 68 { 69 atomic_set(&page->_mapcount, type); 70 SetPagePrivate(page); 71 set_page_private(page, info); 72 atomic_inc(&page->_count); 73 } 74 75 /* reference to __meminit __free_pages_bootmem is valid 76 * so use __ref to tell modpost not to generate a warning */ 77 void __ref put_page_bootmem(struct page *page) 78 { 79 int type; 80 81 type = atomic_read(&page->_mapcount); 82 BUG_ON(type >= -1); 83 84 if (atomic_dec_return(&page->_count) == 1) { 85 ClearPagePrivate(page); 86 set_page_private(page, 0); 87 reset_page_mapcount(page); 88 __free_pages_bootmem(page, 0); 89 } 90 91 } 92 93 static void register_page_bootmem_info_section(unsigned long start_pfn) 94 { 95 unsigned long *usemap, mapsize, section_nr, i; 96 struct mem_section *ms; 97 struct page *page, *memmap; 98 99 if (!pfn_valid(start_pfn)) 100 return; 101 102 section_nr = pfn_to_section_nr(start_pfn); 103 ms = __nr_to_section(section_nr); 104 105 /* Get section's memmap address */ 106 memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); 107 108 /* 109 * Get page for the memmap's phys address 110 * XXX: need more consideration for sparse_vmemmap... 111 */ 112 page = virt_to_page(memmap); 113 mapsize = sizeof(struct page) * PAGES_PER_SECTION; 114 mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT; 115 116 /* remember memmap's page */ 117 for (i = 0; i < mapsize; i++, page++) 118 get_page_bootmem(section_nr, page, SECTION_INFO); 119 120 usemap = __nr_to_section(section_nr)->pageblock_flags; 121 page = virt_to_page(usemap); 122 123 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; 124 125 for (i = 0; i < mapsize; i++, page++) 126 get_page_bootmem(section_nr, page, MIX_SECTION_INFO); 127 128 } 129 130 void register_page_bootmem_info_node(struct pglist_data *pgdat) 131 { 132 unsigned long i, pfn, end_pfn, nr_pages; 133 int node = pgdat->node_id; 134 struct page *page; 135 struct zone *zone; 136 137 nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT; 138 page = virt_to_page(pgdat); 139 140 for (i = 0; i < nr_pages; i++, page++) 141 get_page_bootmem(node, page, NODE_INFO); 142 143 zone = &pgdat->node_zones[0]; 144 for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) { 145 if (zone->wait_table) { 146 nr_pages = zone->wait_table_hash_nr_entries 147 * sizeof(wait_queue_head_t); 148 nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT; 149 page = virt_to_page(zone->wait_table); 150 151 for (i = 0; i < nr_pages; i++, page++) 152 get_page_bootmem(node, page, NODE_INFO); 153 } 154 } 155 156 pfn = pgdat->node_start_pfn; 157 end_pfn = pfn + pgdat->node_spanned_pages; 158 159 /* register_section info */ 160 for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) 161 register_page_bootmem_info_section(pfn); 162 163 } 164 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ 165 166 static void grow_zone_span(struct zone *zone, unsigned long start_pfn, 167 unsigned long end_pfn) 168 { 169 unsigned long old_zone_end_pfn; 170 171 zone_span_writelock(zone); 172 173 old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; 174 if (start_pfn < zone->zone_start_pfn) 175 zone->zone_start_pfn = start_pfn; 176 177 zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - 178 zone->zone_start_pfn; 179 180 zone_span_writeunlock(zone); 181 } 182 183 static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, 184 unsigned long end_pfn) 185 { 186 unsigned long old_pgdat_end_pfn = 187 pgdat->node_start_pfn + pgdat->node_spanned_pages; 188 189 if (start_pfn < pgdat->node_start_pfn) 190 pgdat->node_start_pfn = start_pfn; 191 192 pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - 193 pgdat->node_start_pfn; 194 } 195 196 static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) 197 { 198 struct pglist_data *pgdat = zone->zone_pgdat; 199 int nr_pages = PAGES_PER_SECTION; 200 int nid = pgdat->node_id; 201 int zone_type; 202 unsigned long flags; 203 204 zone_type = zone - pgdat->node_zones; 205 if (!zone->wait_table) { 206 int ret; 207 208 ret = init_currently_empty_zone(zone, phys_start_pfn, 209 nr_pages, MEMMAP_HOTPLUG); 210 if (ret) 211 return ret; 212 } 213 pgdat_resize_lock(zone->zone_pgdat, &flags); 214 grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); 215 grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, 216 phys_start_pfn + nr_pages); 217 pgdat_resize_unlock(zone->zone_pgdat, &flags); 218 memmap_init_zone(nr_pages, nid, zone_type, 219 phys_start_pfn, MEMMAP_HOTPLUG); 220 return 0; 221 } 222 223 static int __meminit __add_section(int nid, struct zone *zone, 224 unsigned long phys_start_pfn) 225 { 226 int nr_pages = PAGES_PER_SECTION; 227 int ret; 228 229 if (pfn_valid(phys_start_pfn)) 230 return -EEXIST; 231 232 ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages); 233 234 if (ret < 0) 235 return ret; 236 237 ret = __add_zone(zone, phys_start_pfn); 238 239 if (ret < 0) 240 return ret; 241 242 return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); 243 } 244 245 #ifdef CONFIG_SPARSEMEM_VMEMMAP 246 static int __remove_section(struct zone *zone, struct mem_section *ms) 247 { 248 /* 249 * XXX: Freeing memmap with vmemmap is not implement yet. 250 * This should be removed later. 251 */ 252 return -EBUSY; 253 } 254 #else 255 static int __remove_section(struct zone *zone, struct mem_section *ms) 256 { 257 unsigned long flags; 258 struct pglist_data *pgdat = zone->zone_pgdat; 259 int ret = -EINVAL; 260 261 if (!valid_section(ms)) 262 return ret; 263 264 ret = unregister_memory_section(ms); 265 if (ret) 266 return ret; 267 268 pgdat_resize_lock(pgdat, &flags); 269 sparse_remove_one_section(zone, ms); 270 pgdat_resize_unlock(pgdat, &flags); 271 return 0; 272 } 273 #endif 274 275 /* 276 * Reasonably generic function for adding memory. It is 277 * expected that archs that support memory hotplug will 278 * call this function after deciding the zone to which to 279 * add the new pages. 280 */ 281 int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, 282 unsigned long nr_pages) 283 { 284 unsigned long i; 285 int err = 0; 286 int start_sec, end_sec; 287 /* during initialize mem_map, align hot-added range to section */ 288 start_sec = pfn_to_section_nr(phys_start_pfn); 289 end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); 290 291 for (i = start_sec; i <= end_sec; i++) { 292 err = __add_section(nid, zone, i << PFN_SECTION_SHIFT); 293 294 /* 295 * EEXIST is finally dealt with by ioresource collision 296 * check. see add_memory() => register_memory_resource() 297 * Warning will be printed if there is collision. 298 */ 299 if (err && (err != -EEXIST)) 300 break; 301 err = 0; 302 } 303 304 return err; 305 } 306 EXPORT_SYMBOL_GPL(__add_pages); 307 308 /** 309 * __remove_pages() - remove sections of pages from a zone 310 * @zone: zone from which pages need to be removed 311 * @phys_start_pfn: starting pageframe (must be aligned to start of a section) 312 * @nr_pages: number of pages to remove (must be multiple of section size) 313 * 314 * Generic helper function to remove section mappings and sysfs entries 315 * for the section of the memory we are removing. Caller needs to make 316 * sure that pages are marked reserved and zones are adjust properly by 317 * calling offline_pages(). 318 */ 319 int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, 320 unsigned long nr_pages) 321 { 322 unsigned long i, ret = 0; 323 int sections_to_remove; 324 325 /* 326 * We can only remove entire sections 327 */ 328 BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); 329 BUG_ON(nr_pages % PAGES_PER_SECTION); 330 331 sections_to_remove = nr_pages / PAGES_PER_SECTION; 332 for (i = 0; i < sections_to_remove; i++) { 333 unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; 334 release_mem_region(pfn << PAGE_SHIFT, 335 PAGES_PER_SECTION << PAGE_SHIFT); 336 ret = __remove_section(zone, __pfn_to_section(pfn)); 337 if (ret) 338 break; 339 } 340 return ret; 341 } 342 EXPORT_SYMBOL_GPL(__remove_pages); 343 344 void online_page(struct page *page) 345 { 346 unsigned long pfn = page_to_pfn(page); 347 348 totalram_pages++; 349 if (pfn >= num_physpages) 350 num_physpages = pfn + 1; 351 352 #ifdef CONFIG_HIGHMEM 353 if (PageHighMem(page)) 354 totalhigh_pages++; 355 #endif 356 357 #ifdef CONFIG_FLATMEM 358 max_mapnr = max(page_to_pfn(page), max_mapnr); 359 #endif 360 361 ClearPageReserved(page); 362 init_page_count(page); 363 __free_page(page); 364 } 365 366 static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, 367 void *arg) 368 { 369 unsigned long i; 370 unsigned long onlined_pages = *(unsigned long *)arg; 371 struct page *page; 372 if (PageReserved(pfn_to_page(start_pfn))) 373 for (i = 0; i < nr_pages; i++) { 374 page = pfn_to_page(start_pfn + i); 375 online_page(page); 376 onlined_pages++; 377 } 378 *(unsigned long *)arg = onlined_pages; 379 return 0; 380 } 381 382 383 int online_pages(unsigned long pfn, unsigned long nr_pages) 384 { 385 unsigned long onlined_pages = 0; 386 struct zone *zone; 387 int need_zonelists_rebuild = 0; 388 int nid; 389 int ret; 390 struct memory_notify arg; 391 392 arg.start_pfn = pfn; 393 arg.nr_pages = nr_pages; 394 arg.status_change_nid = -1; 395 396 nid = page_to_nid(pfn_to_page(pfn)); 397 if (node_present_pages(nid) == 0) 398 arg.status_change_nid = nid; 399 400 ret = memory_notify(MEM_GOING_ONLINE, &arg); 401 ret = notifier_to_errno(ret); 402 if (ret) { 403 memory_notify(MEM_CANCEL_ONLINE, &arg); 404 return ret; 405 } 406 /* 407 * This doesn't need a lock to do pfn_to_page(). 408 * The section can't be removed here because of the 409 * memory_block->state_mutex. 410 */ 411 zone = page_zone(pfn_to_page(pfn)); 412 /* 413 * If this zone is not populated, then it is not in zonelist. 414 * This means the page allocator ignores this zone. 415 * So, zonelist must be updated after online. 416 */ 417 if (!populated_zone(zone)) 418 need_zonelists_rebuild = 1; 419 420 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, 421 online_pages_range); 422 if (ret) { 423 printk(KERN_DEBUG "online_pages %lx at %lx failed\n", 424 nr_pages, pfn); 425 memory_notify(MEM_CANCEL_ONLINE, &arg); 426 return ret; 427 } 428 429 zone->present_pages += onlined_pages; 430 zone->zone_pgdat->node_present_pages += onlined_pages; 431 432 zone_pcp_update(zone); 433 setup_per_zone_wmarks(); 434 calculate_zone_inactive_ratio(zone); 435 if (onlined_pages) { 436 kswapd_run(zone_to_nid(zone)); 437 node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); 438 } 439 440 if (need_zonelists_rebuild) 441 build_all_zonelists(); 442 else 443 vm_total_pages = nr_free_pagecache_pages(); 444 445 writeback_set_ratelimit(); 446 447 if (onlined_pages) 448 memory_notify(MEM_ONLINE, &arg); 449 450 return 0; 451 } 452 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ 453 454 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 455 static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) 456 { 457 struct pglist_data *pgdat; 458 unsigned long zones_size[MAX_NR_ZONES] = {0}; 459 unsigned long zholes_size[MAX_NR_ZONES] = {0}; 460 unsigned long start_pfn = start >> PAGE_SHIFT; 461 462 pgdat = arch_alloc_nodedata(nid); 463 if (!pgdat) 464 return NULL; 465 466 arch_refresh_nodedata(nid, pgdat); 467 468 /* we can use NODE_DATA(nid) from here */ 469 470 /* init node's zones as empty zones, we don't have any present pages.*/ 471 free_area_init_node(nid, zones_size, start_pfn, zholes_size); 472 473 return pgdat; 474 } 475 476 static void rollback_node_hotadd(int nid, pg_data_t *pgdat) 477 { 478 arch_refresh_nodedata(nid, NULL); 479 arch_free_nodedata(pgdat); 480 return; 481 } 482 483 484 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 485 int __ref add_memory(int nid, u64 start, u64 size) 486 { 487 pg_data_t *pgdat = NULL; 488 int new_pgdat = 0; 489 struct resource *res; 490 int ret; 491 492 lock_system_sleep(); 493 494 res = register_memory_resource(start, size); 495 ret = -EEXIST; 496 if (!res) 497 goto out; 498 499 if (!node_online(nid)) { 500 pgdat = hotadd_new_pgdat(nid, start); 501 ret = -ENOMEM; 502 if (!pgdat) 503 goto out; 504 new_pgdat = 1; 505 } 506 507 /* call arch's memory hotadd */ 508 ret = arch_add_memory(nid, start, size); 509 510 if (ret < 0) 511 goto error; 512 513 /* we online node here. we can't roll back from here. */ 514 node_set_online(nid); 515 516 if (new_pgdat) { 517 ret = register_one_node(nid); 518 /* 519 * If sysfs file of new node can't create, cpu on the node 520 * can't be hot-added. There is no rollback way now. 521 * So, check by BUG_ON() to catch it reluctantly.. 522 */ 523 BUG_ON(ret); 524 } 525 526 goto out; 527 528 error: 529 /* rollback pgdat allocation and others */ 530 if (new_pgdat) 531 rollback_node_hotadd(nid, pgdat); 532 if (res) 533 release_memory_resource(res); 534 535 out: 536 unlock_system_sleep(); 537 return ret; 538 } 539 EXPORT_SYMBOL_GPL(add_memory); 540 541 #ifdef CONFIG_MEMORY_HOTREMOVE 542 /* 543 * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy 544 * set and the size of the free page is given by page_order(). Using this, 545 * the function determines if the pageblock contains only free pages. 546 * Due to buddy contraints, a free page at least the size of a pageblock will 547 * be located at the start of the pageblock 548 */ 549 static inline int pageblock_free(struct page *page) 550 { 551 return PageBuddy(page) && page_order(page) >= pageblock_order; 552 } 553 554 /* Return the start of the next active pageblock after a given page */ 555 static struct page *next_active_pageblock(struct page *page) 556 { 557 int pageblocks_stride; 558 559 /* Ensure the starting page is pageblock-aligned */ 560 BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1)); 561 562 /* Move forward by at least 1 * pageblock_nr_pages */ 563 pageblocks_stride = 1; 564 565 /* If the entire pageblock is free, move to the end of free page */ 566 if (pageblock_free(page)) 567 pageblocks_stride += page_order(page) - pageblock_order; 568 569 return page + (pageblocks_stride * pageblock_nr_pages); 570 } 571 572 /* Checks if this range of memory is likely to be hot-removable. */ 573 int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) 574 { 575 int type; 576 struct page *page = pfn_to_page(start_pfn); 577 struct page *end_page = page + nr_pages; 578 579 /* Check the starting page of each pageblock within the range */ 580 for (; page < end_page; page = next_active_pageblock(page)) { 581 type = get_pageblock_migratetype(page); 582 583 /* 584 * A pageblock containing MOVABLE or free pages is considered 585 * removable 586 */ 587 if (type != MIGRATE_MOVABLE && !pageblock_free(page)) 588 return 0; 589 590 /* 591 * A pageblock starting with a PageReserved page is not 592 * considered removable. 593 */ 594 if (PageReserved(page)) 595 return 0; 596 } 597 598 /* All pageblocks in the memory block are likely to be hot-removable */ 599 return 1; 600 } 601 602 /* 603 * Confirm all pages in a range [start, end) is belongs to the same zone. 604 */ 605 static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) 606 { 607 unsigned long pfn; 608 struct zone *zone = NULL; 609 struct page *page; 610 int i; 611 for (pfn = start_pfn; 612 pfn < end_pfn; 613 pfn += MAX_ORDER_NR_PAGES) { 614 i = 0; 615 /* This is just a CONFIG_HOLES_IN_ZONE check.*/ 616 while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i)) 617 i++; 618 if (i == MAX_ORDER_NR_PAGES) 619 continue; 620 page = pfn_to_page(pfn + i); 621 if (zone && page_zone(page) != zone) 622 return 0; 623 zone = page_zone(page); 624 } 625 return 1; 626 } 627 628 /* 629 * Scanning pfn is much easier than scanning lru list. 630 * Scan pfn from start to end and Find LRU page. 631 */ 632 int scan_lru_pages(unsigned long start, unsigned long end) 633 { 634 unsigned long pfn; 635 struct page *page; 636 for (pfn = start; pfn < end; pfn++) { 637 if (pfn_valid(pfn)) { 638 page = pfn_to_page(pfn); 639 if (PageLRU(page)) 640 return pfn; 641 } 642 } 643 return 0; 644 } 645 646 static struct page * 647 hotremove_migrate_alloc(struct page *page, unsigned long private, int **x) 648 { 649 /* This should be improooooved!! */ 650 return alloc_page(GFP_HIGHUSER_MOVABLE); 651 } 652 653 #define NR_OFFLINE_AT_ONCE_PAGES (256) 654 static int 655 do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) 656 { 657 unsigned long pfn; 658 struct page *page; 659 int move_pages = NR_OFFLINE_AT_ONCE_PAGES; 660 int not_managed = 0; 661 int ret = 0; 662 LIST_HEAD(source); 663 664 for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) { 665 if (!pfn_valid(pfn)) 666 continue; 667 page = pfn_to_page(pfn); 668 if (!page_count(page)) 669 continue; 670 /* 671 * We can skip free pages. And we can only deal with pages on 672 * LRU. 673 */ 674 ret = isolate_lru_page(page); 675 if (!ret) { /* Success */ 676 list_add_tail(&page->lru, &source); 677 move_pages--; 678 inc_zone_page_state(page, NR_ISOLATED_ANON + 679 page_is_file_cache(page)); 680 681 } else { 682 /* Becasue we don't have big zone->lock. we should 683 check this again here. */ 684 if (page_count(page)) 685 not_managed++; 686 #ifdef CONFIG_DEBUG_VM 687 printk(KERN_INFO "removing from LRU failed" 688 " %lx/%d/%lx\n", 689 pfn, page_count(page), page->flags); 690 #endif 691 } 692 } 693 ret = -EBUSY; 694 if (not_managed) { 695 if (!list_empty(&source)) 696 putback_lru_pages(&source); 697 goto out; 698 } 699 ret = 0; 700 if (list_empty(&source)) 701 goto out; 702 /* this function returns # of failed pages */ 703 ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1); 704 705 out: 706 return ret; 707 } 708 709 /* 710 * remove from free_area[] and mark all as Reserved. 711 */ 712 static int 713 offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, 714 void *data) 715 { 716 __offline_isolated_pages(start, start + nr_pages); 717 return 0; 718 } 719 720 static void 721 offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 722 { 723 walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL, 724 offline_isolated_pages_cb); 725 } 726 727 /* 728 * Check all pages in range, recoreded as memory resource, are isolated. 729 */ 730 static int 731 check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, 732 void *data) 733 { 734 int ret; 735 long offlined = *(long *)data; 736 ret = test_pages_isolated(start_pfn, start_pfn + nr_pages); 737 offlined = nr_pages; 738 if (!ret) 739 *(long *)data += offlined; 740 return ret; 741 } 742 743 static long 744 check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) 745 { 746 long offlined = 0; 747 int ret; 748 749 ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined, 750 check_pages_isolated_cb); 751 if (ret < 0) 752 offlined = (long)ret; 753 return offlined; 754 } 755 756 static int offline_pages(unsigned long start_pfn, 757 unsigned long end_pfn, unsigned long timeout) 758 { 759 unsigned long pfn, nr_pages, expire; 760 long offlined_pages; 761 int ret, drain, retry_max, node; 762 struct zone *zone; 763 struct memory_notify arg; 764 765 BUG_ON(start_pfn >= end_pfn); 766 /* at least, alignment against pageblock is necessary */ 767 if (!IS_ALIGNED(start_pfn, pageblock_nr_pages)) 768 return -EINVAL; 769 if (!IS_ALIGNED(end_pfn, pageblock_nr_pages)) 770 return -EINVAL; 771 /* This makes hotplug much easier...and readable. 772 we assume this for now. .*/ 773 if (!test_pages_in_a_zone(start_pfn, end_pfn)) 774 return -EINVAL; 775 776 lock_system_sleep(); 777 778 zone = page_zone(pfn_to_page(start_pfn)); 779 node = zone_to_nid(zone); 780 nr_pages = end_pfn - start_pfn; 781 782 /* set above range as isolated */ 783 ret = start_isolate_page_range(start_pfn, end_pfn); 784 if (ret) 785 goto out; 786 787 arg.start_pfn = start_pfn; 788 arg.nr_pages = nr_pages; 789 arg.status_change_nid = -1; 790 if (nr_pages >= node_present_pages(node)) 791 arg.status_change_nid = node; 792 793 ret = memory_notify(MEM_GOING_OFFLINE, &arg); 794 ret = notifier_to_errno(ret); 795 if (ret) 796 goto failed_removal; 797 798 pfn = start_pfn; 799 expire = jiffies + timeout; 800 drain = 0; 801 retry_max = 5; 802 repeat: 803 /* start memory hot removal */ 804 ret = -EAGAIN; 805 if (time_after(jiffies, expire)) 806 goto failed_removal; 807 ret = -EINTR; 808 if (signal_pending(current)) 809 goto failed_removal; 810 ret = 0; 811 if (drain) { 812 lru_add_drain_all(); 813 flush_scheduled_work(); 814 cond_resched(); 815 drain_all_pages(); 816 } 817 818 pfn = scan_lru_pages(start_pfn, end_pfn); 819 if (pfn) { /* We have page on LRU */ 820 ret = do_migrate_range(pfn, end_pfn); 821 if (!ret) { 822 drain = 1; 823 goto repeat; 824 } else { 825 if (ret < 0) 826 if (--retry_max == 0) 827 goto failed_removal; 828 yield(); 829 drain = 1; 830 goto repeat; 831 } 832 } 833 /* drain all zone's lru pagevec, this is asyncronous... */ 834 lru_add_drain_all(); 835 flush_scheduled_work(); 836 yield(); 837 /* drain pcp pages , this is synchrouns. */ 838 drain_all_pages(); 839 /* check again */ 840 offlined_pages = check_pages_isolated(start_pfn, end_pfn); 841 if (offlined_pages < 0) { 842 ret = -EBUSY; 843 goto failed_removal; 844 } 845 printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages); 846 /* Ok, all of our target is islaoted. 847 We cannot do rollback at this point. */ 848 offline_isolated_pages(start_pfn, end_pfn); 849 /* reset pagetype flags and makes migrate type to be MOVABLE */ 850 undo_isolate_page_range(start_pfn, end_pfn); 851 /* removal success */ 852 zone->present_pages -= offlined_pages; 853 zone->zone_pgdat->node_present_pages -= offlined_pages; 854 totalram_pages -= offlined_pages; 855 856 setup_per_zone_wmarks(); 857 calculate_zone_inactive_ratio(zone); 858 if (!node_present_pages(node)) { 859 node_clear_state(node, N_HIGH_MEMORY); 860 kswapd_stop(node); 861 } 862 863 vm_total_pages = nr_free_pagecache_pages(); 864 writeback_set_ratelimit(); 865 866 memory_notify(MEM_OFFLINE, &arg); 867 unlock_system_sleep(); 868 return 0; 869 870 failed_removal: 871 printk(KERN_INFO "memory offlining %lx to %lx failed\n", 872 start_pfn, end_pfn); 873 memory_notify(MEM_CANCEL_OFFLINE, &arg); 874 /* pushback to free area */ 875 undo_isolate_page_range(start_pfn, end_pfn); 876 877 out: 878 unlock_system_sleep(); 879 return ret; 880 } 881 882 int remove_memory(u64 start, u64 size) 883 { 884 unsigned long start_pfn, end_pfn; 885 886 start_pfn = PFN_DOWN(start); 887 end_pfn = start_pfn + PFN_DOWN(size); 888 return offline_pages(start_pfn, end_pfn, 120 * HZ); 889 } 890 #else 891 int remove_memory(u64 start, u64 size) 892 { 893 return -EINVAL; 894 } 895 #endif /* CONFIG_MEMORY_HOTREMOVE */ 896 EXPORT_SYMBOL_GPL(remove_memory); 897