1 /* 2 * linux/mm/compaction.c 3 * 4 * Memory compaction for the reduction of external fragmentation. Note that 5 * this heavily depends upon page migration to do all the real heavy 6 * lifting 7 * 8 * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie> 9 */ 10 #include <linux/cpu.h> 11 #include <linux/swap.h> 12 #include <linux/migrate.h> 13 #include <linux/compaction.h> 14 #include <linux/mm_inline.h> 15 #include <linux/sched/signal.h> 16 #include <linux/backing-dev.h> 17 #include <linux/sysctl.h> 18 #include <linux/sysfs.h> 19 #include <linux/page-isolation.h> 20 #include <linux/kasan.h> 21 #include <linux/kthread.h> 22 #include <linux/freezer.h> 23 #include <linux/page_owner.h> 24 #include "internal.h" 25 26 #ifdef CONFIG_COMPACTION 27 static inline void count_compact_event(enum vm_event_item item) 28 { 29 count_vm_event(item); 30 } 31 32 static inline void count_compact_events(enum vm_event_item item, long delta) 33 { 34 count_vm_events(item, delta); 35 } 36 #else 37 #define count_compact_event(item) do { } while (0) 38 #define count_compact_events(item, delta) do { } while (0) 39 #endif 40 41 #if defined CONFIG_COMPACTION || defined CONFIG_CMA 42 43 #define CREATE_TRACE_POINTS 44 #include <trace/events/compaction.h> 45 46 #define block_start_pfn(pfn, order) round_down(pfn, 1UL << (order)) 47 #define block_end_pfn(pfn, order) ALIGN((pfn) + 1, 1UL << (order)) 48 #define pageblock_start_pfn(pfn) block_start_pfn(pfn, pageblock_order) 49 #define pageblock_end_pfn(pfn) block_end_pfn(pfn, pageblock_order) 50 51 static unsigned long release_freepages(struct list_head *freelist) 52 { 53 struct page *page, *next; 54 unsigned long high_pfn = 0; 55 56 list_for_each_entry_safe(page, next, freelist, lru) { 57 unsigned long pfn = page_to_pfn(page); 58 list_del(&page->lru); 59 __free_page(page); 60 if (pfn > high_pfn) 61 high_pfn = pfn; 62 } 63 64 return high_pfn; 65 } 66 67 static void map_pages(struct list_head *list) 68 { 69 unsigned int i, order, nr_pages; 70 struct page *page, *next; 71 LIST_HEAD(tmp_list); 72 73 list_for_each_entry_safe(page, next, list, lru) { 74 list_del(&page->lru); 75 76 order = page_private(page); 77 nr_pages = 1 << order; 78 79 post_alloc_hook(page, order, __GFP_MOVABLE); 80 if (order) 81 split_page(page, order); 82 83 for (i = 0; i < nr_pages; i++) { 84 list_add(&page->lru, &tmp_list); 85 page++; 86 } 87 } 88 89 list_splice(&tmp_list, list); 90 } 91 92 #ifdef CONFIG_COMPACTION 93 94 int PageMovable(struct page *page) 95 { 96 struct address_space *mapping; 97 98 VM_BUG_ON_PAGE(!PageLocked(page), page); 99 if (!__PageMovable(page)) 100 return 0; 101 102 mapping = page_mapping(page); 103 if (mapping && mapping->a_ops && mapping->a_ops->isolate_page) 104 return 1; 105 106 return 0; 107 } 108 EXPORT_SYMBOL(PageMovable); 109 110 void __SetPageMovable(struct page *page, struct address_space *mapping) 111 { 112 VM_BUG_ON_PAGE(!PageLocked(page), page); 113 VM_BUG_ON_PAGE((unsigned long)mapping & PAGE_MAPPING_MOVABLE, page); 114 page->mapping = (void *)((unsigned long)mapping | PAGE_MAPPING_MOVABLE); 115 } 116 EXPORT_SYMBOL(__SetPageMovable); 117 118 void __ClearPageMovable(struct page *page) 119 { 120 VM_BUG_ON_PAGE(!PageLocked(page), page); 121 VM_BUG_ON_PAGE(!PageMovable(page), page); 122 /* 123 * Clear registered address_space val with keeping PAGE_MAPPING_MOVABLE 124 * flag so that VM can catch up released page by driver after isolation. 125 * With it, VM migration doesn't try to put it back. 126 */ 127 page->mapping = (void *)((unsigned long)page->mapping & 128 PAGE_MAPPING_MOVABLE); 129 } 130 EXPORT_SYMBOL(__ClearPageMovable); 131 132 /* Do not skip compaction more than 64 times */ 133 #define COMPACT_MAX_DEFER_SHIFT 6 134 135 /* 136 * Compaction is deferred when compaction fails to result in a page 137 * allocation success. 1 << compact_defer_limit compactions are skipped up 138 * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT 139 */ 140 void defer_compaction(struct zone *zone, int order) 141 { 142 zone->compact_considered = 0; 143 zone->compact_defer_shift++; 144 145 if (order < zone->compact_order_failed) 146 zone->compact_order_failed = order; 147 148 if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT) 149 zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT; 150 151 trace_mm_compaction_defer_compaction(zone, order); 152 } 153 154 /* Returns true if compaction should be skipped this time */ 155 bool compaction_deferred(struct zone *zone, int order) 156 { 157 unsigned long defer_limit = 1UL << zone->compact_defer_shift; 158 159 if (order < zone->compact_order_failed) 160 return false; 161 162 /* Avoid possible overflow */ 163 if (++zone->compact_considered > defer_limit) 164 zone->compact_considered = defer_limit; 165 166 if (zone->compact_considered >= defer_limit) 167 return false; 168 169 trace_mm_compaction_deferred(zone, order); 170 171 return true; 172 } 173 174 /* 175 * Update defer tracking counters after successful compaction of given order, 176 * which means an allocation either succeeded (alloc_success == true) or is 177 * expected to succeed. 178 */ 179 void compaction_defer_reset(struct zone *zone, int order, 180 bool alloc_success) 181 { 182 if (alloc_success) { 183 zone->compact_considered = 0; 184 zone->compact_defer_shift = 0; 185 } 186 if (order >= zone->compact_order_failed) 187 zone->compact_order_failed = order + 1; 188 189 trace_mm_compaction_defer_reset(zone, order); 190 } 191 192 /* Returns true if restarting compaction after many failures */ 193 bool compaction_restarting(struct zone *zone, int order) 194 { 195 if (order < zone->compact_order_failed) 196 return false; 197 198 return zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT && 199 zone->compact_considered >= 1UL << zone->compact_defer_shift; 200 } 201 202 /* Returns true if the pageblock should be scanned for pages to isolate. */ 203 static inline bool isolation_suitable(struct compact_control *cc, 204 struct page *page) 205 { 206 if (cc->ignore_skip_hint) 207 return true; 208 209 return !get_pageblock_skip(page); 210 } 211 212 static void reset_cached_positions(struct zone *zone) 213 { 214 zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn; 215 zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn; 216 zone->compact_cached_free_pfn = 217 pageblock_start_pfn(zone_end_pfn(zone) - 1); 218 } 219 220 /* 221 * This function is called to clear all cached information on pageblocks that 222 * should be skipped for page isolation when the migrate and free page scanner 223 * meet. 224 */ 225 static void __reset_isolation_suitable(struct zone *zone) 226 { 227 unsigned long start_pfn = zone->zone_start_pfn; 228 unsigned long end_pfn = zone_end_pfn(zone); 229 unsigned long pfn; 230 231 zone->compact_blockskip_flush = false; 232 233 /* Walk the zone and mark every pageblock as suitable for isolation */ 234 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 235 struct page *page; 236 237 cond_resched(); 238 239 if (!pfn_valid(pfn)) 240 continue; 241 242 page = pfn_to_page(pfn); 243 if (zone != page_zone(page)) 244 continue; 245 246 clear_pageblock_skip(page); 247 } 248 249 reset_cached_positions(zone); 250 } 251 252 void reset_isolation_suitable(pg_data_t *pgdat) 253 { 254 int zoneid; 255 256 for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { 257 struct zone *zone = &pgdat->node_zones[zoneid]; 258 if (!populated_zone(zone)) 259 continue; 260 261 /* Only flush if a full compaction finished recently */ 262 if (zone->compact_blockskip_flush) 263 __reset_isolation_suitable(zone); 264 } 265 } 266 267 /* 268 * If no pages were isolated then mark this pageblock to be skipped in the 269 * future. The information is later cleared by __reset_isolation_suitable(). 270 */ 271 static void update_pageblock_skip(struct compact_control *cc, 272 struct page *page, unsigned long nr_isolated, 273 bool migrate_scanner) 274 { 275 struct zone *zone = cc->zone; 276 unsigned long pfn; 277 278 if (cc->ignore_skip_hint) 279 return; 280 281 if (!page) 282 return; 283 284 if (nr_isolated) 285 return; 286 287 set_pageblock_skip(page); 288 289 pfn = page_to_pfn(page); 290 291 /* Update where async and sync compaction should restart */ 292 if (migrate_scanner) { 293 if (pfn > zone->compact_cached_migrate_pfn[0]) 294 zone->compact_cached_migrate_pfn[0] = pfn; 295 if (cc->mode != MIGRATE_ASYNC && 296 pfn > zone->compact_cached_migrate_pfn[1]) 297 zone->compact_cached_migrate_pfn[1] = pfn; 298 } else { 299 if (pfn < zone->compact_cached_free_pfn) 300 zone->compact_cached_free_pfn = pfn; 301 } 302 } 303 #else 304 static inline bool isolation_suitable(struct compact_control *cc, 305 struct page *page) 306 { 307 return true; 308 } 309 310 static void update_pageblock_skip(struct compact_control *cc, 311 struct page *page, unsigned long nr_isolated, 312 bool migrate_scanner) 313 { 314 } 315 #endif /* CONFIG_COMPACTION */ 316 317 /* 318 * Compaction requires the taking of some coarse locks that are potentially 319 * very heavily contended. For async compaction, back out if the lock cannot 320 * be taken immediately. For sync compaction, spin on the lock if needed. 321 * 322 * Returns true if the lock is held 323 * Returns false if the lock is not held and compaction should abort 324 */ 325 static bool compact_trylock_irqsave(spinlock_t *lock, unsigned long *flags, 326 struct compact_control *cc) 327 { 328 if (cc->mode == MIGRATE_ASYNC) { 329 if (!spin_trylock_irqsave(lock, *flags)) { 330 cc->contended = true; 331 return false; 332 } 333 } else { 334 spin_lock_irqsave(lock, *flags); 335 } 336 337 return true; 338 } 339 340 /* 341 * Compaction requires the taking of some coarse locks that are potentially 342 * very heavily contended. The lock should be periodically unlocked to avoid 343 * having disabled IRQs for a long time, even when there is nobody waiting on 344 * the lock. It might also be that allowing the IRQs will result in 345 * need_resched() becoming true. If scheduling is needed, async compaction 346 * aborts. Sync compaction schedules. 347 * Either compaction type will also abort if a fatal signal is pending. 348 * In either case if the lock was locked, it is dropped and not regained. 349 * 350 * Returns true if compaction should abort due to fatal signal pending, or 351 * async compaction due to need_resched() 352 * Returns false when compaction can continue (sync compaction might have 353 * scheduled) 354 */ 355 static bool compact_unlock_should_abort(spinlock_t *lock, 356 unsigned long flags, bool *locked, struct compact_control *cc) 357 { 358 if (*locked) { 359 spin_unlock_irqrestore(lock, flags); 360 *locked = false; 361 } 362 363 if (fatal_signal_pending(current)) { 364 cc->contended = true; 365 return true; 366 } 367 368 if (need_resched()) { 369 if (cc->mode == MIGRATE_ASYNC) { 370 cc->contended = true; 371 return true; 372 } 373 cond_resched(); 374 } 375 376 return false; 377 } 378 379 /* 380 * Aside from avoiding lock contention, compaction also periodically checks 381 * need_resched() and either schedules in sync compaction or aborts async 382 * compaction. This is similar to what compact_unlock_should_abort() does, but 383 * is used where no lock is concerned. 384 * 385 * Returns false when no scheduling was needed, or sync compaction scheduled. 386 * Returns true when async compaction should abort. 387 */ 388 static inline bool compact_should_abort(struct compact_control *cc) 389 { 390 /* async compaction aborts if contended */ 391 if (need_resched()) { 392 if (cc->mode == MIGRATE_ASYNC) { 393 cc->contended = true; 394 return true; 395 } 396 397 cond_resched(); 398 } 399 400 return false; 401 } 402 403 /* 404 * Isolate free pages onto a private freelist. If @strict is true, will abort 405 * returning 0 on any invalid PFNs or non-free pages inside of the pageblock 406 * (even though it may still end up isolating some pages). 407 */ 408 static unsigned long isolate_freepages_block(struct compact_control *cc, 409 unsigned long *start_pfn, 410 unsigned long end_pfn, 411 struct list_head *freelist, 412 bool strict) 413 { 414 int nr_scanned = 0, total_isolated = 0; 415 struct page *cursor, *valid_page = NULL; 416 unsigned long flags = 0; 417 bool locked = false; 418 unsigned long blockpfn = *start_pfn; 419 unsigned int order; 420 421 cursor = pfn_to_page(blockpfn); 422 423 /* Isolate free pages. */ 424 for (; blockpfn < end_pfn; blockpfn++, cursor++) { 425 int isolated; 426 struct page *page = cursor; 427 428 /* 429 * Periodically drop the lock (if held) regardless of its 430 * contention, to give chance to IRQs. Abort if fatal signal 431 * pending or async compaction detects need_resched() 432 */ 433 if (!(blockpfn % SWAP_CLUSTER_MAX) 434 && compact_unlock_should_abort(&cc->zone->lock, flags, 435 &locked, cc)) 436 break; 437 438 nr_scanned++; 439 if (!pfn_valid_within(blockpfn)) 440 goto isolate_fail; 441 442 if (!valid_page) 443 valid_page = page; 444 445 /* 446 * For compound pages such as THP and hugetlbfs, we can save 447 * potentially a lot of iterations if we skip them at once. 448 * The check is racy, but we can consider only valid values 449 * and the only danger is skipping too much. 450 */ 451 if (PageCompound(page)) { 452 unsigned int comp_order = compound_order(page); 453 454 if (likely(comp_order < MAX_ORDER)) { 455 blockpfn += (1UL << comp_order) - 1; 456 cursor += (1UL << comp_order) - 1; 457 } 458 459 goto isolate_fail; 460 } 461 462 if (!PageBuddy(page)) 463 goto isolate_fail; 464 465 /* 466 * If we already hold the lock, we can skip some rechecking. 467 * Note that if we hold the lock now, checked_pageblock was 468 * already set in some previous iteration (or strict is true), 469 * so it is correct to skip the suitable migration target 470 * recheck as well. 471 */ 472 if (!locked) { 473 /* 474 * The zone lock must be held to isolate freepages. 475 * Unfortunately this is a very coarse lock and can be 476 * heavily contended if there are parallel allocations 477 * or parallel compactions. For async compaction do not 478 * spin on the lock and we acquire the lock as late as 479 * possible. 480 */ 481 locked = compact_trylock_irqsave(&cc->zone->lock, 482 &flags, cc); 483 if (!locked) 484 break; 485 486 /* Recheck this is a buddy page under lock */ 487 if (!PageBuddy(page)) 488 goto isolate_fail; 489 } 490 491 /* Found a free page, will break it into order-0 pages */ 492 order = page_order(page); 493 isolated = __isolate_free_page(page, order); 494 if (!isolated) 495 break; 496 set_page_private(page, order); 497 498 total_isolated += isolated; 499 cc->nr_freepages += isolated; 500 list_add_tail(&page->lru, freelist); 501 502 if (!strict && cc->nr_migratepages <= cc->nr_freepages) { 503 blockpfn += isolated; 504 break; 505 } 506 /* Advance to the end of split page */ 507 blockpfn += isolated - 1; 508 cursor += isolated - 1; 509 continue; 510 511 isolate_fail: 512 if (strict) 513 break; 514 else 515 continue; 516 517 } 518 519 if (locked) 520 spin_unlock_irqrestore(&cc->zone->lock, flags); 521 522 /* 523 * There is a tiny chance that we have read bogus compound_order(), 524 * so be careful to not go outside of the pageblock. 525 */ 526 if (unlikely(blockpfn > end_pfn)) 527 blockpfn = end_pfn; 528 529 trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn, 530 nr_scanned, total_isolated); 531 532 /* Record how far we have got within the block */ 533 *start_pfn = blockpfn; 534 535 /* 536 * If strict isolation is requested by CMA then check that all the 537 * pages requested were isolated. If there were any failures, 0 is 538 * returned and CMA will fail. 539 */ 540 if (strict && blockpfn < end_pfn) 541 total_isolated = 0; 542 543 /* Update the pageblock-skip if the whole pageblock was scanned */ 544 if (blockpfn == end_pfn) 545 update_pageblock_skip(cc, valid_page, total_isolated, false); 546 547 cc->total_free_scanned += nr_scanned; 548 if (total_isolated) 549 count_compact_events(COMPACTISOLATED, total_isolated); 550 return total_isolated; 551 } 552 553 /** 554 * isolate_freepages_range() - isolate free pages. 555 * @start_pfn: The first PFN to start isolating. 556 * @end_pfn: The one-past-last PFN. 557 * 558 * Non-free pages, invalid PFNs, or zone boundaries within the 559 * [start_pfn, end_pfn) range are considered errors, cause function to 560 * undo its actions and return zero. 561 * 562 * Otherwise, function returns one-past-the-last PFN of isolated page 563 * (which may be greater then end_pfn if end fell in a middle of 564 * a free page). 565 */ 566 unsigned long 567 isolate_freepages_range(struct compact_control *cc, 568 unsigned long start_pfn, unsigned long end_pfn) 569 { 570 unsigned long isolated, pfn, block_start_pfn, block_end_pfn; 571 LIST_HEAD(freelist); 572 573 pfn = start_pfn; 574 block_start_pfn = pageblock_start_pfn(pfn); 575 if (block_start_pfn < cc->zone->zone_start_pfn) 576 block_start_pfn = cc->zone->zone_start_pfn; 577 block_end_pfn = pageblock_end_pfn(pfn); 578 579 for (; pfn < end_pfn; pfn += isolated, 580 block_start_pfn = block_end_pfn, 581 block_end_pfn += pageblock_nr_pages) { 582 /* Protect pfn from changing by isolate_freepages_block */ 583 unsigned long isolate_start_pfn = pfn; 584 585 block_end_pfn = min(block_end_pfn, end_pfn); 586 587 /* 588 * pfn could pass the block_end_pfn if isolated freepage 589 * is more than pageblock order. In this case, we adjust 590 * scanning range to right one. 591 */ 592 if (pfn >= block_end_pfn) { 593 block_start_pfn = pageblock_start_pfn(pfn); 594 block_end_pfn = pageblock_end_pfn(pfn); 595 block_end_pfn = min(block_end_pfn, end_pfn); 596 } 597 598 if (!pageblock_pfn_to_page(block_start_pfn, 599 block_end_pfn, cc->zone)) 600 break; 601 602 isolated = isolate_freepages_block(cc, &isolate_start_pfn, 603 block_end_pfn, &freelist, true); 604 605 /* 606 * In strict mode, isolate_freepages_block() returns 0 if 607 * there are any holes in the block (ie. invalid PFNs or 608 * non-free pages). 609 */ 610 if (!isolated) 611 break; 612 613 /* 614 * If we managed to isolate pages, it is always (1 << n) * 615 * pageblock_nr_pages for some non-negative n. (Max order 616 * page may span two pageblocks). 617 */ 618 } 619 620 /* __isolate_free_page() does not map the pages */ 621 map_pages(&freelist); 622 623 if (pfn < end_pfn) { 624 /* Loop terminated early, cleanup. */ 625 release_freepages(&freelist); 626 return 0; 627 } 628 629 /* We don't use freelists for anything. */ 630 return pfn; 631 } 632 633 /* Similar to reclaim, but different enough that they don't share logic */ 634 static bool too_many_isolated(struct zone *zone) 635 { 636 unsigned long active, inactive, isolated; 637 638 inactive = node_page_state(zone->zone_pgdat, NR_INACTIVE_FILE) + 639 node_page_state(zone->zone_pgdat, NR_INACTIVE_ANON); 640 active = node_page_state(zone->zone_pgdat, NR_ACTIVE_FILE) + 641 node_page_state(zone->zone_pgdat, NR_ACTIVE_ANON); 642 isolated = node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE) + 643 node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON); 644 645 return isolated > (inactive + active) / 2; 646 } 647 648 /** 649 * isolate_migratepages_block() - isolate all migrate-able pages within 650 * a single pageblock 651 * @cc: Compaction control structure. 652 * @low_pfn: The first PFN to isolate 653 * @end_pfn: The one-past-the-last PFN to isolate, within same pageblock 654 * @isolate_mode: Isolation mode to be used. 655 * 656 * Isolate all pages that can be migrated from the range specified by 657 * [low_pfn, end_pfn). The range is expected to be within same pageblock. 658 * Returns zero if there is a fatal signal pending, otherwise PFN of the 659 * first page that was not scanned (which may be both less, equal to or more 660 * than end_pfn). 661 * 662 * The pages are isolated on cc->migratepages list (not required to be empty), 663 * and cc->nr_migratepages is updated accordingly. The cc->migrate_pfn field 664 * is neither read nor updated. 665 */ 666 static unsigned long 667 isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, 668 unsigned long end_pfn, isolate_mode_t isolate_mode) 669 { 670 struct zone *zone = cc->zone; 671 unsigned long nr_scanned = 0, nr_isolated = 0; 672 struct lruvec *lruvec; 673 unsigned long flags = 0; 674 bool locked = false; 675 struct page *page = NULL, *valid_page = NULL; 676 unsigned long start_pfn = low_pfn; 677 bool skip_on_failure = false; 678 unsigned long next_skip_pfn = 0; 679 680 /* 681 * Ensure that there are not too many pages isolated from the LRU 682 * list by either parallel reclaimers or compaction. If there are, 683 * delay for some time until fewer pages are isolated 684 */ 685 while (unlikely(too_many_isolated(zone))) { 686 /* async migration should just abort */ 687 if (cc->mode == MIGRATE_ASYNC) 688 return 0; 689 690 congestion_wait(BLK_RW_ASYNC, HZ/10); 691 692 if (fatal_signal_pending(current)) 693 return 0; 694 } 695 696 if (compact_should_abort(cc)) 697 return 0; 698 699 if (cc->direct_compaction && (cc->mode == MIGRATE_ASYNC)) { 700 skip_on_failure = true; 701 next_skip_pfn = block_end_pfn(low_pfn, cc->order); 702 } 703 704 /* Time to isolate some pages for migration */ 705 for (; low_pfn < end_pfn; low_pfn++) { 706 707 if (skip_on_failure && low_pfn >= next_skip_pfn) { 708 /* 709 * We have isolated all migration candidates in the 710 * previous order-aligned block, and did not skip it due 711 * to failure. We should migrate the pages now and 712 * hopefully succeed compaction. 713 */ 714 if (nr_isolated) 715 break; 716 717 /* 718 * We failed to isolate in the previous order-aligned 719 * block. Set the new boundary to the end of the 720 * current block. Note we can't simply increase 721 * next_skip_pfn by 1 << order, as low_pfn might have 722 * been incremented by a higher number due to skipping 723 * a compound or a high-order buddy page in the 724 * previous loop iteration. 725 */ 726 next_skip_pfn = block_end_pfn(low_pfn, cc->order); 727 } 728 729 /* 730 * Periodically drop the lock (if held) regardless of its 731 * contention, to give chance to IRQs. Abort async compaction 732 * if contended. 733 */ 734 if (!(low_pfn % SWAP_CLUSTER_MAX) 735 && compact_unlock_should_abort(zone_lru_lock(zone), flags, 736 &locked, cc)) 737 break; 738 739 if (!pfn_valid_within(low_pfn)) 740 goto isolate_fail; 741 nr_scanned++; 742 743 page = pfn_to_page(low_pfn); 744 745 if (!valid_page) 746 valid_page = page; 747 748 /* 749 * Skip if free. We read page order here without zone lock 750 * which is generally unsafe, but the race window is small and 751 * the worst thing that can happen is that we skip some 752 * potential isolation targets. 753 */ 754 if (PageBuddy(page)) { 755 unsigned long freepage_order = page_order_unsafe(page); 756 757 /* 758 * Without lock, we cannot be sure that what we got is 759 * a valid page order. Consider only values in the 760 * valid order range to prevent low_pfn overflow. 761 */ 762 if (freepage_order > 0 && freepage_order < MAX_ORDER) 763 low_pfn += (1UL << freepage_order) - 1; 764 continue; 765 } 766 767 /* 768 * Regardless of being on LRU, compound pages such as THP and 769 * hugetlbfs are not to be compacted. We can potentially save 770 * a lot of iterations if we skip them at once. The check is 771 * racy, but we can consider only valid values and the only 772 * danger is skipping too much. 773 */ 774 if (PageCompound(page)) { 775 unsigned int comp_order = compound_order(page); 776 777 if (likely(comp_order < MAX_ORDER)) 778 low_pfn += (1UL << comp_order) - 1; 779 780 goto isolate_fail; 781 } 782 783 /* 784 * Check may be lockless but that's ok as we recheck later. 785 * It's possible to migrate LRU and non-lru movable pages. 786 * Skip any other type of page 787 */ 788 if (!PageLRU(page)) { 789 /* 790 * __PageMovable can return false positive so we need 791 * to verify it under page_lock. 792 */ 793 if (unlikely(__PageMovable(page)) && 794 !PageIsolated(page)) { 795 if (locked) { 796 spin_unlock_irqrestore(zone_lru_lock(zone), 797 flags); 798 locked = false; 799 } 800 801 if (!isolate_movable_page(page, isolate_mode)) 802 goto isolate_success; 803 } 804 805 goto isolate_fail; 806 } 807 808 /* 809 * Migration will fail if an anonymous page is pinned in memory, 810 * so avoid taking lru_lock and isolating it unnecessarily in an 811 * admittedly racy check. 812 */ 813 if (!page_mapping(page) && 814 page_count(page) > page_mapcount(page)) 815 goto isolate_fail; 816 817 /* 818 * Only allow to migrate anonymous pages in GFP_NOFS context 819 * because those do not depend on fs locks. 820 */ 821 if (!(cc->gfp_mask & __GFP_FS) && page_mapping(page)) 822 goto isolate_fail; 823 824 /* If we already hold the lock, we can skip some rechecking */ 825 if (!locked) { 826 locked = compact_trylock_irqsave(zone_lru_lock(zone), 827 &flags, cc); 828 if (!locked) 829 break; 830 831 /* Recheck PageLRU and PageCompound under lock */ 832 if (!PageLRU(page)) 833 goto isolate_fail; 834 835 /* 836 * Page become compound since the non-locked check, 837 * and it's on LRU. It can only be a THP so the order 838 * is safe to read and it's 0 for tail pages. 839 */ 840 if (unlikely(PageCompound(page))) { 841 low_pfn += (1UL << compound_order(page)) - 1; 842 goto isolate_fail; 843 } 844 } 845 846 lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); 847 848 /* Try isolate the page */ 849 if (__isolate_lru_page(page, isolate_mode) != 0) 850 goto isolate_fail; 851 852 VM_BUG_ON_PAGE(PageCompound(page), page); 853 854 /* Successfully isolated */ 855 del_page_from_lru_list(page, lruvec, page_lru(page)); 856 inc_node_page_state(page, 857 NR_ISOLATED_ANON + page_is_file_cache(page)); 858 859 isolate_success: 860 list_add(&page->lru, &cc->migratepages); 861 cc->nr_migratepages++; 862 nr_isolated++; 863 864 /* 865 * Record where we could have freed pages by migration and not 866 * yet flushed them to buddy allocator. 867 * - this is the lowest page that was isolated and likely be 868 * then freed by migration. 869 */ 870 if (!cc->last_migrated_pfn) 871 cc->last_migrated_pfn = low_pfn; 872 873 /* Avoid isolating too much */ 874 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { 875 ++low_pfn; 876 break; 877 } 878 879 continue; 880 isolate_fail: 881 if (!skip_on_failure) 882 continue; 883 884 /* 885 * We have isolated some pages, but then failed. Release them 886 * instead of migrating, as we cannot form the cc->order buddy 887 * page anyway. 888 */ 889 if (nr_isolated) { 890 if (locked) { 891 spin_unlock_irqrestore(zone_lru_lock(zone), flags); 892 locked = false; 893 } 894 putback_movable_pages(&cc->migratepages); 895 cc->nr_migratepages = 0; 896 cc->last_migrated_pfn = 0; 897 nr_isolated = 0; 898 } 899 900 if (low_pfn < next_skip_pfn) { 901 low_pfn = next_skip_pfn - 1; 902 /* 903 * The check near the loop beginning would have updated 904 * next_skip_pfn too, but this is a bit simpler. 905 */ 906 next_skip_pfn += 1UL << cc->order; 907 } 908 } 909 910 /* 911 * The PageBuddy() check could have potentially brought us outside 912 * the range to be scanned. 913 */ 914 if (unlikely(low_pfn > end_pfn)) 915 low_pfn = end_pfn; 916 917 if (locked) 918 spin_unlock_irqrestore(zone_lru_lock(zone), flags); 919 920 /* 921 * Update the pageblock-skip information and cached scanner pfn, 922 * if the whole pageblock was scanned without isolating any page. 923 */ 924 if (low_pfn == end_pfn) 925 update_pageblock_skip(cc, valid_page, nr_isolated, true); 926 927 trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn, 928 nr_scanned, nr_isolated); 929 930 cc->total_migrate_scanned += nr_scanned; 931 if (nr_isolated) 932 count_compact_events(COMPACTISOLATED, nr_isolated); 933 934 return low_pfn; 935 } 936 937 /** 938 * isolate_migratepages_range() - isolate migrate-able pages in a PFN range 939 * @cc: Compaction control structure. 940 * @start_pfn: The first PFN to start isolating. 941 * @end_pfn: The one-past-last PFN. 942 * 943 * Returns zero if isolation fails fatally due to e.g. pending signal. 944 * Otherwise, function returns one-past-the-last PFN of isolated page 945 * (which may be greater than end_pfn if end fell in a middle of a THP page). 946 */ 947 unsigned long 948 isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, 949 unsigned long end_pfn) 950 { 951 unsigned long pfn, block_start_pfn, block_end_pfn; 952 953 /* Scan block by block. First and last block may be incomplete */ 954 pfn = start_pfn; 955 block_start_pfn = pageblock_start_pfn(pfn); 956 if (block_start_pfn < cc->zone->zone_start_pfn) 957 block_start_pfn = cc->zone->zone_start_pfn; 958 block_end_pfn = pageblock_end_pfn(pfn); 959 960 for (; pfn < end_pfn; pfn = block_end_pfn, 961 block_start_pfn = block_end_pfn, 962 block_end_pfn += pageblock_nr_pages) { 963 964 block_end_pfn = min(block_end_pfn, end_pfn); 965 966 if (!pageblock_pfn_to_page(block_start_pfn, 967 block_end_pfn, cc->zone)) 968 continue; 969 970 pfn = isolate_migratepages_block(cc, pfn, block_end_pfn, 971 ISOLATE_UNEVICTABLE); 972 973 if (!pfn) 974 break; 975 976 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) 977 break; 978 } 979 980 return pfn; 981 } 982 983 #endif /* CONFIG_COMPACTION || CONFIG_CMA */ 984 #ifdef CONFIG_COMPACTION 985 986 static bool suitable_migration_source(struct compact_control *cc, 987 struct page *page) 988 { 989 int block_mt; 990 991 if ((cc->mode != MIGRATE_ASYNC) || !cc->direct_compaction) 992 return true; 993 994 block_mt = get_pageblock_migratetype(page); 995 996 if (cc->migratetype == MIGRATE_MOVABLE) 997 return is_migrate_movable(block_mt); 998 else 999 return block_mt == cc->migratetype; 1000 } 1001 1002 /* Returns true if the page is within a block suitable for migration to */ 1003 static bool suitable_migration_target(struct compact_control *cc, 1004 struct page *page) 1005 { 1006 /* If the page is a large free page, then disallow migration */ 1007 if (PageBuddy(page)) { 1008 /* 1009 * We are checking page_order without zone->lock taken. But 1010 * the only small danger is that we skip a potentially suitable 1011 * pageblock, so it's not worth to check order for valid range. 1012 */ 1013 if (page_order_unsafe(page) >= pageblock_order) 1014 return false; 1015 } 1016 1017 if (cc->ignore_block_suitable) 1018 return true; 1019 1020 /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ 1021 if (is_migrate_movable(get_pageblock_migratetype(page))) 1022 return true; 1023 1024 /* Otherwise skip the block */ 1025 return false; 1026 } 1027 1028 /* 1029 * Test whether the free scanner has reached the same or lower pageblock than 1030 * the migration scanner, and compaction should thus terminate. 1031 */ 1032 static inline bool compact_scanners_met(struct compact_control *cc) 1033 { 1034 return (cc->free_pfn >> pageblock_order) 1035 <= (cc->migrate_pfn >> pageblock_order); 1036 } 1037 1038 /* 1039 * Based on information in the current compact_control, find blocks 1040 * suitable for isolating free pages from and then isolate them. 1041 */ 1042 static void isolate_freepages(struct compact_control *cc) 1043 { 1044 struct zone *zone = cc->zone; 1045 struct page *page; 1046 unsigned long block_start_pfn; /* start of current pageblock */ 1047 unsigned long isolate_start_pfn; /* exact pfn we start at */ 1048 unsigned long block_end_pfn; /* end of current pageblock */ 1049 unsigned long low_pfn; /* lowest pfn scanner is able to scan */ 1050 struct list_head *freelist = &cc->freepages; 1051 1052 /* 1053 * Initialise the free scanner. The starting point is where we last 1054 * successfully isolated from, zone-cached value, or the end of the 1055 * zone when isolating for the first time. For looping we also need 1056 * this pfn aligned down to the pageblock boundary, because we do 1057 * block_start_pfn -= pageblock_nr_pages in the for loop. 1058 * For ending point, take care when isolating in last pageblock of a 1059 * a zone which ends in the middle of a pageblock. 1060 * The low boundary is the end of the pageblock the migration scanner 1061 * is using. 1062 */ 1063 isolate_start_pfn = cc->free_pfn; 1064 block_start_pfn = pageblock_start_pfn(cc->free_pfn); 1065 block_end_pfn = min(block_start_pfn + pageblock_nr_pages, 1066 zone_end_pfn(zone)); 1067 low_pfn = pageblock_end_pfn(cc->migrate_pfn); 1068 1069 /* 1070 * Isolate free pages until enough are available to migrate the 1071 * pages on cc->migratepages. We stop searching if the migrate 1072 * and free page scanners meet or enough free pages are isolated. 1073 */ 1074 for (; block_start_pfn >= low_pfn; 1075 block_end_pfn = block_start_pfn, 1076 block_start_pfn -= pageblock_nr_pages, 1077 isolate_start_pfn = block_start_pfn) { 1078 /* 1079 * This can iterate a massively long zone without finding any 1080 * suitable migration targets, so periodically check if we need 1081 * to schedule, or even abort async compaction. 1082 */ 1083 if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)) 1084 && compact_should_abort(cc)) 1085 break; 1086 1087 page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, 1088 zone); 1089 if (!page) 1090 continue; 1091 1092 /* Check the block is suitable for migration */ 1093 if (!suitable_migration_target(cc, page)) 1094 continue; 1095 1096 /* If isolation recently failed, do not retry */ 1097 if (!isolation_suitable(cc, page)) 1098 continue; 1099 1100 /* Found a block suitable for isolating free pages from. */ 1101 isolate_freepages_block(cc, &isolate_start_pfn, block_end_pfn, 1102 freelist, false); 1103 1104 /* 1105 * If we isolated enough freepages, or aborted due to lock 1106 * contention, terminate. 1107 */ 1108 if ((cc->nr_freepages >= cc->nr_migratepages) 1109 || cc->contended) { 1110 if (isolate_start_pfn >= block_end_pfn) { 1111 /* 1112 * Restart at previous pageblock if more 1113 * freepages can be isolated next time. 1114 */ 1115 isolate_start_pfn = 1116 block_start_pfn - pageblock_nr_pages; 1117 } 1118 break; 1119 } else if (isolate_start_pfn < block_end_pfn) { 1120 /* 1121 * If isolation failed early, do not continue 1122 * needlessly. 1123 */ 1124 break; 1125 } 1126 } 1127 1128 /* __isolate_free_page() does not map the pages */ 1129 map_pages(freelist); 1130 1131 /* 1132 * Record where the free scanner will restart next time. Either we 1133 * broke from the loop and set isolate_start_pfn based on the last 1134 * call to isolate_freepages_block(), or we met the migration scanner 1135 * and the loop terminated due to isolate_start_pfn < low_pfn 1136 */ 1137 cc->free_pfn = isolate_start_pfn; 1138 } 1139 1140 /* 1141 * This is a migrate-callback that "allocates" freepages by taking pages 1142 * from the isolated freelists in the block we are migrating to. 1143 */ 1144 static struct page *compaction_alloc(struct page *migratepage, 1145 unsigned long data, 1146 int **result) 1147 { 1148 struct compact_control *cc = (struct compact_control *)data; 1149 struct page *freepage; 1150 1151 /* 1152 * Isolate free pages if necessary, and if we are not aborting due to 1153 * contention. 1154 */ 1155 if (list_empty(&cc->freepages)) { 1156 if (!cc->contended) 1157 isolate_freepages(cc); 1158 1159 if (list_empty(&cc->freepages)) 1160 return NULL; 1161 } 1162 1163 freepage = list_entry(cc->freepages.next, struct page, lru); 1164 list_del(&freepage->lru); 1165 cc->nr_freepages--; 1166 1167 return freepage; 1168 } 1169 1170 /* 1171 * This is a migrate-callback that "frees" freepages back to the isolated 1172 * freelist. All pages on the freelist are from the same zone, so there is no 1173 * special handling needed for NUMA. 1174 */ 1175 static void compaction_free(struct page *page, unsigned long data) 1176 { 1177 struct compact_control *cc = (struct compact_control *)data; 1178 1179 list_add(&page->lru, &cc->freepages); 1180 cc->nr_freepages++; 1181 } 1182 1183 /* possible outcome of isolate_migratepages */ 1184 typedef enum { 1185 ISOLATE_ABORT, /* Abort compaction now */ 1186 ISOLATE_NONE, /* No pages isolated, continue scanning */ 1187 ISOLATE_SUCCESS, /* Pages isolated, migrate */ 1188 } isolate_migrate_t; 1189 1190 /* 1191 * Allow userspace to control policy on scanning the unevictable LRU for 1192 * compactable pages. 1193 */ 1194 int sysctl_compact_unevictable_allowed __read_mostly = 1; 1195 1196 /* 1197 * Isolate all pages that can be migrated from the first suitable block, 1198 * starting at the block pointed to by the migrate scanner pfn within 1199 * compact_control. 1200 */ 1201 static isolate_migrate_t isolate_migratepages(struct zone *zone, 1202 struct compact_control *cc) 1203 { 1204 unsigned long block_start_pfn; 1205 unsigned long block_end_pfn; 1206 unsigned long low_pfn; 1207 struct page *page; 1208 const isolate_mode_t isolate_mode = 1209 (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) | 1210 (cc->mode != MIGRATE_SYNC ? ISOLATE_ASYNC_MIGRATE : 0); 1211 1212 /* 1213 * Start at where we last stopped, or beginning of the zone as 1214 * initialized by compact_zone() 1215 */ 1216 low_pfn = cc->migrate_pfn; 1217 block_start_pfn = pageblock_start_pfn(low_pfn); 1218 if (block_start_pfn < zone->zone_start_pfn) 1219 block_start_pfn = zone->zone_start_pfn; 1220 1221 /* Only scan within a pageblock boundary */ 1222 block_end_pfn = pageblock_end_pfn(low_pfn); 1223 1224 /* 1225 * Iterate over whole pageblocks until we find the first suitable. 1226 * Do not cross the free scanner. 1227 */ 1228 for (; block_end_pfn <= cc->free_pfn; 1229 low_pfn = block_end_pfn, 1230 block_start_pfn = block_end_pfn, 1231 block_end_pfn += pageblock_nr_pages) { 1232 1233 /* 1234 * This can potentially iterate a massively long zone with 1235 * many pageblocks unsuitable, so periodically check if we 1236 * need to schedule, or even abort async compaction. 1237 */ 1238 if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)) 1239 && compact_should_abort(cc)) 1240 break; 1241 1242 page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, 1243 zone); 1244 if (!page) 1245 continue; 1246 1247 /* If isolation recently failed, do not retry */ 1248 if (!isolation_suitable(cc, page)) 1249 continue; 1250 1251 /* 1252 * For async compaction, also only scan in MOVABLE blocks. 1253 * Async compaction is optimistic to see if the minimum amount 1254 * of work satisfies the allocation. 1255 */ 1256 if (!suitable_migration_source(cc, page)) 1257 continue; 1258 1259 /* Perform the isolation */ 1260 low_pfn = isolate_migratepages_block(cc, low_pfn, 1261 block_end_pfn, isolate_mode); 1262 1263 if (!low_pfn || cc->contended) 1264 return ISOLATE_ABORT; 1265 1266 /* 1267 * Either we isolated something and proceed with migration. Or 1268 * we failed and compact_zone should decide if we should 1269 * continue or not. 1270 */ 1271 break; 1272 } 1273 1274 /* Record where migration scanner will be restarted. */ 1275 cc->migrate_pfn = low_pfn; 1276 1277 return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE; 1278 } 1279 1280 /* 1281 * order == -1 is expected when compacting via 1282 * /proc/sys/vm/compact_memory 1283 */ 1284 static inline bool is_via_compact_memory(int order) 1285 { 1286 return order == -1; 1287 } 1288 1289 static enum compact_result __compact_finished(struct zone *zone, 1290 struct compact_control *cc) 1291 { 1292 unsigned int order; 1293 const int migratetype = cc->migratetype; 1294 1295 if (cc->contended || fatal_signal_pending(current)) 1296 return COMPACT_CONTENDED; 1297 1298 /* Compaction run completes if the migrate and free scanner meet */ 1299 if (compact_scanners_met(cc)) { 1300 /* Let the next compaction start anew. */ 1301 reset_cached_positions(zone); 1302 1303 /* 1304 * Mark that the PG_migrate_skip information should be cleared 1305 * by kswapd when it goes to sleep. kcompactd does not set the 1306 * flag itself as the decision to be clear should be directly 1307 * based on an allocation request. 1308 */ 1309 if (cc->direct_compaction) 1310 zone->compact_blockskip_flush = true; 1311 1312 if (cc->whole_zone) 1313 return COMPACT_COMPLETE; 1314 else 1315 return COMPACT_PARTIAL_SKIPPED; 1316 } 1317 1318 if (is_via_compact_memory(cc->order)) 1319 return COMPACT_CONTINUE; 1320 1321 if (cc->finishing_block) { 1322 /* 1323 * We have finished the pageblock, but better check again that 1324 * we really succeeded. 1325 */ 1326 if (IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages)) 1327 cc->finishing_block = false; 1328 else 1329 return COMPACT_CONTINUE; 1330 } 1331 1332 /* Direct compactor: Is a suitable page free? */ 1333 for (order = cc->order; order < MAX_ORDER; order++) { 1334 struct free_area *area = &zone->free_area[order]; 1335 bool can_steal; 1336 1337 /* Job done if page is free of the right migratetype */ 1338 if (!list_empty(&area->free_list[migratetype])) 1339 return COMPACT_SUCCESS; 1340 1341 #ifdef CONFIG_CMA 1342 /* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */ 1343 if (migratetype == MIGRATE_MOVABLE && 1344 !list_empty(&area->free_list[MIGRATE_CMA])) 1345 return COMPACT_SUCCESS; 1346 #endif 1347 /* 1348 * Job done if allocation would steal freepages from 1349 * other migratetype buddy lists. 1350 */ 1351 if (find_suitable_fallback(area, order, migratetype, 1352 true, &can_steal) != -1) { 1353 1354 /* movable pages are OK in any pageblock */ 1355 if (migratetype == MIGRATE_MOVABLE) 1356 return COMPACT_SUCCESS; 1357 1358 /* 1359 * We are stealing for a non-movable allocation. Make 1360 * sure we finish compacting the current pageblock 1361 * first so it is as free as possible and we won't 1362 * have to steal another one soon. This only applies 1363 * to sync compaction, as async compaction operates 1364 * on pageblocks of the same migratetype. 1365 */ 1366 if (cc->mode == MIGRATE_ASYNC || 1367 IS_ALIGNED(cc->migrate_pfn, 1368 pageblock_nr_pages)) { 1369 return COMPACT_SUCCESS; 1370 } 1371 1372 cc->finishing_block = true; 1373 return COMPACT_CONTINUE; 1374 } 1375 } 1376 1377 return COMPACT_NO_SUITABLE_PAGE; 1378 } 1379 1380 static enum compact_result compact_finished(struct zone *zone, 1381 struct compact_control *cc) 1382 { 1383 int ret; 1384 1385 ret = __compact_finished(zone, cc); 1386 trace_mm_compaction_finished(zone, cc->order, ret); 1387 if (ret == COMPACT_NO_SUITABLE_PAGE) 1388 ret = COMPACT_CONTINUE; 1389 1390 return ret; 1391 } 1392 1393 /* 1394 * compaction_suitable: Is this suitable to run compaction on this zone now? 1395 * Returns 1396 * COMPACT_SKIPPED - If there are too few free pages for compaction 1397 * COMPACT_SUCCESS - If the allocation would succeed without compaction 1398 * COMPACT_CONTINUE - If compaction should run now 1399 */ 1400 static enum compact_result __compaction_suitable(struct zone *zone, int order, 1401 unsigned int alloc_flags, 1402 int classzone_idx, 1403 unsigned long wmark_target) 1404 { 1405 unsigned long watermark; 1406 1407 if (is_via_compact_memory(order)) 1408 return COMPACT_CONTINUE; 1409 1410 watermark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; 1411 /* 1412 * If watermarks for high-order allocation are already met, there 1413 * should be no need for compaction at all. 1414 */ 1415 if (zone_watermark_ok(zone, order, watermark, classzone_idx, 1416 alloc_flags)) 1417 return COMPACT_SUCCESS; 1418 1419 /* 1420 * Watermarks for order-0 must be met for compaction to be able to 1421 * isolate free pages for migration targets. This means that the 1422 * watermark and alloc_flags have to match, or be more pessimistic than 1423 * the check in __isolate_free_page(). We don't use the direct 1424 * compactor's alloc_flags, as they are not relevant for freepage 1425 * isolation. We however do use the direct compactor's classzone_idx to 1426 * skip over zones where lowmem reserves would prevent allocation even 1427 * if compaction succeeds. 1428 * For costly orders, we require low watermark instead of min for 1429 * compaction to proceed to increase its chances. 1430 * ALLOC_CMA is used, as pages in CMA pageblocks are considered 1431 * suitable migration targets 1432 */ 1433 watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ? 1434 low_wmark_pages(zone) : min_wmark_pages(zone); 1435 watermark += compact_gap(order); 1436 if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx, 1437 ALLOC_CMA, wmark_target)) 1438 return COMPACT_SKIPPED; 1439 1440 return COMPACT_CONTINUE; 1441 } 1442 1443 enum compact_result compaction_suitable(struct zone *zone, int order, 1444 unsigned int alloc_flags, 1445 int classzone_idx) 1446 { 1447 enum compact_result ret; 1448 int fragindex; 1449 1450 ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx, 1451 zone_page_state(zone, NR_FREE_PAGES)); 1452 /* 1453 * fragmentation index determines if allocation failures are due to 1454 * low memory or external fragmentation 1455 * 1456 * index of -1000 would imply allocations might succeed depending on 1457 * watermarks, but we already failed the high-order watermark check 1458 * index towards 0 implies failure is due to lack of memory 1459 * index towards 1000 implies failure is due to fragmentation 1460 * 1461 * Only compact if a failure would be due to fragmentation. Also 1462 * ignore fragindex for non-costly orders where the alternative to 1463 * a successful reclaim/compaction is OOM. Fragindex and the 1464 * vm.extfrag_threshold sysctl is meant as a heuristic to prevent 1465 * excessive compaction for costly orders, but it should not be at the 1466 * expense of system stability. 1467 */ 1468 if (ret == COMPACT_CONTINUE && (order > PAGE_ALLOC_COSTLY_ORDER)) { 1469 fragindex = fragmentation_index(zone, order); 1470 if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) 1471 ret = COMPACT_NOT_SUITABLE_ZONE; 1472 } 1473 1474 trace_mm_compaction_suitable(zone, order, ret); 1475 if (ret == COMPACT_NOT_SUITABLE_ZONE) 1476 ret = COMPACT_SKIPPED; 1477 1478 return ret; 1479 } 1480 1481 bool compaction_zonelist_suitable(struct alloc_context *ac, int order, 1482 int alloc_flags) 1483 { 1484 struct zone *zone; 1485 struct zoneref *z; 1486 1487 /* 1488 * Make sure at least one zone would pass __compaction_suitable if we continue 1489 * retrying the reclaim. 1490 */ 1491 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, 1492 ac->nodemask) { 1493 unsigned long available; 1494 enum compact_result compact_result; 1495 1496 /* 1497 * Do not consider all the reclaimable memory because we do not 1498 * want to trash just for a single high order allocation which 1499 * is even not guaranteed to appear even if __compaction_suitable 1500 * is happy about the watermark check. 1501 */ 1502 available = zone_reclaimable_pages(zone) / order; 1503 available += zone_page_state_snapshot(zone, NR_FREE_PAGES); 1504 compact_result = __compaction_suitable(zone, order, alloc_flags, 1505 ac_classzone_idx(ac), available); 1506 if (compact_result != COMPACT_SKIPPED) 1507 return true; 1508 } 1509 1510 return false; 1511 } 1512 1513 static enum compact_result compact_zone(struct zone *zone, struct compact_control *cc) 1514 { 1515 enum compact_result ret; 1516 unsigned long start_pfn = zone->zone_start_pfn; 1517 unsigned long end_pfn = zone_end_pfn(zone); 1518 const bool sync = cc->mode != MIGRATE_ASYNC; 1519 1520 cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask); 1521 ret = compaction_suitable(zone, cc->order, cc->alloc_flags, 1522 cc->classzone_idx); 1523 /* Compaction is likely to fail */ 1524 if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED) 1525 return ret; 1526 1527 /* huh, compaction_suitable is returning something unexpected */ 1528 VM_BUG_ON(ret != COMPACT_CONTINUE); 1529 1530 /* 1531 * Clear pageblock skip if there were failures recently and compaction 1532 * is about to be retried after being deferred. 1533 */ 1534 if (compaction_restarting(zone, cc->order)) 1535 __reset_isolation_suitable(zone); 1536 1537 /* 1538 * Setup to move all movable pages to the end of the zone. Used cached 1539 * information on where the scanners should start (unless we explicitly 1540 * want to compact the whole zone), but check that it is initialised 1541 * by ensuring the values are within zone boundaries. 1542 */ 1543 if (cc->whole_zone) { 1544 cc->migrate_pfn = start_pfn; 1545 cc->free_pfn = pageblock_start_pfn(end_pfn - 1); 1546 } else { 1547 cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; 1548 cc->free_pfn = zone->compact_cached_free_pfn; 1549 if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) { 1550 cc->free_pfn = pageblock_start_pfn(end_pfn - 1); 1551 zone->compact_cached_free_pfn = cc->free_pfn; 1552 } 1553 if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) { 1554 cc->migrate_pfn = start_pfn; 1555 zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; 1556 zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; 1557 } 1558 1559 if (cc->migrate_pfn == start_pfn) 1560 cc->whole_zone = true; 1561 } 1562 1563 cc->last_migrated_pfn = 0; 1564 1565 trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, 1566 cc->free_pfn, end_pfn, sync); 1567 1568 migrate_prep_local(); 1569 1570 while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { 1571 int err; 1572 1573 switch (isolate_migratepages(zone, cc)) { 1574 case ISOLATE_ABORT: 1575 ret = COMPACT_CONTENDED; 1576 putback_movable_pages(&cc->migratepages); 1577 cc->nr_migratepages = 0; 1578 goto out; 1579 case ISOLATE_NONE: 1580 /* 1581 * We haven't isolated and migrated anything, but 1582 * there might still be unflushed migrations from 1583 * previous cc->order aligned block. 1584 */ 1585 goto check_drain; 1586 case ISOLATE_SUCCESS: 1587 ; 1588 } 1589 1590 err = migrate_pages(&cc->migratepages, compaction_alloc, 1591 compaction_free, (unsigned long)cc, cc->mode, 1592 MR_COMPACTION); 1593 1594 trace_mm_compaction_migratepages(cc->nr_migratepages, err, 1595 &cc->migratepages); 1596 1597 /* All pages were either migrated or will be released */ 1598 cc->nr_migratepages = 0; 1599 if (err) { 1600 putback_movable_pages(&cc->migratepages); 1601 /* 1602 * migrate_pages() may return -ENOMEM when scanners meet 1603 * and we want compact_finished() to detect it 1604 */ 1605 if (err == -ENOMEM && !compact_scanners_met(cc)) { 1606 ret = COMPACT_CONTENDED; 1607 goto out; 1608 } 1609 /* 1610 * We failed to migrate at least one page in the current 1611 * order-aligned block, so skip the rest of it. 1612 */ 1613 if (cc->direct_compaction && 1614 (cc->mode == MIGRATE_ASYNC)) { 1615 cc->migrate_pfn = block_end_pfn( 1616 cc->migrate_pfn - 1, cc->order); 1617 /* Draining pcplists is useless in this case */ 1618 cc->last_migrated_pfn = 0; 1619 1620 } 1621 } 1622 1623 check_drain: 1624 /* 1625 * Has the migration scanner moved away from the previous 1626 * cc->order aligned block where we migrated from? If yes, 1627 * flush the pages that were freed, so that they can merge and 1628 * compact_finished() can detect immediately if allocation 1629 * would succeed. 1630 */ 1631 if (cc->order > 0 && cc->last_migrated_pfn) { 1632 int cpu; 1633 unsigned long current_block_start = 1634 block_start_pfn(cc->migrate_pfn, cc->order); 1635 1636 if (cc->last_migrated_pfn < current_block_start) { 1637 cpu = get_cpu(); 1638 lru_add_drain_cpu(cpu); 1639 drain_local_pages(zone); 1640 put_cpu(); 1641 /* No more flushing until we migrate again */ 1642 cc->last_migrated_pfn = 0; 1643 } 1644 } 1645 1646 } 1647 1648 out: 1649 /* 1650 * Release free pages and update where the free scanner should restart, 1651 * so we don't leave any returned pages behind in the next attempt. 1652 */ 1653 if (cc->nr_freepages > 0) { 1654 unsigned long free_pfn = release_freepages(&cc->freepages); 1655 1656 cc->nr_freepages = 0; 1657 VM_BUG_ON(free_pfn == 0); 1658 /* The cached pfn is always the first in a pageblock */ 1659 free_pfn = pageblock_start_pfn(free_pfn); 1660 /* 1661 * Only go back, not forward. The cached pfn might have been 1662 * already reset to zone end in compact_finished() 1663 */ 1664 if (free_pfn > zone->compact_cached_free_pfn) 1665 zone->compact_cached_free_pfn = free_pfn; 1666 } 1667 1668 count_compact_events(COMPACTMIGRATE_SCANNED, cc->total_migrate_scanned); 1669 count_compact_events(COMPACTFREE_SCANNED, cc->total_free_scanned); 1670 1671 trace_mm_compaction_end(start_pfn, cc->migrate_pfn, 1672 cc->free_pfn, end_pfn, sync, ret); 1673 1674 return ret; 1675 } 1676 1677 static enum compact_result compact_zone_order(struct zone *zone, int order, 1678 gfp_t gfp_mask, enum compact_priority prio, 1679 unsigned int alloc_flags, int classzone_idx) 1680 { 1681 enum compact_result ret; 1682 struct compact_control cc = { 1683 .nr_freepages = 0, 1684 .nr_migratepages = 0, 1685 .total_migrate_scanned = 0, 1686 .total_free_scanned = 0, 1687 .order = order, 1688 .gfp_mask = gfp_mask, 1689 .zone = zone, 1690 .mode = (prio == COMPACT_PRIO_ASYNC) ? 1691 MIGRATE_ASYNC : MIGRATE_SYNC_LIGHT, 1692 .alloc_flags = alloc_flags, 1693 .classzone_idx = classzone_idx, 1694 .direct_compaction = true, 1695 .whole_zone = (prio == MIN_COMPACT_PRIORITY), 1696 .ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY), 1697 .ignore_block_suitable = (prio == MIN_COMPACT_PRIORITY) 1698 }; 1699 INIT_LIST_HEAD(&cc.freepages); 1700 INIT_LIST_HEAD(&cc.migratepages); 1701 1702 ret = compact_zone(zone, &cc); 1703 1704 VM_BUG_ON(!list_empty(&cc.freepages)); 1705 VM_BUG_ON(!list_empty(&cc.migratepages)); 1706 1707 return ret; 1708 } 1709 1710 int sysctl_extfrag_threshold = 500; 1711 1712 /** 1713 * try_to_compact_pages - Direct compact to satisfy a high-order allocation 1714 * @gfp_mask: The GFP mask of the current allocation 1715 * @order: The order of the current allocation 1716 * @alloc_flags: The allocation flags of the current allocation 1717 * @ac: The context of current allocation 1718 * @mode: The migration mode for async, sync light, or sync migration 1719 * 1720 * This is the main entry point for direct page compaction. 1721 */ 1722 enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, 1723 unsigned int alloc_flags, const struct alloc_context *ac, 1724 enum compact_priority prio) 1725 { 1726 int may_perform_io = gfp_mask & __GFP_IO; 1727 struct zoneref *z; 1728 struct zone *zone; 1729 enum compact_result rc = COMPACT_SKIPPED; 1730 1731 /* 1732 * Check if the GFP flags allow compaction - GFP_NOIO is really 1733 * tricky context because the migration might require IO 1734 */ 1735 if (!may_perform_io) 1736 return COMPACT_SKIPPED; 1737 1738 trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio); 1739 1740 /* Compact each zone in the list */ 1741 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, 1742 ac->nodemask) { 1743 enum compact_result status; 1744 1745 if (prio > MIN_COMPACT_PRIORITY 1746 && compaction_deferred(zone, order)) { 1747 rc = max_t(enum compact_result, COMPACT_DEFERRED, rc); 1748 continue; 1749 } 1750 1751 status = compact_zone_order(zone, order, gfp_mask, prio, 1752 alloc_flags, ac_classzone_idx(ac)); 1753 rc = max(status, rc); 1754 1755 /* The allocation should succeed, stop compacting */ 1756 if (status == COMPACT_SUCCESS) { 1757 /* 1758 * We think the allocation will succeed in this zone, 1759 * but it is not certain, hence the false. The caller 1760 * will repeat this with true if allocation indeed 1761 * succeeds in this zone. 1762 */ 1763 compaction_defer_reset(zone, order, false); 1764 1765 break; 1766 } 1767 1768 if (prio != COMPACT_PRIO_ASYNC && (status == COMPACT_COMPLETE || 1769 status == COMPACT_PARTIAL_SKIPPED)) 1770 /* 1771 * We think that allocation won't succeed in this zone 1772 * so we defer compaction there. If it ends up 1773 * succeeding after all, it will be reset. 1774 */ 1775 defer_compaction(zone, order); 1776 1777 /* 1778 * We might have stopped compacting due to need_resched() in 1779 * async compaction, or due to a fatal signal detected. In that 1780 * case do not try further zones 1781 */ 1782 if ((prio == COMPACT_PRIO_ASYNC && need_resched()) 1783 || fatal_signal_pending(current)) 1784 break; 1785 } 1786 1787 return rc; 1788 } 1789 1790 1791 /* Compact all zones within a node */ 1792 static void compact_node(int nid) 1793 { 1794 pg_data_t *pgdat = NODE_DATA(nid); 1795 int zoneid; 1796 struct zone *zone; 1797 struct compact_control cc = { 1798 .order = -1, 1799 .total_migrate_scanned = 0, 1800 .total_free_scanned = 0, 1801 .mode = MIGRATE_SYNC, 1802 .ignore_skip_hint = true, 1803 .whole_zone = true, 1804 .gfp_mask = GFP_KERNEL, 1805 }; 1806 1807 1808 for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { 1809 1810 zone = &pgdat->node_zones[zoneid]; 1811 if (!populated_zone(zone)) 1812 continue; 1813 1814 cc.nr_freepages = 0; 1815 cc.nr_migratepages = 0; 1816 cc.zone = zone; 1817 INIT_LIST_HEAD(&cc.freepages); 1818 INIT_LIST_HEAD(&cc.migratepages); 1819 1820 compact_zone(zone, &cc); 1821 1822 VM_BUG_ON(!list_empty(&cc.freepages)); 1823 VM_BUG_ON(!list_empty(&cc.migratepages)); 1824 } 1825 } 1826 1827 /* Compact all nodes in the system */ 1828 static void compact_nodes(void) 1829 { 1830 int nid; 1831 1832 /* Flush pending updates to the LRU lists */ 1833 lru_add_drain_all(); 1834 1835 for_each_online_node(nid) 1836 compact_node(nid); 1837 } 1838 1839 /* The written value is actually unused, all memory is compacted */ 1840 int sysctl_compact_memory; 1841 1842 /* 1843 * This is the entry point for compacting all nodes via 1844 * /proc/sys/vm/compact_memory 1845 */ 1846 int sysctl_compaction_handler(struct ctl_table *table, int write, 1847 void __user *buffer, size_t *length, loff_t *ppos) 1848 { 1849 if (write) 1850 compact_nodes(); 1851 1852 return 0; 1853 } 1854 1855 int sysctl_extfrag_handler(struct ctl_table *table, int write, 1856 void __user *buffer, size_t *length, loff_t *ppos) 1857 { 1858 proc_dointvec_minmax(table, write, buffer, length, ppos); 1859 1860 return 0; 1861 } 1862 1863 #if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) 1864 static ssize_t sysfs_compact_node(struct device *dev, 1865 struct device_attribute *attr, 1866 const char *buf, size_t count) 1867 { 1868 int nid = dev->id; 1869 1870 if (nid >= 0 && nid < nr_node_ids && node_online(nid)) { 1871 /* Flush pending updates to the LRU lists */ 1872 lru_add_drain_all(); 1873 1874 compact_node(nid); 1875 } 1876 1877 return count; 1878 } 1879 static DEVICE_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node); 1880 1881 int compaction_register_node(struct node *node) 1882 { 1883 return device_create_file(&node->dev, &dev_attr_compact); 1884 } 1885 1886 void compaction_unregister_node(struct node *node) 1887 { 1888 return device_remove_file(&node->dev, &dev_attr_compact); 1889 } 1890 #endif /* CONFIG_SYSFS && CONFIG_NUMA */ 1891 1892 static inline bool kcompactd_work_requested(pg_data_t *pgdat) 1893 { 1894 return pgdat->kcompactd_max_order > 0 || kthread_should_stop(); 1895 } 1896 1897 static bool kcompactd_node_suitable(pg_data_t *pgdat) 1898 { 1899 int zoneid; 1900 struct zone *zone; 1901 enum zone_type classzone_idx = pgdat->kcompactd_classzone_idx; 1902 1903 for (zoneid = 0; zoneid <= classzone_idx; zoneid++) { 1904 zone = &pgdat->node_zones[zoneid]; 1905 1906 if (!populated_zone(zone)) 1907 continue; 1908 1909 if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0, 1910 classzone_idx) == COMPACT_CONTINUE) 1911 return true; 1912 } 1913 1914 return false; 1915 } 1916 1917 static void kcompactd_do_work(pg_data_t *pgdat) 1918 { 1919 /* 1920 * With no special task, compact all zones so that a page of requested 1921 * order is allocatable. 1922 */ 1923 int zoneid; 1924 struct zone *zone; 1925 struct compact_control cc = { 1926 .order = pgdat->kcompactd_max_order, 1927 .total_migrate_scanned = 0, 1928 .total_free_scanned = 0, 1929 .classzone_idx = pgdat->kcompactd_classzone_idx, 1930 .mode = MIGRATE_SYNC_LIGHT, 1931 .ignore_skip_hint = true, 1932 .gfp_mask = GFP_KERNEL, 1933 1934 }; 1935 trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order, 1936 cc.classzone_idx); 1937 count_compact_event(KCOMPACTD_WAKE); 1938 1939 for (zoneid = 0; zoneid <= cc.classzone_idx; zoneid++) { 1940 int status; 1941 1942 zone = &pgdat->node_zones[zoneid]; 1943 if (!populated_zone(zone)) 1944 continue; 1945 1946 if (compaction_deferred(zone, cc.order)) 1947 continue; 1948 1949 if (compaction_suitable(zone, cc.order, 0, zoneid) != 1950 COMPACT_CONTINUE) 1951 continue; 1952 1953 cc.nr_freepages = 0; 1954 cc.nr_migratepages = 0; 1955 cc.total_migrate_scanned = 0; 1956 cc.total_free_scanned = 0; 1957 cc.zone = zone; 1958 INIT_LIST_HEAD(&cc.freepages); 1959 INIT_LIST_HEAD(&cc.migratepages); 1960 1961 if (kthread_should_stop()) 1962 return; 1963 status = compact_zone(zone, &cc); 1964 1965 if (status == COMPACT_SUCCESS) { 1966 compaction_defer_reset(zone, cc.order, false); 1967 } else if (status == COMPACT_PARTIAL_SKIPPED || status == COMPACT_COMPLETE) { 1968 /* 1969 * We use sync migration mode here, so we defer like 1970 * sync direct compaction does. 1971 */ 1972 defer_compaction(zone, cc.order); 1973 } 1974 1975 count_compact_events(KCOMPACTD_MIGRATE_SCANNED, 1976 cc.total_migrate_scanned); 1977 count_compact_events(KCOMPACTD_FREE_SCANNED, 1978 cc.total_free_scanned); 1979 1980 VM_BUG_ON(!list_empty(&cc.freepages)); 1981 VM_BUG_ON(!list_empty(&cc.migratepages)); 1982 } 1983 1984 /* 1985 * Regardless of success, we are done until woken up next. But remember 1986 * the requested order/classzone_idx in case it was higher/tighter than 1987 * our current ones 1988 */ 1989 if (pgdat->kcompactd_max_order <= cc.order) 1990 pgdat->kcompactd_max_order = 0; 1991 if (pgdat->kcompactd_classzone_idx >= cc.classzone_idx) 1992 pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1; 1993 } 1994 1995 void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx) 1996 { 1997 if (!order) 1998 return; 1999 2000 if (pgdat->kcompactd_max_order < order) 2001 pgdat->kcompactd_max_order = order; 2002 2003 /* 2004 * Pairs with implicit barrier in wait_event_freezable() 2005 * such that wakeups are not missed in the lockless 2006 * waitqueue_active() call. 2007 */ 2008 smp_acquire__after_ctrl_dep(); 2009 2010 if (pgdat->kcompactd_classzone_idx > classzone_idx) 2011 pgdat->kcompactd_classzone_idx = classzone_idx; 2012 2013 if (!waitqueue_active(&pgdat->kcompactd_wait)) 2014 return; 2015 2016 if (!kcompactd_node_suitable(pgdat)) 2017 return; 2018 2019 trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, order, 2020 classzone_idx); 2021 wake_up_interruptible(&pgdat->kcompactd_wait); 2022 } 2023 2024 /* 2025 * The background compaction daemon, started as a kernel thread 2026 * from the init process. 2027 */ 2028 static int kcompactd(void *p) 2029 { 2030 pg_data_t *pgdat = (pg_data_t*)p; 2031 struct task_struct *tsk = current; 2032 2033 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); 2034 2035 if (!cpumask_empty(cpumask)) 2036 set_cpus_allowed_ptr(tsk, cpumask); 2037 2038 set_freezable(); 2039 2040 pgdat->kcompactd_max_order = 0; 2041 pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1; 2042 2043 while (!kthread_should_stop()) { 2044 trace_mm_compaction_kcompactd_sleep(pgdat->node_id); 2045 wait_event_freezable(pgdat->kcompactd_wait, 2046 kcompactd_work_requested(pgdat)); 2047 2048 kcompactd_do_work(pgdat); 2049 } 2050 2051 return 0; 2052 } 2053 2054 /* 2055 * This kcompactd start function will be called by init and node-hot-add. 2056 * On node-hot-add, kcompactd will moved to proper cpus if cpus are hot-added. 2057 */ 2058 int kcompactd_run(int nid) 2059 { 2060 pg_data_t *pgdat = NODE_DATA(nid); 2061 int ret = 0; 2062 2063 if (pgdat->kcompactd) 2064 return 0; 2065 2066 pgdat->kcompactd = kthread_run(kcompactd, pgdat, "kcompactd%d", nid); 2067 if (IS_ERR(pgdat->kcompactd)) { 2068 pr_err("Failed to start kcompactd on node %d\n", nid); 2069 ret = PTR_ERR(pgdat->kcompactd); 2070 pgdat->kcompactd = NULL; 2071 } 2072 return ret; 2073 } 2074 2075 /* 2076 * Called by memory hotplug when all memory in a node is offlined. Caller must 2077 * hold mem_hotplug_begin/end(). 2078 */ 2079 void kcompactd_stop(int nid) 2080 { 2081 struct task_struct *kcompactd = NODE_DATA(nid)->kcompactd; 2082 2083 if (kcompactd) { 2084 kthread_stop(kcompactd); 2085 NODE_DATA(nid)->kcompactd = NULL; 2086 } 2087 } 2088 2089 /* 2090 * It's optimal to keep kcompactd on the same CPUs as their memory, but 2091 * not required for correctness. So if the last cpu in a node goes 2092 * away, we get changed to run anywhere: as the first one comes back, 2093 * restore their cpu bindings. 2094 */ 2095 static int kcompactd_cpu_online(unsigned int cpu) 2096 { 2097 int nid; 2098 2099 for_each_node_state(nid, N_MEMORY) { 2100 pg_data_t *pgdat = NODE_DATA(nid); 2101 const struct cpumask *mask; 2102 2103 mask = cpumask_of_node(pgdat->node_id); 2104 2105 if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) 2106 /* One of our CPUs online: restore mask */ 2107 set_cpus_allowed_ptr(pgdat->kcompactd, mask); 2108 } 2109 return 0; 2110 } 2111 2112 static int __init kcompactd_init(void) 2113 { 2114 int nid; 2115 int ret; 2116 2117 ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, 2118 "mm/compaction:online", 2119 kcompactd_cpu_online, NULL); 2120 if (ret < 0) { 2121 pr_err("kcompactd: failed to register hotplug callbacks.\n"); 2122 return ret; 2123 } 2124 2125 for_each_node_state(nid, N_MEMORY) 2126 kcompactd_run(nid); 2127 return 0; 2128 } 2129 subsys_initcall(kcompactd_init) 2130 2131 #endif /* CONFIG_COMPACTION */ 2132