1 /* 2 * linux/mm/compaction.c 3 * 4 * Memory compaction for the reduction of external fragmentation. Note that 5 * this heavily depends upon page migration to do all the real heavy 6 * lifting 7 * 8 * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie> 9 */ 10 #include <linux/swap.h> 11 #include <linux/migrate.h> 12 #include <linux/compaction.h> 13 #include <linux/mm_inline.h> 14 #include <linux/backing-dev.h> 15 #include <linux/sysctl.h> 16 #include <linux/sysfs.h> 17 #include <linux/balloon_compaction.h> 18 #include <linux/page-isolation.h> 19 #include <linux/kasan.h> 20 #include "internal.h" 21 22 #ifdef CONFIG_COMPACTION 23 static inline void count_compact_event(enum vm_event_item item) 24 { 25 count_vm_event(item); 26 } 27 28 static inline void count_compact_events(enum vm_event_item item, long delta) 29 { 30 count_vm_events(item, delta); 31 } 32 #else 33 #define count_compact_event(item) do { } while (0) 34 #define count_compact_events(item, delta) do { } while (0) 35 #endif 36 37 #if defined CONFIG_COMPACTION || defined CONFIG_CMA 38 #ifdef CONFIG_TRACEPOINTS 39 static const char *const compaction_status_string[] = { 40 "deferred", 41 "skipped", 42 "continue", 43 "partial", 44 "complete", 45 "no_suitable_page", 46 "not_suitable_zone", 47 }; 48 #endif 49 50 #define CREATE_TRACE_POINTS 51 #include <trace/events/compaction.h> 52 53 static unsigned long release_freepages(struct list_head *freelist) 54 { 55 struct page *page, *next; 56 unsigned long high_pfn = 0; 57 58 list_for_each_entry_safe(page, next, freelist, lru) { 59 unsigned long pfn = page_to_pfn(page); 60 list_del(&page->lru); 61 __free_page(page); 62 if (pfn > high_pfn) 63 high_pfn = pfn; 64 } 65 66 return high_pfn; 67 } 68 69 static void map_pages(struct list_head *list) 70 { 71 struct page *page; 72 73 list_for_each_entry(page, list, lru) { 74 arch_alloc_page(page, 0); 75 kernel_map_pages(page, 1, 1); 76 kasan_alloc_pages(page, 0); 77 } 78 } 79 80 static inline bool migrate_async_suitable(int migratetype) 81 { 82 return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; 83 } 84 85 /* 86 * Check that the whole (or subset of) a pageblock given by the interval of 87 * [start_pfn, end_pfn) is valid and within the same zone, before scanning it 88 * with the migration of free compaction scanner. The scanners then need to 89 * use only pfn_valid_within() check for arches that allow holes within 90 * pageblocks. 91 * 92 * Return struct page pointer of start_pfn, or NULL if checks were not passed. 93 * 94 * It's possible on some configurations to have a setup like node0 node1 node0 95 * i.e. it's possible that all pages within a zones range of pages do not 96 * belong to a single zone. We assume that a border between node0 and node1 97 * can occur within a single pageblock, but not a node0 node1 node0 98 * interleaving within a single pageblock. It is therefore sufficient to check 99 * the first and last page of a pageblock and avoid checking each individual 100 * page in a pageblock. 101 */ 102 static struct page *pageblock_pfn_to_page(unsigned long start_pfn, 103 unsigned long end_pfn, struct zone *zone) 104 { 105 struct page *start_page; 106 struct page *end_page; 107 108 /* end_pfn is one past the range we are checking */ 109 end_pfn--; 110 111 if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn)) 112 return NULL; 113 114 start_page = pfn_to_page(start_pfn); 115 116 if (page_zone(start_page) != zone) 117 return NULL; 118 119 end_page = pfn_to_page(end_pfn); 120 121 /* This gives a shorter code than deriving page_zone(end_page) */ 122 if (page_zone_id(start_page) != page_zone_id(end_page)) 123 return NULL; 124 125 return start_page; 126 } 127 128 #ifdef CONFIG_COMPACTION 129 130 /* Do not skip compaction more than 64 times */ 131 #define COMPACT_MAX_DEFER_SHIFT 6 132 133 /* 134 * Compaction is deferred when compaction fails to result in a page 135 * allocation success. 1 << compact_defer_limit compactions are skipped up 136 * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT 137 */ 138 void defer_compaction(struct zone *zone, int order) 139 { 140 zone->compact_considered = 0; 141 zone->compact_defer_shift++; 142 143 if (order < zone->compact_order_failed) 144 zone->compact_order_failed = order; 145 146 if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT) 147 zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT; 148 149 trace_mm_compaction_defer_compaction(zone, order); 150 } 151 152 /* Returns true if compaction should be skipped this time */ 153 bool compaction_deferred(struct zone *zone, int order) 154 { 155 unsigned long defer_limit = 1UL << zone->compact_defer_shift; 156 157 if (order < zone->compact_order_failed) 158 return false; 159 160 /* Avoid possible overflow */ 161 if (++zone->compact_considered > defer_limit) 162 zone->compact_considered = defer_limit; 163 164 if (zone->compact_considered >= defer_limit) 165 return false; 166 167 trace_mm_compaction_deferred(zone, order); 168 169 return true; 170 } 171 172 /* 173 * Update defer tracking counters after successful compaction of given order, 174 * which means an allocation either succeeded (alloc_success == true) or is 175 * expected to succeed. 176 */ 177 void compaction_defer_reset(struct zone *zone, int order, 178 bool alloc_success) 179 { 180 if (alloc_success) { 181 zone->compact_considered = 0; 182 zone->compact_defer_shift = 0; 183 } 184 if (order >= zone->compact_order_failed) 185 zone->compact_order_failed = order + 1; 186 187 trace_mm_compaction_defer_reset(zone, order); 188 } 189 190 /* Returns true if restarting compaction after many failures */ 191 bool compaction_restarting(struct zone *zone, int order) 192 { 193 if (order < zone->compact_order_failed) 194 return false; 195 196 return zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT && 197 zone->compact_considered >= 1UL << zone->compact_defer_shift; 198 } 199 200 /* Returns true if the pageblock should be scanned for pages to isolate. */ 201 static inline bool isolation_suitable(struct compact_control *cc, 202 struct page *page) 203 { 204 if (cc->ignore_skip_hint) 205 return true; 206 207 return !get_pageblock_skip(page); 208 } 209 210 /* 211 * This function is called to clear all cached information on pageblocks that 212 * should be skipped for page isolation when the migrate and free page scanner 213 * meet. 214 */ 215 static void __reset_isolation_suitable(struct zone *zone) 216 { 217 unsigned long start_pfn = zone->zone_start_pfn; 218 unsigned long end_pfn = zone_end_pfn(zone); 219 unsigned long pfn; 220 221 zone->compact_cached_migrate_pfn[0] = start_pfn; 222 zone->compact_cached_migrate_pfn[1] = start_pfn; 223 zone->compact_cached_free_pfn = end_pfn; 224 zone->compact_blockskip_flush = false; 225 226 /* Walk the zone and mark every pageblock as suitable for isolation */ 227 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 228 struct page *page; 229 230 cond_resched(); 231 232 if (!pfn_valid(pfn)) 233 continue; 234 235 page = pfn_to_page(pfn); 236 if (zone != page_zone(page)) 237 continue; 238 239 clear_pageblock_skip(page); 240 } 241 } 242 243 void reset_isolation_suitable(pg_data_t *pgdat) 244 { 245 int zoneid; 246 247 for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { 248 struct zone *zone = &pgdat->node_zones[zoneid]; 249 if (!populated_zone(zone)) 250 continue; 251 252 /* Only flush if a full compaction finished recently */ 253 if (zone->compact_blockskip_flush) 254 __reset_isolation_suitable(zone); 255 } 256 } 257 258 /* 259 * If no pages were isolated then mark this pageblock to be skipped in the 260 * future. The information is later cleared by __reset_isolation_suitable(). 261 */ 262 static void update_pageblock_skip(struct compact_control *cc, 263 struct page *page, unsigned long nr_isolated, 264 bool migrate_scanner) 265 { 266 struct zone *zone = cc->zone; 267 unsigned long pfn; 268 269 if (cc->ignore_skip_hint) 270 return; 271 272 if (!page) 273 return; 274 275 if (nr_isolated) 276 return; 277 278 set_pageblock_skip(page); 279 280 pfn = page_to_pfn(page); 281 282 /* Update where async and sync compaction should restart */ 283 if (migrate_scanner) { 284 if (pfn > zone->compact_cached_migrate_pfn[0]) 285 zone->compact_cached_migrate_pfn[0] = pfn; 286 if (cc->mode != MIGRATE_ASYNC && 287 pfn > zone->compact_cached_migrate_pfn[1]) 288 zone->compact_cached_migrate_pfn[1] = pfn; 289 } else { 290 if (pfn < zone->compact_cached_free_pfn) 291 zone->compact_cached_free_pfn = pfn; 292 } 293 } 294 #else 295 static inline bool isolation_suitable(struct compact_control *cc, 296 struct page *page) 297 { 298 return true; 299 } 300 301 static void update_pageblock_skip(struct compact_control *cc, 302 struct page *page, unsigned long nr_isolated, 303 bool migrate_scanner) 304 { 305 } 306 #endif /* CONFIG_COMPACTION */ 307 308 /* 309 * Compaction requires the taking of some coarse locks that are potentially 310 * very heavily contended. For async compaction, back out if the lock cannot 311 * be taken immediately. For sync compaction, spin on the lock if needed. 312 * 313 * Returns true if the lock is held 314 * Returns false if the lock is not held and compaction should abort 315 */ 316 static bool compact_trylock_irqsave(spinlock_t *lock, unsigned long *flags, 317 struct compact_control *cc) 318 { 319 if (cc->mode == MIGRATE_ASYNC) { 320 if (!spin_trylock_irqsave(lock, *flags)) { 321 cc->contended = COMPACT_CONTENDED_LOCK; 322 return false; 323 } 324 } else { 325 spin_lock_irqsave(lock, *flags); 326 } 327 328 return true; 329 } 330 331 /* 332 * Compaction requires the taking of some coarse locks that are potentially 333 * very heavily contended. The lock should be periodically unlocked to avoid 334 * having disabled IRQs for a long time, even when there is nobody waiting on 335 * the lock. It might also be that allowing the IRQs will result in 336 * need_resched() becoming true. If scheduling is needed, async compaction 337 * aborts. Sync compaction schedules. 338 * Either compaction type will also abort if a fatal signal is pending. 339 * In either case if the lock was locked, it is dropped and not regained. 340 * 341 * Returns true if compaction should abort due to fatal signal pending, or 342 * async compaction due to need_resched() 343 * Returns false when compaction can continue (sync compaction might have 344 * scheduled) 345 */ 346 static bool compact_unlock_should_abort(spinlock_t *lock, 347 unsigned long flags, bool *locked, struct compact_control *cc) 348 { 349 if (*locked) { 350 spin_unlock_irqrestore(lock, flags); 351 *locked = false; 352 } 353 354 if (fatal_signal_pending(current)) { 355 cc->contended = COMPACT_CONTENDED_SCHED; 356 return true; 357 } 358 359 if (need_resched()) { 360 if (cc->mode == MIGRATE_ASYNC) { 361 cc->contended = COMPACT_CONTENDED_SCHED; 362 return true; 363 } 364 cond_resched(); 365 } 366 367 return false; 368 } 369 370 /* 371 * Aside from avoiding lock contention, compaction also periodically checks 372 * need_resched() and either schedules in sync compaction or aborts async 373 * compaction. This is similar to what compact_unlock_should_abort() does, but 374 * is used where no lock is concerned. 375 * 376 * Returns false when no scheduling was needed, or sync compaction scheduled. 377 * Returns true when async compaction should abort. 378 */ 379 static inline bool compact_should_abort(struct compact_control *cc) 380 { 381 /* async compaction aborts if contended */ 382 if (need_resched()) { 383 if (cc->mode == MIGRATE_ASYNC) { 384 cc->contended = COMPACT_CONTENDED_SCHED; 385 return true; 386 } 387 388 cond_resched(); 389 } 390 391 return false; 392 } 393 394 /* 395 * Isolate free pages onto a private freelist. If @strict is true, will abort 396 * returning 0 on any invalid PFNs or non-free pages inside of the pageblock 397 * (even though it may still end up isolating some pages). 398 */ 399 static unsigned long isolate_freepages_block(struct compact_control *cc, 400 unsigned long *start_pfn, 401 unsigned long end_pfn, 402 struct list_head *freelist, 403 bool strict) 404 { 405 int nr_scanned = 0, total_isolated = 0; 406 struct page *cursor, *valid_page = NULL; 407 unsigned long flags = 0; 408 bool locked = false; 409 unsigned long blockpfn = *start_pfn; 410 411 cursor = pfn_to_page(blockpfn); 412 413 /* Isolate free pages. */ 414 for (; blockpfn < end_pfn; blockpfn++, cursor++) { 415 int isolated, i; 416 struct page *page = cursor; 417 418 /* 419 * Periodically drop the lock (if held) regardless of its 420 * contention, to give chance to IRQs. Abort if fatal signal 421 * pending or async compaction detects need_resched() 422 */ 423 if (!(blockpfn % SWAP_CLUSTER_MAX) 424 && compact_unlock_should_abort(&cc->zone->lock, flags, 425 &locked, cc)) 426 break; 427 428 nr_scanned++; 429 if (!pfn_valid_within(blockpfn)) 430 goto isolate_fail; 431 432 if (!valid_page) 433 valid_page = page; 434 if (!PageBuddy(page)) 435 goto isolate_fail; 436 437 /* 438 * If we already hold the lock, we can skip some rechecking. 439 * Note that if we hold the lock now, checked_pageblock was 440 * already set in some previous iteration (or strict is true), 441 * so it is correct to skip the suitable migration target 442 * recheck as well. 443 */ 444 if (!locked) { 445 /* 446 * The zone lock must be held to isolate freepages. 447 * Unfortunately this is a very coarse lock and can be 448 * heavily contended if there are parallel allocations 449 * or parallel compactions. For async compaction do not 450 * spin on the lock and we acquire the lock as late as 451 * possible. 452 */ 453 locked = compact_trylock_irqsave(&cc->zone->lock, 454 &flags, cc); 455 if (!locked) 456 break; 457 458 /* Recheck this is a buddy page under lock */ 459 if (!PageBuddy(page)) 460 goto isolate_fail; 461 } 462 463 /* Found a free page, break it into order-0 pages */ 464 isolated = split_free_page(page); 465 total_isolated += isolated; 466 for (i = 0; i < isolated; i++) { 467 list_add(&page->lru, freelist); 468 page++; 469 } 470 471 /* If a page was split, advance to the end of it */ 472 if (isolated) { 473 cc->nr_freepages += isolated; 474 if (!strict && 475 cc->nr_migratepages <= cc->nr_freepages) { 476 blockpfn += isolated; 477 break; 478 } 479 480 blockpfn += isolated - 1; 481 cursor += isolated - 1; 482 continue; 483 } 484 485 isolate_fail: 486 if (strict) 487 break; 488 else 489 continue; 490 491 } 492 493 trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn, 494 nr_scanned, total_isolated); 495 496 /* Record how far we have got within the block */ 497 *start_pfn = blockpfn; 498 499 /* 500 * If strict isolation is requested by CMA then check that all the 501 * pages requested were isolated. If there were any failures, 0 is 502 * returned and CMA will fail. 503 */ 504 if (strict && blockpfn < end_pfn) 505 total_isolated = 0; 506 507 if (locked) 508 spin_unlock_irqrestore(&cc->zone->lock, flags); 509 510 /* Update the pageblock-skip if the whole pageblock was scanned */ 511 if (blockpfn == end_pfn) 512 update_pageblock_skip(cc, valid_page, total_isolated, false); 513 514 count_compact_events(COMPACTFREE_SCANNED, nr_scanned); 515 if (total_isolated) 516 count_compact_events(COMPACTISOLATED, total_isolated); 517 return total_isolated; 518 } 519 520 /** 521 * isolate_freepages_range() - isolate free pages. 522 * @start_pfn: The first PFN to start isolating. 523 * @end_pfn: The one-past-last PFN. 524 * 525 * Non-free pages, invalid PFNs, or zone boundaries within the 526 * [start_pfn, end_pfn) range are considered errors, cause function to 527 * undo its actions and return zero. 528 * 529 * Otherwise, function returns one-past-the-last PFN of isolated page 530 * (which may be greater then end_pfn if end fell in a middle of 531 * a free page). 532 */ 533 unsigned long 534 isolate_freepages_range(struct compact_control *cc, 535 unsigned long start_pfn, unsigned long end_pfn) 536 { 537 unsigned long isolated, pfn, block_end_pfn; 538 LIST_HEAD(freelist); 539 540 pfn = start_pfn; 541 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); 542 543 for (; pfn < end_pfn; pfn += isolated, 544 block_end_pfn += pageblock_nr_pages) { 545 /* Protect pfn from changing by isolate_freepages_block */ 546 unsigned long isolate_start_pfn = pfn; 547 548 block_end_pfn = min(block_end_pfn, end_pfn); 549 550 /* 551 * pfn could pass the block_end_pfn if isolated freepage 552 * is more than pageblock order. In this case, we adjust 553 * scanning range to right one. 554 */ 555 if (pfn >= block_end_pfn) { 556 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); 557 block_end_pfn = min(block_end_pfn, end_pfn); 558 } 559 560 if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone)) 561 break; 562 563 isolated = isolate_freepages_block(cc, &isolate_start_pfn, 564 block_end_pfn, &freelist, true); 565 566 /* 567 * In strict mode, isolate_freepages_block() returns 0 if 568 * there are any holes in the block (ie. invalid PFNs or 569 * non-free pages). 570 */ 571 if (!isolated) 572 break; 573 574 /* 575 * If we managed to isolate pages, it is always (1 << n) * 576 * pageblock_nr_pages for some non-negative n. (Max order 577 * page may span two pageblocks). 578 */ 579 } 580 581 /* split_free_page does not map the pages */ 582 map_pages(&freelist); 583 584 if (pfn < end_pfn) { 585 /* Loop terminated early, cleanup. */ 586 release_freepages(&freelist); 587 return 0; 588 } 589 590 /* We don't use freelists for anything. */ 591 return pfn; 592 } 593 594 /* Update the number of anon and file isolated pages in the zone */ 595 static void acct_isolated(struct zone *zone, struct compact_control *cc) 596 { 597 struct page *page; 598 unsigned int count[2] = { 0, }; 599 600 if (list_empty(&cc->migratepages)) 601 return; 602 603 list_for_each_entry(page, &cc->migratepages, lru) 604 count[!!page_is_file_cache(page)]++; 605 606 mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); 607 mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); 608 } 609 610 /* Similar to reclaim, but different enough that they don't share logic */ 611 static bool too_many_isolated(struct zone *zone) 612 { 613 unsigned long active, inactive, isolated; 614 615 inactive = zone_page_state(zone, NR_INACTIVE_FILE) + 616 zone_page_state(zone, NR_INACTIVE_ANON); 617 active = zone_page_state(zone, NR_ACTIVE_FILE) + 618 zone_page_state(zone, NR_ACTIVE_ANON); 619 isolated = zone_page_state(zone, NR_ISOLATED_FILE) + 620 zone_page_state(zone, NR_ISOLATED_ANON); 621 622 return isolated > (inactive + active) / 2; 623 } 624 625 /** 626 * isolate_migratepages_block() - isolate all migrate-able pages within 627 * a single pageblock 628 * @cc: Compaction control structure. 629 * @low_pfn: The first PFN to isolate 630 * @end_pfn: The one-past-the-last PFN to isolate, within same pageblock 631 * @isolate_mode: Isolation mode to be used. 632 * 633 * Isolate all pages that can be migrated from the range specified by 634 * [low_pfn, end_pfn). The range is expected to be within same pageblock. 635 * Returns zero if there is a fatal signal pending, otherwise PFN of the 636 * first page that was not scanned (which may be both less, equal to or more 637 * than end_pfn). 638 * 639 * The pages are isolated on cc->migratepages list (not required to be empty), 640 * and cc->nr_migratepages is updated accordingly. The cc->migrate_pfn field 641 * is neither read nor updated. 642 */ 643 static unsigned long 644 isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, 645 unsigned long end_pfn, isolate_mode_t isolate_mode) 646 { 647 struct zone *zone = cc->zone; 648 unsigned long nr_scanned = 0, nr_isolated = 0; 649 struct list_head *migratelist = &cc->migratepages; 650 struct lruvec *lruvec; 651 unsigned long flags = 0; 652 bool locked = false; 653 struct page *page = NULL, *valid_page = NULL; 654 unsigned long start_pfn = low_pfn; 655 656 /* 657 * Ensure that there are not too many pages isolated from the LRU 658 * list by either parallel reclaimers or compaction. If there are, 659 * delay for some time until fewer pages are isolated 660 */ 661 while (unlikely(too_many_isolated(zone))) { 662 /* async migration should just abort */ 663 if (cc->mode == MIGRATE_ASYNC) 664 return 0; 665 666 congestion_wait(BLK_RW_ASYNC, HZ/10); 667 668 if (fatal_signal_pending(current)) 669 return 0; 670 } 671 672 if (compact_should_abort(cc)) 673 return 0; 674 675 /* Time to isolate some pages for migration */ 676 for (; low_pfn < end_pfn; low_pfn++) { 677 /* 678 * Periodically drop the lock (if held) regardless of its 679 * contention, to give chance to IRQs. Abort async compaction 680 * if contended. 681 */ 682 if (!(low_pfn % SWAP_CLUSTER_MAX) 683 && compact_unlock_should_abort(&zone->lru_lock, flags, 684 &locked, cc)) 685 break; 686 687 if (!pfn_valid_within(low_pfn)) 688 continue; 689 nr_scanned++; 690 691 page = pfn_to_page(low_pfn); 692 693 if (!valid_page) 694 valid_page = page; 695 696 /* 697 * Skip if free. We read page order here without zone lock 698 * which is generally unsafe, but the race window is small and 699 * the worst thing that can happen is that we skip some 700 * potential isolation targets. 701 */ 702 if (PageBuddy(page)) { 703 unsigned long freepage_order = page_order_unsafe(page); 704 705 /* 706 * Without lock, we cannot be sure that what we got is 707 * a valid page order. Consider only values in the 708 * valid order range to prevent low_pfn overflow. 709 */ 710 if (freepage_order > 0 && freepage_order < MAX_ORDER) 711 low_pfn += (1UL << freepage_order) - 1; 712 continue; 713 } 714 715 /* 716 * Check may be lockless but that's ok as we recheck later. 717 * It's possible to migrate LRU pages and balloon pages 718 * Skip any other type of page 719 */ 720 if (!PageLRU(page)) { 721 if (unlikely(balloon_page_movable(page))) { 722 if (balloon_page_isolate(page)) { 723 /* Successfully isolated */ 724 goto isolate_success; 725 } 726 } 727 continue; 728 } 729 730 /* 731 * PageLRU is set. lru_lock normally excludes isolation 732 * splitting and collapsing (collapsing has already happened 733 * if PageLRU is set) but the lock is not necessarily taken 734 * here and it is wasteful to take it just to check transhuge. 735 * Check TransHuge without lock and skip the whole pageblock if 736 * it's either a transhuge or hugetlbfs page, as calling 737 * compound_order() without preventing THP from splitting the 738 * page underneath us may return surprising results. 739 */ 740 if (PageTransHuge(page)) { 741 if (!locked) 742 low_pfn = ALIGN(low_pfn + 1, 743 pageblock_nr_pages) - 1; 744 else 745 low_pfn += (1 << compound_order(page)) - 1; 746 747 continue; 748 } 749 750 /* 751 * Migration will fail if an anonymous page is pinned in memory, 752 * so avoid taking lru_lock and isolating it unnecessarily in an 753 * admittedly racy check. 754 */ 755 if (!page_mapping(page) && 756 page_count(page) > page_mapcount(page)) 757 continue; 758 759 /* If we already hold the lock, we can skip some rechecking */ 760 if (!locked) { 761 locked = compact_trylock_irqsave(&zone->lru_lock, 762 &flags, cc); 763 if (!locked) 764 break; 765 766 /* Recheck PageLRU and PageTransHuge under lock */ 767 if (!PageLRU(page)) 768 continue; 769 if (PageTransHuge(page)) { 770 low_pfn += (1 << compound_order(page)) - 1; 771 continue; 772 } 773 } 774 775 lruvec = mem_cgroup_page_lruvec(page, zone); 776 777 /* Try isolate the page */ 778 if (__isolate_lru_page(page, isolate_mode) != 0) 779 continue; 780 781 VM_BUG_ON_PAGE(PageTransCompound(page), page); 782 783 /* Successfully isolated */ 784 del_page_from_lru_list(page, lruvec, page_lru(page)); 785 786 isolate_success: 787 list_add(&page->lru, migratelist); 788 cc->nr_migratepages++; 789 nr_isolated++; 790 791 /* Avoid isolating too much */ 792 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { 793 ++low_pfn; 794 break; 795 } 796 } 797 798 /* 799 * The PageBuddy() check could have potentially brought us outside 800 * the range to be scanned. 801 */ 802 if (unlikely(low_pfn > end_pfn)) 803 low_pfn = end_pfn; 804 805 if (locked) 806 spin_unlock_irqrestore(&zone->lru_lock, flags); 807 808 /* 809 * Update the pageblock-skip information and cached scanner pfn, 810 * if the whole pageblock was scanned without isolating any page. 811 */ 812 if (low_pfn == end_pfn) 813 update_pageblock_skip(cc, valid_page, nr_isolated, true); 814 815 trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn, 816 nr_scanned, nr_isolated); 817 818 count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned); 819 if (nr_isolated) 820 count_compact_events(COMPACTISOLATED, nr_isolated); 821 822 return low_pfn; 823 } 824 825 /** 826 * isolate_migratepages_range() - isolate migrate-able pages in a PFN range 827 * @cc: Compaction control structure. 828 * @start_pfn: The first PFN to start isolating. 829 * @end_pfn: The one-past-last PFN. 830 * 831 * Returns zero if isolation fails fatally due to e.g. pending signal. 832 * Otherwise, function returns one-past-the-last PFN of isolated page 833 * (which may be greater than end_pfn if end fell in a middle of a THP page). 834 */ 835 unsigned long 836 isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, 837 unsigned long end_pfn) 838 { 839 unsigned long pfn, block_end_pfn; 840 841 /* Scan block by block. First and last block may be incomplete */ 842 pfn = start_pfn; 843 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); 844 845 for (; pfn < end_pfn; pfn = block_end_pfn, 846 block_end_pfn += pageblock_nr_pages) { 847 848 block_end_pfn = min(block_end_pfn, end_pfn); 849 850 if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone)) 851 continue; 852 853 pfn = isolate_migratepages_block(cc, pfn, block_end_pfn, 854 ISOLATE_UNEVICTABLE); 855 856 /* 857 * In case of fatal failure, release everything that might 858 * have been isolated in the previous iteration, and signal 859 * the failure back to caller. 860 */ 861 if (!pfn) { 862 putback_movable_pages(&cc->migratepages); 863 cc->nr_migratepages = 0; 864 break; 865 } 866 867 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) 868 break; 869 } 870 acct_isolated(cc->zone, cc); 871 872 return pfn; 873 } 874 875 #endif /* CONFIG_COMPACTION || CONFIG_CMA */ 876 #ifdef CONFIG_COMPACTION 877 878 /* Returns true if the page is within a block suitable for migration to */ 879 static bool suitable_migration_target(struct page *page) 880 { 881 /* If the page is a large free page, then disallow migration */ 882 if (PageBuddy(page)) { 883 /* 884 * We are checking page_order without zone->lock taken. But 885 * the only small danger is that we skip a potentially suitable 886 * pageblock, so it's not worth to check order for valid range. 887 */ 888 if (page_order_unsafe(page) >= pageblock_order) 889 return false; 890 } 891 892 /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ 893 if (migrate_async_suitable(get_pageblock_migratetype(page))) 894 return true; 895 896 /* Otherwise skip the block */ 897 return false; 898 } 899 900 /* 901 * Based on information in the current compact_control, find blocks 902 * suitable for isolating free pages from and then isolate them. 903 */ 904 static void isolate_freepages(struct compact_control *cc) 905 { 906 struct zone *zone = cc->zone; 907 struct page *page; 908 unsigned long block_start_pfn; /* start of current pageblock */ 909 unsigned long isolate_start_pfn; /* exact pfn we start at */ 910 unsigned long block_end_pfn; /* end of current pageblock */ 911 unsigned long low_pfn; /* lowest pfn scanner is able to scan */ 912 struct list_head *freelist = &cc->freepages; 913 914 /* 915 * Initialise the free scanner. The starting point is where we last 916 * successfully isolated from, zone-cached value, or the end of the 917 * zone when isolating for the first time. For looping we also need 918 * this pfn aligned down to the pageblock boundary, because we do 919 * block_start_pfn -= pageblock_nr_pages in the for loop. 920 * For ending point, take care when isolating in last pageblock of a 921 * a zone which ends in the middle of a pageblock. 922 * The low boundary is the end of the pageblock the migration scanner 923 * is using. 924 */ 925 isolate_start_pfn = cc->free_pfn; 926 block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1); 927 block_end_pfn = min(block_start_pfn + pageblock_nr_pages, 928 zone_end_pfn(zone)); 929 low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages); 930 931 /* 932 * Isolate free pages until enough are available to migrate the 933 * pages on cc->migratepages. We stop searching if the migrate 934 * and free page scanners meet or enough free pages are isolated. 935 */ 936 for (; block_start_pfn >= low_pfn && 937 cc->nr_migratepages > cc->nr_freepages; 938 block_end_pfn = block_start_pfn, 939 block_start_pfn -= pageblock_nr_pages, 940 isolate_start_pfn = block_start_pfn) { 941 942 /* 943 * This can iterate a massively long zone without finding any 944 * suitable migration targets, so periodically check if we need 945 * to schedule, or even abort async compaction. 946 */ 947 if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)) 948 && compact_should_abort(cc)) 949 break; 950 951 page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, 952 zone); 953 if (!page) 954 continue; 955 956 /* Check the block is suitable for migration */ 957 if (!suitable_migration_target(page)) 958 continue; 959 960 /* If isolation recently failed, do not retry */ 961 if (!isolation_suitable(cc, page)) 962 continue; 963 964 /* Found a block suitable for isolating free pages from. */ 965 isolate_freepages_block(cc, &isolate_start_pfn, 966 block_end_pfn, freelist, false); 967 968 /* 969 * Remember where the free scanner should restart next time, 970 * which is where isolate_freepages_block() left off. 971 * But if it scanned the whole pageblock, isolate_start_pfn 972 * now points at block_end_pfn, which is the start of the next 973 * pageblock. 974 * In that case we will however want to restart at the start 975 * of the previous pageblock. 976 */ 977 cc->free_pfn = (isolate_start_pfn < block_end_pfn) ? 978 isolate_start_pfn : 979 block_start_pfn - pageblock_nr_pages; 980 981 /* 982 * isolate_freepages_block() might have aborted due to async 983 * compaction being contended 984 */ 985 if (cc->contended) 986 break; 987 } 988 989 /* split_free_page does not map the pages */ 990 map_pages(freelist); 991 992 /* 993 * If we crossed the migrate scanner, we want to keep it that way 994 * so that compact_finished() may detect this 995 */ 996 if (block_start_pfn < low_pfn) 997 cc->free_pfn = cc->migrate_pfn; 998 } 999 1000 /* 1001 * This is a migrate-callback that "allocates" freepages by taking pages 1002 * from the isolated freelists in the block we are migrating to. 1003 */ 1004 static struct page *compaction_alloc(struct page *migratepage, 1005 unsigned long data, 1006 int **result) 1007 { 1008 struct compact_control *cc = (struct compact_control *)data; 1009 struct page *freepage; 1010 1011 /* 1012 * Isolate free pages if necessary, and if we are not aborting due to 1013 * contention. 1014 */ 1015 if (list_empty(&cc->freepages)) { 1016 if (!cc->contended) 1017 isolate_freepages(cc); 1018 1019 if (list_empty(&cc->freepages)) 1020 return NULL; 1021 } 1022 1023 freepage = list_entry(cc->freepages.next, struct page, lru); 1024 list_del(&freepage->lru); 1025 cc->nr_freepages--; 1026 1027 return freepage; 1028 } 1029 1030 /* 1031 * This is a migrate-callback that "frees" freepages back to the isolated 1032 * freelist. All pages on the freelist are from the same zone, so there is no 1033 * special handling needed for NUMA. 1034 */ 1035 static void compaction_free(struct page *page, unsigned long data) 1036 { 1037 struct compact_control *cc = (struct compact_control *)data; 1038 1039 list_add(&page->lru, &cc->freepages); 1040 cc->nr_freepages++; 1041 } 1042 1043 /* possible outcome of isolate_migratepages */ 1044 typedef enum { 1045 ISOLATE_ABORT, /* Abort compaction now */ 1046 ISOLATE_NONE, /* No pages isolated, continue scanning */ 1047 ISOLATE_SUCCESS, /* Pages isolated, migrate */ 1048 } isolate_migrate_t; 1049 1050 /* 1051 * Allow userspace to control policy on scanning the unevictable LRU for 1052 * compactable pages. 1053 */ 1054 int sysctl_compact_unevictable_allowed __read_mostly = 1; 1055 1056 /* 1057 * Isolate all pages that can be migrated from the first suitable block, 1058 * starting at the block pointed to by the migrate scanner pfn within 1059 * compact_control. 1060 */ 1061 static isolate_migrate_t isolate_migratepages(struct zone *zone, 1062 struct compact_control *cc) 1063 { 1064 unsigned long low_pfn, end_pfn; 1065 struct page *page; 1066 const isolate_mode_t isolate_mode = 1067 (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) | 1068 (cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0); 1069 1070 /* 1071 * Start at where we last stopped, or beginning of the zone as 1072 * initialized by compact_zone() 1073 */ 1074 low_pfn = cc->migrate_pfn; 1075 1076 /* Only scan within a pageblock boundary */ 1077 end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages); 1078 1079 /* 1080 * Iterate over whole pageblocks until we find the first suitable. 1081 * Do not cross the free scanner. 1082 */ 1083 for (; end_pfn <= cc->free_pfn; 1084 low_pfn = end_pfn, end_pfn += pageblock_nr_pages) { 1085 1086 /* 1087 * This can potentially iterate a massively long zone with 1088 * many pageblocks unsuitable, so periodically check if we 1089 * need to schedule, or even abort async compaction. 1090 */ 1091 if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)) 1092 && compact_should_abort(cc)) 1093 break; 1094 1095 page = pageblock_pfn_to_page(low_pfn, end_pfn, zone); 1096 if (!page) 1097 continue; 1098 1099 /* If isolation recently failed, do not retry */ 1100 if (!isolation_suitable(cc, page)) 1101 continue; 1102 1103 /* 1104 * For async compaction, also only scan in MOVABLE blocks. 1105 * Async compaction is optimistic to see if the minimum amount 1106 * of work satisfies the allocation. 1107 */ 1108 if (cc->mode == MIGRATE_ASYNC && 1109 !migrate_async_suitable(get_pageblock_migratetype(page))) 1110 continue; 1111 1112 /* Perform the isolation */ 1113 low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn, 1114 isolate_mode); 1115 1116 if (!low_pfn || cc->contended) { 1117 acct_isolated(zone, cc); 1118 return ISOLATE_ABORT; 1119 } 1120 1121 /* 1122 * Either we isolated something and proceed with migration. Or 1123 * we failed and compact_zone should decide if we should 1124 * continue or not. 1125 */ 1126 break; 1127 } 1128 1129 acct_isolated(zone, cc); 1130 /* 1131 * Record where migration scanner will be restarted. If we end up in 1132 * the same pageblock as the free scanner, make the scanners fully 1133 * meet so that compact_finished() terminates compaction. 1134 */ 1135 cc->migrate_pfn = (end_pfn <= cc->free_pfn) ? low_pfn : cc->free_pfn; 1136 1137 return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE; 1138 } 1139 1140 static int __compact_finished(struct zone *zone, struct compact_control *cc, 1141 const int migratetype) 1142 { 1143 unsigned int order; 1144 unsigned long watermark; 1145 1146 if (cc->contended || fatal_signal_pending(current)) 1147 return COMPACT_PARTIAL; 1148 1149 /* Compaction run completes if the migrate and free scanner meet */ 1150 if (cc->free_pfn <= cc->migrate_pfn) { 1151 /* Let the next compaction start anew. */ 1152 zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn; 1153 zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn; 1154 zone->compact_cached_free_pfn = zone_end_pfn(zone); 1155 1156 /* 1157 * Mark that the PG_migrate_skip information should be cleared 1158 * by kswapd when it goes to sleep. kswapd does not set the 1159 * flag itself as the decision to be clear should be directly 1160 * based on an allocation request. 1161 */ 1162 if (!current_is_kswapd()) 1163 zone->compact_blockskip_flush = true; 1164 1165 return COMPACT_COMPLETE; 1166 } 1167 1168 /* 1169 * order == -1 is expected when compacting via 1170 * /proc/sys/vm/compact_memory 1171 */ 1172 if (cc->order == -1) 1173 return COMPACT_CONTINUE; 1174 1175 /* Compaction run is not finished if the watermark is not met */ 1176 watermark = low_wmark_pages(zone); 1177 1178 if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx, 1179 cc->alloc_flags)) 1180 return COMPACT_CONTINUE; 1181 1182 /* Direct compactor: Is a suitable page free? */ 1183 for (order = cc->order; order < MAX_ORDER; order++) { 1184 struct free_area *area = &zone->free_area[order]; 1185 bool can_steal; 1186 1187 /* Job done if page is free of the right migratetype */ 1188 if (!list_empty(&area->free_list[migratetype])) 1189 return COMPACT_PARTIAL; 1190 1191 #ifdef CONFIG_CMA 1192 /* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */ 1193 if (migratetype == MIGRATE_MOVABLE && 1194 !list_empty(&area->free_list[MIGRATE_CMA])) 1195 return COMPACT_PARTIAL; 1196 #endif 1197 /* 1198 * Job done if allocation would steal freepages from 1199 * other migratetype buddy lists. 1200 */ 1201 if (find_suitable_fallback(area, order, migratetype, 1202 true, &can_steal) != -1) 1203 return COMPACT_PARTIAL; 1204 } 1205 1206 return COMPACT_NO_SUITABLE_PAGE; 1207 } 1208 1209 static int compact_finished(struct zone *zone, struct compact_control *cc, 1210 const int migratetype) 1211 { 1212 int ret; 1213 1214 ret = __compact_finished(zone, cc, migratetype); 1215 trace_mm_compaction_finished(zone, cc->order, ret); 1216 if (ret == COMPACT_NO_SUITABLE_PAGE) 1217 ret = COMPACT_CONTINUE; 1218 1219 return ret; 1220 } 1221 1222 /* 1223 * compaction_suitable: Is this suitable to run compaction on this zone now? 1224 * Returns 1225 * COMPACT_SKIPPED - If there are too few free pages for compaction 1226 * COMPACT_PARTIAL - If the allocation would succeed without compaction 1227 * COMPACT_CONTINUE - If compaction should run now 1228 */ 1229 static unsigned long __compaction_suitable(struct zone *zone, int order, 1230 int alloc_flags, int classzone_idx) 1231 { 1232 int fragindex; 1233 unsigned long watermark; 1234 1235 /* 1236 * order == -1 is expected when compacting via 1237 * /proc/sys/vm/compact_memory 1238 */ 1239 if (order == -1) 1240 return COMPACT_CONTINUE; 1241 1242 watermark = low_wmark_pages(zone); 1243 /* 1244 * If watermarks for high-order allocation are already met, there 1245 * should be no need for compaction at all. 1246 */ 1247 if (zone_watermark_ok(zone, order, watermark, classzone_idx, 1248 alloc_flags)) 1249 return COMPACT_PARTIAL; 1250 1251 /* 1252 * Watermarks for order-0 must be met for compaction. Note the 2UL. 1253 * This is because during migration, copies of pages need to be 1254 * allocated and for a short time, the footprint is higher 1255 */ 1256 watermark += (2UL << order); 1257 if (!zone_watermark_ok(zone, 0, watermark, classzone_idx, alloc_flags)) 1258 return COMPACT_SKIPPED; 1259 1260 /* 1261 * fragmentation index determines if allocation failures are due to 1262 * low memory or external fragmentation 1263 * 1264 * index of -1000 would imply allocations might succeed depending on 1265 * watermarks, but we already failed the high-order watermark check 1266 * index towards 0 implies failure is due to lack of memory 1267 * index towards 1000 implies failure is due to fragmentation 1268 * 1269 * Only compact if a failure would be due to fragmentation. 1270 */ 1271 fragindex = fragmentation_index(zone, order); 1272 if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) 1273 return COMPACT_NOT_SUITABLE_ZONE; 1274 1275 return COMPACT_CONTINUE; 1276 } 1277 1278 unsigned long compaction_suitable(struct zone *zone, int order, 1279 int alloc_flags, int classzone_idx) 1280 { 1281 unsigned long ret; 1282 1283 ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx); 1284 trace_mm_compaction_suitable(zone, order, ret); 1285 if (ret == COMPACT_NOT_SUITABLE_ZONE) 1286 ret = COMPACT_SKIPPED; 1287 1288 return ret; 1289 } 1290 1291 static int compact_zone(struct zone *zone, struct compact_control *cc) 1292 { 1293 int ret; 1294 unsigned long start_pfn = zone->zone_start_pfn; 1295 unsigned long end_pfn = zone_end_pfn(zone); 1296 const int migratetype = gfpflags_to_migratetype(cc->gfp_mask); 1297 const bool sync = cc->mode != MIGRATE_ASYNC; 1298 unsigned long last_migrated_pfn = 0; 1299 1300 ret = compaction_suitable(zone, cc->order, cc->alloc_flags, 1301 cc->classzone_idx); 1302 switch (ret) { 1303 case COMPACT_PARTIAL: 1304 case COMPACT_SKIPPED: 1305 /* Compaction is likely to fail */ 1306 return ret; 1307 case COMPACT_CONTINUE: 1308 /* Fall through to compaction */ 1309 ; 1310 } 1311 1312 /* 1313 * Clear pageblock skip if there were failures recently and compaction 1314 * is about to be retried after being deferred. kswapd does not do 1315 * this reset as it'll reset the cached information when going to sleep. 1316 */ 1317 if (compaction_restarting(zone, cc->order) && !current_is_kswapd()) 1318 __reset_isolation_suitable(zone); 1319 1320 /* 1321 * Setup to move all movable pages to the end of the zone. Used cached 1322 * information on where the scanners should start but check that it 1323 * is initialised by ensuring the values are within zone boundaries. 1324 */ 1325 cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; 1326 cc->free_pfn = zone->compact_cached_free_pfn; 1327 if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) { 1328 cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1); 1329 zone->compact_cached_free_pfn = cc->free_pfn; 1330 } 1331 if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) { 1332 cc->migrate_pfn = start_pfn; 1333 zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; 1334 zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; 1335 } 1336 1337 trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, 1338 cc->free_pfn, end_pfn, sync); 1339 1340 migrate_prep_local(); 1341 1342 while ((ret = compact_finished(zone, cc, migratetype)) == 1343 COMPACT_CONTINUE) { 1344 int err; 1345 unsigned long isolate_start_pfn = cc->migrate_pfn; 1346 1347 switch (isolate_migratepages(zone, cc)) { 1348 case ISOLATE_ABORT: 1349 ret = COMPACT_PARTIAL; 1350 putback_movable_pages(&cc->migratepages); 1351 cc->nr_migratepages = 0; 1352 goto out; 1353 case ISOLATE_NONE: 1354 /* 1355 * We haven't isolated and migrated anything, but 1356 * there might still be unflushed migrations from 1357 * previous cc->order aligned block. 1358 */ 1359 goto check_drain; 1360 case ISOLATE_SUCCESS: 1361 ; 1362 } 1363 1364 err = migrate_pages(&cc->migratepages, compaction_alloc, 1365 compaction_free, (unsigned long)cc, cc->mode, 1366 MR_COMPACTION); 1367 1368 trace_mm_compaction_migratepages(cc->nr_migratepages, err, 1369 &cc->migratepages); 1370 1371 /* All pages were either migrated or will be released */ 1372 cc->nr_migratepages = 0; 1373 if (err) { 1374 putback_movable_pages(&cc->migratepages); 1375 /* 1376 * migrate_pages() may return -ENOMEM when scanners meet 1377 * and we want compact_finished() to detect it 1378 */ 1379 if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) { 1380 ret = COMPACT_PARTIAL; 1381 goto out; 1382 } 1383 } 1384 1385 /* 1386 * Record where we could have freed pages by migration and not 1387 * yet flushed them to buddy allocator. We use the pfn that 1388 * isolate_migratepages() started from in this loop iteration 1389 * - this is the lowest page that could have been isolated and 1390 * then freed by migration. 1391 */ 1392 if (!last_migrated_pfn) 1393 last_migrated_pfn = isolate_start_pfn; 1394 1395 check_drain: 1396 /* 1397 * Has the migration scanner moved away from the previous 1398 * cc->order aligned block where we migrated from? If yes, 1399 * flush the pages that were freed, so that they can merge and 1400 * compact_finished() can detect immediately if allocation 1401 * would succeed. 1402 */ 1403 if (cc->order > 0 && last_migrated_pfn) { 1404 int cpu; 1405 unsigned long current_block_start = 1406 cc->migrate_pfn & ~((1UL << cc->order) - 1); 1407 1408 if (last_migrated_pfn < current_block_start) { 1409 cpu = get_cpu(); 1410 lru_add_drain_cpu(cpu); 1411 drain_local_pages(zone); 1412 put_cpu(); 1413 /* No more flushing until we migrate again */ 1414 last_migrated_pfn = 0; 1415 } 1416 } 1417 1418 } 1419 1420 out: 1421 /* 1422 * Release free pages and update where the free scanner should restart, 1423 * so we don't leave any returned pages behind in the next attempt. 1424 */ 1425 if (cc->nr_freepages > 0) { 1426 unsigned long free_pfn = release_freepages(&cc->freepages); 1427 1428 cc->nr_freepages = 0; 1429 VM_BUG_ON(free_pfn == 0); 1430 /* The cached pfn is always the first in a pageblock */ 1431 free_pfn &= ~(pageblock_nr_pages-1); 1432 /* 1433 * Only go back, not forward. The cached pfn might have been 1434 * already reset to zone end in compact_finished() 1435 */ 1436 if (free_pfn > zone->compact_cached_free_pfn) 1437 zone->compact_cached_free_pfn = free_pfn; 1438 } 1439 1440 trace_mm_compaction_end(start_pfn, cc->migrate_pfn, 1441 cc->free_pfn, end_pfn, sync, ret); 1442 1443 return ret; 1444 } 1445 1446 static unsigned long compact_zone_order(struct zone *zone, int order, 1447 gfp_t gfp_mask, enum migrate_mode mode, int *contended, 1448 int alloc_flags, int classzone_idx) 1449 { 1450 unsigned long ret; 1451 struct compact_control cc = { 1452 .nr_freepages = 0, 1453 .nr_migratepages = 0, 1454 .order = order, 1455 .gfp_mask = gfp_mask, 1456 .zone = zone, 1457 .mode = mode, 1458 .alloc_flags = alloc_flags, 1459 .classzone_idx = classzone_idx, 1460 }; 1461 INIT_LIST_HEAD(&cc.freepages); 1462 INIT_LIST_HEAD(&cc.migratepages); 1463 1464 ret = compact_zone(zone, &cc); 1465 1466 VM_BUG_ON(!list_empty(&cc.freepages)); 1467 VM_BUG_ON(!list_empty(&cc.migratepages)); 1468 1469 *contended = cc.contended; 1470 return ret; 1471 } 1472 1473 int sysctl_extfrag_threshold = 500; 1474 1475 /** 1476 * try_to_compact_pages - Direct compact to satisfy a high-order allocation 1477 * @gfp_mask: The GFP mask of the current allocation 1478 * @order: The order of the current allocation 1479 * @alloc_flags: The allocation flags of the current allocation 1480 * @ac: The context of current allocation 1481 * @mode: The migration mode for async, sync light, or sync migration 1482 * @contended: Return value that determines if compaction was aborted due to 1483 * need_resched() or lock contention 1484 * 1485 * This is the main entry point for direct page compaction. 1486 */ 1487 unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, 1488 int alloc_flags, const struct alloc_context *ac, 1489 enum migrate_mode mode, int *contended) 1490 { 1491 int may_enter_fs = gfp_mask & __GFP_FS; 1492 int may_perform_io = gfp_mask & __GFP_IO; 1493 struct zoneref *z; 1494 struct zone *zone; 1495 int rc = COMPACT_DEFERRED; 1496 int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */ 1497 1498 *contended = COMPACT_CONTENDED_NONE; 1499 1500 /* Check if the GFP flags allow compaction */ 1501 if (!order || !may_enter_fs || !may_perform_io) 1502 return COMPACT_SKIPPED; 1503 1504 trace_mm_compaction_try_to_compact_pages(order, gfp_mask, mode); 1505 1506 /* Compact each zone in the list */ 1507 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, 1508 ac->nodemask) { 1509 int status; 1510 int zone_contended; 1511 1512 if (compaction_deferred(zone, order)) 1513 continue; 1514 1515 status = compact_zone_order(zone, order, gfp_mask, mode, 1516 &zone_contended, alloc_flags, 1517 ac->classzone_idx); 1518 rc = max(status, rc); 1519 /* 1520 * It takes at least one zone that wasn't lock contended 1521 * to clear all_zones_contended. 1522 */ 1523 all_zones_contended &= zone_contended; 1524 1525 /* If a normal allocation would succeed, stop compacting */ 1526 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 1527 ac->classzone_idx, alloc_flags)) { 1528 /* 1529 * We think the allocation will succeed in this zone, 1530 * but it is not certain, hence the false. The caller 1531 * will repeat this with true if allocation indeed 1532 * succeeds in this zone. 1533 */ 1534 compaction_defer_reset(zone, order, false); 1535 /* 1536 * It is possible that async compaction aborted due to 1537 * need_resched() and the watermarks were ok thanks to 1538 * somebody else freeing memory. The allocation can 1539 * however still fail so we better signal the 1540 * need_resched() contention anyway (this will not 1541 * prevent the allocation attempt). 1542 */ 1543 if (zone_contended == COMPACT_CONTENDED_SCHED) 1544 *contended = COMPACT_CONTENDED_SCHED; 1545 1546 goto break_loop; 1547 } 1548 1549 if (mode != MIGRATE_ASYNC && status == COMPACT_COMPLETE) { 1550 /* 1551 * We think that allocation won't succeed in this zone 1552 * so we defer compaction there. If it ends up 1553 * succeeding after all, it will be reset. 1554 */ 1555 defer_compaction(zone, order); 1556 } 1557 1558 /* 1559 * We might have stopped compacting due to need_resched() in 1560 * async compaction, or due to a fatal signal detected. In that 1561 * case do not try further zones and signal need_resched() 1562 * contention. 1563 */ 1564 if ((zone_contended == COMPACT_CONTENDED_SCHED) 1565 || fatal_signal_pending(current)) { 1566 *contended = COMPACT_CONTENDED_SCHED; 1567 goto break_loop; 1568 } 1569 1570 continue; 1571 break_loop: 1572 /* 1573 * We might not have tried all the zones, so be conservative 1574 * and assume they are not all lock contended. 1575 */ 1576 all_zones_contended = 0; 1577 break; 1578 } 1579 1580 /* 1581 * If at least one zone wasn't deferred or skipped, we report if all 1582 * zones that were tried were lock contended. 1583 */ 1584 if (rc > COMPACT_SKIPPED && all_zones_contended) 1585 *contended = COMPACT_CONTENDED_LOCK; 1586 1587 return rc; 1588 } 1589 1590 1591 /* Compact all zones within a node */ 1592 static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) 1593 { 1594 int zoneid; 1595 struct zone *zone; 1596 1597 for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { 1598 1599 zone = &pgdat->node_zones[zoneid]; 1600 if (!populated_zone(zone)) 1601 continue; 1602 1603 cc->nr_freepages = 0; 1604 cc->nr_migratepages = 0; 1605 cc->zone = zone; 1606 INIT_LIST_HEAD(&cc->freepages); 1607 INIT_LIST_HEAD(&cc->migratepages); 1608 1609 /* 1610 * When called via /proc/sys/vm/compact_memory 1611 * this makes sure we compact the whole zone regardless of 1612 * cached scanner positions. 1613 */ 1614 if (cc->order == -1) 1615 __reset_isolation_suitable(zone); 1616 1617 if (cc->order == -1 || !compaction_deferred(zone, cc->order)) 1618 compact_zone(zone, cc); 1619 1620 if (cc->order > 0) { 1621 if (zone_watermark_ok(zone, cc->order, 1622 low_wmark_pages(zone), 0, 0)) 1623 compaction_defer_reset(zone, cc->order, false); 1624 } 1625 1626 VM_BUG_ON(!list_empty(&cc->freepages)); 1627 VM_BUG_ON(!list_empty(&cc->migratepages)); 1628 } 1629 } 1630 1631 void compact_pgdat(pg_data_t *pgdat, int order) 1632 { 1633 struct compact_control cc = { 1634 .order = order, 1635 .mode = MIGRATE_ASYNC, 1636 }; 1637 1638 if (!order) 1639 return; 1640 1641 __compact_pgdat(pgdat, &cc); 1642 } 1643 1644 static void compact_node(int nid) 1645 { 1646 struct compact_control cc = { 1647 .order = -1, 1648 .mode = MIGRATE_SYNC, 1649 .ignore_skip_hint = true, 1650 }; 1651 1652 __compact_pgdat(NODE_DATA(nid), &cc); 1653 } 1654 1655 /* Compact all nodes in the system */ 1656 static void compact_nodes(void) 1657 { 1658 int nid; 1659 1660 /* Flush pending updates to the LRU lists */ 1661 lru_add_drain_all(); 1662 1663 for_each_online_node(nid) 1664 compact_node(nid); 1665 } 1666 1667 /* The written value is actually unused, all memory is compacted */ 1668 int sysctl_compact_memory; 1669 1670 /* This is the entry point for compacting all nodes via /proc/sys/vm */ 1671 int sysctl_compaction_handler(struct ctl_table *table, int write, 1672 void __user *buffer, size_t *length, loff_t *ppos) 1673 { 1674 if (write) 1675 compact_nodes(); 1676 1677 return 0; 1678 } 1679 1680 int sysctl_extfrag_handler(struct ctl_table *table, int write, 1681 void __user *buffer, size_t *length, loff_t *ppos) 1682 { 1683 proc_dointvec_minmax(table, write, buffer, length, ppos); 1684 1685 return 0; 1686 } 1687 1688 #if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) 1689 static ssize_t sysfs_compact_node(struct device *dev, 1690 struct device_attribute *attr, 1691 const char *buf, size_t count) 1692 { 1693 int nid = dev->id; 1694 1695 if (nid >= 0 && nid < nr_node_ids && node_online(nid)) { 1696 /* Flush pending updates to the LRU lists */ 1697 lru_add_drain_all(); 1698 1699 compact_node(nid); 1700 } 1701 1702 return count; 1703 } 1704 static DEVICE_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node); 1705 1706 int compaction_register_node(struct node *node) 1707 { 1708 return device_create_file(&node->dev, &dev_attr_compact); 1709 } 1710 1711 void compaction_unregister_node(struct node *node) 1712 { 1713 return device_remove_file(&node->dev, &dev_attr_compact); 1714 } 1715 #endif /* CONFIG_SYSFS && CONFIG_NUMA */ 1716 1717 #endif /* CONFIG_COMPACTION */ 1718