1 /* 2 * linux/mm/compaction.c 3 * 4 * Memory compaction for the reduction of external fragmentation. Note that 5 * this heavily depends upon page migration to do all the real heavy 6 * lifting 7 * 8 * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie> 9 */ 10 #include <linux/swap.h> 11 #include <linux/migrate.h> 12 #include <linux/compaction.h> 13 #include <linux/mm_inline.h> 14 #include <linux/backing-dev.h> 15 #include <linux/sysctl.h> 16 #include <linux/sysfs.h> 17 #include <linux/balloon_compaction.h> 18 #include <linux/page-isolation.h> 19 #include "internal.h" 20 21 #ifdef CONFIG_COMPACTION 22 static inline void count_compact_event(enum vm_event_item item) 23 { 24 count_vm_event(item); 25 } 26 27 static inline void count_compact_events(enum vm_event_item item, long delta) 28 { 29 count_vm_events(item, delta); 30 } 31 #else 32 #define count_compact_event(item) do { } while (0) 33 #define count_compact_events(item, delta) do { } while (0) 34 #endif 35 36 #if defined CONFIG_COMPACTION || defined CONFIG_CMA 37 38 #define CREATE_TRACE_POINTS 39 #include <trace/events/compaction.h> 40 41 static unsigned long release_freepages(struct list_head *freelist) 42 { 43 struct page *page, *next; 44 unsigned long count = 0; 45 46 list_for_each_entry_safe(page, next, freelist, lru) { 47 list_del(&page->lru); 48 __free_page(page); 49 count++; 50 } 51 52 return count; 53 } 54 55 static void map_pages(struct list_head *list) 56 { 57 struct page *page; 58 59 list_for_each_entry(page, list, lru) { 60 arch_alloc_page(page, 0); 61 kernel_map_pages(page, 1, 1); 62 } 63 } 64 65 static inline bool migrate_async_suitable(int migratetype) 66 { 67 return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; 68 } 69 70 #ifdef CONFIG_COMPACTION 71 /* Returns true if the pageblock should be scanned for pages to isolate. */ 72 static inline bool isolation_suitable(struct compact_control *cc, 73 struct page *page) 74 { 75 if (cc->ignore_skip_hint) 76 return true; 77 78 return !get_pageblock_skip(page); 79 } 80 81 /* 82 * This function is called to clear all cached information on pageblocks that 83 * should be skipped for page isolation when the migrate and free page scanner 84 * meet. 85 */ 86 static void __reset_isolation_suitable(struct zone *zone) 87 { 88 unsigned long start_pfn = zone->zone_start_pfn; 89 unsigned long end_pfn = zone_end_pfn(zone); 90 unsigned long pfn; 91 92 zone->compact_cached_migrate_pfn[0] = start_pfn; 93 zone->compact_cached_migrate_pfn[1] = start_pfn; 94 zone->compact_cached_free_pfn = end_pfn; 95 zone->compact_blockskip_flush = false; 96 97 /* Walk the zone and mark every pageblock as suitable for isolation */ 98 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 99 struct page *page; 100 101 cond_resched(); 102 103 if (!pfn_valid(pfn)) 104 continue; 105 106 page = pfn_to_page(pfn); 107 if (zone != page_zone(page)) 108 continue; 109 110 clear_pageblock_skip(page); 111 } 112 } 113 114 void reset_isolation_suitable(pg_data_t *pgdat) 115 { 116 int zoneid; 117 118 for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { 119 struct zone *zone = &pgdat->node_zones[zoneid]; 120 if (!populated_zone(zone)) 121 continue; 122 123 /* Only flush if a full compaction finished recently */ 124 if (zone->compact_blockskip_flush) 125 __reset_isolation_suitable(zone); 126 } 127 } 128 129 /* 130 * If no pages were isolated then mark this pageblock to be skipped in the 131 * future. The information is later cleared by __reset_isolation_suitable(). 132 */ 133 static void update_pageblock_skip(struct compact_control *cc, 134 struct page *page, unsigned long nr_isolated, 135 bool set_unsuitable, bool migrate_scanner) 136 { 137 struct zone *zone = cc->zone; 138 unsigned long pfn; 139 140 if (cc->ignore_skip_hint) 141 return; 142 143 if (!page) 144 return; 145 146 if (nr_isolated) 147 return; 148 149 /* 150 * Only skip pageblocks when all forms of compaction will be known to 151 * fail in the near future. 152 */ 153 if (set_unsuitable) 154 set_pageblock_skip(page); 155 156 pfn = page_to_pfn(page); 157 158 /* Update where async and sync compaction should restart */ 159 if (migrate_scanner) { 160 if (cc->finished_update_migrate) 161 return; 162 if (pfn > zone->compact_cached_migrate_pfn[0]) 163 zone->compact_cached_migrate_pfn[0] = pfn; 164 if (cc->mode != MIGRATE_ASYNC && 165 pfn > zone->compact_cached_migrate_pfn[1]) 166 zone->compact_cached_migrate_pfn[1] = pfn; 167 } else { 168 if (cc->finished_update_free) 169 return; 170 if (pfn < zone->compact_cached_free_pfn) 171 zone->compact_cached_free_pfn = pfn; 172 } 173 } 174 #else 175 static inline bool isolation_suitable(struct compact_control *cc, 176 struct page *page) 177 { 178 return true; 179 } 180 181 static void update_pageblock_skip(struct compact_control *cc, 182 struct page *page, unsigned long nr_isolated, 183 bool set_unsuitable, bool migrate_scanner) 184 { 185 } 186 #endif /* CONFIG_COMPACTION */ 187 188 static inline bool should_release_lock(spinlock_t *lock) 189 { 190 return need_resched() || spin_is_contended(lock); 191 } 192 193 /* 194 * Compaction requires the taking of some coarse locks that are potentially 195 * very heavily contended. Check if the process needs to be scheduled or 196 * if the lock is contended. For async compaction, back out in the event 197 * if contention is severe. For sync compaction, schedule. 198 * 199 * Returns true if the lock is held. 200 * Returns false if the lock is released and compaction should abort 201 */ 202 static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, 203 bool locked, struct compact_control *cc) 204 { 205 if (should_release_lock(lock)) { 206 if (locked) { 207 spin_unlock_irqrestore(lock, *flags); 208 locked = false; 209 } 210 211 /* async aborts if taking too long or contended */ 212 if (cc->mode == MIGRATE_ASYNC) { 213 cc->contended = true; 214 return false; 215 } 216 217 cond_resched(); 218 } 219 220 if (!locked) 221 spin_lock_irqsave(lock, *flags); 222 return true; 223 } 224 225 /* 226 * Aside from avoiding lock contention, compaction also periodically checks 227 * need_resched() and either schedules in sync compaction or aborts async 228 * compaction. This is similar to what compact_checklock_irqsave() does, but 229 * is used where no lock is concerned. 230 * 231 * Returns false when no scheduling was needed, or sync compaction scheduled. 232 * Returns true when async compaction should abort. 233 */ 234 static inline bool compact_should_abort(struct compact_control *cc) 235 { 236 /* async compaction aborts if contended */ 237 if (need_resched()) { 238 if (cc->mode == MIGRATE_ASYNC) { 239 cc->contended = true; 240 return true; 241 } 242 243 cond_resched(); 244 } 245 246 return false; 247 } 248 249 /* Returns true if the page is within a block suitable for migration to */ 250 static bool suitable_migration_target(struct page *page) 251 { 252 /* If the page is a large free page, then disallow migration */ 253 if (PageBuddy(page) && page_order(page) >= pageblock_order) 254 return false; 255 256 /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ 257 if (migrate_async_suitable(get_pageblock_migratetype(page))) 258 return true; 259 260 /* Otherwise skip the block */ 261 return false; 262 } 263 264 /* 265 * Isolate free pages onto a private freelist. If @strict is true, will abort 266 * returning 0 on any invalid PFNs or non-free pages inside of the pageblock 267 * (even though it may still end up isolating some pages). 268 */ 269 static unsigned long isolate_freepages_block(struct compact_control *cc, 270 unsigned long blockpfn, 271 unsigned long end_pfn, 272 struct list_head *freelist, 273 bool strict) 274 { 275 int nr_scanned = 0, total_isolated = 0; 276 struct page *cursor, *valid_page = NULL; 277 unsigned long flags; 278 bool locked = false; 279 bool checked_pageblock = false; 280 281 cursor = pfn_to_page(blockpfn); 282 283 /* Isolate free pages. */ 284 for (; blockpfn < end_pfn; blockpfn++, cursor++) { 285 int isolated, i; 286 struct page *page = cursor; 287 288 nr_scanned++; 289 if (!pfn_valid_within(blockpfn)) 290 goto isolate_fail; 291 292 if (!valid_page) 293 valid_page = page; 294 if (!PageBuddy(page)) 295 goto isolate_fail; 296 297 /* 298 * The zone lock must be held to isolate freepages. 299 * Unfortunately this is a very coarse lock and can be 300 * heavily contended if there are parallel allocations 301 * or parallel compactions. For async compaction do not 302 * spin on the lock and we acquire the lock as late as 303 * possible. 304 */ 305 locked = compact_checklock_irqsave(&cc->zone->lock, &flags, 306 locked, cc); 307 if (!locked) 308 break; 309 310 /* Recheck this is a suitable migration target under lock */ 311 if (!strict && !checked_pageblock) { 312 /* 313 * We need to check suitability of pageblock only once 314 * and this isolate_freepages_block() is called with 315 * pageblock range, so just check once is sufficient. 316 */ 317 checked_pageblock = true; 318 if (!suitable_migration_target(page)) 319 break; 320 } 321 322 /* Recheck this is a buddy page under lock */ 323 if (!PageBuddy(page)) 324 goto isolate_fail; 325 326 /* Found a free page, break it into order-0 pages */ 327 isolated = split_free_page(page); 328 total_isolated += isolated; 329 for (i = 0; i < isolated; i++) { 330 list_add(&page->lru, freelist); 331 page++; 332 } 333 334 /* If a page was split, advance to the end of it */ 335 if (isolated) { 336 blockpfn += isolated - 1; 337 cursor += isolated - 1; 338 continue; 339 } 340 341 isolate_fail: 342 if (strict) 343 break; 344 else 345 continue; 346 347 } 348 349 trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); 350 351 /* 352 * If strict isolation is requested by CMA then check that all the 353 * pages requested were isolated. If there were any failures, 0 is 354 * returned and CMA will fail. 355 */ 356 if (strict && blockpfn < end_pfn) 357 total_isolated = 0; 358 359 if (locked) 360 spin_unlock_irqrestore(&cc->zone->lock, flags); 361 362 /* Update the pageblock-skip if the whole pageblock was scanned */ 363 if (blockpfn == end_pfn) 364 update_pageblock_skip(cc, valid_page, total_isolated, true, 365 false); 366 367 count_compact_events(COMPACTFREE_SCANNED, nr_scanned); 368 if (total_isolated) 369 count_compact_events(COMPACTISOLATED, total_isolated); 370 return total_isolated; 371 } 372 373 /** 374 * isolate_freepages_range() - isolate free pages. 375 * @start_pfn: The first PFN to start isolating. 376 * @end_pfn: The one-past-last PFN. 377 * 378 * Non-free pages, invalid PFNs, or zone boundaries within the 379 * [start_pfn, end_pfn) range are considered errors, cause function to 380 * undo its actions and return zero. 381 * 382 * Otherwise, function returns one-past-the-last PFN of isolated page 383 * (which may be greater then end_pfn if end fell in a middle of 384 * a free page). 385 */ 386 unsigned long 387 isolate_freepages_range(struct compact_control *cc, 388 unsigned long start_pfn, unsigned long end_pfn) 389 { 390 unsigned long isolated, pfn, block_end_pfn; 391 LIST_HEAD(freelist); 392 393 for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) { 394 if (!pfn_valid(pfn) || cc->zone != page_zone(pfn_to_page(pfn))) 395 break; 396 397 /* 398 * On subsequent iterations ALIGN() is actually not needed, 399 * but we keep it that we not to complicate the code. 400 */ 401 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); 402 block_end_pfn = min(block_end_pfn, end_pfn); 403 404 isolated = isolate_freepages_block(cc, pfn, block_end_pfn, 405 &freelist, true); 406 407 /* 408 * In strict mode, isolate_freepages_block() returns 0 if 409 * there are any holes in the block (ie. invalid PFNs or 410 * non-free pages). 411 */ 412 if (!isolated) 413 break; 414 415 /* 416 * If we managed to isolate pages, it is always (1 << n) * 417 * pageblock_nr_pages for some non-negative n. (Max order 418 * page may span two pageblocks). 419 */ 420 } 421 422 /* split_free_page does not map the pages */ 423 map_pages(&freelist); 424 425 if (pfn < end_pfn) { 426 /* Loop terminated early, cleanup. */ 427 release_freepages(&freelist); 428 return 0; 429 } 430 431 /* We don't use freelists for anything. */ 432 return pfn; 433 } 434 435 /* Update the number of anon and file isolated pages in the zone */ 436 static void acct_isolated(struct zone *zone, bool locked, struct compact_control *cc) 437 { 438 struct page *page; 439 unsigned int count[2] = { 0, }; 440 441 list_for_each_entry(page, &cc->migratepages, lru) 442 count[!!page_is_file_cache(page)]++; 443 444 /* If locked we can use the interrupt unsafe versions */ 445 if (locked) { 446 __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); 447 __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); 448 } else { 449 mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); 450 mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); 451 } 452 } 453 454 /* Similar to reclaim, but different enough that they don't share logic */ 455 static bool too_many_isolated(struct zone *zone) 456 { 457 unsigned long active, inactive, isolated; 458 459 inactive = zone_page_state(zone, NR_INACTIVE_FILE) + 460 zone_page_state(zone, NR_INACTIVE_ANON); 461 active = zone_page_state(zone, NR_ACTIVE_FILE) + 462 zone_page_state(zone, NR_ACTIVE_ANON); 463 isolated = zone_page_state(zone, NR_ISOLATED_FILE) + 464 zone_page_state(zone, NR_ISOLATED_ANON); 465 466 return isolated > (inactive + active) / 2; 467 } 468 469 /** 470 * isolate_migratepages_range() - isolate all migrate-able pages in range. 471 * @zone: Zone pages are in. 472 * @cc: Compaction control structure. 473 * @low_pfn: The first PFN of the range. 474 * @end_pfn: The one-past-the-last PFN of the range. 475 * @unevictable: true if it allows to isolate unevictable pages 476 * 477 * Isolate all pages that can be migrated from the range specified by 478 * [low_pfn, end_pfn). Returns zero if there is a fatal signal 479 * pending), otherwise PFN of the first page that was not scanned 480 * (which may be both less, equal to or more then end_pfn). 481 * 482 * Assumes that cc->migratepages is empty and cc->nr_migratepages is 483 * zero. 484 * 485 * Apart from cc->migratepages and cc->nr_migratetypes this function 486 * does not modify any cc's fields, in particular it does not modify 487 * (or read for that matter) cc->migrate_pfn. 488 */ 489 unsigned long 490 isolate_migratepages_range(struct zone *zone, struct compact_control *cc, 491 unsigned long low_pfn, unsigned long end_pfn, bool unevictable) 492 { 493 unsigned long last_pageblock_nr = 0, pageblock_nr; 494 unsigned long nr_scanned = 0, nr_isolated = 0; 495 struct list_head *migratelist = &cc->migratepages; 496 struct lruvec *lruvec; 497 unsigned long flags; 498 bool locked = false; 499 struct page *page = NULL, *valid_page = NULL; 500 bool set_unsuitable = true; 501 const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ? 502 ISOLATE_ASYNC_MIGRATE : 0) | 503 (unevictable ? ISOLATE_UNEVICTABLE : 0); 504 505 /* 506 * Ensure that there are not too many pages isolated from the LRU 507 * list by either parallel reclaimers or compaction. If there are, 508 * delay for some time until fewer pages are isolated 509 */ 510 while (unlikely(too_many_isolated(zone))) { 511 /* async migration should just abort */ 512 if (cc->mode == MIGRATE_ASYNC) 513 return 0; 514 515 congestion_wait(BLK_RW_ASYNC, HZ/10); 516 517 if (fatal_signal_pending(current)) 518 return 0; 519 } 520 521 if (compact_should_abort(cc)) 522 return 0; 523 524 /* Time to isolate some pages for migration */ 525 for (; low_pfn < end_pfn; low_pfn++) { 526 /* give a chance to irqs before checking need_resched() */ 527 if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) { 528 if (should_release_lock(&zone->lru_lock)) { 529 spin_unlock_irqrestore(&zone->lru_lock, flags); 530 locked = false; 531 } 532 } 533 534 /* 535 * migrate_pfn does not necessarily start aligned to a 536 * pageblock. Ensure that pfn_valid is called when moving 537 * into a new MAX_ORDER_NR_PAGES range in case of large 538 * memory holes within the zone 539 */ 540 if ((low_pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) { 541 if (!pfn_valid(low_pfn)) { 542 low_pfn += MAX_ORDER_NR_PAGES - 1; 543 continue; 544 } 545 } 546 547 if (!pfn_valid_within(low_pfn)) 548 continue; 549 nr_scanned++; 550 551 /* 552 * Get the page and ensure the page is within the same zone. 553 * See the comment in isolate_freepages about overlapping 554 * nodes. It is deliberate that the new zone lock is not taken 555 * as memory compaction should not move pages between nodes. 556 */ 557 page = pfn_to_page(low_pfn); 558 if (page_zone(page) != zone) 559 continue; 560 561 if (!valid_page) 562 valid_page = page; 563 564 /* If isolation recently failed, do not retry */ 565 pageblock_nr = low_pfn >> pageblock_order; 566 if (last_pageblock_nr != pageblock_nr) { 567 int mt; 568 569 last_pageblock_nr = pageblock_nr; 570 if (!isolation_suitable(cc, page)) 571 goto next_pageblock; 572 573 /* 574 * For async migration, also only scan in MOVABLE 575 * blocks. Async migration is optimistic to see if 576 * the minimum amount of work satisfies the allocation 577 */ 578 mt = get_pageblock_migratetype(page); 579 if (cc->mode == MIGRATE_ASYNC && 580 !migrate_async_suitable(mt)) { 581 set_unsuitable = false; 582 goto next_pageblock; 583 } 584 } 585 586 /* 587 * Skip if free. page_order cannot be used without zone->lock 588 * as nothing prevents parallel allocations or buddy merging. 589 */ 590 if (PageBuddy(page)) 591 continue; 592 593 /* 594 * Check may be lockless but that's ok as we recheck later. 595 * It's possible to migrate LRU pages and balloon pages 596 * Skip any other type of page 597 */ 598 if (!PageLRU(page)) { 599 if (unlikely(balloon_page_movable(page))) { 600 if (locked && balloon_page_isolate(page)) { 601 /* Successfully isolated */ 602 goto isolate_success; 603 } 604 } 605 continue; 606 } 607 608 /* 609 * PageLRU is set. lru_lock normally excludes isolation 610 * splitting and collapsing (collapsing has already happened 611 * if PageLRU is set) but the lock is not necessarily taken 612 * here and it is wasteful to take it just to check transhuge. 613 * Check TransHuge without lock and skip the whole pageblock if 614 * it's either a transhuge or hugetlbfs page, as calling 615 * compound_order() without preventing THP from splitting the 616 * page underneath us may return surprising results. 617 */ 618 if (PageTransHuge(page)) { 619 if (!locked) 620 goto next_pageblock; 621 low_pfn += (1 << compound_order(page)) - 1; 622 continue; 623 } 624 625 /* 626 * Migration will fail if an anonymous page is pinned in memory, 627 * so avoid taking lru_lock and isolating it unnecessarily in an 628 * admittedly racy check. 629 */ 630 if (!page_mapping(page) && 631 page_count(page) > page_mapcount(page)) 632 continue; 633 634 /* Check if it is ok to still hold the lock */ 635 locked = compact_checklock_irqsave(&zone->lru_lock, &flags, 636 locked, cc); 637 if (!locked || fatal_signal_pending(current)) 638 break; 639 640 /* Recheck PageLRU and PageTransHuge under lock */ 641 if (!PageLRU(page)) 642 continue; 643 if (PageTransHuge(page)) { 644 low_pfn += (1 << compound_order(page)) - 1; 645 continue; 646 } 647 648 lruvec = mem_cgroup_page_lruvec(page, zone); 649 650 /* Try isolate the page */ 651 if (__isolate_lru_page(page, mode) != 0) 652 continue; 653 654 VM_BUG_ON_PAGE(PageTransCompound(page), page); 655 656 /* Successfully isolated */ 657 del_page_from_lru_list(page, lruvec, page_lru(page)); 658 659 isolate_success: 660 cc->finished_update_migrate = true; 661 list_add(&page->lru, migratelist); 662 cc->nr_migratepages++; 663 nr_isolated++; 664 665 /* Avoid isolating too much */ 666 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { 667 ++low_pfn; 668 break; 669 } 670 671 continue; 672 673 next_pageblock: 674 low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1; 675 } 676 677 acct_isolated(zone, locked, cc); 678 679 if (locked) 680 spin_unlock_irqrestore(&zone->lru_lock, flags); 681 682 /* 683 * Update the pageblock-skip information and cached scanner pfn, 684 * if the whole pageblock was scanned without isolating any page. 685 */ 686 if (low_pfn == end_pfn) 687 update_pageblock_skip(cc, valid_page, nr_isolated, 688 set_unsuitable, true); 689 690 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); 691 692 count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned); 693 if (nr_isolated) 694 count_compact_events(COMPACTISOLATED, nr_isolated); 695 696 return low_pfn; 697 } 698 699 #endif /* CONFIG_COMPACTION || CONFIG_CMA */ 700 #ifdef CONFIG_COMPACTION 701 /* 702 * Based on information in the current compact_control, find blocks 703 * suitable for isolating free pages from and then isolate them. 704 */ 705 static void isolate_freepages(struct zone *zone, 706 struct compact_control *cc) 707 { 708 struct page *page; 709 unsigned long block_start_pfn; /* start of current pageblock */ 710 unsigned long block_end_pfn; /* end of current pageblock */ 711 unsigned long low_pfn; /* lowest pfn scanner is able to scan */ 712 int nr_freepages = cc->nr_freepages; 713 struct list_head *freelist = &cc->freepages; 714 715 /* 716 * Initialise the free scanner. The starting point is where we last 717 * successfully isolated from, zone-cached value, or the end of the 718 * zone when isolating for the first time. We need this aligned to 719 * the pageblock boundary, because we do 720 * block_start_pfn -= pageblock_nr_pages in the for loop. 721 * For ending point, take care when isolating in last pageblock of a 722 * a zone which ends in the middle of a pageblock. 723 * The low boundary is the end of the pageblock the migration scanner 724 * is using. 725 */ 726 block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1); 727 block_end_pfn = min(block_start_pfn + pageblock_nr_pages, 728 zone_end_pfn(zone)); 729 low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages); 730 731 /* 732 * Isolate free pages until enough are available to migrate the 733 * pages on cc->migratepages. We stop searching if the migrate 734 * and free page scanners meet or enough free pages are isolated. 735 */ 736 for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages; 737 block_end_pfn = block_start_pfn, 738 block_start_pfn -= pageblock_nr_pages) { 739 unsigned long isolated; 740 741 /* 742 * This can iterate a massively long zone without finding any 743 * suitable migration targets, so periodically check if we need 744 * to schedule, or even abort async compaction. 745 */ 746 if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)) 747 && compact_should_abort(cc)) 748 break; 749 750 if (!pfn_valid(block_start_pfn)) 751 continue; 752 753 /* 754 * Check for overlapping nodes/zones. It's possible on some 755 * configurations to have a setup like 756 * node0 node1 node0 757 * i.e. it's possible that all pages within a zones range of 758 * pages do not belong to a single zone. 759 */ 760 page = pfn_to_page(block_start_pfn); 761 if (page_zone(page) != zone) 762 continue; 763 764 /* Check the block is suitable for migration */ 765 if (!suitable_migration_target(page)) 766 continue; 767 768 /* If isolation recently failed, do not retry */ 769 if (!isolation_suitable(cc, page)) 770 continue; 771 772 /* Found a block suitable for isolating free pages from */ 773 cc->free_pfn = block_start_pfn; 774 isolated = isolate_freepages_block(cc, block_start_pfn, 775 block_end_pfn, freelist, false); 776 nr_freepages += isolated; 777 778 /* 779 * Set a flag that we successfully isolated in this pageblock. 780 * In the next loop iteration, zone->compact_cached_free_pfn 781 * will not be updated and thus it will effectively contain the 782 * highest pageblock we isolated pages from. 783 */ 784 if (isolated) 785 cc->finished_update_free = true; 786 787 /* 788 * isolate_freepages_block() might have aborted due to async 789 * compaction being contended 790 */ 791 if (cc->contended) 792 break; 793 } 794 795 /* split_free_page does not map the pages */ 796 map_pages(freelist); 797 798 /* 799 * If we crossed the migrate scanner, we want to keep it that way 800 * so that compact_finished() may detect this 801 */ 802 if (block_start_pfn < low_pfn) 803 cc->free_pfn = cc->migrate_pfn; 804 805 cc->nr_freepages = nr_freepages; 806 } 807 808 /* 809 * This is a migrate-callback that "allocates" freepages by taking pages 810 * from the isolated freelists in the block we are migrating to. 811 */ 812 static struct page *compaction_alloc(struct page *migratepage, 813 unsigned long data, 814 int **result) 815 { 816 struct compact_control *cc = (struct compact_control *)data; 817 struct page *freepage; 818 819 /* 820 * Isolate free pages if necessary, and if we are not aborting due to 821 * contention. 822 */ 823 if (list_empty(&cc->freepages)) { 824 if (!cc->contended) 825 isolate_freepages(cc->zone, cc); 826 827 if (list_empty(&cc->freepages)) 828 return NULL; 829 } 830 831 freepage = list_entry(cc->freepages.next, struct page, lru); 832 list_del(&freepage->lru); 833 cc->nr_freepages--; 834 835 return freepage; 836 } 837 838 /* 839 * This is a migrate-callback that "frees" freepages back to the isolated 840 * freelist. All pages on the freelist are from the same zone, so there is no 841 * special handling needed for NUMA. 842 */ 843 static void compaction_free(struct page *page, unsigned long data) 844 { 845 struct compact_control *cc = (struct compact_control *)data; 846 847 list_add(&page->lru, &cc->freepages); 848 cc->nr_freepages++; 849 } 850 851 /* possible outcome of isolate_migratepages */ 852 typedef enum { 853 ISOLATE_ABORT, /* Abort compaction now */ 854 ISOLATE_NONE, /* No pages isolated, continue scanning */ 855 ISOLATE_SUCCESS, /* Pages isolated, migrate */ 856 } isolate_migrate_t; 857 858 /* 859 * Isolate all pages that can be migrated from the block pointed to by 860 * the migrate scanner within compact_control. 861 */ 862 static isolate_migrate_t isolate_migratepages(struct zone *zone, 863 struct compact_control *cc) 864 { 865 unsigned long low_pfn, end_pfn; 866 867 /* Do not scan outside zone boundaries */ 868 low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); 869 870 /* Only scan within a pageblock boundary */ 871 end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages); 872 873 /* Do not cross the free scanner or scan within a memory hole */ 874 if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { 875 cc->migrate_pfn = end_pfn; 876 return ISOLATE_NONE; 877 } 878 879 /* Perform the isolation */ 880 low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn, false); 881 if (!low_pfn || cc->contended) 882 return ISOLATE_ABORT; 883 884 cc->migrate_pfn = low_pfn; 885 886 return ISOLATE_SUCCESS; 887 } 888 889 static int compact_finished(struct zone *zone, 890 struct compact_control *cc) 891 { 892 unsigned int order; 893 unsigned long watermark; 894 895 if (cc->contended || fatal_signal_pending(current)) 896 return COMPACT_PARTIAL; 897 898 /* Compaction run completes if the migrate and free scanner meet */ 899 if (cc->free_pfn <= cc->migrate_pfn) { 900 /* Let the next compaction start anew. */ 901 zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn; 902 zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn; 903 zone->compact_cached_free_pfn = zone_end_pfn(zone); 904 905 /* 906 * Mark that the PG_migrate_skip information should be cleared 907 * by kswapd when it goes to sleep. kswapd does not set the 908 * flag itself as the decision to be clear should be directly 909 * based on an allocation request. 910 */ 911 if (!current_is_kswapd()) 912 zone->compact_blockskip_flush = true; 913 914 return COMPACT_COMPLETE; 915 } 916 917 /* 918 * order == -1 is expected when compacting via 919 * /proc/sys/vm/compact_memory 920 */ 921 if (cc->order == -1) 922 return COMPACT_CONTINUE; 923 924 /* Compaction run is not finished if the watermark is not met */ 925 watermark = low_wmark_pages(zone); 926 watermark += (1 << cc->order); 927 928 if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) 929 return COMPACT_CONTINUE; 930 931 /* Direct compactor: Is a suitable page free? */ 932 for (order = cc->order; order < MAX_ORDER; order++) { 933 struct free_area *area = &zone->free_area[order]; 934 935 /* Job done if page is free of the right migratetype */ 936 if (!list_empty(&area->free_list[cc->migratetype])) 937 return COMPACT_PARTIAL; 938 939 /* Job done if allocation would set block type */ 940 if (cc->order >= pageblock_order && area->nr_free) 941 return COMPACT_PARTIAL; 942 } 943 944 return COMPACT_CONTINUE; 945 } 946 947 /* 948 * compaction_suitable: Is this suitable to run compaction on this zone now? 949 * Returns 950 * COMPACT_SKIPPED - If there are too few free pages for compaction 951 * COMPACT_PARTIAL - If the allocation would succeed without compaction 952 * COMPACT_CONTINUE - If compaction should run now 953 */ 954 unsigned long compaction_suitable(struct zone *zone, int order) 955 { 956 int fragindex; 957 unsigned long watermark; 958 959 /* 960 * order == -1 is expected when compacting via 961 * /proc/sys/vm/compact_memory 962 */ 963 if (order == -1) 964 return COMPACT_CONTINUE; 965 966 /* 967 * Watermarks for order-0 must be met for compaction. Note the 2UL. 968 * This is because during migration, copies of pages need to be 969 * allocated and for a short time, the footprint is higher 970 */ 971 watermark = low_wmark_pages(zone) + (2UL << order); 972 if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) 973 return COMPACT_SKIPPED; 974 975 /* 976 * fragmentation index determines if allocation failures are due to 977 * low memory or external fragmentation 978 * 979 * index of -1000 implies allocations might succeed depending on 980 * watermarks 981 * index towards 0 implies failure is due to lack of memory 982 * index towards 1000 implies failure is due to fragmentation 983 * 984 * Only compact if a failure would be due to fragmentation. 985 */ 986 fragindex = fragmentation_index(zone, order); 987 if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) 988 return COMPACT_SKIPPED; 989 990 if (fragindex == -1000 && zone_watermark_ok(zone, order, watermark, 991 0, 0)) 992 return COMPACT_PARTIAL; 993 994 return COMPACT_CONTINUE; 995 } 996 997 static int compact_zone(struct zone *zone, struct compact_control *cc) 998 { 999 int ret; 1000 unsigned long start_pfn = zone->zone_start_pfn; 1001 unsigned long end_pfn = zone_end_pfn(zone); 1002 const bool sync = cc->mode != MIGRATE_ASYNC; 1003 1004 ret = compaction_suitable(zone, cc->order); 1005 switch (ret) { 1006 case COMPACT_PARTIAL: 1007 case COMPACT_SKIPPED: 1008 /* Compaction is likely to fail */ 1009 return ret; 1010 case COMPACT_CONTINUE: 1011 /* Fall through to compaction */ 1012 ; 1013 } 1014 1015 /* 1016 * Clear pageblock skip if there were failures recently and compaction 1017 * is about to be retried after being deferred. kswapd does not do 1018 * this reset as it'll reset the cached information when going to sleep. 1019 */ 1020 if (compaction_restarting(zone, cc->order) && !current_is_kswapd()) 1021 __reset_isolation_suitable(zone); 1022 1023 /* 1024 * Setup to move all movable pages to the end of the zone. Used cached 1025 * information on where the scanners should start but check that it 1026 * is initialised by ensuring the values are within zone boundaries. 1027 */ 1028 cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; 1029 cc->free_pfn = zone->compact_cached_free_pfn; 1030 if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) { 1031 cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1); 1032 zone->compact_cached_free_pfn = cc->free_pfn; 1033 } 1034 if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) { 1035 cc->migrate_pfn = start_pfn; 1036 zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; 1037 zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; 1038 } 1039 1040 trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn); 1041 1042 migrate_prep_local(); 1043 1044 while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { 1045 int err; 1046 1047 switch (isolate_migratepages(zone, cc)) { 1048 case ISOLATE_ABORT: 1049 ret = COMPACT_PARTIAL; 1050 putback_movable_pages(&cc->migratepages); 1051 cc->nr_migratepages = 0; 1052 goto out; 1053 case ISOLATE_NONE: 1054 continue; 1055 case ISOLATE_SUCCESS: 1056 ; 1057 } 1058 1059 if (!cc->nr_migratepages) 1060 continue; 1061 1062 err = migrate_pages(&cc->migratepages, compaction_alloc, 1063 compaction_free, (unsigned long)cc, cc->mode, 1064 MR_COMPACTION); 1065 1066 trace_mm_compaction_migratepages(cc->nr_migratepages, err, 1067 &cc->migratepages); 1068 1069 /* All pages were either migrated or will be released */ 1070 cc->nr_migratepages = 0; 1071 if (err) { 1072 putback_movable_pages(&cc->migratepages); 1073 /* 1074 * migrate_pages() may return -ENOMEM when scanners meet 1075 * and we want compact_finished() to detect it 1076 */ 1077 if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) { 1078 ret = COMPACT_PARTIAL; 1079 goto out; 1080 } 1081 } 1082 } 1083 1084 out: 1085 /* Release free pages and check accounting */ 1086 cc->nr_freepages -= release_freepages(&cc->freepages); 1087 VM_BUG_ON(cc->nr_freepages != 0); 1088 1089 trace_mm_compaction_end(ret); 1090 1091 return ret; 1092 } 1093 1094 static unsigned long compact_zone_order(struct zone *zone, int order, 1095 gfp_t gfp_mask, enum migrate_mode mode, bool *contended) 1096 { 1097 unsigned long ret; 1098 struct compact_control cc = { 1099 .nr_freepages = 0, 1100 .nr_migratepages = 0, 1101 .order = order, 1102 .migratetype = allocflags_to_migratetype(gfp_mask), 1103 .zone = zone, 1104 .mode = mode, 1105 }; 1106 INIT_LIST_HEAD(&cc.freepages); 1107 INIT_LIST_HEAD(&cc.migratepages); 1108 1109 ret = compact_zone(zone, &cc); 1110 1111 VM_BUG_ON(!list_empty(&cc.freepages)); 1112 VM_BUG_ON(!list_empty(&cc.migratepages)); 1113 1114 *contended = cc.contended; 1115 return ret; 1116 } 1117 1118 int sysctl_extfrag_threshold = 500; 1119 1120 /** 1121 * try_to_compact_pages - Direct compact to satisfy a high-order allocation 1122 * @zonelist: The zonelist used for the current allocation 1123 * @order: The order of the current allocation 1124 * @gfp_mask: The GFP mask of the current allocation 1125 * @nodemask: The allowed nodes to allocate from 1126 * @mode: The migration mode for async, sync light, or sync migration 1127 * @contended: Return value that is true if compaction was aborted due to lock contention 1128 * @page: Optionally capture a free page of the requested order during compaction 1129 * 1130 * This is the main entry point for direct page compaction. 1131 */ 1132 unsigned long try_to_compact_pages(struct zonelist *zonelist, 1133 int order, gfp_t gfp_mask, nodemask_t *nodemask, 1134 enum migrate_mode mode, bool *contended) 1135 { 1136 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 1137 int may_enter_fs = gfp_mask & __GFP_FS; 1138 int may_perform_io = gfp_mask & __GFP_IO; 1139 struct zoneref *z; 1140 struct zone *zone; 1141 int rc = COMPACT_SKIPPED; 1142 int alloc_flags = 0; 1143 1144 /* Check if the GFP flags allow compaction */ 1145 if (!order || !may_enter_fs || !may_perform_io) 1146 return rc; 1147 1148 count_compact_event(COMPACTSTALL); 1149 1150 #ifdef CONFIG_CMA 1151 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) 1152 alloc_flags |= ALLOC_CMA; 1153 #endif 1154 /* Compact each zone in the list */ 1155 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, 1156 nodemask) { 1157 int status; 1158 1159 status = compact_zone_order(zone, order, gfp_mask, mode, 1160 contended); 1161 rc = max(status, rc); 1162 1163 /* If a normal allocation would succeed, stop compacting */ 1164 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 1165 alloc_flags)) 1166 break; 1167 } 1168 1169 return rc; 1170 } 1171 1172 1173 /* Compact all zones within a node */ 1174 static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) 1175 { 1176 int zoneid; 1177 struct zone *zone; 1178 1179 for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { 1180 1181 zone = &pgdat->node_zones[zoneid]; 1182 if (!populated_zone(zone)) 1183 continue; 1184 1185 cc->nr_freepages = 0; 1186 cc->nr_migratepages = 0; 1187 cc->zone = zone; 1188 INIT_LIST_HEAD(&cc->freepages); 1189 INIT_LIST_HEAD(&cc->migratepages); 1190 1191 if (cc->order == -1 || !compaction_deferred(zone, cc->order)) 1192 compact_zone(zone, cc); 1193 1194 if (cc->order > 0) { 1195 if (zone_watermark_ok(zone, cc->order, 1196 low_wmark_pages(zone), 0, 0)) 1197 compaction_defer_reset(zone, cc->order, false); 1198 } 1199 1200 VM_BUG_ON(!list_empty(&cc->freepages)); 1201 VM_BUG_ON(!list_empty(&cc->migratepages)); 1202 } 1203 } 1204 1205 void compact_pgdat(pg_data_t *pgdat, int order) 1206 { 1207 struct compact_control cc = { 1208 .order = order, 1209 .mode = MIGRATE_ASYNC, 1210 }; 1211 1212 if (!order) 1213 return; 1214 1215 __compact_pgdat(pgdat, &cc); 1216 } 1217 1218 static void compact_node(int nid) 1219 { 1220 struct compact_control cc = { 1221 .order = -1, 1222 .mode = MIGRATE_SYNC, 1223 .ignore_skip_hint = true, 1224 }; 1225 1226 __compact_pgdat(NODE_DATA(nid), &cc); 1227 } 1228 1229 /* Compact all nodes in the system */ 1230 static void compact_nodes(void) 1231 { 1232 int nid; 1233 1234 /* Flush pending updates to the LRU lists */ 1235 lru_add_drain_all(); 1236 1237 for_each_online_node(nid) 1238 compact_node(nid); 1239 } 1240 1241 /* The written value is actually unused, all memory is compacted */ 1242 int sysctl_compact_memory; 1243 1244 /* This is the entry point for compacting all nodes via /proc/sys/vm */ 1245 int sysctl_compaction_handler(struct ctl_table *table, int write, 1246 void __user *buffer, size_t *length, loff_t *ppos) 1247 { 1248 if (write) 1249 compact_nodes(); 1250 1251 return 0; 1252 } 1253 1254 int sysctl_extfrag_handler(struct ctl_table *table, int write, 1255 void __user *buffer, size_t *length, loff_t *ppos) 1256 { 1257 proc_dointvec_minmax(table, write, buffer, length, ppos); 1258 1259 return 0; 1260 } 1261 1262 #if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) 1263 static ssize_t sysfs_compact_node(struct device *dev, 1264 struct device_attribute *attr, 1265 const char *buf, size_t count) 1266 { 1267 int nid = dev->id; 1268 1269 if (nid >= 0 && nid < nr_node_ids && node_online(nid)) { 1270 /* Flush pending updates to the LRU lists */ 1271 lru_add_drain_all(); 1272 1273 compact_node(nid); 1274 } 1275 1276 return count; 1277 } 1278 static DEVICE_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node); 1279 1280 int compaction_register_node(struct node *node) 1281 { 1282 return device_create_file(&node->dev, &dev_attr_compact); 1283 } 1284 1285 void compaction_unregister_node(struct node *node) 1286 { 1287 return device_remove_file(&node->dev, &dev_attr_compact); 1288 } 1289 #endif /* CONFIG_SYSFS && CONFIG_NUMA */ 1290 1291 #endif /* CONFIG_COMPACTION */ 1292