1 /* 2 * linux/mm/vmscan.c 3 * 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 5 * 6 * Swap reorganised 29.12.95, Stephen Tweedie. 7 * kswapd added: 7.1.96 sct 8 * Removed kswapd_ctl limits, and swap out as many pages as needed 9 * to bring the system back to freepages.high: 2.4.97, Rik van Riel. 10 * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com). 11 * Multiqueue VM started 5.8.00, Rik van Riel. 12 */ 13 14 #include <linux/mm.h> 15 #include <linux/module.h> 16 #include <linux/slab.h> 17 #include <linux/kernel_stat.h> 18 #include <linux/swap.h> 19 #include <linux/pagemap.h> 20 #include <linux/init.h> 21 #include <linux/highmem.h> 22 #include <linux/vmstat.h> 23 #include <linux/file.h> 24 #include <linux/writeback.h> 25 #include <linux/blkdev.h> 26 #include <linux/buffer_head.h> /* for try_to_release_page(), 27 buffer_heads_over_limit */ 28 #include <linux/mm_inline.h> 29 #include <linux/pagevec.h> 30 #include <linux/backing-dev.h> 31 #include <linux/rmap.h> 32 #include <linux/topology.h> 33 #include <linux/cpu.h> 34 #include <linux/cpuset.h> 35 #include <linux/notifier.h> 36 #include <linux/rwsem.h> 37 #include <linux/delay.h> 38 #include <linux/kthread.h> 39 #include <linux/freezer.h> 40 41 #include <asm/tlbflush.h> 42 #include <asm/div64.h> 43 44 #include <linux/swapops.h> 45 46 #include "internal.h" 47 48 struct scan_control { 49 /* Incremented by the number of inactive pages that were scanned */ 50 unsigned long nr_scanned; 51 52 /* This context's GFP mask */ 53 gfp_t gfp_mask; 54 55 int may_writepage; 56 57 /* Can pages be swapped as part of reclaim? */ 58 int may_swap; 59 60 /* This context's SWAP_CLUSTER_MAX. If freeing memory for 61 * suspend, we effectively ignore SWAP_CLUSTER_MAX. 62 * In this context, it doesn't matter that we scan the 63 * whole list at once. */ 64 int swap_cluster_max; 65 66 int swappiness; 67 68 int all_unreclaimable; 69 }; 70 71 /* 72 * The list of shrinker callbacks used by to apply pressure to 73 * ageable caches. 74 */ 75 struct shrinker { 76 shrinker_t shrinker; 77 struct list_head list; 78 int seeks; /* seeks to recreate an obj */ 79 long nr; /* objs pending delete */ 80 }; 81 82 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 83 84 #ifdef ARCH_HAS_PREFETCH 85 #define prefetch_prev_lru_page(_page, _base, _field) \ 86 do { \ 87 if ((_page)->lru.prev != _base) { \ 88 struct page *prev; \ 89 \ 90 prev = lru_to_page(&(_page->lru)); \ 91 prefetch(&prev->_field); \ 92 } \ 93 } while (0) 94 #else 95 #define prefetch_prev_lru_page(_page, _base, _field) do { } while (0) 96 #endif 97 98 #ifdef ARCH_HAS_PREFETCHW 99 #define prefetchw_prev_lru_page(_page, _base, _field) \ 100 do { \ 101 if ((_page)->lru.prev != _base) { \ 102 struct page *prev; \ 103 \ 104 prev = lru_to_page(&(_page->lru)); \ 105 prefetchw(&prev->_field); \ 106 } \ 107 } while (0) 108 #else 109 #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0) 110 #endif 111 112 /* 113 * From 0 .. 100. Higher means more swappy. 114 */ 115 int vm_swappiness = 60; 116 long vm_total_pages; /* The total number of pages which the VM controls */ 117 118 static LIST_HEAD(shrinker_list); 119 static DECLARE_RWSEM(shrinker_rwsem); 120 121 /* 122 * Add a shrinker callback to be called from the vm 123 */ 124 struct shrinker *set_shrinker(int seeks, shrinker_t theshrinker) 125 { 126 struct shrinker *shrinker; 127 128 shrinker = kmalloc(sizeof(*shrinker), GFP_KERNEL); 129 if (shrinker) { 130 shrinker->shrinker = theshrinker; 131 shrinker->seeks = seeks; 132 shrinker->nr = 0; 133 down_write(&shrinker_rwsem); 134 list_add_tail(&shrinker->list, &shrinker_list); 135 up_write(&shrinker_rwsem); 136 } 137 return shrinker; 138 } 139 EXPORT_SYMBOL(set_shrinker); 140 141 /* 142 * Remove one 143 */ 144 void remove_shrinker(struct shrinker *shrinker) 145 { 146 down_write(&shrinker_rwsem); 147 list_del(&shrinker->list); 148 up_write(&shrinker_rwsem); 149 kfree(shrinker); 150 } 151 EXPORT_SYMBOL(remove_shrinker); 152 153 #define SHRINK_BATCH 128 154 /* 155 * Call the shrink functions to age shrinkable caches 156 * 157 * Here we assume it costs one seek to replace a lru page and that it also 158 * takes a seek to recreate a cache object. With this in mind we age equal 159 * percentages of the lru and ageable caches. This should balance the seeks 160 * generated by these structures. 161 * 162 * If the vm encounted mapped pages on the LRU it increase the pressure on 163 * slab to avoid swapping. 164 * 165 * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits. 166 * 167 * `lru_pages' represents the number of on-LRU pages in all the zones which 168 * are eligible for the caller's allocation attempt. It is used for balancing 169 * slab reclaim versus page reclaim. 170 * 171 * Returns the number of slab objects which we shrunk. 172 */ 173 unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, 174 unsigned long lru_pages) 175 { 176 struct shrinker *shrinker; 177 unsigned long ret = 0; 178 179 if (scanned == 0) 180 scanned = SWAP_CLUSTER_MAX; 181 182 if (!down_read_trylock(&shrinker_rwsem)) 183 return 1; /* Assume we'll be able to shrink next time */ 184 185 list_for_each_entry(shrinker, &shrinker_list, list) { 186 unsigned long long delta; 187 unsigned long total_scan; 188 unsigned long max_pass = (*shrinker->shrinker)(0, gfp_mask); 189 190 delta = (4 * scanned) / shrinker->seeks; 191 delta *= max_pass; 192 do_div(delta, lru_pages + 1); 193 shrinker->nr += delta; 194 if (shrinker->nr < 0) { 195 printk(KERN_ERR "%s: nr=%ld\n", 196 __FUNCTION__, shrinker->nr); 197 shrinker->nr = max_pass; 198 } 199 200 /* 201 * Avoid risking looping forever due to too large nr value: 202 * never try to free more than twice the estimate number of 203 * freeable entries. 204 */ 205 if (shrinker->nr > max_pass * 2) 206 shrinker->nr = max_pass * 2; 207 208 total_scan = shrinker->nr; 209 shrinker->nr = 0; 210 211 while (total_scan >= SHRINK_BATCH) { 212 long this_scan = SHRINK_BATCH; 213 int shrink_ret; 214 int nr_before; 215 216 nr_before = (*shrinker->shrinker)(0, gfp_mask); 217 shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask); 218 if (shrink_ret == -1) 219 break; 220 if (shrink_ret < nr_before) 221 ret += nr_before - shrink_ret; 222 count_vm_events(SLABS_SCANNED, this_scan); 223 total_scan -= this_scan; 224 225 cond_resched(); 226 } 227 228 shrinker->nr += total_scan; 229 } 230 up_read(&shrinker_rwsem); 231 return ret; 232 } 233 234 /* Called without lock on whether page is mapped, so answer is unstable */ 235 static inline int page_mapping_inuse(struct page *page) 236 { 237 struct address_space *mapping; 238 239 /* Page is in somebody's page tables. */ 240 if (page_mapped(page)) 241 return 1; 242 243 /* Be more reluctant to reclaim swapcache than pagecache */ 244 if (PageSwapCache(page)) 245 return 1; 246 247 mapping = page_mapping(page); 248 if (!mapping) 249 return 0; 250 251 /* File is mmap'd by somebody? */ 252 return mapping_mapped(mapping); 253 } 254 255 static inline int is_page_cache_freeable(struct page *page) 256 { 257 return page_count(page) - !!PagePrivate(page) == 2; 258 } 259 260 static int may_write_to_queue(struct backing_dev_info *bdi) 261 { 262 if (current->flags & PF_SWAPWRITE) 263 return 1; 264 if (!bdi_write_congested(bdi)) 265 return 1; 266 if (bdi == current->backing_dev_info) 267 return 1; 268 return 0; 269 } 270 271 /* 272 * We detected a synchronous write error writing a page out. Probably 273 * -ENOSPC. We need to propagate that into the address_space for a subsequent 274 * fsync(), msync() or close(). 275 * 276 * The tricky part is that after writepage we cannot touch the mapping: nothing 277 * prevents it from being freed up. But we have a ref on the page and once 278 * that page is locked, the mapping is pinned. 279 * 280 * We're allowed to run sleeping lock_page() here because we know the caller has 281 * __GFP_FS. 282 */ 283 static void handle_write_error(struct address_space *mapping, 284 struct page *page, int error) 285 { 286 lock_page(page); 287 if (page_mapping(page) == mapping) 288 mapping_set_error(mapping, error); 289 unlock_page(page); 290 } 291 292 /* possible outcome of pageout() */ 293 typedef enum { 294 /* failed to write page out, page is locked */ 295 PAGE_KEEP, 296 /* move page to the active list, page is locked */ 297 PAGE_ACTIVATE, 298 /* page has been sent to the disk successfully, page is unlocked */ 299 PAGE_SUCCESS, 300 /* page is clean and locked */ 301 PAGE_CLEAN, 302 } pageout_t; 303 304 /* 305 * pageout is called by shrink_page_list() for each dirty page. 306 * Calls ->writepage(). 307 */ 308 static pageout_t pageout(struct page *page, struct address_space *mapping) 309 { 310 /* 311 * If the page is dirty, only perform writeback if that write 312 * will be non-blocking. To prevent this allocation from being 313 * stalled by pagecache activity. But note that there may be 314 * stalls if we need to run get_block(). We could test 315 * PagePrivate for that. 316 * 317 * If this process is currently in generic_file_write() against 318 * this page's queue, we can perform writeback even if that 319 * will block. 320 * 321 * If the page is swapcache, write it back even if that would 322 * block, for some throttling. This happens by accident, because 323 * swap_backing_dev_info is bust: it doesn't reflect the 324 * congestion state of the swapdevs. Easy to fix, if needed. 325 * See swapfile.c:page_queue_congested(). 326 */ 327 if (!is_page_cache_freeable(page)) 328 return PAGE_KEEP; 329 if (!mapping) { 330 /* 331 * Some data journaling orphaned pages can have 332 * page->mapping == NULL while being dirty with clean buffers. 333 */ 334 if (PagePrivate(page)) { 335 if (try_to_free_buffers(page)) { 336 ClearPageDirty(page); 337 printk("%s: orphaned page\n", __FUNCTION__); 338 return PAGE_CLEAN; 339 } 340 } 341 return PAGE_KEEP; 342 } 343 if (mapping->a_ops->writepage == NULL) 344 return PAGE_ACTIVATE; 345 if (!may_write_to_queue(mapping->backing_dev_info)) 346 return PAGE_KEEP; 347 348 if (clear_page_dirty_for_io(page)) { 349 int res; 350 struct writeback_control wbc = { 351 .sync_mode = WB_SYNC_NONE, 352 .nr_to_write = SWAP_CLUSTER_MAX, 353 .range_start = 0, 354 .range_end = LLONG_MAX, 355 .nonblocking = 1, 356 .for_reclaim = 1, 357 }; 358 359 SetPageReclaim(page); 360 res = mapping->a_ops->writepage(page, &wbc); 361 if (res < 0) 362 handle_write_error(mapping, page, res); 363 if (res == AOP_WRITEPAGE_ACTIVATE) { 364 ClearPageReclaim(page); 365 return PAGE_ACTIVATE; 366 } 367 if (!PageWriteback(page)) { 368 /* synchronous write or broken a_ops? */ 369 ClearPageReclaim(page); 370 } 371 inc_zone_page_state(page, NR_VMSCAN_WRITE); 372 return PAGE_SUCCESS; 373 } 374 375 return PAGE_CLEAN; 376 } 377 378 /* 379 * Attempt to detach a locked page from its ->mapping. If it is dirty or if 380 * someone else has a ref on the page, abort and return 0. If it was 381 * successfully detached, return 1. Assumes the caller has a single ref on 382 * this page. 383 */ 384 int remove_mapping(struct address_space *mapping, struct page *page) 385 { 386 BUG_ON(!PageLocked(page)); 387 BUG_ON(mapping != page_mapping(page)); 388 389 write_lock_irq(&mapping->tree_lock); 390 /* 391 * The non racy check for a busy page. 392 * 393 * Must be careful with the order of the tests. When someone has 394 * a ref to the page, it may be possible that they dirty it then 395 * drop the reference. So if PageDirty is tested before page_count 396 * here, then the following race may occur: 397 * 398 * get_user_pages(&page); 399 * [user mapping goes away] 400 * write_to(page); 401 * !PageDirty(page) [good] 402 * SetPageDirty(page); 403 * put_page(page); 404 * !page_count(page) [good, discard it] 405 * 406 * [oops, our write_to data is lost] 407 * 408 * Reversing the order of the tests ensures such a situation cannot 409 * escape unnoticed. The smp_rmb is needed to ensure the page->flags 410 * load is not satisfied before that of page->_count. 411 * 412 * Note that if SetPageDirty is always performed via set_page_dirty, 413 * and thus under tree_lock, then this ordering is not required. 414 */ 415 if (unlikely(page_count(page) != 2)) 416 goto cannot_free; 417 smp_rmb(); 418 if (unlikely(PageDirty(page))) 419 goto cannot_free; 420 421 if (PageSwapCache(page)) { 422 swp_entry_t swap = { .val = page_private(page) }; 423 __delete_from_swap_cache(page); 424 write_unlock_irq(&mapping->tree_lock); 425 swap_free(swap); 426 __put_page(page); /* The pagecache ref */ 427 return 1; 428 } 429 430 __remove_from_page_cache(page); 431 write_unlock_irq(&mapping->tree_lock); 432 __put_page(page); 433 return 1; 434 435 cannot_free: 436 write_unlock_irq(&mapping->tree_lock); 437 return 0; 438 } 439 440 /* 441 * shrink_page_list() returns the number of reclaimed pages 442 */ 443 static unsigned long shrink_page_list(struct list_head *page_list, 444 struct scan_control *sc) 445 { 446 LIST_HEAD(ret_pages); 447 struct pagevec freed_pvec; 448 int pgactivate = 0; 449 unsigned long nr_reclaimed = 0; 450 451 cond_resched(); 452 453 pagevec_init(&freed_pvec, 1); 454 while (!list_empty(page_list)) { 455 struct address_space *mapping; 456 struct page *page; 457 int may_enter_fs; 458 int referenced; 459 460 cond_resched(); 461 462 page = lru_to_page(page_list); 463 list_del(&page->lru); 464 465 if (TestSetPageLocked(page)) 466 goto keep; 467 468 VM_BUG_ON(PageActive(page)); 469 470 sc->nr_scanned++; 471 472 if (!sc->may_swap && page_mapped(page)) 473 goto keep_locked; 474 475 /* Double the slab pressure for mapped and swapcache pages */ 476 if (page_mapped(page) || PageSwapCache(page)) 477 sc->nr_scanned++; 478 479 if (PageWriteback(page)) 480 goto keep_locked; 481 482 referenced = page_referenced(page, 1); 483 /* In active use or really unfreeable? Activate it. */ 484 if (referenced && page_mapping_inuse(page)) 485 goto activate_locked; 486 487 #ifdef CONFIG_SWAP 488 /* 489 * Anonymous process memory has backing store? 490 * Try to allocate it some swap space here. 491 */ 492 if (PageAnon(page) && !PageSwapCache(page)) 493 if (!add_to_swap(page, GFP_ATOMIC)) 494 goto activate_locked; 495 #endif /* CONFIG_SWAP */ 496 497 mapping = page_mapping(page); 498 may_enter_fs = (sc->gfp_mask & __GFP_FS) || 499 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); 500 501 /* 502 * The page is mapped into the page tables of one or more 503 * processes. Try to unmap it here. 504 */ 505 if (page_mapped(page) && mapping) { 506 switch (try_to_unmap(page, 0)) { 507 case SWAP_FAIL: 508 goto activate_locked; 509 case SWAP_AGAIN: 510 goto keep_locked; 511 case SWAP_SUCCESS: 512 ; /* try to free the page below */ 513 } 514 } 515 516 if (PageDirty(page)) { 517 if (referenced) 518 goto keep_locked; 519 if (!may_enter_fs) 520 goto keep_locked; 521 if (!sc->may_writepage) 522 goto keep_locked; 523 524 /* Page is dirty, try to write it out here */ 525 switch(pageout(page, mapping)) { 526 case PAGE_KEEP: 527 goto keep_locked; 528 case PAGE_ACTIVATE: 529 goto activate_locked; 530 case PAGE_SUCCESS: 531 if (PageWriteback(page) || PageDirty(page)) 532 goto keep; 533 /* 534 * A synchronous write - probably a ramdisk. Go 535 * ahead and try to reclaim the page. 536 */ 537 if (TestSetPageLocked(page)) 538 goto keep; 539 if (PageDirty(page) || PageWriteback(page)) 540 goto keep_locked; 541 mapping = page_mapping(page); 542 case PAGE_CLEAN: 543 ; /* try to free the page below */ 544 } 545 } 546 547 /* 548 * If the page has buffers, try to free the buffer mappings 549 * associated with this page. If we succeed we try to free 550 * the page as well. 551 * 552 * We do this even if the page is PageDirty(). 553 * try_to_release_page() does not perform I/O, but it is 554 * possible for a page to have PageDirty set, but it is actually 555 * clean (all its buffers are clean). This happens if the 556 * buffers were written out directly, with submit_bh(). ext3 557 * will do this, as well as the blockdev mapping. 558 * try_to_release_page() will discover that cleanness and will 559 * drop the buffers and mark the page clean - it can be freed. 560 * 561 * Rarely, pages can have buffers and no ->mapping. These are 562 * the pages which were not successfully invalidated in 563 * truncate_complete_page(). We try to drop those buffers here 564 * and if that worked, and the page is no longer mapped into 565 * process address space (page_count == 1) it can be freed. 566 * Otherwise, leave the page on the LRU so it is swappable. 567 */ 568 if (PagePrivate(page)) { 569 if (!try_to_release_page(page, sc->gfp_mask)) 570 goto activate_locked; 571 if (!mapping && page_count(page) == 1) 572 goto free_it; 573 } 574 575 if (!mapping || !remove_mapping(mapping, page)) 576 goto keep_locked; 577 578 free_it: 579 unlock_page(page); 580 nr_reclaimed++; 581 if (!pagevec_add(&freed_pvec, page)) 582 __pagevec_release_nonlru(&freed_pvec); 583 continue; 584 585 activate_locked: 586 SetPageActive(page); 587 pgactivate++; 588 keep_locked: 589 unlock_page(page); 590 keep: 591 list_add(&page->lru, &ret_pages); 592 VM_BUG_ON(PageLRU(page)); 593 } 594 list_splice(&ret_pages, page_list); 595 if (pagevec_count(&freed_pvec)) 596 __pagevec_release_nonlru(&freed_pvec); 597 count_vm_events(PGACTIVATE, pgactivate); 598 return nr_reclaimed; 599 } 600 601 /* 602 * zone->lru_lock is heavily contended. Some of the functions that 603 * shrink the lists perform better by taking out a batch of pages 604 * and working on them outside the LRU lock. 605 * 606 * For pagecache intensive workloads, this function is the hottest 607 * spot in the kernel (apart from copy_*_user functions). 608 * 609 * Appropriate locks must be held before calling this function. 610 * 611 * @nr_to_scan: The number of pages to look through on the list. 612 * @src: The LRU list to pull pages off. 613 * @dst: The temp list to put pages on to. 614 * @scanned: The number of pages that were scanned. 615 * 616 * returns how many pages were moved onto *@dst. 617 */ 618 static unsigned long isolate_lru_pages(unsigned long nr_to_scan, 619 struct list_head *src, struct list_head *dst, 620 unsigned long *scanned) 621 { 622 unsigned long nr_taken = 0; 623 struct page *page; 624 unsigned long scan; 625 626 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { 627 struct list_head *target; 628 page = lru_to_page(src); 629 prefetchw_prev_lru_page(page, src, flags); 630 631 VM_BUG_ON(!PageLRU(page)); 632 633 list_del(&page->lru); 634 target = src; 635 if (likely(get_page_unless_zero(page))) { 636 /* 637 * Be careful not to clear PageLRU until after we're 638 * sure the page is not being freed elsewhere -- the 639 * page release code relies on it. 640 */ 641 ClearPageLRU(page); 642 target = dst; 643 nr_taken++; 644 } /* else it is being freed elsewhere */ 645 646 list_add(&page->lru, target); 647 } 648 649 *scanned = scan; 650 return nr_taken; 651 } 652 653 /* 654 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number 655 * of reclaimed pages 656 */ 657 static unsigned long shrink_inactive_list(unsigned long max_scan, 658 struct zone *zone, struct scan_control *sc) 659 { 660 LIST_HEAD(page_list); 661 struct pagevec pvec; 662 unsigned long nr_scanned = 0; 663 unsigned long nr_reclaimed = 0; 664 665 pagevec_init(&pvec, 1); 666 667 lru_add_drain(); 668 spin_lock_irq(&zone->lru_lock); 669 do { 670 struct page *page; 671 unsigned long nr_taken; 672 unsigned long nr_scan; 673 unsigned long nr_freed; 674 675 nr_taken = isolate_lru_pages(sc->swap_cluster_max, 676 &zone->inactive_list, 677 &page_list, &nr_scan); 678 __mod_zone_page_state(zone, NR_INACTIVE, -nr_taken); 679 zone->pages_scanned += nr_scan; 680 spin_unlock_irq(&zone->lru_lock); 681 682 nr_scanned += nr_scan; 683 nr_freed = shrink_page_list(&page_list, sc); 684 nr_reclaimed += nr_freed; 685 local_irq_disable(); 686 if (current_is_kswapd()) { 687 __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan); 688 __count_vm_events(KSWAPD_STEAL, nr_freed); 689 } else 690 __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan); 691 __count_zone_vm_events(PGSTEAL, zone, nr_freed); 692 693 if (nr_taken == 0) 694 goto done; 695 696 spin_lock(&zone->lru_lock); 697 /* 698 * Put back any unfreeable pages. 699 */ 700 while (!list_empty(&page_list)) { 701 page = lru_to_page(&page_list); 702 VM_BUG_ON(PageLRU(page)); 703 SetPageLRU(page); 704 list_del(&page->lru); 705 if (PageActive(page)) 706 add_page_to_active_list(zone, page); 707 else 708 add_page_to_inactive_list(zone, page); 709 if (!pagevec_add(&pvec, page)) { 710 spin_unlock_irq(&zone->lru_lock); 711 __pagevec_release(&pvec); 712 spin_lock_irq(&zone->lru_lock); 713 } 714 } 715 } while (nr_scanned < max_scan); 716 spin_unlock(&zone->lru_lock); 717 done: 718 local_irq_enable(); 719 pagevec_release(&pvec); 720 return nr_reclaimed; 721 } 722 723 /* 724 * We are about to scan this zone at a certain priority level. If that priority 725 * level is smaller (ie: more urgent) than the previous priority, then note 726 * that priority level within the zone. This is done so that when the next 727 * process comes in to scan this zone, it will immediately start out at this 728 * priority level rather than having to build up its own scanning priority. 729 * Here, this priority affects only the reclaim-mapped threshold. 730 */ 731 static inline void note_zone_scanning_priority(struct zone *zone, int priority) 732 { 733 if (priority < zone->prev_priority) 734 zone->prev_priority = priority; 735 } 736 737 static inline int zone_is_near_oom(struct zone *zone) 738 { 739 return zone->pages_scanned >= (zone_page_state(zone, NR_ACTIVE) 740 + zone_page_state(zone, NR_INACTIVE))*3; 741 } 742 743 /* 744 * This moves pages from the active list to the inactive list. 745 * 746 * We move them the other way if the page is referenced by one or more 747 * processes, from rmap. 748 * 749 * If the pages are mostly unmapped, the processing is fast and it is 750 * appropriate to hold zone->lru_lock across the whole operation. But if 751 * the pages are mapped, the processing is slow (page_referenced()) so we 752 * should drop zone->lru_lock around each page. It's impossible to balance 753 * this, so instead we remove the pages from the LRU while processing them. 754 * It is safe to rely on PG_active against the non-LRU pages in here because 755 * nobody will play with that bit on a non-LRU page. 756 * 757 * The downside is that we have to touch page->_count against each page. 758 * But we had to alter page->flags anyway. 759 */ 760 static void shrink_active_list(unsigned long nr_pages, struct zone *zone, 761 struct scan_control *sc, int priority) 762 { 763 unsigned long pgmoved; 764 int pgdeactivate = 0; 765 unsigned long pgscanned; 766 LIST_HEAD(l_hold); /* The pages which were snipped off */ 767 LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */ 768 LIST_HEAD(l_active); /* Pages to go onto the active_list */ 769 struct page *page; 770 struct pagevec pvec; 771 int reclaim_mapped = 0; 772 773 if (sc->may_swap) { 774 long mapped_ratio; 775 long distress; 776 long swap_tendency; 777 778 if (zone_is_near_oom(zone)) 779 goto force_reclaim_mapped; 780 781 /* 782 * `distress' is a measure of how much trouble we're having 783 * reclaiming pages. 0 -> no problems. 100 -> great trouble. 784 */ 785 distress = 100 >> min(zone->prev_priority, priority); 786 787 /* 788 * The point of this algorithm is to decide when to start 789 * reclaiming mapped memory instead of just pagecache. Work out 790 * how much memory 791 * is mapped. 792 */ 793 mapped_ratio = ((global_page_state(NR_FILE_MAPPED) + 794 global_page_state(NR_ANON_PAGES)) * 100) / 795 vm_total_pages; 796 797 /* 798 * Now decide how much we really want to unmap some pages. The 799 * mapped ratio is downgraded - just because there's a lot of 800 * mapped memory doesn't necessarily mean that page reclaim 801 * isn't succeeding. 802 * 803 * The distress ratio is important - we don't want to start 804 * going oom. 805 * 806 * A 100% value of vm_swappiness overrides this algorithm 807 * altogether. 808 */ 809 swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; 810 811 /* 812 * Now use this metric to decide whether to start moving mapped 813 * memory onto the inactive list. 814 */ 815 if (swap_tendency >= 100) 816 force_reclaim_mapped: 817 reclaim_mapped = 1; 818 } 819 820 lru_add_drain(); 821 spin_lock_irq(&zone->lru_lock); 822 pgmoved = isolate_lru_pages(nr_pages, &zone->active_list, 823 &l_hold, &pgscanned); 824 zone->pages_scanned += pgscanned; 825 __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved); 826 spin_unlock_irq(&zone->lru_lock); 827 828 while (!list_empty(&l_hold)) { 829 cond_resched(); 830 page = lru_to_page(&l_hold); 831 list_del(&page->lru); 832 if (page_mapped(page)) { 833 if (!reclaim_mapped || 834 (total_swap_pages == 0 && PageAnon(page)) || 835 page_referenced(page, 0)) { 836 list_add(&page->lru, &l_active); 837 continue; 838 } 839 } 840 list_add(&page->lru, &l_inactive); 841 } 842 843 pagevec_init(&pvec, 1); 844 pgmoved = 0; 845 spin_lock_irq(&zone->lru_lock); 846 while (!list_empty(&l_inactive)) { 847 page = lru_to_page(&l_inactive); 848 prefetchw_prev_lru_page(page, &l_inactive, flags); 849 VM_BUG_ON(PageLRU(page)); 850 SetPageLRU(page); 851 VM_BUG_ON(!PageActive(page)); 852 ClearPageActive(page); 853 854 list_move(&page->lru, &zone->inactive_list); 855 pgmoved++; 856 if (!pagevec_add(&pvec, page)) { 857 __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); 858 spin_unlock_irq(&zone->lru_lock); 859 pgdeactivate += pgmoved; 860 pgmoved = 0; 861 if (buffer_heads_over_limit) 862 pagevec_strip(&pvec); 863 __pagevec_release(&pvec); 864 spin_lock_irq(&zone->lru_lock); 865 } 866 } 867 __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); 868 pgdeactivate += pgmoved; 869 if (buffer_heads_over_limit) { 870 spin_unlock_irq(&zone->lru_lock); 871 pagevec_strip(&pvec); 872 spin_lock_irq(&zone->lru_lock); 873 } 874 875 pgmoved = 0; 876 while (!list_empty(&l_active)) { 877 page = lru_to_page(&l_active); 878 prefetchw_prev_lru_page(page, &l_active, flags); 879 VM_BUG_ON(PageLRU(page)); 880 SetPageLRU(page); 881 VM_BUG_ON(!PageActive(page)); 882 list_move(&page->lru, &zone->active_list); 883 pgmoved++; 884 if (!pagevec_add(&pvec, page)) { 885 __mod_zone_page_state(zone, NR_ACTIVE, pgmoved); 886 pgmoved = 0; 887 spin_unlock_irq(&zone->lru_lock); 888 __pagevec_release(&pvec); 889 spin_lock_irq(&zone->lru_lock); 890 } 891 } 892 __mod_zone_page_state(zone, NR_ACTIVE, pgmoved); 893 894 __count_zone_vm_events(PGREFILL, zone, pgscanned); 895 __count_vm_events(PGDEACTIVATE, pgdeactivate); 896 spin_unlock_irq(&zone->lru_lock); 897 898 pagevec_release(&pvec); 899 } 900 901 /* 902 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 903 */ 904 static unsigned long shrink_zone(int priority, struct zone *zone, 905 struct scan_control *sc) 906 { 907 unsigned long nr_active; 908 unsigned long nr_inactive; 909 unsigned long nr_to_scan; 910 unsigned long nr_reclaimed = 0; 911 912 atomic_inc(&zone->reclaim_in_progress); 913 914 /* 915 * Add one to `nr_to_scan' just to make sure that the kernel will 916 * slowly sift through the active list. 917 */ 918 zone->nr_scan_active += 919 (zone_page_state(zone, NR_ACTIVE) >> priority) + 1; 920 nr_active = zone->nr_scan_active; 921 if (nr_active >= sc->swap_cluster_max) 922 zone->nr_scan_active = 0; 923 else 924 nr_active = 0; 925 926 zone->nr_scan_inactive += 927 (zone_page_state(zone, NR_INACTIVE) >> priority) + 1; 928 nr_inactive = zone->nr_scan_inactive; 929 if (nr_inactive >= sc->swap_cluster_max) 930 zone->nr_scan_inactive = 0; 931 else 932 nr_inactive = 0; 933 934 while (nr_active || nr_inactive) { 935 if (nr_active) { 936 nr_to_scan = min(nr_active, 937 (unsigned long)sc->swap_cluster_max); 938 nr_active -= nr_to_scan; 939 shrink_active_list(nr_to_scan, zone, sc, priority); 940 } 941 942 if (nr_inactive) { 943 nr_to_scan = min(nr_inactive, 944 (unsigned long)sc->swap_cluster_max); 945 nr_inactive -= nr_to_scan; 946 nr_reclaimed += shrink_inactive_list(nr_to_scan, zone, 947 sc); 948 } 949 } 950 951 throttle_vm_writeout(sc->gfp_mask); 952 953 atomic_dec(&zone->reclaim_in_progress); 954 return nr_reclaimed; 955 } 956 957 /* 958 * This is the direct reclaim path, for page-allocating processes. We only 959 * try to reclaim pages from zones which will satisfy the caller's allocation 960 * request. 961 * 962 * We reclaim from a zone even if that zone is over pages_high. Because: 963 * a) The caller may be trying to free *extra* pages to satisfy a higher-order 964 * allocation or 965 * b) The zones may be over pages_high but they must go *over* pages_high to 966 * satisfy the `incremental min' zone defense algorithm. 967 * 968 * Returns the number of reclaimed pages. 969 * 970 * If a zone is deemed to be full of pinned pages then just give it a light 971 * scan then give up on it. 972 */ 973 static unsigned long shrink_zones(int priority, struct zone **zones, 974 struct scan_control *sc) 975 { 976 unsigned long nr_reclaimed = 0; 977 int i; 978 979 sc->all_unreclaimable = 1; 980 for (i = 0; zones[i] != NULL; i++) { 981 struct zone *zone = zones[i]; 982 983 if (!populated_zone(zone)) 984 continue; 985 986 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 987 continue; 988 989 note_zone_scanning_priority(zone, priority); 990 991 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 992 continue; /* Let kswapd poll it */ 993 994 sc->all_unreclaimable = 0; 995 996 nr_reclaimed += shrink_zone(priority, zone, sc); 997 } 998 return nr_reclaimed; 999 } 1000 1001 /* 1002 * This is the main entry point to direct page reclaim. 1003 * 1004 * If a full scan of the inactive list fails to free enough memory then we 1005 * are "out of memory" and something needs to be killed. 1006 * 1007 * If the caller is !__GFP_FS then the probability of a failure is reasonably 1008 * high - the zone may be full of dirty or under-writeback pages, which this 1009 * caller can't do much about. We kick pdflush and take explicit naps in the 1010 * hope that some of these pages can be written. But if the allocating task 1011 * holds filesystem locks which prevent writeout this might not work, and the 1012 * allocation attempt will fail. 1013 */ 1014 unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) 1015 { 1016 int priority; 1017 int ret = 0; 1018 unsigned long total_scanned = 0; 1019 unsigned long nr_reclaimed = 0; 1020 struct reclaim_state *reclaim_state = current->reclaim_state; 1021 unsigned long lru_pages = 0; 1022 int i; 1023 struct scan_control sc = { 1024 .gfp_mask = gfp_mask, 1025 .may_writepage = !laptop_mode, 1026 .swap_cluster_max = SWAP_CLUSTER_MAX, 1027 .may_swap = 1, 1028 .swappiness = vm_swappiness, 1029 }; 1030 1031 count_vm_event(ALLOCSTALL); 1032 1033 for (i = 0; zones[i] != NULL; i++) { 1034 struct zone *zone = zones[i]; 1035 1036 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1037 continue; 1038 1039 lru_pages += zone_page_state(zone, NR_ACTIVE) 1040 + zone_page_state(zone, NR_INACTIVE); 1041 } 1042 1043 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 1044 sc.nr_scanned = 0; 1045 if (!priority) 1046 disable_swap_token(); 1047 nr_reclaimed += shrink_zones(priority, zones, &sc); 1048 shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); 1049 if (reclaim_state) { 1050 nr_reclaimed += reclaim_state->reclaimed_slab; 1051 reclaim_state->reclaimed_slab = 0; 1052 } 1053 total_scanned += sc.nr_scanned; 1054 if (nr_reclaimed >= sc.swap_cluster_max) { 1055 ret = 1; 1056 goto out; 1057 } 1058 1059 /* 1060 * Try to write back as many pages as we just scanned. This 1061 * tends to cause slow streaming writers to write data to the 1062 * disk smoothly, at the dirtying rate, which is nice. But 1063 * that's undesirable in laptop mode, where we *want* lumpy 1064 * writeout. So in laptop mode, write out the whole world. 1065 */ 1066 if (total_scanned > sc.swap_cluster_max + 1067 sc.swap_cluster_max / 2) { 1068 wakeup_pdflush(laptop_mode ? 0 : total_scanned); 1069 sc.may_writepage = 1; 1070 } 1071 1072 /* Take a nap, wait for some writeback to complete */ 1073 if (sc.nr_scanned && priority < DEF_PRIORITY - 2) 1074 congestion_wait(WRITE, HZ/10); 1075 } 1076 /* top priority shrink_caches still had more to do? don't OOM, then */ 1077 if (!sc.all_unreclaimable) 1078 ret = 1; 1079 out: 1080 /* 1081 * Now that we've scanned all the zones at this priority level, note 1082 * that level within the zone so that the next thread which performs 1083 * scanning of this zone will immediately start out at this priority 1084 * level. This affects only the decision whether or not to bring 1085 * mapped pages onto the inactive list. 1086 */ 1087 if (priority < 0) 1088 priority = 0; 1089 for (i = 0; zones[i] != 0; i++) { 1090 struct zone *zone = zones[i]; 1091 1092 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1093 continue; 1094 1095 zone->prev_priority = priority; 1096 } 1097 return ret; 1098 } 1099 1100 /* 1101 * For kswapd, balance_pgdat() will work across all this node's zones until 1102 * they are all at pages_high. 1103 * 1104 * Returns the number of pages which were actually freed. 1105 * 1106 * There is special handling here for zones which are full of pinned pages. 1107 * This can happen if the pages are all mlocked, or if they are all used by 1108 * device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb. 1109 * What we do is to detect the case where all pages in the zone have been 1110 * scanned twice and there has been zero successful reclaim. Mark the zone as 1111 * dead and from now on, only perform a short scan. Basically we're polling 1112 * the zone for when the problem goes away. 1113 * 1114 * kswapd scans the zones in the highmem->normal->dma direction. It skips 1115 * zones which have free_pages > pages_high, but once a zone is found to have 1116 * free_pages <= pages_high, we scan that zone and the lower zones regardless 1117 * of the number of free pages in the lower zones. This interoperates with 1118 * the page allocator fallback scheme to ensure that aging of pages is balanced 1119 * across the zones. 1120 */ 1121 static unsigned long balance_pgdat(pg_data_t *pgdat, int order) 1122 { 1123 int all_zones_ok; 1124 int priority; 1125 int i; 1126 unsigned long total_scanned; 1127 unsigned long nr_reclaimed; 1128 struct reclaim_state *reclaim_state = current->reclaim_state; 1129 struct scan_control sc = { 1130 .gfp_mask = GFP_KERNEL, 1131 .may_swap = 1, 1132 .swap_cluster_max = SWAP_CLUSTER_MAX, 1133 .swappiness = vm_swappiness, 1134 }; 1135 /* 1136 * temp_priority is used to remember the scanning priority at which 1137 * this zone was successfully refilled to free_pages == pages_high. 1138 */ 1139 int temp_priority[MAX_NR_ZONES]; 1140 1141 loop_again: 1142 total_scanned = 0; 1143 nr_reclaimed = 0; 1144 sc.may_writepage = !laptop_mode; 1145 count_vm_event(PAGEOUTRUN); 1146 1147 for (i = 0; i < pgdat->nr_zones; i++) 1148 temp_priority[i] = DEF_PRIORITY; 1149 1150 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 1151 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 1152 unsigned long lru_pages = 0; 1153 1154 /* The swap token gets in the way of swapout... */ 1155 if (!priority) 1156 disable_swap_token(); 1157 1158 all_zones_ok = 1; 1159 1160 /* 1161 * Scan in the highmem->dma direction for the highest 1162 * zone which needs scanning 1163 */ 1164 for (i = pgdat->nr_zones - 1; i >= 0; i--) { 1165 struct zone *zone = pgdat->node_zones + i; 1166 1167 if (!populated_zone(zone)) 1168 continue; 1169 1170 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 1171 continue; 1172 1173 if (!zone_watermark_ok(zone, order, zone->pages_high, 1174 0, 0)) { 1175 end_zone = i; 1176 break; 1177 } 1178 } 1179 if (i < 0) 1180 goto out; 1181 1182 for (i = 0; i <= end_zone; i++) { 1183 struct zone *zone = pgdat->node_zones + i; 1184 1185 lru_pages += zone_page_state(zone, NR_ACTIVE) 1186 + zone_page_state(zone, NR_INACTIVE); 1187 } 1188 1189 /* 1190 * Now scan the zone in the dma->highmem direction, stopping 1191 * at the last zone which needs scanning. 1192 * 1193 * We do this because the page allocator works in the opposite 1194 * direction. This prevents the page allocator from allocating 1195 * pages behind kswapd's direction of progress, which would 1196 * cause too much scanning of the lower zones. 1197 */ 1198 for (i = 0; i <= end_zone; i++) { 1199 struct zone *zone = pgdat->node_zones + i; 1200 int nr_slab; 1201 1202 if (!populated_zone(zone)) 1203 continue; 1204 1205 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 1206 continue; 1207 1208 if (!zone_watermark_ok(zone, order, zone->pages_high, 1209 end_zone, 0)) 1210 all_zones_ok = 0; 1211 temp_priority[i] = priority; 1212 sc.nr_scanned = 0; 1213 note_zone_scanning_priority(zone, priority); 1214 nr_reclaimed += shrink_zone(priority, zone, &sc); 1215 reclaim_state->reclaimed_slab = 0; 1216 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, 1217 lru_pages); 1218 nr_reclaimed += reclaim_state->reclaimed_slab; 1219 total_scanned += sc.nr_scanned; 1220 if (zone->all_unreclaimable) 1221 continue; 1222 if (nr_slab == 0 && zone->pages_scanned >= 1223 (zone_page_state(zone, NR_ACTIVE) 1224 + zone_page_state(zone, NR_INACTIVE)) * 6) 1225 zone->all_unreclaimable = 1; 1226 /* 1227 * If we've done a decent amount of scanning and 1228 * the reclaim ratio is low, start doing writepage 1229 * even in laptop mode 1230 */ 1231 if (total_scanned > SWAP_CLUSTER_MAX * 2 && 1232 total_scanned > nr_reclaimed + nr_reclaimed / 2) 1233 sc.may_writepage = 1; 1234 } 1235 if (all_zones_ok) 1236 break; /* kswapd: all done */ 1237 /* 1238 * OK, kswapd is getting into trouble. Take a nap, then take 1239 * another pass across the zones. 1240 */ 1241 if (total_scanned && priority < DEF_PRIORITY - 2) 1242 congestion_wait(WRITE, HZ/10); 1243 1244 /* 1245 * We do this so kswapd doesn't build up large priorities for 1246 * example when it is freeing in parallel with allocators. It 1247 * matches the direct reclaim path behaviour in terms of impact 1248 * on zone->*_priority. 1249 */ 1250 if (nr_reclaimed >= SWAP_CLUSTER_MAX) 1251 break; 1252 } 1253 out: 1254 /* 1255 * Note within each zone the priority level at which this zone was 1256 * brought into a happy state. So that the next thread which scans this 1257 * zone will start out at that priority level. 1258 */ 1259 for (i = 0; i < pgdat->nr_zones; i++) { 1260 struct zone *zone = pgdat->node_zones + i; 1261 1262 zone->prev_priority = temp_priority[i]; 1263 } 1264 if (!all_zones_ok) { 1265 cond_resched(); 1266 1267 try_to_freeze(); 1268 1269 goto loop_again; 1270 } 1271 1272 return nr_reclaimed; 1273 } 1274 1275 /* 1276 * The background pageout daemon, started as a kernel thread 1277 * from the init process. 1278 * 1279 * This basically trickles out pages so that we have _some_ 1280 * free memory available even if there is no other activity 1281 * that frees anything up. This is needed for things like routing 1282 * etc, where we otherwise might have all activity going on in 1283 * asynchronous contexts that cannot page things out. 1284 * 1285 * If there are applications that are active memory-allocators 1286 * (most normal use), this basically shouldn't matter. 1287 */ 1288 static int kswapd(void *p) 1289 { 1290 unsigned long order; 1291 pg_data_t *pgdat = (pg_data_t*)p; 1292 struct task_struct *tsk = current; 1293 DEFINE_WAIT(wait); 1294 struct reclaim_state reclaim_state = { 1295 .reclaimed_slab = 0, 1296 }; 1297 cpumask_t cpumask; 1298 1299 cpumask = node_to_cpumask(pgdat->node_id); 1300 if (!cpus_empty(cpumask)) 1301 set_cpus_allowed(tsk, cpumask); 1302 current->reclaim_state = &reclaim_state; 1303 1304 /* 1305 * Tell the memory management that we're a "memory allocator", 1306 * and that if we need more memory we should get access to it 1307 * regardless (see "__alloc_pages()"). "kswapd" should 1308 * never get caught in the normal page freeing logic. 1309 * 1310 * (Kswapd normally doesn't need memory anyway, but sometimes 1311 * you need a small amount of memory in order to be able to 1312 * page out something else, and this flag essentially protects 1313 * us from recursively trying to free more memory as we're 1314 * trying to free the first piece of memory in the first place). 1315 */ 1316 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; 1317 1318 order = 0; 1319 for ( ; ; ) { 1320 unsigned long new_order; 1321 1322 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 1323 new_order = pgdat->kswapd_max_order; 1324 pgdat->kswapd_max_order = 0; 1325 if (order < new_order) { 1326 /* 1327 * Don't sleep if someone wants a larger 'order' 1328 * allocation 1329 */ 1330 order = new_order; 1331 } else { 1332 if (!freezing(current)) 1333 schedule(); 1334 1335 order = pgdat->kswapd_max_order; 1336 } 1337 finish_wait(&pgdat->kswapd_wait, &wait); 1338 1339 if (!try_to_freeze()) { 1340 /* We can speed up thawing tasks if we don't call 1341 * balance_pgdat after returning from the refrigerator 1342 */ 1343 balance_pgdat(pgdat, order); 1344 } 1345 } 1346 return 0; 1347 } 1348 1349 /* 1350 * A zone is low on free memory, so wake its kswapd task to service it. 1351 */ 1352 void wakeup_kswapd(struct zone *zone, int order) 1353 { 1354 pg_data_t *pgdat; 1355 1356 if (!populated_zone(zone)) 1357 return; 1358 1359 pgdat = zone->zone_pgdat; 1360 if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0)) 1361 return; 1362 if (pgdat->kswapd_max_order < order) 1363 pgdat->kswapd_max_order = order; 1364 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1365 return; 1366 if (!waitqueue_active(&pgdat->kswapd_wait)) 1367 return; 1368 wake_up_interruptible(&pgdat->kswapd_wait); 1369 } 1370 1371 #ifdef CONFIG_PM 1372 /* 1373 * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages 1374 * from LRU lists system-wide, for given pass and priority, and returns the 1375 * number of reclaimed pages 1376 * 1377 * For pass > 3 we also try to shrink the LRU lists that contain a few pages 1378 */ 1379 static unsigned long shrink_all_zones(unsigned long nr_pages, int prio, 1380 int pass, struct scan_control *sc) 1381 { 1382 struct zone *zone; 1383 unsigned long nr_to_scan, ret = 0; 1384 1385 for_each_zone(zone) { 1386 1387 if (!populated_zone(zone)) 1388 continue; 1389 1390 if (zone->all_unreclaimable && prio != DEF_PRIORITY) 1391 continue; 1392 1393 /* For pass = 0 we don't shrink the active list */ 1394 if (pass > 0) { 1395 zone->nr_scan_active += 1396 (zone_page_state(zone, NR_ACTIVE) >> prio) + 1; 1397 if (zone->nr_scan_active >= nr_pages || pass > 3) { 1398 zone->nr_scan_active = 0; 1399 nr_to_scan = min(nr_pages, 1400 zone_page_state(zone, NR_ACTIVE)); 1401 shrink_active_list(nr_to_scan, zone, sc, prio); 1402 } 1403 } 1404 1405 zone->nr_scan_inactive += 1406 (zone_page_state(zone, NR_INACTIVE) >> prio) + 1; 1407 if (zone->nr_scan_inactive >= nr_pages || pass > 3) { 1408 zone->nr_scan_inactive = 0; 1409 nr_to_scan = min(nr_pages, 1410 zone_page_state(zone, NR_INACTIVE)); 1411 ret += shrink_inactive_list(nr_to_scan, zone, sc); 1412 if (ret >= nr_pages) 1413 return ret; 1414 } 1415 } 1416 1417 return ret; 1418 } 1419 1420 static unsigned long count_lru_pages(void) 1421 { 1422 return global_page_state(NR_ACTIVE) + global_page_state(NR_INACTIVE); 1423 } 1424 1425 /* 1426 * Try to free `nr_pages' of memory, system-wide, and return the number of 1427 * freed pages. 1428 * 1429 * Rather than trying to age LRUs the aim is to preserve the overall 1430 * LRU order by reclaiming preferentially 1431 * inactive > active > active referenced > active mapped 1432 */ 1433 unsigned long shrink_all_memory(unsigned long nr_pages) 1434 { 1435 unsigned long lru_pages, nr_slab; 1436 unsigned long ret = 0; 1437 int pass; 1438 struct reclaim_state reclaim_state; 1439 struct scan_control sc = { 1440 .gfp_mask = GFP_KERNEL, 1441 .may_swap = 0, 1442 .swap_cluster_max = nr_pages, 1443 .may_writepage = 1, 1444 .swappiness = vm_swappiness, 1445 }; 1446 1447 current->reclaim_state = &reclaim_state; 1448 1449 lru_pages = count_lru_pages(); 1450 nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); 1451 /* If slab caches are huge, it's better to hit them first */ 1452 while (nr_slab >= lru_pages) { 1453 reclaim_state.reclaimed_slab = 0; 1454 shrink_slab(nr_pages, sc.gfp_mask, lru_pages); 1455 if (!reclaim_state.reclaimed_slab) 1456 break; 1457 1458 ret += reclaim_state.reclaimed_slab; 1459 if (ret >= nr_pages) 1460 goto out; 1461 1462 nr_slab -= reclaim_state.reclaimed_slab; 1463 } 1464 1465 /* 1466 * We try to shrink LRUs in 5 passes: 1467 * 0 = Reclaim from inactive_list only 1468 * 1 = Reclaim from active list but don't reclaim mapped 1469 * 2 = 2nd pass of type 1 1470 * 3 = Reclaim mapped (normal reclaim) 1471 * 4 = 2nd pass of type 3 1472 */ 1473 for (pass = 0; pass < 5; pass++) { 1474 int prio; 1475 1476 /* Force reclaiming mapped pages in the passes #3 and #4 */ 1477 if (pass > 2) { 1478 sc.may_swap = 1; 1479 sc.swappiness = 100; 1480 } 1481 1482 for (prio = DEF_PRIORITY; prio >= 0; prio--) { 1483 unsigned long nr_to_scan = nr_pages - ret; 1484 1485 sc.nr_scanned = 0; 1486 ret += shrink_all_zones(nr_to_scan, prio, pass, &sc); 1487 if (ret >= nr_pages) 1488 goto out; 1489 1490 reclaim_state.reclaimed_slab = 0; 1491 shrink_slab(sc.nr_scanned, sc.gfp_mask, 1492 count_lru_pages()); 1493 ret += reclaim_state.reclaimed_slab; 1494 if (ret >= nr_pages) 1495 goto out; 1496 1497 if (sc.nr_scanned && prio < DEF_PRIORITY - 2) 1498 congestion_wait(WRITE, HZ / 10); 1499 } 1500 } 1501 1502 /* 1503 * If ret = 0, we could not shrink LRUs, but there may be something 1504 * in slab caches 1505 */ 1506 if (!ret) { 1507 do { 1508 reclaim_state.reclaimed_slab = 0; 1509 shrink_slab(nr_pages, sc.gfp_mask, count_lru_pages()); 1510 ret += reclaim_state.reclaimed_slab; 1511 } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); 1512 } 1513 1514 out: 1515 current->reclaim_state = NULL; 1516 1517 return ret; 1518 } 1519 #endif 1520 1521 /* It's optimal to keep kswapds on the same CPUs as their memory, but 1522 not required for correctness. So if the last cpu in a node goes 1523 away, we get changed to run anywhere: as the first one comes back, 1524 restore their cpu bindings. */ 1525 static int __devinit cpu_callback(struct notifier_block *nfb, 1526 unsigned long action, void *hcpu) 1527 { 1528 pg_data_t *pgdat; 1529 cpumask_t mask; 1530 1531 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { 1532 for_each_online_pgdat(pgdat) { 1533 mask = node_to_cpumask(pgdat->node_id); 1534 if (any_online_cpu(mask) != NR_CPUS) 1535 /* One of our CPUs online: restore mask */ 1536 set_cpus_allowed(pgdat->kswapd, mask); 1537 } 1538 } 1539 return NOTIFY_OK; 1540 } 1541 1542 /* 1543 * This kswapd start function will be called by init and node-hot-add. 1544 * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. 1545 */ 1546 int kswapd_run(int nid) 1547 { 1548 pg_data_t *pgdat = NODE_DATA(nid); 1549 int ret = 0; 1550 1551 if (pgdat->kswapd) 1552 return 0; 1553 1554 pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); 1555 if (IS_ERR(pgdat->kswapd)) { 1556 /* failure at boot is fatal */ 1557 BUG_ON(system_state == SYSTEM_BOOTING); 1558 printk("Failed to start kswapd on node %d\n",nid); 1559 ret = -1; 1560 } 1561 return ret; 1562 } 1563 1564 static int __init kswapd_init(void) 1565 { 1566 int nid; 1567 1568 swap_setup(); 1569 for_each_online_node(nid) 1570 kswapd_run(nid); 1571 hotcpu_notifier(cpu_callback, 0); 1572 return 0; 1573 } 1574 1575 module_init(kswapd_init) 1576 1577 #ifdef CONFIG_NUMA 1578 /* 1579 * Zone reclaim mode 1580 * 1581 * If non-zero call zone_reclaim when the number of free pages falls below 1582 * the watermarks. 1583 */ 1584 int zone_reclaim_mode __read_mostly; 1585 1586 #define RECLAIM_OFF 0 1587 #define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */ 1588 #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ 1589 #define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ 1590 1591 /* 1592 * Priority for ZONE_RECLAIM. This determines the fraction of pages 1593 * of a node considered for each zone_reclaim. 4 scans 1/16th of 1594 * a zone. 1595 */ 1596 #define ZONE_RECLAIM_PRIORITY 4 1597 1598 /* 1599 * Percentage of pages in a zone that must be unmapped for zone_reclaim to 1600 * occur. 1601 */ 1602 int sysctl_min_unmapped_ratio = 1; 1603 1604 /* 1605 * If the number of slab pages in a zone grows beyond this percentage then 1606 * slab reclaim needs to occur. 1607 */ 1608 int sysctl_min_slab_ratio = 5; 1609 1610 /* 1611 * Try to free up some pages from this zone through reclaim. 1612 */ 1613 static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) 1614 { 1615 /* Minimum pages needed in order to stay on node */ 1616 const unsigned long nr_pages = 1 << order; 1617 struct task_struct *p = current; 1618 struct reclaim_state reclaim_state; 1619 int priority; 1620 unsigned long nr_reclaimed = 0; 1621 struct scan_control sc = { 1622 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), 1623 .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), 1624 .swap_cluster_max = max_t(unsigned long, nr_pages, 1625 SWAP_CLUSTER_MAX), 1626 .gfp_mask = gfp_mask, 1627 .swappiness = vm_swappiness, 1628 }; 1629 unsigned long slab_reclaimable; 1630 1631 disable_swap_token(); 1632 cond_resched(); 1633 /* 1634 * We need to be able to allocate from the reserves for RECLAIM_SWAP 1635 * and we also need to be able to write out pages for RECLAIM_WRITE 1636 * and RECLAIM_SWAP. 1637 */ 1638 p->flags |= PF_MEMALLOC | PF_SWAPWRITE; 1639 reclaim_state.reclaimed_slab = 0; 1640 p->reclaim_state = &reclaim_state; 1641 1642 if (zone_page_state(zone, NR_FILE_PAGES) - 1643 zone_page_state(zone, NR_FILE_MAPPED) > 1644 zone->min_unmapped_pages) { 1645 /* 1646 * Free memory by calling shrink zone with increasing 1647 * priorities until we have enough memory freed. 1648 */ 1649 priority = ZONE_RECLAIM_PRIORITY; 1650 do { 1651 note_zone_scanning_priority(zone, priority); 1652 nr_reclaimed += shrink_zone(priority, zone, &sc); 1653 priority--; 1654 } while (priority >= 0 && nr_reclaimed < nr_pages); 1655 } 1656 1657 slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE); 1658 if (slab_reclaimable > zone->min_slab_pages) { 1659 /* 1660 * shrink_slab() does not currently allow us to determine how 1661 * many pages were freed in this zone. So we take the current 1662 * number of slab pages and shake the slab until it is reduced 1663 * by the same nr_pages that we used for reclaiming unmapped 1664 * pages. 1665 * 1666 * Note that shrink_slab will free memory on all zones and may 1667 * take a long time. 1668 */ 1669 while (shrink_slab(sc.nr_scanned, gfp_mask, order) && 1670 zone_page_state(zone, NR_SLAB_RECLAIMABLE) > 1671 slab_reclaimable - nr_pages) 1672 ; 1673 1674 /* 1675 * Update nr_reclaimed by the number of slab pages we 1676 * reclaimed from this zone. 1677 */ 1678 nr_reclaimed += slab_reclaimable - 1679 zone_page_state(zone, NR_SLAB_RECLAIMABLE); 1680 } 1681 1682 p->reclaim_state = NULL; 1683 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); 1684 return nr_reclaimed >= nr_pages; 1685 } 1686 1687 int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) 1688 { 1689 cpumask_t mask; 1690 int node_id; 1691 1692 /* 1693 * Zone reclaim reclaims unmapped file backed pages and 1694 * slab pages if we are over the defined limits. 1695 * 1696 * A small portion of unmapped file backed pages is needed for 1697 * file I/O otherwise pages read by file I/O will be immediately 1698 * thrown out if the zone is overallocated. So we do not reclaim 1699 * if less than a specified percentage of the zone is used by 1700 * unmapped file backed pages. 1701 */ 1702 if (zone_page_state(zone, NR_FILE_PAGES) - 1703 zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages 1704 && zone_page_state(zone, NR_SLAB_RECLAIMABLE) 1705 <= zone->min_slab_pages) 1706 return 0; 1707 1708 /* 1709 * Avoid concurrent zone reclaims, do not reclaim in a zone that does 1710 * not have reclaimable pages and if we should not delay the allocation 1711 * then do not scan. 1712 */ 1713 if (!(gfp_mask & __GFP_WAIT) || 1714 zone->all_unreclaimable || 1715 atomic_read(&zone->reclaim_in_progress) > 0 || 1716 (current->flags & PF_MEMALLOC)) 1717 return 0; 1718 1719 /* 1720 * Only run zone reclaim on the local zone or on zones that do not 1721 * have associated processors. This will favor the local processor 1722 * over remote processors and spread off node memory allocations 1723 * as wide as possible. 1724 */ 1725 node_id = zone_to_nid(zone); 1726 mask = node_to_cpumask(node_id); 1727 if (!cpus_empty(mask) && node_id != numa_node_id()) 1728 return 0; 1729 return __zone_reclaim(zone, gfp_mask, order); 1730 } 1731 #endif 1732