1 /* 2 * linux/mm/swap.c 3 * 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 5 */ 6 7 /* 8 * This file contains the default values for the operation of the 9 * Linux VM subsystem. Fine-tuning documentation can be found in 10 * Documentation/sysctl/vm.txt. 11 * Started 18.12.91 12 * Swap aging added 23.2.95, Stephen Tweedie. 13 * Buffermem limits added 12.3.98, Rik van Riel. 14 */ 15 16 #include <linux/mm.h> 17 #include <linux/sched.h> 18 #include <linux/kernel_stat.h> 19 #include <linux/swap.h> 20 #include <linux/mman.h> 21 #include <linux/pagemap.h> 22 #include <linux/pagevec.h> 23 #include <linux/init.h> 24 #include <linux/export.h> 25 #include <linux/mm_inline.h> 26 #include <linux/percpu_counter.h> 27 #include <linux/percpu.h> 28 #include <linux/cpu.h> 29 #include <linux/notifier.h> 30 #include <linux/backing-dev.h> 31 #include <linux/memcontrol.h> 32 #include <linux/gfp.h> 33 #include <linux/uio.h> 34 #include <linux/hugetlb.h> 35 36 #include "internal.h" 37 38 #define CREATE_TRACE_POINTS 39 #include <trace/events/pagemap.h> 40 41 /* How many pages do we try to swap or page in/out together? */ 42 int page_cluster; 43 44 static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); 45 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); 46 static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs); 47 48 /* 49 * This path almost never happens for VM activity - pages are normally 50 * freed via pagevecs. But it gets used by networking. 51 */ 52 static void __page_cache_release(struct page *page) 53 { 54 if (PageLRU(page)) { 55 struct zone *zone = page_zone(page); 56 struct lruvec *lruvec; 57 unsigned long flags; 58 59 spin_lock_irqsave(&zone->lru_lock, flags); 60 lruvec = mem_cgroup_page_lruvec(page, zone); 61 VM_BUG_ON_PAGE(!PageLRU(page), page); 62 __ClearPageLRU(page); 63 del_page_from_lru_list(page, lruvec, page_off_lru(page)); 64 spin_unlock_irqrestore(&zone->lru_lock, flags); 65 } 66 mem_cgroup_uncharge(page); 67 } 68 69 static void __put_single_page(struct page *page) 70 { 71 __page_cache_release(page); 72 free_hot_cold_page(page, false); 73 } 74 75 static void __put_compound_page(struct page *page) 76 { 77 compound_page_dtor *dtor; 78 79 /* 80 * __page_cache_release() is supposed to be called for thp, not for 81 * hugetlb. This is because hugetlb page does never have PageLRU set 82 * (it's never listed to any LRU lists) and no memcg routines should 83 * be called for hugetlb (it has a separate hugetlb_cgroup.) 84 */ 85 if (!PageHuge(page)) 86 __page_cache_release(page); 87 dtor = get_compound_page_dtor(page); 88 (*dtor)(page); 89 } 90 91 /** 92 * Two special cases here: we could avoid taking compound_lock_irqsave 93 * and could skip the tail refcounting(in _mapcount). 94 * 95 * 1. Hugetlbfs page: 96 * 97 * PageHeadHuge will remain true until the compound page 98 * is released and enters the buddy allocator, and it could 99 * not be split by __split_huge_page_refcount(). 100 * 101 * So if we see PageHeadHuge set, and we have the tail page pin, 102 * then we could safely put head page. 103 * 104 * 2. Slab THP page: 105 * 106 * PG_slab is cleared before the slab frees the head page, and 107 * tail pin cannot be the last reference left on the head page, 108 * because the slab code is free to reuse the compound page 109 * after a kfree/kmem_cache_free without having to check if 110 * there's any tail pin left. In turn all tail pinsmust be always 111 * released while the head is still pinned by the slab code 112 * and so we know PG_slab will be still set too. 113 * 114 * So if we see PageSlab set, and we have the tail page pin, 115 * then we could safely put head page. 116 */ 117 static __always_inline 118 void put_unrefcounted_compound_page(struct page *page_head, struct page *page) 119 { 120 /* 121 * If @page is a THP tail, we must read the tail page 122 * flags after the head page flags. The 123 * __split_huge_page_refcount side enforces write memory barriers 124 * between clearing PageTail and before the head page 125 * can be freed and reallocated. 126 */ 127 smp_rmb(); 128 if (likely(PageTail(page))) { 129 /* 130 * __split_huge_page_refcount cannot race 131 * here, see the comment above this function. 132 */ 133 VM_BUG_ON_PAGE(!PageHead(page_head), page_head); 134 VM_BUG_ON_PAGE(page_mapcount(page) != 0, page); 135 if (put_page_testzero(page_head)) { 136 /* 137 * If this is the tail of a slab THP page, 138 * the tail pin must not be the last reference 139 * held on the page, because the PG_slab cannot 140 * be cleared before all tail pins (which skips 141 * the _mapcount tail refcounting) have been 142 * released. 143 * 144 * If this is the tail of a hugetlbfs page, 145 * the tail pin may be the last reference on 146 * the page instead, because PageHeadHuge will 147 * not go away until the compound page enters 148 * the buddy allocator. 149 */ 150 VM_BUG_ON_PAGE(PageSlab(page_head), page_head); 151 __put_compound_page(page_head); 152 } 153 } else 154 /* 155 * __split_huge_page_refcount run before us, 156 * @page was a THP tail. The split @page_head 157 * has been freed and reallocated as slab or 158 * hugetlbfs page of smaller order (only 159 * possible if reallocated as slab on x86). 160 */ 161 if (put_page_testzero(page)) 162 __put_single_page(page); 163 } 164 165 static __always_inline 166 void put_refcounted_compound_page(struct page *page_head, struct page *page) 167 { 168 if (likely(page != page_head && get_page_unless_zero(page_head))) { 169 unsigned long flags; 170 171 /* 172 * @page_head wasn't a dangling pointer but it may not 173 * be a head page anymore by the time we obtain the 174 * lock. That is ok as long as it can't be freed from 175 * under us. 176 */ 177 flags = compound_lock_irqsave(page_head); 178 if (unlikely(!PageTail(page))) { 179 /* __split_huge_page_refcount run before us */ 180 compound_unlock_irqrestore(page_head, flags); 181 if (put_page_testzero(page_head)) { 182 /* 183 * The @page_head may have been freed 184 * and reallocated as a compound page 185 * of smaller order and then freed 186 * again. All we know is that it 187 * cannot have become: a THP page, a 188 * compound page of higher order, a 189 * tail page. That is because we 190 * still hold the refcount of the 191 * split THP tail and page_head was 192 * the THP head before the split. 193 */ 194 if (PageHead(page_head)) 195 __put_compound_page(page_head); 196 else 197 __put_single_page(page_head); 198 } 199 out_put_single: 200 if (put_page_testzero(page)) 201 __put_single_page(page); 202 return; 203 } 204 VM_BUG_ON_PAGE(page_head != page->first_page, page); 205 /* 206 * We can release the refcount taken by 207 * get_page_unless_zero() now that 208 * __split_huge_page_refcount() is blocked on the 209 * compound_lock. 210 */ 211 if (put_page_testzero(page_head)) 212 VM_BUG_ON_PAGE(1, page_head); 213 /* __split_huge_page_refcount will wait now */ 214 VM_BUG_ON_PAGE(page_mapcount(page) <= 0, page); 215 atomic_dec(&page->_mapcount); 216 VM_BUG_ON_PAGE(atomic_read(&page_head->_count) <= 0, page_head); 217 VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page); 218 compound_unlock_irqrestore(page_head, flags); 219 220 if (put_page_testzero(page_head)) { 221 if (PageHead(page_head)) 222 __put_compound_page(page_head); 223 else 224 __put_single_page(page_head); 225 } 226 } else { 227 /* @page_head is a dangling pointer */ 228 VM_BUG_ON_PAGE(PageTail(page), page); 229 goto out_put_single; 230 } 231 } 232 233 static void put_compound_page(struct page *page) 234 { 235 struct page *page_head; 236 237 /* 238 * We see the PageCompound set and PageTail not set, so @page maybe: 239 * 1. hugetlbfs head page, or 240 * 2. THP head page. 241 */ 242 if (likely(!PageTail(page))) { 243 if (put_page_testzero(page)) { 244 /* 245 * By the time all refcounts have been released 246 * split_huge_page cannot run anymore from under us. 247 */ 248 if (PageHead(page)) 249 __put_compound_page(page); 250 else 251 __put_single_page(page); 252 } 253 return; 254 } 255 256 /* 257 * We see the PageCompound set and PageTail set, so @page maybe: 258 * 1. a tail hugetlbfs page, or 259 * 2. a tail THP page, or 260 * 3. a split THP page. 261 * 262 * Case 3 is possible, as we may race with 263 * __split_huge_page_refcount tearing down a THP page. 264 */ 265 page_head = compound_head_by_tail(page); 266 if (!__compound_tail_refcounted(page_head)) 267 put_unrefcounted_compound_page(page_head, page); 268 else 269 put_refcounted_compound_page(page_head, page); 270 } 271 272 void put_page(struct page *page) 273 { 274 if (unlikely(PageCompound(page))) 275 put_compound_page(page); 276 else if (put_page_testzero(page)) 277 __put_single_page(page); 278 } 279 EXPORT_SYMBOL(put_page); 280 281 /* 282 * This function is exported but must not be called by anything other 283 * than get_page(). It implements the slow path of get_page(). 284 */ 285 bool __get_page_tail(struct page *page) 286 { 287 /* 288 * This takes care of get_page() if run on a tail page 289 * returned by one of the get_user_pages/follow_page variants. 290 * get_user_pages/follow_page itself doesn't need the compound 291 * lock because it runs __get_page_tail_foll() under the 292 * proper PT lock that already serializes against 293 * split_huge_page(). 294 */ 295 unsigned long flags; 296 bool got; 297 struct page *page_head = compound_head(page); 298 299 /* Ref to put_compound_page() comment. */ 300 if (!__compound_tail_refcounted(page_head)) { 301 smp_rmb(); 302 if (likely(PageTail(page))) { 303 /* 304 * This is a hugetlbfs page or a slab 305 * page. __split_huge_page_refcount 306 * cannot race here. 307 */ 308 VM_BUG_ON_PAGE(!PageHead(page_head), page_head); 309 __get_page_tail_foll(page, true); 310 return true; 311 } else { 312 /* 313 * __split_huge_page_refcount run 314 * before us, "page" was a THP 315 * tail. The split page_head has been 316 * freed and reallocated as slab or 317 * hugetlbfs page of smaller order 318 * (only possible if reallocated as 319 * slab on x86). 320 */ 321 return false; 322 } 323 } 324 325 got = false; 326 if (likely(page != page_head && get_page_unless_zero(page_head))) { 327 /* 328 * page_head wasn't a dangling pointer but it 329 * may not be a head page anymore by the time 330 * we obtain the lock. That is ok as long as it 331 * can't be freed from under us. 332 */ 333 flags = compound_lock_irqsave(page_head); 334 /* here __split_huge_page_refcount won't run anymore */ 335 if (likely(PageTail(page))) { 336 __get_page_tail_foll(page, false); 337 got = true; 338 } 339 compound_unlock_irqrestore(page_head, flags); 340 if (unlikely(!got)) 341 put_page(page_head); 342 } 343 return got; 344 } 345 EXPORT_SYMBOL(__get_page_tail); 346 347 /** 348 * put_pages_list() - release a list of pages 349 * @pages: list of pages threaded on page->lru 350 * 351 * Release a list of pages which are strung together on page.lru. Currently 352 * used by read_cache_pages() and related error recovery code. 353 */ 354 void put_pages_list(struct list_head *pages) 355 { 356 while (!list_empty(pages)) { 357 struct page *victim; 358 359 victim = list_entry(pages->prev, struct page, lru); 360 list_del(&victim->lru); 361 page_cache_release(victim); 362 } 363 } 364 EXPORT_SYMBOL(put_pages_list); 365 366 /* 367 * get_kernel_pages() - pin kernel pages in memory 368 * @kiov: An array of struct kvec structures 369 * @nr_segs: number of segments to pin 370 * @write: pinning for read/write, currently ignored 371 * @pages: array that receives pointers to the pages pinned. 372 * Should be at least nr_segs long. 373 * 374 * Returns number of pages pinned. This may be fewer than the number 375 * requested. If nr_pages is 0 or negative, returns 0. If no pages 376 * were pinned, returns -errno. Each page returned must be released 377 * with a put_page() call when it is finished with. 378 */ 379 int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write, 380 struct page **pages) 381 { 382 int seg; 383 384 for (seg = 0; seg < nr_segs; seg++) { 385 if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE)) 386 return seg; 387 388 pages[seg] = kmap_to_page(kiov[seg].iov_base); 389 page_cache_get(pages[seg]); 390 } 391 392 return seg; 393 } 394 EXPORT_SYMBOL_GPL(get_kernel_pages); 395 396 /* 397 * get_kernel_page() - pin a kernel page in memory 398 * @start: starting kernel address 399 * @write: pinning for read/write, currently ignored 400 * @pages: array that receives pointer to the page pinned. 401 * Must be at least nr_segs long. 402 * 403 * Returns 1 if page is pinned. If the page was not pinned, returns 404 * -errno. The page returned must be released with a put_page() call 405 * when it is finished with. 406 */ 407 int get_kernel_page(unsigned long start, int write, struct page **pages) 408 { 409 const struct kvec kiov = { 410 .iov_base = (void *)start, 411 .iov_len = PAGE_SIZE 412 }; 413 414 return get_kernel_pages(&kiov, 1, write, pages); 415 } 416 EXPORT_SYMBOL_GPL(get_kernel_page); 417 418 static void pagevec_lru_move_fn(struct pagevec *pvec, 419 void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg), 420 void *arg) 421 { 422 int i; 423 struct zone *zone = NULL; 424 struct lruvec *lruvec; 425 unsigned long flags = 0; 426 427 for (i = 0; i < pagevec_count(pvec); i++) { 428 struct page *page = pvec->pages[i]; 429 struct zone *pagezone = page_zone(page); 430 431 if (pagezone != zone) { 432 if (zone) 433 spin_unlock_irqrestore(&zone->lru_lock, flags); 434 zone = pagezone; 435 spin_lock_irqsave(&zone->lru_lock, flags); 436 } 437 438 lruvec = mem_cgroup_page_lruvec(page, zone); 439 (*move_fn)(page, lruvec, arg); 440 } 441 if (zone) 442 spin_unlock_irqrestore(&zone->lru_lock, flags); 443 release_pages(pvec->pages, pvec->nr, pvec->cold); 444 pagevec_reinit(pvec); 445 } 446 447 static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec, 448 void *arg) 449 { 450 int *pgmoved = arg; 451 452 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 453 enum lru_list lru = page_lru_base_type(page); 454 list_move_tail(&page->lru, &lruvec->lists[lru]); 455 (*pgmoved)++; 456 } 457 } 458 459 /* 460 * pagevec_move_tail() must be called with IRQ disabled. 461 * Otherwise this may cause nasty races. 462 */ 463 static void pagevec_move_tail(struct pagevec *pvec) 464 { 465 int pgmoved = 0; 466 467 pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved); 468 __count_vm_events(PGROTATED, pgmoved); 469 } 470 471 /* 472 * Writeback is about to end against a page which has been marked for immediate 473 * reclaim. If it still appears to be reclaimable, move it to the tail of the 474 * inactive list. 475 */ 476 void rotate_reclaimable_page(struct page *page) 477 { 478 if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && 479 !PageUnevictable(page) && PageLRU(page)) { 480 struct pagevec *pvec; 481 unsigned long flags; 482 483 page_cache_get(page); 484 local_irq_save(flags); 485 pvec = this_cpu_ptr(&lru_rotate_pvecs); 486 if (!pagevec_add(pvec, page)) 487 pagevec_move_tail(pvec); 488 local_irq_restore(flags); 489 } 490 } 491 492 static void update_page_reclaim_stat(struct lruvec *lruvec, 493 int file, int rotated) 494 { 495 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; 496 497 reclaim_stat->recent_scanned[file]++; 498 if (rotated) 499 reclaim_stat->recent_rotated[file]++; 500 } 501 502 static void __activate_page(struct page *page, struct lruvec *lruvec, 503 void *arg) 504 { 505 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 506 int file = page_is_file_cache(page); 507 int lru = page_lru_base_type(page); 508 509 del_page_from_lru_list(page, lruvec, lru); 510 SetPageActive(page); 511 lru += LRU_ACTIVE; 512 add_page_to_lru_list(page, lruvec, lru); 513 trace_mm_lru_activate(page); 514 515 __count_vm_event(PGACTIVATE); 516 update_page_reclaim_stat(lruvec, file, 1); 517 } 518 } 519 520 #ifdef CONFIG_SMP 521 static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); 522 523 static void activate_page_drain(int cpu) 524 { 525 struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu); 526 527 if (pagevec_count(pvec)) 528 pagevec_lru_move_fn(pvec, __activate_page, NULL); 529 } 530 531 static bool need_activate_page_drain(int cpu) 532 { 533 return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; 534 } 535 536 void activate_page(struct page *page) 537 { 538 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 539 struct pagevec *pvec = &get_cpu_var(activate_page_pvecs); 540 541 page_cache_get(page); 542 if (!pagevec_add(pvec, page)) 543 pagevec_lru_move_fn(pvec, __activate_page, NULL); 544 put_cpu_var(activate_page_pvecs); 545 } 546 } 547 548 #else 549 static inline void activate_page_drain(int cpu) 550 { 551 } 552 553 static bool need_activate_page_drain(int cpu) 554 { 555 return false; 556 } 557 558 void activate_page(struct page *page) 559 { 560 struct zone *zone = page_zone(page); 561 562 spin_lock_irq(&zone->lru_lock); 563 __activate_page(page, mem_cgroup_page_lruvec(page, zone), NULL); 564 spin_unlock_irq(&zone->lru_lock); 565 } 566 #endif 567 568 static void __lru_cache_activate_page(struct page *page) 569 { 570 struct pagevec *pvec = &get_cpu_var(lru_add_pvec); 571 int i; 572 573 /* 574 * Search backwards on the optimistic assumption that the page being 575 * activated has just been added to this pagevec. Note that only 576 * the local pagevec is examined as a !PageLRU page could be in the 577 * process of being released, reclaimed, migrated or on a remote 578 * pagevec that is currently being drained. Furthermore, marking 579 * a remote pagevec's page PageActive potentially hits a race where 580 * a page is marked PageActive just after it is added to the inactive 581 * list causing accounting errors and BUG_ON checks to trigger. 582 */ 583 for (i = pagevec_count(pvec) - 1; i >= 0; i--) { 584 struct page *pagevec_page = pvec->pages[i]; 585 586 if (pagevec_page == page) { 587 SetPageActive(page); 588 break; 589 } 590 } 591 592 put_cpu_var(lru_add_pvec); 593 } 594 595 /* 596 * Mark a page as having seen activity. 597 * 598 * inactive,unreferenced -> inactive,referenced 599 * inactive,referenced -> active,unreferenced 600 * active,unreferenced -> active,referenced 601 * 602 * When a newly allocated page is not yet visible, so safe for non-atomic ops, 603 * __SetPageReferenced(page) may be substituted for mark_page_accessed(page). 604 */ 605 void mark_page_accessed(struct page *page) 606 { 607 if (!PageActive(page) && !PageUnevictable(page) && 608 PageReferenced(page)) { 609 610 /* 611 * If the page is on the LRU, queue it for activation via 612 * activate_page_pvecs. Otherwise, assume the page is on a 613 * pagevec, mark it active and it'll be moved to the active 614 * LRU on the next drain. 615 */ 616 if (PageLRU(page)) 617 activate_page(page); 618 else 619 __lru_cache_activate_page(page); 620 ClearPageReferenced(page); 621 if (page_is_file_cache(page)) 622 workingset_activation(page); 623 } else if (!PageReferenced(page)) { 624 SetPageReferenced(page); 625 } 626 } 627 EXPORT_SYMBOL(mark_page_accessed); 628 629 static void __lru_cache_add(struct page *page) 630 { 631 struct pagevec *pvec = &get_cpu_var(lru_add_pvec); 632 633 page_cache_get(page); 634 if (!pagevec_space(pvec)) 635 __pagevec_lru_add(pvec); 636 pagevec_add(pvec, page); 637 put_cpu_var(lru_add_pvec); 638 } 639 640 /** 641 * lru_cache_add: add a page to the page lists 642 * @page: the page to add 643 */ 644 void lru_cache_add_anon(struct page *page) 645 { 646 if (PageActive(page)) 647 ClearPageActive(page); 648 __lru_cache_add(page); 649 } 650 651 void lru_cache_add_file(struct page *page) 652 { 653 if (PageActive(page)) 654 ClearPageActive(page); 655 __lru_cache_add(page); 656 } 657 EXPORT_SYMBOL(lru_cache_add_file); 658 659 /** 660 * lru_cache_add - add a page to a page list 661 * @page: the page to be added to the LRU. 662 * 663 * Queue the page for addition to the LRU via pagevec. The decision on whether 664 * to add the page to the [in]active [file|anon] list is deferred until the 665 * pagevec is drained. This gives a chance for the caller of lru_cache_add() 666 * have the page added to the active list using mark_page_accessed(). 667 */ 668 void lru_cache_add(struct page *page) 669 { 670 VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page); 671 VM_BUG_ON_PAGE(PageLRU(page), page); 672 __lru_cache_add(page); 673 } 674 675 /** 676 * add_page_to_unevictable_list - add a page to the unevictable list 677 * @page: the page to be added to the unevictable list 678 * 679 * Add page directly to its zone's unevictable list. To avoid races with 680 * tasks that might be making the page evictable, through eg. munlock, 681 * munmap or exit, while it's not on the lru, we want to add the page 682 * while it's locked or otherwise "invisible" to other tasks. This is 683 * difficult to do when using the pagevec cache, so bypass that. 684 */ 685 void add_page_to_unevictable_list(struct page *page) 686 { 687 struct zone *zone = page_zone(page); 688 struct lruvec *lruvec; 689 690 spin_lock_irq(&zone->lru_lock); 691 lruvec = mem_cgroup_page_lruvec(page, zone); 692 ClearPageActive(page); 693 SetPageUnevictable(page); 694 SetPageLRU(page); 695 add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE); 696 spin_unlock_irq(&zone->lru_lock); 697 } 698 699 /** 700 * lru_cache_add_active_or_unevictable 701 * @page: the page to be added to LRU 702 * @vma: vma in which page is mapped for determining reclaimability 703 * 704 * Place @page on the active or unevictable LRU list, depending on its 705 * evictability. Note that if the page is not evictable, it goes 706 * directly back onto it's zone's unevictable list, it does NOT use a 707 * per cpu pagevec. 708 */ 709 void lru_cache_add_active_or_unevictable(struct page *page, 710 struct vm_area_struct *vma) 711 { 712 VM_BUG_ON_PAGE(PageLRU(page), page); 713 714 if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) { 715 SetPageActive(page); 716 lru_cache_add(page); 717 return; 718 } 719 720 if (!TestSetPageMlocked(page)) { 721 /* 722 * We use the irq-unsafe __mod_zone_page_stat because this 723 * counter is not modified from interrupt context, and the pte 724 * lock is held(spinlock), which implies preemption disabled. 725 */ 726 __mod_zone_page_state(page_zone(page), NR_MLOCK, 727 hpage_nr_pages(page)); 728 count_vm_event(UNEVICTABLE_PGMLOCKED); 729 } 730 add_page_to_unevictable_list(page); 731 } 732 733 /* 734 * If the page can not be invalidated, it is moved to the 735 * inactive list to speed up its reclaim. It is moved to the 736 * head of the list, rather than the tail, to give the flusher 737 * threads some time to write it out, as this is much more 738 * effective than the single-page writeout from reclaim. 739 * 740 * If the page isn't page_mapped and dirty/writeback, the page 741 * could reclaim asap using PG_reclaim. 742 * 743 * 1. active, mapped page -> none 744 * 2. active, dirty/writeback page -> inactive, head, PG_reclaim 745 * 3. inactive, mapped page -> none 746 * 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim 747 * 5. inactive, clean -> inactive, tail 748 * 6. Others -> none 749 * 750 * In 4, why it moves inactive's head, the VM expects the page would 751 * be write it out by flusher threads as this is much more effective 752 * than the single-page writeout from reclaim. 753 */ 754 static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, 755 void *arg) 756 { 757 int lru, file; 758 bool active; 759 760 if (!PageLRU(page)) 761 return; 762 763 if (PageUnevictable(page)) 764 return; 765 766 /* Some processes are using the page */ 767 if (page_mapped(page)) 768 return; 769 770 active = PageActive(page); 771 file = page_is_file_cache(page); 772 lru = page_lru_base_type(page); 773 774 del_page_from_lru_list(page, lruvec, lru + active); 775 ClearPageActive(page); 776 ClearPageReferenced(page); 777 add_page_to_lru_list(page, lruvec, lru); 778 779 if (PageWriteback(page) || PageDirty(page)) { 780 /* 781 * PG_reclaim could be raced with end_page_writeback 782 * It can make readahead confusing. But race window 783 * is _really_ small and it's non-critical problem. 784 */ 785 SetPageReclaim(page); 786 } else { 787 /* 788 * The page's writeback ends up during pagevec 789 * We moves tha page into tail of inactive. 790 */ 791 list_move_tail(&page->lru, &lruvec->lists[lru]); 792 __count_vm_event(PGROTATED); 793 } 794 795 if (active) 796 __count_vm_event(PGDEACTIVATE); 797 update_page_reclaim_stat(lruvec, file, 0); 798 } 799 800 /* 801 * Drain pages out of the cpu's pagevecs. 802 * Either "cpu" is the current CPU, and preemption has already been 803 * disabled; or "cpu" is being hot-unplugged, and is already dead. 804 */ 805 void lru_add_drain_cpu(int cpu) 806 { 807 struct pagevec *pvec = &per_cpu(lru_add_pvec, cpu); 808 809 if (pagevec_count(pvec)) 810 __pagevec_lru_add(pvec); 811 812 pvec = &per_cpu(lru_rotate_pvecs, cpu); 813 if (pagevec_count(pvec)) { 814 unsigned long flags; 815 816 /* No harm done if a racing interrupt already did this */ 817 local_irq_save(flags); 818 pagevec_move_tail(pvec); 819 local_irq_restore(flags); 820 } 821 822 pvec = &per_cpu(lru_deactivate_file_pvecs, cpu); 823 if (pagevec_count(pvec)) 824 pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); 825 826 activate_page_drain(cpu); 827 } 828 829 /** 830 * deactivate_file_page - forcefully deactivate a file page 831 * @page: page to deactivate 832 * 833 * This function hints the VM that @page is a good reclaim candidate, 834 * for example if its invalidation fails due to the page being dirty 835 * or under writeback. 836 */ 837 void deactivate_file_page(struct page *page) 838 { 839 /* 840 * In a workload with many unevictable page such as mprotect, 841 * unevictable page deactivation for accelerating reclaim is pointless. 842 */ 843 if (PageUnevictable(page)) 844 return; 845 846 if (likely(get_page_unless_zero(page))) { 847 struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs); 848 849 if (!pagevec_add(pvec, page)) 850 pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); 851 put_cpu_var(lru_deactivate_file_pvecs); 852 } 853 } 854 855 void lru_add_drain(void) 856 { 857 lru_add_drain_cpu(get_cpu()); 858 put_cpu(); 859 } 860 861 static void lru_add_drain_per_cpu(struct work_struct *dummy) 862 { 863 lru_add_drain(); 864 } 865 866 static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); 867 868 void lru_add_drain_all(void) 869 { 870 static DEFINE_MUTEX(lock); 871 static struct cpumask has_work; 872 int cpu; 873 874 mutex_lock(&lock); 875 get_online_cpus(); 876 cpumask_clear(&has_work); 877 878 for_each_online_cpu(cpu) { 879 struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); 880 881 if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || 882 pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || 883 pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) || 884 need_activate_page_drain(cpu)) { 885 INIT_WORK(work, lru_add_drain_per_cpu); 886 schedule_work_on(cpu, work); 887 cpumask_set_cpu(cpu, &has_work); 888 } 889 } 890 891 for_each_cpu(cpu, &has_work) 892 flush_work(&per_cpu(lru_add_drain_work, cpu)); 893 894 put_online_cpus(); 895 mutex_unlock(&lock); 896 } 897 898 /** 899 * release_pages - batched page_cache_release() 900 * @pages: array of pages to release 901 * @nr: number of pages 902 * @cold: whether the pages are cache cold 903 * 904 * Decrement the reference count on all the pages in @pages. If it 905 * fell to zero, remove the page from the LRU and free it. 906 */ 907 void release_pages(struct page **pages, int nr, bool cold) 908 { 909 int i; 910 LIST_HEAD(pages_to_free); 911 struct zone *zone = NULL; 912 struct lruvec *lruvec; 913 unsigned long uninitialized_var(flags); 914 unsigned int uninitialized_var(lock_batch); 915 916 for (i = 0; i < nr; i++) { 917 struct page *page = pages[i]; 918 919 if (unlikely(PageCompound(page))) { 920 if (zone) { 921 spin_unlock_irqrestore(&zone->lru_lock, flags); 922 zone = NULL; 923 } 924 put_compound_page(page); 925 continue; 926 } 927 928 /* 929 * Make sure the IRQ-safe lock-holding time does not get 930 * excessive with a continuous string of pages from the 931 * same zone. The lock is held only if zone != NULL. 932 */ 933 if (zone && ++lock_batch == SWAP_CLUSTER_MAX) { 934 spin_unlock_irqrestore(&zone->lru_lock, flags); 935 zone = NULL; 936 } 937 938 if (!put_page_testzero(page)) 939 continue; 940 941 if (PageLRU(page)) { 942 struct zone *pagezone = page_zone(page); 943 944 if (pagezone != zone) { 945 if (zone) 946 spin_unlock_irqrestore(&zone->lru_lock, 947 flags); 948 lock_batch = 0; 949 zone = pagezone; 950 spin_lock_irqsave(&zone->lru_lock, flags); 951 } 952 953 lruvec = mem_cgroup_page_lruvec(page, zone); 954 VM_BUG_ON_PAGE(!PageLRU(page), page); 955 __ClearPageLRU(page); 956 del_page_from_lru_list(page, lruvec, page_off_lru(page)); 957 } 958 959 /* Clear Active bit in case of parallel mark_page_accessed */ 960 __ClearPageActive(page); 961 962 list_add(&page->lru, &pages_to_free); 963 } 964 if (zone) 965 spin_unlock_irqrestore(&zone->lru_lock, flags); 966 967 mem_cgroup_uncharge_list(&pages_to_free); 968 free_hot_cold_page_list(&pages_to_free, cold); 969 } 970 EXPORT_SYMBOL(release_pages); 971 972 /* 973 * The pages which we're about to release may be in the deferred lru-addition 974 * queues. That would prevent them from really being freed right now. That's 975 * OK from a correctness point of view but is inefficient - those pages may be 976 * cache-warm and we want to give them back to the page allocator ASAP. 977 * 978 * So __pagevec_release() will drain those queues here. __pagevec_lru_add() 979 * and __pagevec_lru_add_active() call release_pages() directly to avoid 980 * mutual recursion. 981 */ 982 void __pagevec_release(struct pagevec *pvec) 983 { 984 lru_add_drain(); 985 release_pages(pvec->pages, pagevec_count(pvec), pvec->cold); 986 pagevec_reinit(pvec); 987 } 988 EXPORT_SYMBOL(__pagevec_release); 989 990 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 991 /* used by __split_huge_page_refcount() */ 992 void lru_add_page_tail(struct page *page, struct page *page_tail, 993 struct lruvec *lruvec, struct list_head *list) 994 { 995 const int file = 0; 996 997 VM_BUG_ON_PAGE(!PageHead(page), page); 998 VM_BUG_ON_PAGE(PageCompound(page_tail), page); 999 VM_BUG_ON_PAGE(PageLRU(page_tail), page); 1000 VM_BUG_ON(NR_CPUS != 1 && 1001 !spin_is_locked(&lruvec_zone(lruvec)->lru_lock)); 1002 1003 if (!list) 1004 SetPageLRU(page_tail); 1005 1006 if (likely(PageLRU(page))) 1007 list_add_tail(&page_tail->lru, &page->lru); 1008 else if (list) { 1009 /* page reclaim is reclaiming a huge page */ 1010 get_page(page_tail); 1011 list_add_tail(&page_tail->lru, list); 1012 } else { 1013 struct list_head *list_head; 1014 /* 1015 * Head page has not yet been counted, as an hpage, 1016 * so we must account for each subpage individually. 1017 * 1018 * Use the standard add function to put page_tail on the list, 1019 * but then correct its position so they all end up in order. 1020 */ 1021 add_page_to_lru_list(page_tail, lruvec, page_lru(page_tail)); 1022 list_head = page_tail->lru.prev; 1023 list_move_tail(&page_tail->lru, list_head); 1024 } 1025 1026 if (!PageUnevictable(page)) 1027 update_page_reclaim_stat(lruvec, file, PageActive(page_tail)); 1028 } 1029 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1030 1031 static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, 1032 void *arg) 1033 { 1034 int file = page_is_file_cache(page); 1035 int active = PageActive(page); 1036 enum lru_list lru = page_lru(page); 1037 1038 VM_BUG_ON_PAGE(PageLRU(page), page); 1039 1040 SetPageLRU(page); 1041 add_page_to_lru_list(page, lruvec, lru); 1042 update_page_reclaim_stat(lruvec, file, active); 1043 trace_mm_lru_insertion(page, lru); 1044 } 1045 1046 /* 1047 * Add the passed pages to the LRU, then drop the caller's refcount 1048 * on them. Reinitialises the caller's pagevec. 1049 */ 1050 void __pagevec_lru_add(struct pagevec *pvec) 1051 { 1052 pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL); 1053 } 1054 EXPORT_SYMBOL(__pagevec_lru_add); 1055 1056 /** 1057 * pagevec_lookup_entries - gang pagecache lookup 1058 * @pvec: Where the resulting entries are placed 1059 * @mapping: The address_space to search 1060 * @start: The starting entry index 1061 * @nr_entries: The maximum number of entries 1062 * @indices: The cache indices corresponding to the entries in @pvec 1063 * 1064 * pagevec_lookup_entries() will search for and return a group of up 1065 * to @nr_entries pages and shadow entries in the mapping. All 1066 * entries are placed in @pvec. pagevec_lookup_entries() takes a 1067 * reference against actual pages in @pvec. 1068 * 1069 * The search returns a group of mapping-contiguous entries with 1070 * ascending indexes. There may be holes in the indices due to 1071 * not-present entries. 1072 * 1073 * pagevec_lookup_entries() returns the number of entries which were 1074 * found. 1075 */ 1076 unsigned pagevec_lookup_entries(struct pagevec *pvec, 1077 struct address_space *mapping, 1078 pgoff_t start, unsigned nr_pages, 1079 pgoff_t *indices) 1080 { 1081 pvec->nr = find_get_entries(mapping, start, nr_pages, 1082 pvec->pages, indices); 1083 return pagevec_count(pvec); 1084 } 1085 1086 /** 1087 * pagevec_remove_exceptionals - pagevec exceptionals pruning 1088 * @pvec: The pagevec to prune 1089 * 1090 * pagevec_lookup_entries() fills both pages and exceptional radix 1091 * tree entries into the pagevec. This function prunes all 1092 * exceptionals from @pvec without leaving holes, so that it can be 1093 * passed on to page-only pagevec operations. 1094 */ 1095 void pagevec_remove_exceptionals(struct pagevec *pvec) 1096 { 1097 int i, j; 1098 1099 for (i = 0, j = 0; i < pagevec_count(pvec); i++) { 1100 struct page *page = pvec->pages[i]; 1101 if (!radix_tree_exceptional_entry(page)) 1102 pvec->pages[j++] = page; 1103 } 1104 pvec->nr = j; 1105 } 1106 1107 /** 1108 * pagevec_lookup - gang pagecache lookup 1109 * @pvec: Where the resulting pages are placed 1110 * @mapping: The address_space to search 1111 * @start: The starting page index 1112 * @nr_pages: The maximum number of pages 1113 * 1114 * pagevec_lookup() will search for and return a group of up to @nr_pages pages 1115 * in the mapping. The pages are placed in @pvec. pagevec_lookup() takes a 1116 * reference against the pages in @pvec. 1117 * 1118 * The search returns a group of mapping-contiguous pages with ascending 1119 * indexes. There may be holes in the indices due to not-present pages. 1120 * 1121 * pagevec_lookup() returns the number of pages which were found. 1122 */ 1123 unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, 1124 pgoff_t start, unsigned nr_pages) 1125 { 1126 pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages); 1127 return pagevec_count(pvec); 1128 } 1129 EXPORT_SYMBOL(pagevec_lookup); 1130 1131 unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, 1132 pgoff_t *index, int tag, unsigned nr_pages) 1133 { 1134 pvec->nr = find_get_pages_tag(mapping, index, tag, 1135 nr_pages, pvec->pages); 1136 return pagevec_count(pvec); 1137 } 1138 EXPORT_SYMBOL(pagevec_lookup_tag); 1139 1140 /* 1141 * Perform any setup for the swap system 1142 */ 1143 void __init swap_setup(void) 1144 { 1145 unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); 1146 #ifdef CONFIG_SWAP 1147 int i; 1148 1149 for (i = 0; i < MAX_SWAPFILES; i++) 1150 spin_lock_init(&swapper_spaces[i].tree_lock); 1151 #endif 1152 1153 /* Use a smaller cluster for small-memory machines */ 1154 if (megs < 16) 1155 page_cluster = 2; 1156 else 1157 page_cluster = 3; 1158 /* 1159 * Right now other parts of the system means that we 1160 * _really_ don't want to cluster much more 1161 */ 1162 } 1163