1 /* 2 * linux/mm/swap.c 3 * 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 5 */ 6 7 /* 8 * This file contains the default values for the operation of the 9 * Linux VM subsystem. Fine-tuning documentation can be found in 10 * Documentation/sysctl/vm.txt. 11 * Started 18.12.91 12 * Swap aging added 23.2.95, Stephen Tweedie. 13 * Buffermem limits added 12.3.98, Rik van Riel. 14 */ 15 16 #include <linux/mm.h> 17 #include <linux/sched.h> 18 #include <linux/kernel_stat.h> 19 #include <linux/swap.h> 20 #include <linux/mman.h> 21 #include <linux/pagemap.h> 22 #include <linux/pagevec.h> 23 #include <linux/init.h> 24 #include <linux/export.h> 25 #include <linux/mm_inline.h> 26 #include <linux/percpu_counter.h> 27 #include <linux/percpu.h> 28 #include <linux/cpu.h> 29 #include <linux/notifier.h> 30 #include <linux/backing-dev.h> 31 #include <linux/memcontrol.h> 32 #include <linux/gfp.h> 33 #include <linux/uio.h> 34 #include <linux/hugetlb.h> 35 36 #include "internal.h" 37 38 #define CREATE_TRACE_POINTS 39 #include <trace/events/pagemap.h> 40 41 /* How many pages do we try to swap or page in/out together? */ 42 int page_cluster; 43 44 static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); 45 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); 46 static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); 47 48 /* 49 * This path almost never happens for VM activity - pages are normally 50 * freed via pagevecs. But it gets used by networking. 51 */ 52 static void __page_cache_release(struct page *page) 53 { 54 if (PageLRU(page)) { 55 struct zone *zone = page_zone(page); 56 struct lruvec *lruvec; 57 unsigned long flags; 58 59 spin_lock_irqsave(&zone->lru_lock, flags); 60 lruvec = mem_cgroup_page_lruvec(page, zone); 61 VM_BUG_ON(!PageLRU(page)); 62 __ClearPageLRU(page); 63 del_page_from_lru_list(page, lruvec, page_off_lru(page)); 64 spin_unlock_irqrestore(&zone->lru_lock, flags); 65 } 66 } 67 68 static void __put_single_page(struct page *page) 69 { 70 __page_cache_release(page); 71 free_hot_cold_page(page, 0); 72 } 73 74 static void __put_compound_page(struct page *page) 75 { 76 compound_page_dtor *dtor; 77 78 __page_cache_release(page); 79 dtor = get_compound_page_dtor(page); 80 (*dtor)(page); 81 } 82 83 static void put_compound_page(struct page *page) 84 { 85 /* 86 * hugetlbfs pages cannot be split from under us. If this is a 87 * hugetlbfs page, check refcount on head page and release the page if 88 * the refcount becomes zero. 89 */ 90 if (PageHuge(page)) { 91 page = compound_head(page); 92 if (put_page_testzero(page)) 93 __put_compound_page(page); 94 95 return; 96 } 97 98 if (unlikely(PageTail(page))) { 99 /* __split_huge_page_refcount can run under us */ 100 struct page *page_head = compound_trans_head(page); 101 102 if (likely(page != page_head && 103 get_page_unless_zero(page_head))) { 104 unsigned long flags; 105 106 /* 107 * THP can not break up slab pages so avoid taking 108 * compound_lock(). Slab performs non-atomic bit ops 109 * on page->flags for better performance. In particular 110 * slab_unlock() in slub used to be a hot path. It is 111 * still hot on arches that do not support 112 * this_cpu_cmpxchg_double(). 113 */ 114 if (PageSlab(page_head)) { 115 if (PageTail(page)) { 116 if (put_page_testzero(page_head)) 117 VM_BUG_ON(1); 118 119 atomic_dec(&page->_mapcount); 120 goto skip_lock_tail; 121 } else 122 goto skip_lock; 123 } 124 /* 125 * page_head wasn't a dangling pointer but it 126 * may not be a head page anymore by the time 127 * we obtain the lock. That is ok as long as it 128 * can't be freed from under us. 129 */ 130 flags = compound_lock_irqsave(page_head); 131 if (unlikely(!PageTail(page))) { 132 /* __split_huge_page_refcount run before us */ 133 compound_unlock_irqrestore(page_head, flags); 134 skip_lock: 135 if (put_page_testzero(page_head)) 136 __put_single_page(page_head); 137 out_put_single: 138 if (put_page_testzero(page)) 139 __put_single_page(page); 140 return; 141 } 142 VM_BUG_ON(page_head != page->first_page); 143 /* 144 * We can release the refcount taken by 145 * get_page_unless_zero() now that 146 * __split_huge_page_refcount() is blocked on 147 * the compound_lock. 148 */ 149 if (put_page_testzero(page_head)) 150 VM_BUG_ON(1); 151 /* __split_huge_page_refcount will wait now */ 152 VM_BUG_ON(page_mapcount(page) <= 0); 153 atomic_dec(&page->_mapcount); 154 VM_BUG_ON(atomic_read(&page_head->_count) <= 0); 155 VM_BUG_ON(atomic_read(&page->_count) != 0); 156 compound_unlock_irqrestore(page_head, flags); 157 158 skip_lock_tail: 159 if (put_page_testzero(page_head)) { 160 if (PageHead(page_head)) 161 __put_compound_page(page_head); 162 else 163 __put_single_page(page_head); 164 } 165 } else { 166 /* page_head is a dangling pointer */ 167 VM_BUG_ON(PageTail(page)); 168 goto out_put_single; 169 } 170 } else if (put_page_testzero(page)) { 171 if (PageHead(page)) 172 __put_compound_page(page); 173 else 174 __put_single_page(page); 175 } 176 } 177 178 void put_page(struct page *page) 179 { 180 if (unlikely(PageCompound(page))) 181 put_compound_page(page); 182 else if (put_page_testzero(page)) 183 __put_single_page(page); 184 } 185 EXPORT_SYMBOL(put_page); 186 187 /* 188 * This function is exported but must not be called by anything other 189 * than get_page(). It implements the slow path of get_page(). 190 */ 191 bool __get_page_tail(struct page *page) 192 { 193 /* 194 * This takes care of get_page() if run on a tail page 195 * returned by one of the get_user_pages/follow_page variants. 196 * get_user_pages/follow_page itself doesn't need the compound 197 * lock because it runs __get_page_tail_foll() under the 198 * proper PT lock that already serializes against 199 * split_huge_page(). 200 */ 201 bool got = false; 202 struct page *page_head; 203 204 /* 205 * If this is a hugetlbfs page it cannot be split under us. Simply 206 * increment refcount for the head page. 207 */ 208 if (PageHuge(page)) { 209 page_head = compound_head(page); 210 atomic_inc(&page_head->_count); 211 got = true; 212 } else { 213 unsigned long flags; 214 215 page_head = compound_trans_head(page); 216 if (likely(page != page_head && 217 get_page_unless_zero(page_head))) { 218 219 /* Ref to put_compound_page() comment. */ 220 if (PageSlab(page_head)) { 221 if (likely(PageTail(page))) { 222 __get_page_tail_foll(page, false); 223 return true; 224 } else { 225 put_page(page_head); 226 return false; 227 } 228 } 229 230 /* 231 * page_head wasn't a dangling pointer but it 232 * may not be a head page anymore by the time 233 * we obtain the lock. That is ok as long as it 234 * can't be freed from under us. 235 */ 236 flags = compound_lock_irqsave(page_head); 237 /* here __split_huge_page_refcount won't run anymore */ 238 if (likely(PageTail(page))) { 239 __get_page_tail_foll(page, false); 240 got = true; 241 } 242 compound_unlock_irqrestore(page_head, flags); 243 if (unlikely(!got)) 244 put_page(page_head); 245 } 246 } 247 return got; 248 } 249 EXPORT_SYMBOL(__get_page_tail); 250 251 /** 252 * put_pages_list() - release a list of pages 253 * @pages: list of pages threaded on page->lru 254 * 255 * Release a list of pages which are strung together on page.lru. Currently 256 * used by read_cache_pages() and related error recovery code. 257 */ 258 void put_pages_list(struct list_head *pages) 259 { 260 while (!list_empty(pages)) { 261 struct page *victim; 262 263 victim = list_entry(pages->prev, struct page, lru); 264 list_del(&victim->lru); 265 page_cache_release(victim); 266 } 267 } 268 EXPORT_SYMBOL(put_pages_list); 269 270 /* 271 * get_kernel_pages() - pin kernel pages in memory 272 * @kiov: An array of struct kvec structures 273 * @nr_segs: number of segments to pin 274 * @write: pinning for read/write, currently ignored 275 * @pages: array that receives pointers to the pages pinned. 276 * Should be at least nr_segs long. 277 * 278 * Returns number of pages pinned. This may be fewer than the number 279 * requested. If nr_pages is 0 or negative, returns 0. If no pages 280 * were pinned, returns -errno. Each page returned must be released 281 * with a put_page() call when it is finished with. 282 */ 283 int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write, 284 struct page **pages) 285 { 286 int seg; 287 288 for (seg = 0; seg < nr_segs; seg++) { 289 if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE)) 290 return seg; 291 292 pages[seg] = kmap_to_page(kiov[seg].iov_base); 293 page_cache_get(pages[seg]); 294 } 295 296 return seg; 297 } 298 EXPORT_SYMBOL_GPL(get_kernel_pages); 299 300 /* 301 * get_kernel_page() - pin a kernel page in memory 302 * @start: starting kernel address 303 * @write: pinning for read/write, currently ignored 304 * @pages: array that receives pointer to the page pinned. 305 * Must be at least nr_segs long. 306 * 307 * Returns 1 if page is pinned. If the page was not pinned, returns 308 * -errno. The page returned must be released with a put_page() call 309 * when it is finished with. 310 */ 311 int get_kernel_page(unsigned long start, int write, struct page **pages) 312 { 313 const struct kvec kiov = { 314 .iov_base = (void *)start, 315 .iov_len = PAGE_SIZE 316 }; 317 318 return get_kernel_pages(&kiov, 1, write, pages); 319 } 320 EXPORT_SYMBOL_GPL(get_kernel_page); 321 322 static void pagevec_lru_move_fn(struct pagevec *pvec, 323 void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg), 324 void *arg) 325 { 326 int i; 327 struct zone *zone = NULL; 328 struct lruvec *lruvec; 329 unsigned long flags = 0; 330 331 for (i = 0; i < pagevec_count(pvec); i++) { 332 struct page *page = pvec->pages[i]; 333 struct zone *pagezone = page_zone(page); 334 335 if (pagezone != zone) { 336 if (zone) 337 spin_unlock_irqrestore(&zone->lru_lock, flags); 338 zone = pagezone; 339 spin_lock_irqsave(&zone->lru_lock, flags); 340 } 341 342 lruvec = mem_cgroup_page_lruvec(page, zone); 343 (*move_fn)(page, lruvec, arg); 344 } 345 if (zone) 346 spin_unlock_irqrestore(&zone->lru_lock, flags); 347 release_pages(pvec->pages, pvec->nr, pvec->cold); 348 pagevec_reinit(pvec); 349 } 350 351 static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec, 352 void *arg) 353 { 354 int *pgmoved = arg; 355 356 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 357 enum lru_list lru = page_lru_base_type(page); 358 list_move_tail(&page->lru, &lruvec->lists[lru]); 359 (*pgmoved)++; 360 } 361 } 362 363 /* 364 * pagevec_move_tail() must be called with IRQ disabled. 365 * Otherwise this may cause nasty races. 366 */ 367 static void pagevec_move_tail(struct pagevec *pvec) 368 { 369 int pgmoved = 0; 370 371 pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved); 372 __count_vm_events(PGROTATED, pgmoved); 373 } 374 375 /* 376 * Writeback is about to end against a page which has been marked for immediate 377 * reclaim. If it still appears to be reclaimable, move it to the tail of the 378 * inactive list. 379 */ 380 void rotate_reclaimable_page(struct page *page) 381 { 382 if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && 383 !PageUnevictable(page) && PageLRU(page)) { 384 struct pagevec *pvec; 385 unsigned long flags; 386 387 page_cache_get(page); 388 local_irq_save(flags); 389 pvec = &__get_cpu_var(lru_rotate_pvecs); 390 if (!pagevec_add(pvec, page)) 391 pagevec_move_tail(pvec); 392 local_irq_restore(flags); 393 } 394 } 395 396 static void update_page_reclaim_stat(struct lruvec *lruvec, 397 int file, int rotated) 398 { 399 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; 400 401 reclaim_stat->recent_scanned[file]++; 402 if (rotated) 403 reclaim_stat->recent_rotated[file]++; 404 } 405 406 static void __activate_page(struct page *page, struct lruvec *lruvec, 407 void *arg) 408 { 409 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 410 int file = page_is_file_cache(page); 411 int lru = page_lru_base_type(page); 412 413 del_page_from_lru_list(page, lruvec, lru); 414 SetPageActive(page); 415 lru += LRU_ACTIVE; 416 add_page_to_lru_list(page, lruvec, lru); 417 trace_mm_lru_activate(page, page_to_pfn(page)); 418 419 __count_vm_event(PGACTIVATE); 420 update_page_reclaim_stat(lruvec, file, 1); 421 } 422 } 423 424 #ifdef CONFIG_SMP 425 static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); 426 427 static void activate_page_drain(int cpu) 428 { 429 struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu); 430 431 if (pagevec_count(pvec)) 432 pagevec_lru_move_fn(pvec, __activate_page, NULL); 433 } 434 435 static bool need_activate_page_drain(int cpu) 436 { 437 return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; 438 } 439 440 void activate_page(struct page *page) 441 { 442 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 443 struct pagevec *pvec = &get_cpu_var(activate_page_pvecs); 444 445 page_cache_get(page); 446 if (!pagevec_add(pvec, page)) 447 pagevec_lru_move_fn(pvec, __activate_page, NULL); 448 put_cpu_var(activate_page_pvecs); 449 } 450 } 451 452 #else 453 static inline void activate_page_drain(int cpu) 454 { 455 } 456 457 static bool need_activate_page_drain(int cpu) 458 { 459 return false; 460 } 461 462 void activate_page(struct page *page) 463 { 464 struct zone *zone = page_zone(page); 465 466 spin_lock_irq(&zone->lru_lock); 467 __activate_page(page, mem_cgroup_page_lruvec(page, zone), NULL); 468 spin_unlock_irq(&zone->lru_lock); 469 } 470 #endif 471 472 static void __lru_cache_activate_page(struct page *page) 473 { 474 struct pagevec *pvec = &get_cpu_var(lru_add_pvec); 475 int i; 476 477 /* 478 * Search backwards on the optimistic assumption that the page being 479 * activated has just been added to this pagevec. Note that only 480 * the local pagevec is examined as a !PageLRU page could be in the 481 * process of being released, reclaimed, migrated or on a remote 482 * pagevec that is currently being drained. Furthermore, marking 483 * a remote pagevec's page PageActive potentially hits a race where 484 * a page is marked PageActive just after it is added to the inactive 485 * list causing accounting errors and BUG_ON checks to trigger. 486 */ 487 for (i = pagevec_count(pvec) - 1; i >= 0; i--) { 488 struct page *pagevec_page = pvec->pages[i]; 489 490 if (pagevec_page == page) { 491 SetPageActive(page); 492 break; 493 } 494 } 495 496 put_cpu_var(lru_add_pvec); 497 } 498 499 /* 500 * Mark a page as having seen activity. 501 * 502 * inactive,unreferenced -> inactive,referenced 503 * inactive,referenced -> active,unreferenced 504 * active,unreferenced -> active,referenced 505 */ 506 void mark_page_accessed(struct page *page) 507 { 508 if (!PageActive(page) && !PageUnevictable(page) && 509 PageReferenced(page)) { 510 511 /* 512 * If the page is on the LRU, queue it for activation via 513 * activate_page_pvecs. Otherwise, assume the page is on a 514 * pagevec, mark it active and it'll be moved to the active 515 * LRU on the next drain. 516 */ 517 if (PageLRU(page)) 518 activate_page(page); 519 else 520 __lru_cache_activate_page(page); 521 ClearPageReferenced(page); 522 } else if (!PageReferenced(page)) { 523 SetPageReferenced(page); 524 } 525 } 526 EXPORT_SYMBOL(mark_page_accessed); 527 528 /* 529 * Queue the page for addition to the LRU via pagevec. The decision on whether 530 * to add the page to the [in]active [file|anon] list is deferred until the 531 * pagevec is drained. This gives a chance for the caller of __lru_cache_add() 532 * have the page added to the active list using mark_page_accessed(). 533 */ 534 void __lru_cache_add(struct page *page) 535 { 536 struct pagevec *pvec = &get_cpu_var(lru_add_pvec); 537 538 page_cache_get(page); 539 if (!pagevec_space(pvec)) 540 __pagevec_lru_add(pvec); 541 pagevec_add(pvec, page); 542 put_cpu_var(lru_add_pvec); 543 } 544 EXPORT_SYMBOL(__lru_cache_add); 545 546 /** 547 * lru_cache_add - add a page to a page list 548 * @page: the page to be added to the LRU. 549 */ 550 void lru_cache_add(struct page *page) 551 { 552 VM_BUG_ON(PageActive(page) && PageUnevictable(page)); 553 VM_BUG_ON(PageLRU(page)); 554 __lru_cache_add(page); 555 } 556 557 /** 558 * add_page_to_unevictable_list - add a page to the unevictable list 559 * @page: the page to be added to the unevictable list 560 * 561 * Add page directly to its zone's unevictable list. To avoid races with 562 * tasks that might be making the page evictable, through eg. munlock, 563 * munmap or exit, while it's not on the lru, we want to add the page 564 * while it's locked or otherwise "invisible" to other tasks. This is 565 * difficult to do when using the pagevec cache, so bypass that. 566 */ 567 void add_page_to_unevictable_list(struct page *page) 568 { 569 struct zone *zone = page_zone(page); 570 struct lruvec *lruvec; 571 572 spin_lock_irq(&zone->lru_lock); 573 lruvec = mem_cgroup_page_lruvec(page, zone); 574 ClearPageActive(page); 575 SetPageUnevictable(page); 576 SetPageLRU(page); 577 add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE); 578 spin_unlock_irq(&zone->lru_lock); 579 } 580 581 /* 582 * If the page can not be invalidated, it is moved to the 583 * inactive list to speed up its reclaim. It is moved to the 584 * head of the list, rather than the tail, to give the flusher 585 * threads some time to write it out, as this is much more 586 * effective than the single-page writeout from reclaim. 587 * 588 * If the page isn't page_mapped and dirty/writeback, the page 589 * could reclaim asap using PG_reclaim. 590 * 591 * 1. active, mapped page -> none 592 * 2. active, dirty/writeback page -> inactive, head, PG_reclaim 593 * 3. inactive, mapped page -> none 594 * 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim 595 * 5. inactive, clean -> inactive, tail 596 * 6. Others -> none 597 * 598 * In 4, why it moves inactive's head, the VM expects the page would 599 * be write it out by flusher threads as this is much more effective 600 * than the single-page writeout from reclaim. 601 */ 602 static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, 603 void *arg) 604 { 605 int lru, file; 606 bool active; 607 608 if (!PageLRU(page)) 609 return; 610 611 if (PageUnevictable(page)) 612 return; 613 614 /* Some processes are using the page */ 615 if (page_mapped(page)) 616 return; 617 618 active = PageActive(page); 619 file = page_is_file_cache(page); 620 lru = page_lru_base_type(page); 621 622 del_page_from_lru_list(page, lruvec, lru + active); 623 ClearPageActive(page); 624 ClearPageReferenced(page); 625 add_page_to_lru_list(page, lruvec, lru); 626 627 if (PageWriteback(page) || PageDirty(page)) { 628 /* 629 * PG_reclaim could be raced with end_page_writeback 630 * It can make readahead confusing. But race window 631 * is _really_ small and it's non-critical problem. 632 */ 633 SetPageReclaim(page); 634 } else { 635 /* 636 * The page's writeback ends up during pagevec 637 * We moves tha page into tail of inactive. 638 */ 639 list_move_tail(&page->lru, &lruvec->lists[lru]); 640 __count_vm_event(PGROTATED); 641 } 642 643 if (active) 644 __count_vm_event(PGDEACTIVATE); 645 update_page_reclaim_stat(lruvec, file, 0); 646 } 647 648 /* 649 * Drain pages out of the cpu's pagevecs. 650 * Either "cpu" is the current CPU, and preemption has already been 651 * disabled; or "cpu" is being hot-unplugged, and is already dead. 652 */ 653 void lru_add_drain_cpu(int cpu) 654 { 655 struct pagevec *pvec = &per_cpu(lru_add_pvec, cpu); 656 657 if (pagevec_count(pvec)) 658 __pagevec_lru_add(pvec); 659 660 pvec = &per_cpu(lru_rotate_pvecs, cpu); 661 if (pagevec_count(pvec)) { 662 unsigned long flags; 663 664 /* No harm done if a racing interrupt already did this */ 665 local_irq_save(flags); 666 pagevec_move_tail(pvec); 667 local_irq_restore(flags); 668 } 669 670 pvec = &per_cpu(lru_deactivate_pvecs, cpu); 671 if (pagevec_count(pvec)) 672 pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); 673 674 activate_page_drain(cpu); 675 } 676 677 /** 678 * deactivate_page - forcefully deactivate a page 679 * @page: page to deactivate 680 * 681 * This function hints the VM that @page is a good reclaim candidate, 682 * for example if its invalidation fails due to the page being dirty 683 * or under writeback. 684 */ 685 void deactivate_page(struct page *page) 686 { 687 /* 688 * In a workload with many unevictable page such as mprotect, unevictable 689 * page deactivation for accelerating reclaim is pointless. 690 */ 691 if (PageUnevictable(page)) 692 return; 693 694 if (likely(get_page_unless_zero(page))) { 695 struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); 696 697 if (!pagevec_add(pvec, page)) 698 pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); 699 put_cpu_var(lru_deactivate_pvecs); 700 } 701 } 702 703 void lru_add_drain(void) 704 { 705 lru_add_drain_cpu(get_cpu()); 706 put_cpu(); 707 } 708 709 static void lru_add_drain_per_cpu(struct work_struct *dummy) 710 { 711 lru_add_drain(); 712 } 713 714 static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); 715 716 void lru_add_drain_all(void) 717 { 718 static DEFINE_MUTEX(lock); 719 static struct cpumask has_work; 720 int cpu; 721 722 mutex_lock(&lock); 723 get_online_cpus(); 724 cpumask_clear(&has_work); 725 726 for_each_online_cpu(cpu) { 727 struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); 728 729 if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || 730 pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || 731 pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || 732 need_activate_page_drain(cpu)) { 733 INIT_WORK(work, lru_add_drain_per_cpu); 734 schedule_work_on(cpu, work); 735 cpumask_set_cpu(cpu, &has_work); 736 } 737 } 738 739 for_each_cpu(cpu, &has_work) 740 flush_work(&per_cpu(lru_add_drain_work, cpu)); 741 742 put_online_cpus(); 743 mutex_unlock(&lock); 744 } 745 746 /* 747 * Batched page_cache_release(). Decrement the reference count on all the 748 * passed pages. If it fell to zero then remove the page from the LRU and 749 * free it. 750 * 751 * Avoid taking zone->lru_lock if possible, but if it is taken, retain it 752 * for the remainder of the operation. 753 * 754 * The locking in this function is against shrink_inactive_list(): we recheck 755 * the page count inside the lock to see whether shrink_inactive_list() 756 * grabbed the page via the LRU. If it did, give up: shrink_inactive_list() 757 * will free it. 758 */ 759 void release_pages(struct page **pages, int nr, int cold) 760 { 761 int i; 762 LIST_HEAD(pages_to_free); 763 struct zone *zone = NULL; 764 struct lruvec *lruvec; 765 unsigned long uninitialized_var(flags); 766 767 for (i = 0; i < nr; i++) { 768 struct page *page = pages[i]; 769 770 if (unlikely(PageCompound(page))) { 771 if (zone) { 772 spin_unlock_irqrestore(&zone->lru_lock, flags); 773 zone = NULL; 774 } 775 put_compound_page(page); 776 continue; 777 } 778 779 if (!put_page_testzero(page)) 780 continue; 781 782 if (PageLRU(page)) { 783 struct zone *pagezone = page_zone(page); 784 785 if (pagezone != zone) { 786 if (zone) 787 spin_unlock_irqrestore(&zone->lru_lock, 788 flags); 789 zone = pagezone; 790 spin_lock_irqsave(&zone->lru_lock, flags); 791 } 792 793 lruvec = mem_cgroup_page_lruvec(page, zone); 794 VM_BUG_ON(!PageLRU(page)); 795 __ClearPageLRU(page); 796 del_page_from_lru_list(page, lruvec, page_off_lru(page)); 797 } 798 799 /* Clear Active bit in case of parallel mark_page_accessed */ 800 ClearPageActive(page); 801 802 list_add(&page->lru, &pages_to_free); 803 } 804 if (zone) 805 spin_unlock_irqrestore(&zone->lru_lock, flags); 806 807 free_hot_cold_page_list(&pages_to_free, cold); 808 } 809 EXPORT_SYMBOL(release_pages); 810 811 /* 812 * The pages which we're about to release may be in the deferred lru-addition 813 * queues. That would prevent them from really being freed right now. That's 814 * OK from a correctness point of view but is inefficient - those pages may be 815 * cache-warm and we want to give them back to the page allocator ASAP. 816 * 817 * So __pagevec_release() will drain those queues here. __pagevec_lru_add() 818 * and __pagevec_lru_add_active() call release_pages() directly to avoid 819 * mutual recursion. 820 */ 821 void __pagevec_release(struct pagevec *pvec) 822 { 823 lru_add_drain(); 824 release_pages(pvec->pages, pagevec_count(pvec), pvec->cold); 825 pagevec_reinit(pvec); 826 } 827 EXPORT_SYMBOL(__pagevec_release); 828 829 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 830 /* used by __split_huge_page_refcount() */ 831 void lru_add_page_tail(struct page *page, struct page *page_tail, 832 struct lruvec *lruvec, struct list_head *list) 833 { 834 const int file = 0; 835 836 VM_BUG_ON(!PageHead(page)); 837 VM_BUG_ON(PageCompound(page_tail)); 838 VM_BUG_ON(PageLRU(page_tail)); 839 VM_BUG_ON(NR_CPUS != 1 && 840 !spin_is_locked(&lruvec_zone(lruvec)->lru_lock)); 841 842 if (!list) 843 SetPageLRU(page_tail); 844 845 if (likely(PageLRU(page))) 846 list_add_tail(&page_tail->lru, &page->lru); 847 else if (list) { 848 /* page reclaim is reclaiming a huge page */ 849 get_page(page_tail); 850 list_add_tail(&page_tail->lru, list); 851 } else { 852 struct list_head *list_head; 853 /* 854 * Head page has not yet been counted, as an hpage, 855 * so we must account for each subpage individually. 856 * 857 * Use the standard add function to put page_tail on the list, 858 * but then correct its position so they all end up in order. 859 */ 860 add_page_to_lru_list(page_tail, lruvec, page_lru(page_tail)); 861 list_head = page_tail->lru.prev; 862 list_move_tail(&page_tail->lru, list_head); 863 } 864 865 if (!PageUnevictable(page)) 866 update_page_reclaim_stat(lruvec, file, PageActive(page_tail)); 867 } 868 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 869 870 static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, 871 void *arg) 872 { 873 int file = page_is_file_cache(page); 874 int active = PageActive(page); 875 enum lru_list lru = page_lru(page); 876 877 VM_BUG_ON(PageLRU(page)); 878 879 SetPageLRU(page); 880 add_page_to_lru_list(page, lruvec, lru); 881 update_page_reclaim_stat(lruvec, file, active); 882 trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page)); 883 } 884 885 /* 886 * Add the passed pages to the LRU, then drop the caller's refcount 887 * on them. Reinitialises the caller's pagevec. 888 */ 889 void __pagevec_lru_add(struct pagevec *pvec) 890 { 891 pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL); 892 } 893 EXPORT_SYMBOL(__pagevec_lru_add); 894 895 /** 896 * pagevec_lookup - gang pagecache lookup 897 * @pvec: Where the resulting pages are placed 898 * @mapping: The address_space to search 899 * @start: The starting page index 900 * @nr_pages: The maximum number of pages 901 * 902 * pagevec_lookup() will search for and return a group of up to @nr_pages pages 903 * in the mapping. The pages are placed in @pvec. pagevec_lookup() takes a 904 * reference against the pages in @pvec. 905 * 906 * The search returns a group of mapping-contiguous pages with ascending 907 * indexes. There may be holes in the indices due to not-present pages. 908 * 909 * pagevec_lookup() returns the number of pages which were found. 910 */ 911 unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, 912 pgoff_t start, unsigned nr_pages) 913 { 914 pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages); 915 return pagevec_count(pvec); 916 } 917 EXPORT_SYMBOL(pagevec_lookup); 918 919 unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, 920 pgoff_t *index, int tag, unsigned nr_pages) 921 { 922 pvec->nr = find_get_pages_tag(mapping, index, tag, 923 nr_pages, pvec->pages); 924 return pagevec_count(pvec); 925 } 926 EXPORT_SYMBOL(pagevec_lookup_tag); 927 928 /* 929 * Perform any setup for the swap system 930 */ 931 void __init swap_setup(void) 932 { 933 unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); 934 #ifdef CONFIG_SWAP 935 int i; 936 937 bdi_init(swapper_spaces[0].backing_dev_info); 938 for (i = 0; i < MAX_SWAPFILES; i++) { 939 spin_lock_init(&swapper_spaces[i].tree_lock); 940 INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear); 941 } 942 #endif 943 944 /* Use a smaller cluster for small-memory machines */ 945 if (megs < 16) 946 page_cluster = 2; 947 else 948 page_cluster = 3; 949 /* 950 * Right now other parts of the system means that we 951 * _really_ don't want to cluster much more 952 */ 953 } 954