1 /* 2 * linux/mm/swap.c 3 * 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 5 */ 6 7 /* 8 * This file contains the default values for the operation of the 9 * Linux VM subsystem. Fine-tuning documentation can be found in 10 * Documentation/sysctl/vm.txt. 11 * Started 18.12.91 12 * Swap aging added 23.2.95, Stephen Tweedie. 13 * Buffermem limits added 12.3.98, Rik van Riel. 14 */ 15 16 #include <linux/mm.h> 17 #include <linux/sched.h> 18 #include <linux/kernel_stat.h> 19 #include <linux/swap.h> 20 #include <linux/mman.h> 21 #include <linux/pagemap.h> 22 #include <linux/pagevec.h> 23 #include <linux/init.h> 24 #include <linux/export.h> 25 #include <linux/mm_inline.h> 26 #include <linux/percpu_counter.h> 27 #include <linux/memremap.h> 28 #include <linux/percpu.h> 29 #include <linux/cpu.h> 30 #include <linux/notifier.h> 31 #include <linux/backing-dev.h> 32 #include <linux/memcontrol.h> 33 #include <linux/gfp.h> 34 #include <linux/uio.h> 35 #include <linux/hugetlb.h> 36 #include <linux/page_idle.h> 37 38 #include "internal.h" 39 40 #define CREATE_TRACE_POINTS 41 #include <trace/events/pagemap.h> 42 43 /* How many pages do we try to swap or page in/out together? */ 44 int page_cluster; 45 46 static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); 47 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); 48 static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs); 49 static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); 50 #ifdef CONFIG_SMP 51 static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); 52 #endif 53 54 /* 55 * This path almost never happens for VM activity - pages are normally 56 * freed via pagevecs. But it gets used by networking. 57 */ 58 static void __page_cache_release(struct page *page) 59 { 60 if (PageLRU(page)) { 61 struct zone *zone = page_zone(page); 62 struct lruvec *lruvec; 63 unsigned long flags; 64 65 spin_lock_irqsave(zone_lru_lock(zone), flags); 66 lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); 67 VM_BUG_ON_PAGE(!PageLRU(page), page); 68 __ClearPageLRU(page); 69 del_page_from_lru_list(page, lruvec, page_off_lru(page)); 70 spin_unlock_irqrestore(zone_lru_lock(zone), flags); 71 } 72 __ClearPageWaiters(page); 73 mem_cgroup_uncharge(page); 74 } 75 76 static void __put_single_page(struct page *page) 77 { 78 __page_cache_release(page); 79 free_hot_cold_page(page, false); 80 } 81 82 static void __put_compound_page(struct page *page) 83 { 84 compound_page_dtor *dtor; 85 86 /* 87 * __page_cache_release() is supposed to be called for thp, not for 88 * hugetlb. This is because hugetlb page does never have PageLRU set 89 * (it's never listed to any LRU lists) and no memcg routines should 90 * be called for hugetlb (it has a separate hugetlb_cgroup.) 91 */ 92 if (!PageHuge(page)) 93 __page_cache_release(page); 94 dtor = get_compound_page_dtor(page); 95 (*dtor)(page); 96 } 97 98 void __put_page(struct page *page) 99 { 100 if (unlikely(PageCompound(page))) 101 __put_compound_page(page); 102 else 103 __put_single_page(page); 104 } 105 EXPORT_SYMBOL(__put_page); 106 107 /** 108 * put_pages_list() - release a list of pages 109 * @pages: list of pages threaded on page->lru 110 * 111 * Release a list of pages which are strung together on page.lru. Currently 112 * used by read_cache_pages() and related error recovery code. 113 */ 114 void put_pages_list(struct list_head *pages) 115 { 116 while (!list_empty(pages)) { 117 struct page *victim; 118 119 victim = list_entry(pages->prev, struct page, lru); 120 list_del(&victim->lru); 121 put_page(victim); 122 } 123 } 124 EXPORT_SYMBOL(put_pages_list); 125 126 /* 127 * get_kernel_pages() - pin kernel pages in memory 128 * @kiov: An array of struct kvec structures 129 * @nr_segs: number of segments to pin 130 * @write: pinning for read/write, currently ignored 131 * @pages: array that receives pointers to the pages pinned. 132 * Should be at least nr_segs long. 133 * 134 * Returns number of pages pinned. This may be fewer than the number 135 * requested. If nr_pages is 0 or negative, returns 0. If no pages 136 * were pinned, returns -errno. Each page returned must be released 137 * with a put_page() call when it is finished with. 138 */ 139 int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write, 140 struct page **pages) 141 { 142 int seg; 143 144 for (seg = 0; seg < nr_segs; seg++) { 145 if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE)) 146 return seg; 147 148 pages[seg] = kmap_to_page(kiov[seg].iov_base); 149 get_page(pages[seg]); 150 } 151 152 return seg; 153 } 154 EXPORT_SYMBOL_GPL(get_kernel_pages); 155 156 /* 157 * get_kernel_page() - pin a kernel page in memory 158 * @start: starting kernel address 159 * @write: pinning for read/write, currently ignored 160 * @pages: array that receives pointer to the page pinned. 161 * Must be at least nr_segs long. 162 * 163 * Returns 1 if page is pinned. If the page was not pinned, returns 164 * -errno. The page returned must be released with a put_page() call 165 * when it is finished with. 166 */ 167 int get_kernel_page(unsigned long start, int write, struct page **pages) 168 { 169 const struct kvec kiov = { 170 .iov_base = (void *)start, 171 .iov_len = PAGE_SIZE 172 }; 173 174 return get_kernel_pages(&kiov, 1, write, pages); 175 } 176 EXPORT_SYMBOL_GPL(get_kernel_page); 177 178 static void pagevec_lru_move_fn(struct pagevec *pvec, 179 void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg), 180 void *arg) 181 { 182 int i; 183 struct pglist_data *pgdat = NULL; 184 struct lruvec *lruvec; 185 unsigned long flags = 0; 186 187 for (i = 0; i < pagevec_count(pvec); i++) { 188 struct page *page = pvec->pages[i]; 189 struct pglist_data *pagepgdat = page_pgdat(page); 190 191 if (pagepgdat != pgdat) { 192 if (pgdat) 193 spin_unlock_irqrestore(&pgdat->lru_lock, flags); 194 pgdat = pagepgdat; 195 spin_lock_irqsave(&pgdat->lru_lock, flags); 196 } 197 198 lruvec = mem_cgroup_page_lruvec(page, pgdat); 199 (*move_fn)(page, lruvec, arg); 200 } 201 if (pgdat) 202 spin_unlock_irqrestore(&pgdat->lru_lock, flags); 203 release_pages(pvec->pages, pvec->nr, pvec->cold); 204 pagevec_reinit(pvec); 205 } 206 207 static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec, 208 void *arg) 209 { 210 int *pgmoved = arg; 211 212 if (PageLRU(page) && !PageUnevictable(page)) { 213 del_page_from_lru_list(page, lruvec, page_lru(page)); 214 ClearPageActive(page); 215 add_page_to_lru_list_tail(page, lruvec, page_lru(page)); 216 (*pgmoved)++; 217 } 218 } 219 220 /* 221 * pagevec_move_tail() must be called with IRQ disabled. 222 * Otherwise this may cause nasty races. 223 */ 224 static void pagevec_move_tail(struct pagevec *pvec) 225 { 226 int pgmoved = 0; 227 228 pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved); 229 __count_vm_events(PGROTATED, pgmoved); 230 } 231 232 /* 233 * Writeback is about to end against a page which has been marked for immediate 234 * reclaim. If it still appears to be reclaimable, move it to the tail of the 235 * inactive list. 236 */ 237 void rotate_reclaimable_page(struct page *page) 238 { 239 if (!PageLocked(page) && !PageDirty(page) && 240 !PageUnevictable(page) && PageLRU(page)) { 241 struct pagevec *pvec; 242 unsigned long flags; 243 244 get_page(page); 245 local_irq_save(flags); 246 pvec = this_cpu_ptr(&lru_rotate_pvecs); 247 if (!pagevec_add(pvec, page) || PageCompound(page)) 248 pagevec_move_tail(pvec); 249 local_irq_restore(flags); 250 } 251 } 252 253 static void update_page_reclaim_stat(struct lruvec *lruvec, 254 int file, int rotated) 255 { 256 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; 257 258 reclaim_stat->recent_scanned[file]++; 259 if (rotated) 260 reclaim_stat->recent_rotated[file]++; 261 } 262 263 static void __activate_page(struct page *page, struct lruvec *lruvec, 264 void *arg) 265 { 266 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 267 int file = page_is_file_cache(page); 268 int lru = page_lru_base_type(page); 269 270 del_page_from_lru_list(page, lruvec, lru); 271 SetPageActive(page); 272 lru += LRU_ACTIVE; 273 add_page_to_lru_list(page, lruvec, lru); 274 trace_mm_lru_activate(page); 275 276 __count_vm_event(PGACTIVATE); 277 update_page_reclaim_stat(lruvec, file, 1); 278 } 279 } 280 281 #ifdef CONFIG_SMP 282 static void activate_page_drain(int cpu) 283 { 284 struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu); 285 286 if (pagevec_count(pvec)) 287 pagevec_lru_move_fn(pvec, __activate_page, NULL); 288 } 289 290 static bool need_activate_page_drain(int cpu) 291 { 292 return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; 293 } 294 295 void activate_page(struct page *page) 296 { 297 page = compound_head(page); 298 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 299 struct pagevec *pvec = &get_cpu_var(activate_page_pvecs); 300 301 get_page(page); 302 if (!pagevec_add(pvec, page) || PageCompound(page)) 303 pagevec_lru_move_fn(pvec, __activate_page, NULL); 304 put_cpu_var(activate_page_pvecs); 305 } 306 } 307 308 #else 309 static inline void activate_page_drain(int cpu) 310 { 311 } 312 313 static bool need_activate_page_drain(int cpu) 314 { 315 return false; 316 } 317 318 void activate_page(struct page *page) 319 { 320 struct zone *zone = page_zone(page); 321 322 page = compound_head(page); 323 spin_lock_irq(zone_lru_lock(zone)); 324 __activate_page(page, mem_cgroup_page_lruvec(page, zone->zone_pgdat), NULL); 325 spin_unlock_irq(zone_lru_lock(zone)); 326 } 327 #endif 328 329 static void __lru_cache_activate_page(struct page *page) 330 { 331 struct pagevec *pvec = &get_cpu_var(lru_add_pvec); 332 int i; 333 334 /* 335 * Search backwards on the optimistic assumption that the page being 336 * activated has just been added to this pagevec. Note that only 337 * the local pagevec is examined as a !PageLRU page could be in the 338 * process of being released, reclaimed, migrated or on a remote 339 * pagevec that is currently being drained. Furthermore, marking 340 * a remote pagevec's page PageActive potentially hits a race where 341 * a page is marked PageActive just after it is added to the inactive 342 * list causing accounting errors and BUG_ON checks to trigger. 343 */ 344 for (i = pagevec_count(pvec) - 1; i >= 0; i--) { 345 struct page *pagevec_page = pvec->pages[i]; 346 347 if (pagevec_page == page) { 348 SetPageActive(page); 349 break; 350 } 351 } 352 353 put_cpu_var(lru_add_pvec); 354 } 355 356 /* 357 * Mark a page as having seen activity. 358 * 359 * inactive,unreferenced -> inactive,referenced 360 * inactive,referenced -> active,unreferenced 361 * active,unreferenced -> active,referenced 362 * 363 * When a newly allocated page is not yet visible, so safe for non-atomic ops, 364 * __SetPageReferenced(page) may be substituted for mark_page_accessed(page). 365 */ 366 void mark_page_accessed(struct page *page) 367 { 368 page = compound_head(page); 369 if (!PageActive(page) && !PageUnevictable(page) && 370 PageReferenced(page)) { 371 372 /* 373 * If the page is on the LRU, queue it for activation via 374 * activate_page_pvecs. Otherwise, assume the page is on a 375 * pagevec, mark it active and it'll be moved to the active 376 * LRU on the next drain. 377 */ 378 if (PageLRU(page)) 379 activate_page(page); 380 else 381 __lru_cache_activate_page(page); 382 ClearPageReferenced(page); 383 if (page_is_file_cache(page)) 384 workingset_activation(page); 385 } else if (!PageReferenced(page)) { 386 SetPageReferenced(page); 387 } 388 if (page_is_idle(page)) 389 clear_page_idle(page); 390 } 391 EXPORT_SYMBOL(mark_page_accessed); 392 393 static void __lru_cache_add(struct page *page) 394 { 395 struct pagevec *pvec = &get_cpu_var(lru_add_pvec); 396 397 get_page(page); 398 if (!pagevec_add(pvec, page) || PageCompound(page)) 399 __pagevec_lru_add(pvec); 400 put_cpu_var(lru_add_pvec); 401 } 402 403 /** 404 * lru_cache_add: add a page to the page lists 405 * @page: the page to add 406 */ 407 void lru_cache_add_anon(struct page *page) 408 { 409 if (PageActive(page)) 410 ClearPageActive(page); 411 __lru_cache_add(page); 412 } 413 414 void lru_cache_add_file(struct page *page) 415 { 416 if (PageActive(page)) 417 ClearPageActive(page); 418 __lru_cache_add(page); 419 } 420 EXPORT_SYMBOL(lru_cache_add_file); 421 422 /** 423 * lru_cache_add - add a page to a page list 424 * @page: the page to be added to the LRU. 425 * 426 * Queue the page for addition to the LRU via pagevec. The decision on whether 427 * to add the page to the [in]active [file|anon] list is deferred until the 428 * pagevec is drained. This gives a chance for the caller of lru_cache_add() 429 * have the page added to the active list using mark_page_accessed(). 430 */ 431 void lru_cache_add(struct page *page) 432 { 433 VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page); 434 VM_BUG_ON_PAGE(PageLRU(page), page); 435 __lru_cache_add(page); 436 } 437 438 /** 439 * add_page_to_unevictable_list - add a page to the unevictable list 440 * @page: the page to be added to the unevictable list 441 * 442 * Add page directly to its zone's unevictable list. To avoid races with 443 * tasks that might be making the page evictable, through eg. munlock, 444 * munmap or exit, while it's not on the lru, we want to add the page 445 * while it's locked or otherwise "invisible" to other tasks. This is 446 * difficult to do when using the pagevec cache, so bypass that. 447 */ 448 void add_page_to_unevictable_list(struct page *page) 449 { 450 struct pglist_data *pgdat = page_pgdat(page); 451 struct lruvec *lruvec; 452 453 spin_lock_irq(&pgdat->lru_lock); 454 lruvec = mem_cgroup_page_lruvec(page, pgdat); 455 ClearPageActive(page); 456 SetPageUnevictable(page); 457 SetPageLRU(page); 458 add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE); 459 spin_unlock_irq(&pgdat->lru_lock); 460 } 461 462 /** 463 * lru_cache_add_active_or_unevictable 464 * @page: the page to be added to LRU 465 * @vma: vma in which page is mapped for determining reclaimability 466 * 467 * Place @page on the active or unevictable LRU list, depending on its 468 * evictability. Note that if the page is not evictable, it goes 469 * directly back onto it's zone's unevictable list, it does NOT use a 470 * per cpu pagevec. 471 */ 472 void lru_cache_add_active_or_unevictable(struct page *page, 473 struct vm_area_struct *vma) 474 { 475 VM_BUG_ON_PAGE(PageLRU(page), page); 476 477 if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) { 478 SetPageActive(page); 479 lru_cache_add(page); 480 return; 481 } 482 483 if (!TestSetPageMlocked(page)) { 484 /* 485 * We use the irq-unsafe __mod_zone_page_stat because this 486 * counter is not modified from interrupt context, and the pte 487 * lock is held(spinlock), which implies preemption disabled. 488 */ 489 __mod_zone_page_state(page_zone(page), NR_MLOCK, 490 hpage_nr_pages(page)); 491 count_vm_event(UNEVICTABLE_PGMLOCKED); 492 } 493 add_page_to_unevictable_list(page); 494 } 495 496 /* 497 * If the page can not be invalidated, it is moved to the 498 * inactive list to speed up its reclaim. It is moved to the 499 * head of the list, rather than the tail, to give the flusher 500 * threads some time to write it out, as this is much more 501 * effective than the single-page writeout from reclaim. 502 * 503 * If the page isn't page_mapped and dirty/writeback, the page 504 * could reclaim asap using PG_reclaim. 505 * 506 * 1. active, mapped page -> none 507 * 2. active, dirty/writeback page -> inactive, head, PG_reclaim 508 * 3. inactive, mapped page -> none 509 * 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim 510 * 5. inactive, clean -> inactive, tail 511 * 6. Others -> none 512 * 513 * In 4, why it moves inactive's head, the VM expects the page would 514 * be write it out by flusher threads as this is much more effective 515 * than the single-page writeout from reclaim. 516 */ 517 static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, 518 void *arg) 519 { 520 int lru, file; 521 bool active; 522 523 if (!PageLRU(page)) 524 return; 525 526 if (PageUnevictable(page)) 527 return; 528 529 /* Some processes are using the page */ 530 if (page_mapped(page)) 531 return; 532 533 active = PageActive(page); 534 file = page_is_file_cache(page); 535 lru = page_lru_base_type(page); 536 537 del_page_from_lru_list(page, lruvec, lru + active); 538 ClearPageActive(page); 539 ClearPageReferenced(page); 540 add_page_to_lru_list(page, lruvec, lru); 541 542 if (PageWriteback(page) || PageDirty(page)) { 543 /* 544 * PG_reclaim could be raced with end_page_writeback 545 * It can make readahead confusing. But race window 546 * is _really_ small and it's non-critical problem. 547 */ 548 SetPageReclaim(page); 549 } else { 550 /* 551 * The page's writeback ends up during pagevec 552 * We moves tha page into tail of inactive. 553 */ 554 list_move_tail(&page->lru, &lruvec->lists[lru]); 555 __count_vm_event(PGROTATED); 556 } 557 558 if (active) 559 __count_vm_event(PGDEACTIVATE); 560 update_page_reclaim_stat(lruvec, file, 0); 561 } 562 563 564 static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, 565 void *arg) 566 { 567 if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { 568 int file = page_is_file_cache(page); 569 int lru = page_lru_base_type(page); 570 571 del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE); 572 ClearPageActive(page); 573 ClearPageReferenced(page); 574 add_page_to_lru_list(page, lruvec, lru); 575 576 __count_vm_event(PGDEACTIVATE); 577 update_page_reclaim_stat(lruvec, file, 0); 578 } 579 } 580 581 /* 582 * Drain pages out of the cpu's pagevecs. 583 * Either "cpu" is the current CPU, and preemption has already been 584 * disabled; or "cpu" is being hot-unplugged, and is already dead. 585 */ 586 void lru_add_drain_cpu(int cpu) 587 { 588 struct pagevec *pvec = &per_cpu(lru_add_pvec, cpu); 589 590 if (pagevec_count(pvec)) 591 __pagevec_lru_add(pvec); 592 593 pvec = &per_cpu(lru_rotate_pvecs, cpu); 594 if (pagevec_count(pvec)) { 595 unsigned long flags; 596 597 /* No harm done if a racing interrupt already did this */ 598 local_irq_save(flags); 599 pagevec_move_tail(pvec); 600 local_irq_restore(flags); 601 } 602 603 pvec = &per_cpu(lru_deactivate_file_pvecs, cpu); 604 if (pagevec_count(pvec)) 605 pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); 606 607 pvec = &per_cpu(lru_deactivate_pvecs, cpu); 608 if (pagevec_count(pvec)) 609 pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); 610 611 activate_page_drain(cpu); 612 } 613 614 /** 615 * deactivate_file_page - forcefully deactivate a file page 616 * @page: page to deactivate 617 * 618 * This function hints the VM that @page is a good reclaim candidate, 619 * for example if its invalidation fails due to the page being dirty 620 * or under writeback. 621 */ 622 void deactivate_file_page(struct page *page) 623 { 624 /* 625 * In a workload with many unevictable page such as mprotect, 626 * unevictable page deactivation for accelerating reclaim is pointless. 627 */ 628 if (PageUnevictable(page)) 629 return; 630 631 if (likely(get_page_unless_zero(page))) { 632 struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs); 633 634 if (!pagevec_add(pvec, page) || PageCompound(page)) 635 pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); 636 put_cpu_var(lru_deactivate_file_pvecs); 637 } 638 } 639 640 /** 641 * deactivate_page - deactivate a page 642 * @page: page to deactivate 643 * 644 * deactivate_page() moves @page to the inactive list if @page was on the active 645 * list and was not an unevictable page. This is done to accelerate the reclaim 646 * of @page. 647 */ 648 void deactivate_page(struct page *page) 649 { 650 if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { 651 struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); 652 653 get_page(page); 654 if (!pagevec_add(pvec, page) || PageCompound(page)) 655 pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); 656 put_cpu_var(lru_deactivate_pvecs); 657 } 658 } 659 660 void lru_add_drain(void) 661 { 662 lru_add_drain_cpu(get_cpu()); 663 put_cpu(); 664 } 665 666 static void lru_add_drain_per_cpu(struct work_struct *dummy) 667 { 668 lru_add_drain(); 669 } 670 671 static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); 672 673 /* 674 * lru_add_drain_wq is used to do lru_add_drain_all() from a WQ_MEM_RECLAIM 675 * workqueue, aiding in getting memory freed. 676 */ 677 static struct workqueue_struct *lru_add_drain_wq; 678 679 static int __init lru_init(void) 680 { 681 lru_add_drain_wq = alloc_workqueue("lru-add-drain", WQ_MEM_RECLAIM, 0); 682 683 if (WARN(!lru_add_drain_wq, 684 "Failed to create workqueue lru_add_drain_wq")) 685 return -ENOMEM; 686 687 return 0; 688 } 689 early_initcall(lru_init); 690 691 void lru_add_drain_all(void) 692 { 693 static DEFINE_MUTEX(lock); 694 static struct cpumask has_work; 695 int cpu; 696 697 mutex_lock(&lock); 698 get_online_cpus(); 699 cpumask_clear(&has_work); 700 701 for_each_online_cpu(cpu) { 702 struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); 703 704 if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || 705 pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || 706 pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) || 707 pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || 708 need_activate_page_drain(cpu)) { 709 INIT_WORK(work, lru_add_drain_per_cpu); 710 queue_work_on(cpu, lru_add_drain_wq, work); 711 cpumask_set_cpu(cpu, &has_work); 712 } 713 } 714 715 for_each_cpu(cpu, &has_work) 716 flush_work(&per_cpu(lru_add_drain_work, cpu)); 717 718 put_online_cpus(); 719 mutex_unlock(&lock); 720 } 721 722 /** 723 * release_pages - batched put_page() 724 * @pages: array of pages to release 725 * @nr: number of pages 726 * @cold: whether the pages are cache cold 727 * 728 * Decrement the reference count on all the pages in @pages. If it 729 * fell to zero, remove the page from the LRU and free it. 730 */ 731 void release_pages(struct page **pages, int nr, bool cold) 732 { 733 int i; 734 LIST_HEAD(pages_to_free); 735 struct pglist_data *locked_pgdat = NULL; 736 struct lruvec *lruvec; 737 unsigned long uninitialized_var(flags); 738 unsigned int uninitialized_var(lock_batch); 739 740 for (i = 0; i < nr; i++) { 741 struct page *page = pages[i]; 742 743 /* 744 * Make sure the IRQ-safe lock-holding time does not get 745 * excessive with a continuous string of pages from the 746 * same pgdat. The lock is held only if pgdat != NULL. 747 */ 748 if (locked_pgdat && ++lock_batch == SWAP_CLUSTER_MAX) { 749 spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); 750 locked_pgdat = NULL; 751 } 752 753 if (is_huge_zero_page(page)) 754 continue; 755 756 page = compound_head(page); 757 if (!put_page_testzero(page)) 758 continue; 759 760 if (PageCompound(page)) { 761 if (locked_pgdat) { 762 spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); 763 locked_pgdat = NULL; 764 } 765 __put_compound_page(page); 766 continue; 767 } 768 769 if (PageLRU(page)) { 770 struct pglist_data *pgdat = page_pgdat(page); 771 772 if (pgdat != locked_pgdat) { 773 if (locked_pgdat) 774 spin_unlock_irqrestore(&locked_pgdat->lru_lock, 775 flags); 776 lock_batch = 0; 777 locked_pgdat = pgdat; 778 spin_lock_irqsave(&locked_pgdat->lru_lock, flags); 779 } 780 781 lruvec = mem_cgroup_page_lruvec(page, locked_pgdat); 782 VM_BUG_ON_PAGE(!PageLRU(page), page); 783 __ClearPageLRU(page); 784 del_page_from_lru_list(page, lruvec, page_off_lru(page)); 785 } 786 787 /* Clear Active bit in case of parallel mark_page_accessed */ 788 __ClearPageActive(page); 789 __ClearPageWaiters(page); 790 791 list_add(&page->lru, &pages_to_free); 792 } 793 if (locked_pgdat) 794 spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); 795 796 mem_cgroup_uncharge_list(&pages_to_free); 797 free_hot_cold_page_list(&pages_to_free, cold); 798 } 799 EXPORT_SYMBOL(release_pages); 800 801 /* 802 * The pages which we're about to release may be in the deferred lru-addition 803 * queues. That would prevent them from really being freed right now. That's 804 * OK from a correctness point of view but is inefficient - those pages may be 805 * cache-warm and we want to give them back to the page allocator ASAP. 806 * 807 * So __pagevec_release() will drain those queues here. __pagevec_lru_add() 808 * and __pagevec_lru_add_active() call release_pages() directly to avoid 809 * mutual recursion. 810 */ 811 void __pagevec_release(struct pagevec *pvec) 812 { 813 lru_add_drain(); 814 release_pages(pvec->pages, pagevec_count(pvec), pvec->cold); 815 pagevec_reinit(pvec); 816 } 817 EXPORT_SYMBOL(__pagevec_release); 818 819 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 820 /* used by __split_huge_page_refcount() */ 821 void lru_add_page_tail(struct page *page, struct page *page_tail, 822 struct lruvec *lruvec, struct list_head *list) 823 { 824 const int file = 0; 825 826 VM_BUG_ON_PAGE(!PageHead(page), page); 827 VM_BUG_ON_PAGE(PageCompound(page_tail), page); 828 VM_BUG_ON_PAGE(PageLRU(page_tail), page); 829 VM_BUG_ON(NR_CPUS != 1 && 830 !spin_is_locked(&lruvec_pgdat(lruvec)->lru_lock)); 831 832 if (!list) 833 SetPageLRU(page_tail); 834 835 if (likely(PageLRU(page))) 836 list_add_tail(&page_tail->lru, &page->lru); 837 else if (list) { 838 /* page reclaim is reclaiming a huge page */ 839 get_page(page_tail); 840 list_add_tail(&page_tail->lru, list); 841 } else { 842 struct list_head *list_head; 843 /* 844 * Head page has not yet been counted, as an hpage, 845 * so we must account for each subpage individually. 846 * 847 * Use the standard add function to put page_tail on the list, 848 * but then correct its position so they all end up in order. 849 */ 850 add_page_to_lru_list(page_tail, lruvec, page_lru(page_tail)); 851 list_head = page_tail->lru.prev; 852 list_move_tail(&page_tail->lru, list_head); 853 } 854 855 if (!PageUnevictable(page)) 856 update_page_reclaim_stat(lruvec, file, PageActive(page_tail)); 857 } 858 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 859 860 static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, 861 void *arg) 862 { 863 int file = page_is_file_cache(page); 864 int active = PageActive(page); 865 enum lru_list lru = page_lru(page); 866 867 VM_BUG_ON_PAGE(PageLRU(page), page); 868 869 SetPageLRU(page); 870 add_page_to_lru_list(page, lruvec, lru); 871 update_page_reclaim_stat(lruvec, file, active); 872 trace_mm_lru_insertion(page, lru); 873 } 874 875 /* 876 * Add the passed pages to the LRU, then drop the caller's refcount 877 * on them. Reinitialises the caller's pagevec. 878 */ 879 void __pagevec_lru_add(struct pagevec *pvec) 880 { 881 pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL); 882 } 883 EXPORT_SYMBOL(__pagevec_lru_add); 884 885 /** 886 * pagevec_lookup_entries - gang pagecache lookup 887 * @pvec: Where the resulting entries are placed 888 * @mapping: The address_space to search 889 * @start: The starting entry index 890 * @nr_entries: The maximum number of entries 891 * @indices: The cache indices corresponding to the entries in @pvec 892 * 893 * pagevec_lookup_entries() will search for and return a group of up 894 * to @nr_entries pages and shadow entries in the mapping. All 895 * entries are placed in @pvec. pagevec_lookup_entries() takes a 896 * reference against actual pages in @pvec. 897 * 898 * The search returns a group of mapping-contiguous entries with 899 * ascending indexes. There may be holes in the indices due to 900 * not-present entries. 901 * 902 * pagevec_lookup_entries() returns the number of entries which were 903 * found. 904 */ 905 unsigned pagevec_lookup_entries(struct pagevec *pvec, 906 struct address_space *mapping, 907 pgoff_t start, unsigned nr_pages, 908 pgoff_t *indices) 909 { 910 pvec->nr = find_get_entries(mapping, start, nr_pages, 911 pvec->pages, indices); 912 return pagevec_count(pvec); 913 } 914 915 /** 916 * pagevec_remove_exceptionals - pagevec exceptionals pruning 917 * @pvec: The pagevec to prune 918 * 919 * pagevec_lookup_entries() fills both pages and exceptional radix 920 * tree entries into the pagevec. This function prunes all 921 * exceptionals from @pvec without leaving holes, so that it can be 922 * passed on to page-only pagevec operations. 923 */ 924 void pagevec_remove_exceptionals(struct pagevec *pvec) 925 { 926 int i, j; 927 928 for (i = 0, j = 0; i < pagevec_count(pvec); i++) { 929 struct page *page = pvec->pages[i]; 930 if (!radix_tree_exceptional_entry(page)) 931 pvec->pages[j++] = page; 932 } 933 pvec->nr = j; 934 } 935 936 /** 937 * pagevec_lookup - gang pagecache lookup 938 * @pvec: Where the resulting pages are placed 939 * @mapping: The address_space to search 940 * @start: The starting page index 941 * @nr_pages: The maximum number of pages 942 * 943 * pagevec_lookup() will search for and return a group of up to @nr_pages pages 944 * in the mapping. The pages are placed in @pvec. pagevec_lookup() takes a 945 * reference against the pages in @pvec. 946 * 947 * The search returns a group of mapping-contiguous pages with ascending 948 * indexes. There may be holes in the indices due to not-present pages. 949 * 950 * pagevec_lookup() returns the number of pages which were found. 951 */ 952 unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, 953 pgoff_t start, unsigned nr_pages) 954 { 955 pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages); 956 return pagevec_count(pvec); 957 } 958 EXPORT_SYMBOL(pagevec_lookup); 959 960 unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, 961 pgoff_t *index, int tag, unsigned nr_pages) 962 { 963 pvec->nr = find_get_pages_tag(mapping, index, tag, 964 nr_pages, pvec->pages); 965 return pagevec_count(pvec); 966 } 967 EXPORT_SYMBOL(pagevec_lookup_tag); 968 969 /* 970 * Perform any setup for the swap system 971 */ 972 void __init swap_setup(void) 973 { 974 unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); 975 976 /* Use a smaller cluster for small-memory machines */ 977 if (megs < 16) 978 page_cluster = 2; 979 else 980 page_cluster = 3; 981 /* 982 * Right now other parts of the system means that we 983 * _really_ don't want to cluster much more 984 */ 985 } 986