1 /* 2 * linux/mm/swap.c 3 * 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 5 */ 6 7 /* 8 * This file contains the default values for the operation of the 9 * Linux VM subsystem. Fine-tuning documentation can be found in 10 * Documentation/sysctl/vm.txt. 11 * Started 18.12.91 12 * Swap aging added 23.2.95, Stephen Tweedie. 13 * Buffermem limits added 12.3.98, Rik van Riel. 14 */ 15 16 #include <linux/mm.h> 17 #include <linux/sched.h> 18 #include <linux/kernel_stat.h> 19 #include <linux/swap.h> 20 #include <linux/mman.h> 21 #include <linux/pagemap.h> 22 #include <linux/pagevec.h> 23 #include <linux/init.h> 24 #include <linux/export.h> 25 #include <linux/mm_inline.h> 26 #include <linux/percpu_counter.h> 27 #include <linux/memremap.h> 28 #include <linux/percpu.h> 29 #include <linux/cpu.h> 30 #include <linux/notifier.h> 31 #include <linux/backing-dev.h> 32 #include <linux/memcontrol.h> 33 #include <linux/gfp.h> 34 #include <linux/uio.h> 35 #include <linux/hugetlb.h> 36 #include <linux/page_idle.h> 37 38 #include "internal.h" 39 40 #define CREATE_TRACE_POINTS 41 #include <trace/events/pagemap.h> 42 43 /* How many pages do we try to swap or page in/out together? */ 44 int page_cluster; 45 46 static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); 47 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); 48 static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs); 49 static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); 50 #ifdef CONFIG_SMP 51 static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); 52 #endif 53 54 /* 55 * This path almost never happens for VM activity - pages are normally 56 * freed via pagevecs. But it gets used by networking. 57 */ 58 static void __page_cache_release(struct page *page) 59 { 60 if (PageLRU(page)) { 61 struct zone *zone = page_zone(page); 62 struct lruvec *lruvec; 63 unsigned long flags; 64 65 spin_lock_irqsave(zone_lru_lock(zone), flags); 66 lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); 67 VM_BUG_ON_PAGE(!PageLRU(page), page); 68 __ClearPageLRU(page); 69 del_page_from_lru_list(page, lruvec, page_off_lru(page)); 70 spin_unlock_irqrestore(zone_lru_lock(zone), flags); 71 } 72 __ClearPageWaiters(page); 73 mem_cgroup_uncharge(page); 74 } 75 76 static void __put_single_page(struct page *page) 77 { 78 __page_cache_release(page); 79 free_hot_cold_page(page, false); 80 } 81 82 static void __put_compound_page(struct page *page) 83 { 84 compound_page_dtor *dtor; 85 86 /* 87 * __page_cache_release() is supposed to be called for thp, not for 88 * hugetlb. This is because hugetlb page does never have PageLRU set 89 * (it's never listed to any LRU lists) and no memcg routines should 90 * be called for hugetlb (it has a separate hugetlb_cgroup.) 91 */ 92 if (!PageHuge(page)) 93 __page_cache_release(page); 94 dtor = get_compound_page_dtor(page); 95 (*dtor)(page); 96 } 97 98 void __put_page(struct page *page) 99 { 100 if (unlikely(PageCompound(page))) 101 __put_compound_page(page); 102 else 103 __put_single_page(page); 104 } 105 EXPORT_SYMBOL(__put_page); 106 107 /** 108 * put_pages_list() - release a list of pages 109 * @pages: list of pages threaded on page->lru 110 * 111 * Release a list of pages which are strung together on page.lru. Currently 112 * used by read_cache_pages() and related error recovery code. 113 */ 114 void put_pages_list(struct list_head *pages) 115 { 116 while (!list_empty(pages)) { 117 struct page *victim; 118 119 victim = list_entry(pages->prev, struct page, lru); 120 list_del(&victim->lru); 121 put_page(victim); 122 } 123 } 124 EXPORT_SYMBOL(put_pages_list); 125 126 /* 127 * get_kernel_pages() - pin kernel pages in memory 128 * @kiov: An array of struct kvec structures 129 * @nr_segs: number of segments to pin 130 * @write: pinning for read/write, currently ignored 131 * @pages: array that receives pointers to the pages pinned. 132 * Should be at least nr_segs long. 133 * 134 * Returns number of pages pinned. This may be fewer than the number 135 * requested. If nr_pages is 0 or negative, returns 0. If no pages 136 * were pinned, returns -errno. Each page returned must be released 137 * with a put_page() call when it is finished with. 138 */ 139 int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write, 140 struct page **pages) 141 { 142 int seg; 143 144 for (seg = 0; seg < nr_segs; seg++) { 145 if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE)) 146 return seg; 147 148 pages[seg] = kmap_to_page(kiov[seg].iov_base); 149 get_page(pages[seg]); 150 } 151 152 return seg; 153 } 154 EXPORT_SYMBOL_GPL(get_kernel_pages); 155 156 /* 157 * get_kernel_page() - pin a kernel page in memory 158 * @start: starting kernel address 159 * @write: pinning for read/write, currently ignored 160 * @pages: array that receives pointer to the page pinned. 161 * Must be at least nr_segs long. 162 * 163 * Returns 1 if page is pinned. If the page was not pinned, returns 164 * -errno. The page returned must be released with a put_page() call 165 * when it is finished with. 166 */ 167 int get_kernel_page(unsigned long start, int write, struct page **pages) 168 { 169 const struct kvec kiov = { 170 .iov_base = (void *)start, 171 .iov_len = PAGE_SIZE 172 }; 173 174 return get_kernel_pages(&kiov, 1, write, pages); 175 } 176 EXPORT_SYMBOL_GPL(get_kernel_page); 177 178 static void pagevec_lru_move_fn(struct pagevec *pvec, 179 void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg), 180 void *arg) 181 { 182 int i; 183 struct pglist_data *pgdat = NULL; 184 struct lruvec *lruvec; 185 unsigned long flags = 0; 186 187 for (i = 0; i < pagevec_count(pvec); i++) { 188 struct page *page = pvec->pages[i]; 189 struct pglist_data *pagepgdat = page_pgdat(page); 190 191 if (pagepgdat != pgdat) { 192 if (pgdat) 193 spin_unlock_irqrestore(&pgdat->lru_lock, flags); 194 pgdat = pagepgdat; 195 spin_lock_irqsave(&pgdat->lru_lock, flags); 196 } 197 198 lruvec = mem_cgroup_page_lruvec(page, pgdat); 199 (*move_fn)(page, lruvec, arg); 200 } 201 if (pgdat) 202 spin_unlock_irqrestore(&pgdat->lru_lock, flags); 203 release_pages(pvec->pages, pvec->nr, pvec->cold); 204 pagevec_reinit(pvec); 205 } 206 207 static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec, 208 void *arg) 209 { 210 int *pgmoved = arg; 211 212 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 213 enum lru_list lru = page_lru_base_type(page); 214 list_move_tail(&page->lru, &lruvec->lists[lru]); 215 (*pgmoved)++; 216 } 217 } 218 219 /* 220 * pagevec_move_tail() must be called with IRQ disabled. 221 * Otherwise this may cause nasty races. 222 */ 223 static void pagevec_move_tail(struct pagevec *pvec) 224 { 225 int pgmoved = 0; 226 227 pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved); 228 __count_vm_events(PGROTATED, pgmoved); 229 } 230 231 /* 232 * Writeback is about to end against a page which has been marked for immediate 233 * reclaim. If it still appears to be reclaimable, move it to the tail of the 234 * inactive list. 235 */ 236 void rotate_reclaimable_page(struct page *page) 237 { 238 if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && 239 !PageUnevictable(page) && PageLRU(page)) { 240 struct pagevec *pvec; 241 unsigned long flags; 242 243 get_page(page); 244 local_irq_save(flags); 245 pvec = this_cpu_ptr(&lru_rotate_pvecs); 246 if (!pagevec_add(pvec, page) || PageCompound(page)) 247 pagevec_move_tail(pvec); 248 local_irq_restore(flags); 249 } 250 } 251 252 static void update_page_reclaim_stat(struct lruvec *lruvec, 253 int file, int rotated) 254 { 255 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; 256 257 reclaim_stat->recent_scanned[file]++; 258 if (rotated) 259 reclaim_stat->recent_rotated[file]++; 260 } 261 262 static void __activate_page(struct page *page, struct lruvec *lruvec, 263 void *arg) 264 { 265 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 266 int file = page_is_file_cache(page); 267 int lru = page_lru_base_type(page); 268 269 del_page_from_lru_list(page, lruvec, lru); 270 SetPageActive(page); 271 lru += LRU_ACTIVE; 272 add_page_to_lru_list(page, lruvec, lru); 273 trace_mm_lru_activate(page); 274 275 __count_vm_event(PGACTIVATE); 276 update_page_reclaim_stat(lruvec, file, 1); 277 } 278 } 279 280 #ifdef CONFIG_SMP 281 static void activate_page_drain(int cpu) 282 { 283 struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu); 284 285 if (pagevec_count(pvec)) 286 pagevec_lru_move_fn(pvec, __activate_page, NULL); 287 } 288 289 static bool need_activate_page_drain(int cpu) 290 { 291 return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; 292 } 293 294 void activate_page(struct page *page) 295 { 296 page = compound_head(page); 297 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 298 struct pagevec *pvec = &get_cpu_var(activate_page_pvecs); 299 300 get_page(page); 301 if (!pagevec_add(pvec, page) || PageCompound(page)) 302 pagevec_lru_move_fn(pvec, __activate_page, NULL); 303 put_cpu_var(activate_page_pvecs); 304 } 305 } 306 307 #else 308 static inline void activate_page_drain(int cpu) 309 { 310 } 311 312 static bool need_activate_page_drain(int cpu) 313 { 314 return false; 315 } 316 317 void activate_page(struct page *page) 318 { 319 struct zone *zone = page_zone(page); 320 321 page = compound_head(page); 322 spin_lock_irq(zone_lru_lock(zone)); 323 __activate_page(page, mem_cgroup_page_lruvec(page, zone->zone_pgdat), NULL); 324 spin_unlock_irq(zone_lru_lock(zone)); 325 } 326 #endif 327 328 static void __lru_cache_activate_page(struct page *page) 329 { 330 struct pagevec *pvec = &get_cpu_var(lru_add_pvec); 331 int i; 332 333 /* 334 * Search backwards on the optimistic assumption that the page being 335 * activated has just been added to this pagevec. Note that only 336 * the local pagevec is examined as a !PageLRU page could be in the 337 * process of being released, reclaimed, migrated or on a remote 338 * pagevec that is currently being drained. Furthermore, marking 339 * a remote pagevec's page PageActive potentially hits a race where 340 * a page is marked PageActive just after it is added to the inactive 341 * list causing accounting errors and BUG_ON checks to trigger. 342 */ 343 for (i = pagevec_count(pvec) - 1; i >= 0; i--) { 344 struct page *pagevec_page = pvec->pages[i]; 345 346 if (pagevec_page == page) { 347 SetPageActive(page); 348 break; 349 } 350 } 351 352 put_cpu_var(lru_add_pvec); 353 } 354 355 /* 356 * Mark a page as having seen activity. 357 * 358 * inactive,unreferenced -> inactive,referenced 359 * inactive,referenced -> active,unreferenced 360 * active,unreferenced -> active,referenced 361 * 362 * When a newly allocated page is not yet visible, so safe for non-atomic ops, 363 * __SetPageReferenced(page) may be substituted for mark_page_accessed(page). 364 */ 365 void mark_page_accessed(struct page *page) 366 { 367 page = compound_head(page); 368 if (!PageActive(page) && !PageUnevictable(page) && 369 PageReferenced(page)) { 370 371 /* 372 * If the page is on the LRU, queue it for activation via 373 * activate_page_pvecs. Otherwise, assume the page is on a 374 * pagevec, mark it active and it'll be moved to the active 375 * LRU on the next drain. 376 */ 377 if (PageLRU(page)) 378 activate_page(page); 379 else 380 __lru_cache_activate_page(page); 381 ClearPageReferenced(page); 382 if (page_is_file_cache(page)) 383 workingset_activation(page); 384 } else if (!PageReferenced(page)) { 385 SetPageReferenced(page); 386 } 387 if (page_is_idle(page)) 388 clear_page_idle(page); 389 } 390 EXPORT_SYMBOL(mark_page_accessed); 391 392 static void __lru_cache_add(struct page *page) 393 { 394 struct pagevec *pvec = &get_cpu_var(lru_add_pvec); 395 396 get_page(page); 397 if (!pagevec_add(pvec, page) || PageCompound(page)) 398 __pagevec_lru_add(pvec); 399 put_cpu_var(lru_add_pvec); 400 } 401 402 /** 403 * lru_cache_add: add a page to the page lists 404 * @page: the page to add 405 */ 406 void lru_cache_add_anon(struct page *page) 407 { 408 if (PageActive(page)) 409 ClearPageActive(page); 410 __lru_cache_add(page); 411 } 412 413 void lru_cache_add_file(struct page *page) 414 { 415 if (PageActive(page)) 416 ClearPageActive(page); 417 __lru_cache_add(page); 418 } 419 EXPORT_SYMBOL(lru_cache_add_file); 420 421 /** 422 * lru_cache_add - add a page to a page list 423 * @page: the page to be added to the LRU. 424 * 425 * Queue the page for addition to the LRU via pagevec. The decision on whether 426 * to add the page to the [in]active [file|anon] list is deferred until the 427 * pagevec is drained. This gives a chance for the caller of lru_cache_add() 428 * have the page added to the active list using mark_page_accessed(). 429 */ 430 void lru_cache_add(struct page *page) 431 { 432 VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page); 433 VM_BUG_ON_PAGE(PageLRU(page), page); 434 __lru_cache_add(page); 435 } 436 437 /** 438 * add_page_to_unevictable_list - add a page to the unevictable list 439 * @page: the page to be added to the unevictable list 440 * 441 * Add page directly to its zone's unevictable list. To avoid races with 442 * tasks that might be making the page evictable, through eg. munlock, 443 * munmap or exit, while it's not on the lru, we want to add the page 444 * while it's locked or otherwise "invisible" to other tasks. This is 445 * difficult to do when using the pagevec cache, so bypass that. 446 */ 447 void add_page_to_unevictable_list(struct page *page) 448 { 449 struct pglist_data *pgdat = page_pgdat(page); 450 struct lruvec *lruvec; 451 452 spin_lock_irq(&pgdat->lru_lock); 453 lruvec = mem_cgroup_page_lruvec(page, pgdat); 454 ClearPageActive(page); 455 SetPageUnevictable(page); 456 SetPageLRU(page); 457 add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE); 458 spin_unlock_irq(&pgdat->lru_lock); 459 } 460 461 /** 462 * lru_cache_add_active_or_unevictable 463 * @page: the page to be added to LRU 464 * @vma: vma in which page is mapped for determining reclaimability 465 * 466 * Place @page on the active or unevictable LRU list, depending on its 467 * evictability. Note that if the page is not evictable, it goes 468 * directly back onto it's zone's unevictable list, it does NOT use a 469 * per cpu pagevec. 470 */ 471 void lru_cache_add_active_or_unevictable(struct page *page, 472 struct vm_area_struct *vma) 473 { 474 VM_BUG_ON_PAGE(PageLRU(page), page); 475 476 if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) { 477 SetPageActive(page); 478 lru_cache_add(page); 479 return; 480 } 481 482 if (!TestSetPageMlocked(page)) { 483 /* 484 * We use the irq-unsafe __mod_zone_page_stat because this 485 * counter is not modified from interrupt context, and the pte 486 * lock is held(spinlock), which implies preemption disabled. 487 */ 488 __mod_zone_page_state(page_zone(page), NR_MLOCK, 489 hpage_nr_pages(page)); 490 count_vm_event(UNEVICTABLE_PGMLOCKED); 491 } 492 add_page_to_unevictable_list(page); 493 } 494 495 /* 496 * If the page can not be invalidated, it is moved to the 497 * inactive list to speed up its reclaim. It is moved to the 498 * head of the list, rather than the tail, to give the flusher 499 * threads some time to write it out, as this is much more 500 * effective than the single-page writeout from reclaim. 501 * 502 * If the page isn't page_mapped and dirty/writeback, the page 503 * could reclaim asap using PG_reclaim. 504 * 505 * 1. active, mapped page -> none 506 * 2. active, dirty/writeback page -> inactive, head, PG_reclaim 507 * 3. inactive, mapped page -> none 508 * 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim 509 * 5. inactive, clean -> inactive, tail 510 * 6. Others -> none 511 * 512 * In 4, why it moves inactive's head, the VM expects the page would 513 * be write it out by flusher threads as this is much more effective 514 * than the single-page writeout from reclaim. 515 */ 516 static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, 517 void *arg) 518 { 519 int lru, file; 520 bool active; 521 522 if (!PageLRU(page)) 523 return; 524 525 if (PageUnevictable(page)) 526 return; 527 528 /* Some processes are using the page */ 529 if (page_mapped(page)) 530 return; 531 532 active = PageActive(page); 533 file = page_is_file_cache(page); 534 lru = page_lru_base_type(page); 535 536 del_page_from_lru_list(page, lruvec, lru + active); 537 ClearPageActive(page); 538 ClearPageReferenced(page); 539 add_page_to_lru_list(page, lruvec, lru); 540 541 if (PageWriteback(page) || PageDirty(page)) { 542 /* 543 * PG_reclaim could be raced with end_page_writeback 544 * It can make readahead confusing. But race window 545 * is _really_ small and it's non-critical problem. 546 */ 547 SetPageReclaim(page); 548 } else { 549 /* 550 * The page's writeback ends up during pagevec 551 * We moves tha page into tail of inactive. 552 */ 553 list_move_tail(&page->lru, &lruvec->lists[lru]); 554 __count_vm_event(PGROTATED); 555 } 556 557 if (active) 558 __count_vm_event(PGDEACTIVATE); 559 update_page_reclaim_stat(lruvec, file, 0); 560 } 561 562 563 static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, 564 void *arg) 565 { 566 if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { 567 int file = page_is_file_cache(page); 568 int lru = page_lru_base_type(page); 569 570 del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE); 571 ClearPageActive(page); 572 ClearPageReferenced(page); 573 add_page_to_lru_list(page, lruvec, lru); 574 575 __count_vm_event(PGDEACTIVATE); 576 update_page_reclaim_stat(lruvec, file, 0); 577 } 578 } 579 580 /* 581 * Drain pages out of the cpu's pagevecs. 582 * Either "cpu" is the current CPU, and preemption has already been 583 * disabled; or "cpu" is being hot-unplugged, and is already dead. 584 */ 585 void lru_add_drain_cpu(int cpu) 586 { 587 struct pagevec *pvec = &per_cpu(lru_add_pvec, cpu); 588 589 if (pagevec_count(pvec)) 590 __pagevec_lru_add(pvec); 591 592 pvec = &per_cpu(lru_rotate_pvecs, cpu); 593 if (pagevec_count(pvec)) { 594 unsigned long flags; 595 596 /* No harm done if a racing interrupt already did this */ 597 local_irq_save(flags); 598 pagevec_move_tail(pvec); 599 local_irq_restore(flags); 600 } 601 602 pvec = &per_cpu(lru_deactivate_file_pvecs, cpu); 603 if (pagevec_count(pvec)) 604 pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); 605 606 pvec = &per_cpu(lru_deactivate_pvecs, cpu); 607 if (pagevec_count(pvec)) 608 pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); 609 610 activate_page_drain(cpu); 611 } 612 613 /** 614 * deactivate_file_page - forcefully deactivate a file page 615 * @page: page to deactivate 616 * 617 * This function hints the VM that @page is a good reclaim candidate, 618 * for example if its invalidation fails due to the page being dirty 619 * or under writeback. 620 */ 621 void deactivate_file_page(struct page *page) 622 { 623 /* 624 * In a workload with many unevictable page such as mprotect, 625 * unevictable page deactivation for accelerating reclaim is pointless. 626 */ 627 if (PageUnevictable(page)) 628 return; 629 630 if (likely(get_page_unless_zero(page))) { 631 struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs); 632 633 if (!pagevec_add(pvec, page) || PageCompound(page)) 634 pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); 635 put_cpu_var(lru_deactivate_file_pvecs); 636 } 637 } 638 639 /** 640 * deactivate_page - deactivate a page 641 * @page: page to deactivate 642 * 643 * deactivate_page() moves @page to the inactive list if @page was on the active 644 * list and was not an unevictable page. This is done to accelerate the reclaim 645 * of @page. 646 */ 647 void deactivate_page(struct page *page) 648 { 649 if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { 650 struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); 651 652 get_page(page); 653 if (!pagevec_add(pvec, page) || PageCompound(page)) 654 pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); 655 put_cpu_var(lru_deactivate_pvecs); 656 } 657 } 658 659 void lru_add_drain(void) 660 { 661 lru_add_drain_cpu(get_cpu()); 662 put_cpu(); 663 } 664 665 static void lru_add_drain_per_cpu(struct work_struct *dummy) 666 { 667 lru_add_drain(); 668 } 669 670 static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); 671 672 /* 673 * lru_add_drain_wq is used to do lru_add_drain_all() from a WQ_MEM_RECLAIM 674 * workqueue, aiding in getting memory freed. 675 */ 676 static struct workqueue_struct *lru_add_drain_wq; 677 678 static int __init lru_init(void) 679 { 680 lru_add_drain_wq = alloc_workqueue("lru-add-drain", WQ_MEM_RECLAIM, 0); 681 682 if (WARN(!lru_add_drain_wq, 683 "Failed to create workqueue lru_add_drain_wq")) 684 return -ENOMEM; 685 686 return 0; 687 } 688 early_initcall(lru_init); 689 690 void lru_add_drain_all(void) 691 { 692 static DEFINE_MUTEX(lock); 693 static struct cpumask has_work; 694 int cpu; 695 696 mutex_lock(&lock); 697 get_online_cpus(); 698 cpumask_clear(&has_work); 699 700 for_each_online_cpu(cpu) { 701 struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); 702 703 if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || 704 pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || 705 pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) || 706 pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || 707 need_activate_page_drain(cpu)) { 708 INIT_WORK(work, lru_add_drain_per_cpu); 709 queue_work_on(cpu, lru_add_drain_wq, work); 710 cpumask_set_cpu(cpu, &has_work); 711 } 712 } 713 714 for_each_cpu(cpu, &has_work) 715 flush_work(&per_cpu(lru_add_drain_work, cpu)); 716 717 put_online_cpus(); 718 mutex_unlock(&lock); 719 } 720 721 /** 722 * release_pages - batched put_page() 723 * @pages: array of pages to release 724 * @nr: number of pages 725 * @cold: whether the pages are cache cold 726 * 727 * Decrement the reference count on all the pages in @pages. If it 728 * fell to zero, remove the page from the LRU and free it. 729 */ 730 void release_pages(struct page **pages, int nr, bool cold) 731 { 732 int i; 733 LIST_HEAD(pages_to_free); 734 struct pglist_data *locked_pgdat = NULL; 735 struct lruvec *lruvec; 736 unsigned long uninitialized_var(flags); 737 unsigned int uninitialized_var(lock_batch); 738 739 for (i = 0; i < nr; i++) { 740 struct page *page = pages[i]; 741 742 /* 743 * Make sure the IRQ-safe lock-holding time does not get 744 * excessive with a continuous string of pages from the 745 * same pgdat. The lock is held only if pgdat != NULL. 746 */ 747 if (locked_pgdat && ++lock_batch == SWAP_CLUSTER_MAX) { 748 spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); 749 locked_pgdat = NULL; 750 } 751 752 if (is_huge_zero_page(page)) 753 continue; 754 755 page = compound_head(page); 756 if (!put_page_testzero(page)) 757 continue; 758 759 if (PageCompound(page)) { 760 if (locked_pgdat) { 761 spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); 762 locked_pgdat = NULL; 763 } 764 __put_compound_page(page); 765 continue; 766 } 767 768 if (PageLRU(page)) { 769 struct pglist_data *pgdat = page_pgdat(page); 770 771 if (pgdat != locked_pgdat) { 772 if (locked_pgdat) 773 spin_unlock_irqrestore(&locked_pgdat->lru_lock, 774 flags); 775 lock_batch = 0; 776 locked_pgdat = pgdat; 777 spin_lock_irqsave(&locked_pgdat->lru_lock, flags); 778 } 779 780 lruvec = mem_cgroup_page_lruvec(page, locked_pgdat); 781 VM_BUG_ON_PAGE(!PageLRU(page), page); 782 __ClearPageLRU(page); 783 del_page_from_lru_list(page, lruvec, page_off_lru(page)); 784 } 785 786 /* Clear Active bit in case of parallel mark_page_accessed */ 787 __ClearPageActive(page); 788 __ClearPageWaiters(page); 789 790 list_add(&page->lru, &pages_to_free); 791 } 792 if (locked_pgdat) 793 spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); 794 795 mem_cgroup_uncharge_list(&pages_to_free); 796 free_hot_cold_page_list(&pages_to_free, cold); 797 } 798 EXPORT_SYMBOL(release_pages); 799 800 /* 801 * The pages which we're about to release may be in the deferred lru-addition 802 * queues. That would prevent them from really being freed right now. That's 803 * OK from a correctness point of view but is inefficient - those pages may be 804 * cache-warm and we want to give them back to the page allocator ASAP. 805 * 806 * So __pagevec_release() will drain those queues here. __pagevec_lru_add() 807 * and __pagevec_lru_add_active() call release_pages() directly to avoid 808 * mutual recursion. 809 */ 810 void __pagevec_release(struct pagevec *pvec) 811 { 812 lru_add_drain(); 813 release_pages(pvec->pages, pagevec_count(pvec), pvec->cold); 814 pagevec_reinit(pvec); 815 } 816 EXPORT_SYMBOL(__pagevec_release); 817 818 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 819 /* used by __split_huge_page_refcount() */ 820 void lru_add_page_tail(struct page *page, struct page *page_tail, 821 struct lruvec *lruvec, struct list_head *list) 822 { 823 const int file = 0; 824 825 VM_BUG_ON_PAGE(!PageHead(page), page); 826 VM_BUG_ON_PAGE(PageCompound(page_tail), page); 827 VM_BUG_ON_PAGE(PageLRU(page_tail), page); 828 VM_BUG_ON(NR_CPUS != 1 && 829 !spin_is_locked(&lruvec_pgdat(lruvec)->lru_lock)); 830 831 if (!list) 832 SetPageLRU(page_tail); 833 834 if (likely(PageLRU(page))) 835 list_add_tail(&page_tail->lru, &page->lru); 836 else if (list) { 837 /* page reclaim is reclaiming a huge page */ 838 get_page(page_tail); 839 list_add_tail(&page_tail->lru, list); 840 } else { 841 struct list_head *list_head; 842 /* 843 * Head page has not yet been counted, as an hpage, 844 * so we must account for each subpage individually. 845 * 846 * Use the standard add function to put page_tail on the list, 847 * but then correct its position so they all end up in order. 848 */ 849 add_page_to_lru_list(page_tail, lruvec, page_lru(page_tail)); 850 list_head = page_tail->lru.prev; 851 list_move_tail(&page_tail->lru, list_head); 852 } 853 854 if (!PageUnevictable(page)) 855 update_page_reclaim_stat(lruvec, file, PageActive(page_tail)); 856 } 857 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 858 859 static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, 860 void *arg) 861 { 862 int file = page_is_file_cache(page); 863 int active = PageActive(page); 864 enum lru_list lru = page_lru(page); 865 866 VM_BUG_ON_PAGE(PageLRU(page), page); 867 868 SetPageLRU(page); 869 add_page_to_lru_list(page, lruvec, lru); 870 update_page_reclaim_stat(lruvec, file, active); 871 trace_mm_lru_insertion(page, lru); 872 } 873 874 /* 875 * Add the passed pages to the LRU, then drop the caller's refcount 876 * on them. Reinitialises the caller's pagevec. 877 */ 878 void __pagevec_lru_add(struct pagevec *pvec) 879 { 880 pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL); 881 } 882 EXPORT_SYMBOL(__pagevec_lru_add); 883 884 /** 885 * pagevec_lookup_entries - gang pagecache lookup 886 * @pvec: Where the resulting entries are placed 887 * @mapping: The address_space to search 888 * @start: The starting entry index 889 * @nr_entries: The maximum number of entries 890 * @indices: The cache indices corresponding to the entries in @pvec 891 * 892 * pagevec_lookup_entries() will search for and return a group of up 893 * to @nr_entries pages and shadow entries in the mapping. All 894 * entries are placed in @pvec. pagevec_lookup_entries() takes a 895 * reference against actual pages in @pvec. 896 * 897 * The search returns a group of mapping-contiguous entries with 898 * ascending indexes. There may be holes in the indices due to 899 * not-present entries. 900 * 901 * pagevec_lookup_entries() returns the number of entries which were 902 * found. 903 */ 904 unsigned pagevec_lookup_entries(struct pagevec *pvec, 905 struct address_space *mapping, 906 pgoff_t start, unsigned nr_pages, 907 pgoff_t *indices) 908 { 909 pvec->nr = find_get_entries(mapping, start, nr_pages, 910 pvec->pages, indices); 911 return pagevec_count(pvec); 912 } 913 914 /** 915 * pagevec_remove_exceptionals - pagevec exceptionals pruning 916 * @pvec: The pagevec to prune 917 * 918 * pagevec_lookup_entries() fills both pages and exceptional radix 919 * tree entries into the pagevec. This function prunes all 920 * exceptionals from @pvec without leaving holes, so that it can be 921 * passed on to page-only pagevec operations. 922 */ 923 void pagevec_remove_exceptionals(struct pagevec *pvec) 924 { 925 int i, j; 926 927 for (i = 0, j = 0; i < pagevec_count(pvec); i++) { 928 struct page *page = pvec->pages[i]; 929 if (!radix_tree_exceptional_entry(page)) 930 pvec->pages[j++] = page; 931 } 932 pvec->nr = j; 933 } 934 935 /** 936 * pagevec_lookup - gang pagecache lookup 937 * @pvec: Where the resulting pages are placed 938 * @mapping: The address_space to search 939 * @start: The starting page index 940 * @nr_pages: The maximum number of pages 941 * 942 * pagevec_lookup() will search for and return a group of up to @nr_pages pages 943 * in the mapping. The pages are placed in @pvec. pagevec_lookup() takes a 944 * reference against the pages in @pvec. 945 * 946 * The search returns a group of mapping-contiguous pages with ascending 947 * indexes. There may be holes in the indices due to not-present pages. 948 * 949 * pagevec_lookup() returns the number of pages which were found. 950 */ 951 unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, 952 pgoff_t start, unsigned nr_pages) 953 { 954 pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages); 955 return pagevec_count(pvec); 956 } 957 EXPORT_SYMBOL(pagevec_lookup); 958 959 unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, 960 pgoff_t *index, int tag, unsigned nr_pages) 961 { 962 pvec->nr = find_get_pages_tag(mapping, index, tag, 963 nr_pages, pvec->pages); 964 return pagevec_count(pvec); 965 } 966 EXPORT_SYMBOL(pagevec_lookup_tag); 967 968 /* 969 * Perform any setup for the swap system 970 */ 971 void __init swap_setup(void) 972 { 973 unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); 974 #ifdef CONFIG_SWAP 975 int i; 976 977 for (i = 0; i < MAX_SWAPFILES; i++) 978 spin_lock_init(&swapper_spaces[i].tree_lock); 979 #endif 980 981 /* Use a smaller cluster for small-memory machines */ 982 if (megs < 16) 983 page_cluster = 2; 984 else 985 page_cluster = 3; 986 /* 987 * Right now other parts of the system means that we 988 * _really_ don't want to cluster much more 989 */ 990 } 991