1 /* 2 * linux/mm/swap.c 3 * 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 5 */ 6 7 /* 8 * This file contains the default values for the operation of the 9 * Linux VM subsystem. Fine-tuning documentation can be found in 10 * Documentation/sysctl/vm.txt. 11 * Started 18.12.91 12 * Swap aging added 23.2.95, Stephen Tweedie. 13 * Buffermem limits added 12.3.98, Rik van Riel. 14 */ 15 16 #include <linux/mm.h> 17 #include <linux/sched.h> 18 #include <linux/kernel_stat.h> 19 #include <linux/swap.h> 20 #include <linux/mman.h> 21 #include <linux/pagemap.h> 22 #include <linux/pagevec.h> 23 #include <linux/init.h> 24 #include <linux/export.h> 25 #include <linux/mm_inline.h> 26 #include <linux/percpu_counter.h> 27 #include <linux/memremap.h> 28 #include <linux/percpu.h> 29 #include <linux/cpu.h> 30 #include <linux/notifier.h> 31 #include <linux/backing-dev.h> 32 #include <linux/memcontrol.h> 33 #include <linux/gfp.h> 34 #include <linux/uio.h> 35 #include <linux/hugetlb.h> 36 #include <linux/page_idle.h> 37 38 #include "internal.h" 39 40 #define CREATE_TRACE_POINTS 41 #include <trace/events/pagemap.h> 42 43 /* How many pages do we try to swap or page in/out together? */ 44 int page_cluster; 45 46 static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); 47 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); 48 static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs); 49 static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); 50 #ifdef CONFIG_SMP 51 static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); 52 #endif 53 54 /* 55 * This path almost never happens for VM activity - pages are normally 56 * freed via pagevecs. But it gets used by networking. 57 */ 58 static void __page_cache_release(struct page *page) 59 { 60 if (PageLRU(page)) { 61 struct zone *zone = page_zone(page); 62 struct lruvec *lruvec; 63 unsigned long flags; 64 65 spin_lock_irqsave(zone_lru_lock(zone), flags); 66 lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); 67 VM_BUG_ON_PAGE(!PageLRU(page), page); 68 __ClearPageLRU(page); 69 del_page_from_lru_list(page, lruvec, page_off_lru(page)); 70 spin_unlock_irqrestore(zone_lru_lock(zone), flags); 71 } 72 __ClearPageWaiters(page); 73 mem_cgroup_uncharge(page); 74 } 75 76 static void __put_single_page(struct page *page) 77 { 78 __page_cache_release(page); 79 free_hot_cold_page(page, false); 80 } 81 82 static void __put_compound_page(struct page *page) 83 { 84 compound_page_dtor *dtor; 85 86 /* 87 * __page_cache_release() is supposed to be called for thp, not for 88 * hugetlb. This is because hugetlb page does never have PageLRU set 89 * (it's never listed to any LRU lists) and no memcg routines should 90 * be called for hugetlb (it has a separate hugetlb_cgroup.) 91 */ 92 if (!PageHuge(page)) 93 __page_cache_release(page); 94 dtor = get_compound_page_dtor(page); 95 (*dtor)(page); 96 } 97 98 void __put_page(struct page *page) 99 { 100 if (unlikely(PageCompound(page))) 101 __put_compound_page(page); 102 else 103 __put_single_page(page); 104 } 105 EXPORT_SYMBOL(__put_page); 106 107 /** 108 * put_pages_list() - release a list of pages 109 * @pages: list of pages threaded on page->lru 110 * 111 * Release a list of pages which are strung together on page.lru. Currently 112 * used by read_cache_pages() and related error recovery code. 113 */ 114 void put_pages_list(struct list_head *pages) 115 { 116 while (!list_empty(pages)) { 117 struct page *victim; 118 119 victim = list_entry(pages->prev, struct page, lru); 120 list_del(&victim->lru); 121 put_page(victim); 122 } 123 } 124 EXPORT_SYMBOL(put_pages_list); 125 126 /* 127 * get_kernel_pages() - pin kernel pages in memory 128 * @kiov: An array of struct kvec structures 129 * @nr_segs: number of segments to pin 130 * @write: pinning for read/write, currently ignored 131 * @pages: array that receives pointers to the pages pinned. 132 * Should be at least nr_segs long. 133 * 134 * Returns number of pages pinned. This may be fewer than the number 135 * requested. If nr_pages is 0 or negative, returns 0. If no pages 136 * were pinned, returns -errno. Each page returned must be released 137 * with a put_page() call when it is finished with. 138 */ 139 int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write, 140 struct page **pages) 141 { 142 int seg; 143 144 for (seg = 0; seg < nr_segs; seg++) { 145 if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE)) 146 return seg; 147 148 pages[seg] = kmap_to_page(kiov[seg].iov_base); 149 get_page(pages[seg]); 150 } 151 152 return seg; 153 } 154 EXPORT_SYMBOL_GPL(get_kernel_pages); 155 156 /* 157 * get_kernel_page() - pin a kernel page in memory 158 * @start: starting kernel address 159 * @write: pinning for read/write, currently ignored 160 * @pages: array that receives pointer to the page pinned. 161 * Must be at least nr_segs long. 162 * 163 * Returns 1 if page is pinned. If the page was not pinned, returns 164 * -errno. The page returned must be released with a put_page() call 165 * when it is finished with. 166 */ 167 int get_kernel_page(unsigned long start, int write, struct page **pages) 168 { 169 const struct kvec kiov = { 170 .iov_base = (void *)start, 171 .iov_len = PAGE_SIZE 172 }; 173 174 return get_kernel_pages(&kiov, 1, write, pages); 175 } 176 EXPORT_SYMBOL_GPL(get_kernel_page); 177 178 static void pagevec_lru_move_fn(struct pagevec *pvec, 179 void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg), 180 void *arg) 181 { 182 int i; 183 struct pglist_data *pgdat = NULL; 184 struct lruvec *lruvec; 185 unsigned long flags = 0; 186 187 for (i = 0; i < pagevec_count(pvec); i++) { 188 struct page *page = pvec->pages[i]; 189 struct pglist_data *pagepgdat = page_pgdat(page); 190 191 if (pagepgdat != pgdat) { 192 if (pgdat) 193 spin_unlock_irqrestore(&pgdat->lru_lock, flags); 194 pgdat = pagepgdat; 195 spin_lock_irqsave(&pgdat->lru_lock, flags); 196 } 197 198 lruvec = mem_cgroup_page_lruvec(page, pgdat); 199 (*move_fn)(page, lruvec, arg); 200 } 201 if (pgdat) 202 spin_unlock_irqrestore(&pgdat->lru_lock, flags); 203 release_pages(pvec->pages, pvec->nr, pvec->cold); 204 pagevec_reinit(pvec); 205 } 206 207 static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec, 208 void *arg) 209 { 210 int *pgmoved = arg; 211 212 if (PageLRU(page) && !PageUnevictable(page)) { 213 del_page_from_lru_list(page, lruvec, page_lru(page)); 214 ClearPageActive(page); 215 add_page_to_lru_list_tail(page, lruvec, page_lru(page)); 216 (*pgmoved)++; 217 } 218 } 219 220 /* 221 * pagevec_move_tail() must be called with IRQ disabled. 222 * Otherwise this may cause nasty races. 223 */ 224 static void pagevec_move_tail(struct pagevec *pvec) 225 { 226 int pgmoved = 0; 227 228 pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved); 229 __count_vm_events(PGROTATED, pgmoved); 230 } 231 232 /* 233 * Writeback is about to end against a page which has been marked for immediate 234 * reclaim. If it still appears to be reclaimable, move it to the tail of the 235 * inactive list. 236 */ 237 void rotate_reclaimable_page(struct page *page) 238 { 239 if (!PageLocked(page) && !PageDirty(page) && 240 !PageUnevictable(page) && PageLRU(page)) { 241 struct pagevec *pvec; 242 unsigned long flags; 243 244 get_page(page); 245 local_irq_save(flags); 246 pvec = this_cpu_ptr(&lru_rotate_pvecs); 247 if (!pagevec_add(pvec, page) || PageCompound(page)) 248 pagevec_move_tail(pvec); 249 local_irq_restore(flags); 250 } 251 } 252 253 static void update_page_reclaim_stat(struct lruvec *lruvec, 254 int file, int rotated) 255 { 256 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; 257 258 reclaim_stat->recent_scanned[file]++; 259 if (rotated) 260 reclaim_stat->recent_rotated[file]++; 261 } 262 263 static void __activate_page(struct page *page, struct lruvec *lruvec, 264 void *arg) 265 { 266 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 267 int file = page_is_file_cache(page); 268 int lru = page_lru_base_type(page); 269 270 del_page_from_lru_list(page, lruvec, lru); 271 SetPageActive(page); 272 lru += LRU_ACTIVE; 273 add_page_to_lru_list(page, lruvec, lru); 274 trace_mm_lru_activate(page); 275 276 __count_vm_event(PGACTIVATE); 277 update_page_reclaim_stat(lruvec, file, 1); 278 } 279 } 280 281 #ifdef CONFIG_SMP 282 static void activate_page_drain(int cpu) 283 { 284 struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu); 285 286 if (pagevec_count(pvec)) 287 pagevec_lru_move_fn(pvec, __activate_page, NULL); 288 } 289 290 static bool need_activate_page_drain(int cpu) 291 { 292 return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; 293 } 294 295 void activate_page(struct page *page) 296 { 297 page = compound_head(page); 298 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 299 struct pagevec *pvec = &get_cpu_var(activate_page_pvecs); 300 301 get_page(page); 302 if (!pagevec_add(pvec, page) || PageCompound(page)) 303 pagevec_lru_move_fn(pvec, __activate_page, NULL); 304 put_cpu_var(activate_page_pvecs); 305 } 306 } 307 308 #else 309 static inline void activate_page_drain(int cpu) 310 { 311 } 312 313 static bool need_activate_page_drain(int cpu) 314 { 315 return false; 316 } 317 318 void activate_page(struct page *page) 319 { 320 struct zone *zone = page_zone(page); 321 322 page = compound_head(page); 323 spin_lock_irq(zone_lru_lock(zone)); 324 __activate_page(page, mem_cgroup_page_lruvec(page, zone->zone_pgdat), NULL); 325 spin_unlock_irq(zone_lru_lock(zone)); 326 } 327 #endif 328 329 static void __lru_cache_activate_page(struct page *page) 330 { 331 struct pagevec *pvec = &get_cpu_var(lru_add_pvec); 332 int i; 333 334 /* 335 * Search backwards on the optimistic assumption that the page being 336 * activated has just been added to this pagevec. Note that only 337 * the local pagevec is examined as a !PageLRU page could be in the 338 * process of being released, reclaimed, migrated or on a remote 339 * pagevec that is currently being drained. Furthermore, marking 340 * a remote pagevec's page PageActive potentially hits a race where 341 * a page is marked PageActive just after it is added to the inactive 342 * list causing accounting errors and BUG_ON checks to trigger. 343 */ 344 for (i = pagevec_count(pvec) - 1; i >= 0; i--) { 345 struct page *pagevec_page = pvec->pages[i]; 346 347 if (pagevec_page == page) { 348 SetPageActive(page); 349 break; 350 } 351 } 352 353 put_cpu_var(lru_add_pvec); 354 } 355 356 /* 357 * Mark a page as having seen activity. 358 * 359 * inactive,unreferenced -> inactive,referenced 360 * inactive,referenced -> active,unreferenced 361 * active,unreferenced -> active,referenced 362 * 363 * When a newly allocated page is not yet visible, so safe for non-atomic ops, 364 * __SetPageReferenced(page) may be substituted for mark_page_accessed(page). 365 */ 366 void mark_page_accessed(struct page *page) 367 { 368 page = compound_head(page); 369 if (!PageActive(page) && !PageUnevictable(page) && 370 PageReferenced(page)) { 371 372 /* 373 * If the page is on the LRU, queue it for activation via 374 * activate_page_pvecs. Otherwise, assume the page is on a 375 * pagevec, mark it active and it'll be moved to the active 376 * LRU on the next drain. 377 */ 378 if (PageLRU(page)) 379 activate_page(page); 380 else 381 __lru_cache_activate_page(page); 382 ClearPageReferenced(page); 383 if (page_is_file_cache(page)) 384 workingset_activation(page); 385 } else if (!PageReferenced(page)) { 386 SetPageReferenced(page); 387 } 388 if (page_is_idle(page)) 389 clear_page_idle(page); 390 } 391 EXPORT_SYMBOL(mark_page_accessed); 392 393 static void __lru_cache_add(struct page *page) 394 { 395 struct pagevec *pvec = &get_cpu_var(lru_add_pvec); 396 397 get_page(page); 398 if (!pagevec_add(pvec, page) || PageCompound(page)) 399 __pagevec_lru_add(pvec); 400 put_cpu_var(lru_add_pvec); 401 } 402 403 /** 404 * lru_cache_add: add a page to the page lists 405 * @page: the page to add 406 */ 407 void lru_cache_add_anon(struct page *page) 408 { 409 if (PageActive(page)) 410 ClearPageActive(page); 411 __lru_cache_add(page); 412 } 413 414 void lru_cache_add_file(struct page *page) 415 { 416 if (PageActive(page)) 417 ClearPageActive(page); 418 __lru_cache_add(page); 419 } 420 EXPORT_SYMBOL(lru_cache_add_file); 421 422 /** 423 * lru_cache_add - add a page to a page list 424 * @page: the page to be added to the LRU. 425 * 426 * Queue the page for addition to the LRU via pagevec. The decision on whether 427 * to add the page to the [in]active [file|anon] list is deferred until the 428 * pagevec is drained. This gives a chance for the caller of lru_cache_add() 429 * have the page added to the active list using mark_page_accessed(). 430 */ 431 void lru_cache_add(struct page *page) 432 { 433 VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page); 434 VM_BUG_ON_PAGE(PageLRU(page), page); 435 __lru_cache_add(page); 436 } 437 438 /** 439 * add_page_to_unevictable_list - add a page to the unevictable list 440 * @page: the page to be added to the unevictable list 441 * 442 * Add page directly to its zone's unevictable list. To avoid races with 443 * tasks that might be making the page evictable, through eg. munlock, 444 * munmap or exit, while it's not on the lru, we want to add the page 445 * while it's locked or otherwise "invisible" to other tasks. This is 446 * difficult to do when using the pagevec cache, so bypass that. 447 */ 448 void add_page_to_unevictable_list(struct page *page) 449 { 450 struct pglist_data *pgdat = page_pgdat(page); 451 struct lruvec *lruvec; 452 453 spin_lock_irq(&pgdat->lru_lock); 454 lruvec = mem_cgroup_page_lruvec(page, pgdat); 455 ClearPageActive(page); 456 SetPageUnevictable(page); 457 SetPageLRU(page); 458 add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE); 459 spin_unlock_irq(&pgdat->lru_lock); 460 } 461 462 /** 463 * lru_cache_add_active_or_unevictable 464 * @page: the page to be added to LRU 465 * @vma: vma in which page is mapped for determining reclaimability 466 * 467 * Place @page on the active or unevictable LRU list, depending on its 468 * evictability. Note that if the page is not evictable, it goes 469 * directly back onto it's zone's unevictable list, it does NOT use a 470 * per cpu pagevec. 471 */ 472 void lru_cache_add_active_or_unevictable(struct page *page, 473 struct vm_area_struct *vma) 474 { 475 VM_BUG_ON_PAGE(PageLRU(page), page); 476 477 if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) { 478 SetPageActive(page); 479 lru_cache_add(page); 480 return; 481 } 482 483 if (!TestSetPageMlocked(page)) { 484 /* 485 * We use the irq-unsafe __mod_zone_page_stat because this 486 * counter is not modified from interrupt context, and the pte 487 * lock is held(spinlock), which implies preemption disabled. 488 */ 489 __mod_zone_page_state(page_zone(page), NR_MLOCK, 490 hpage_nr_pages(page)); 491 count_vm_event(UNEVICTABLE_PGMLOCKED); 492 } 493 add_page_to_unevictable_list(page); 494 } 495 496 /* 497 * If the page can not be invalidated, it is moved to the 498 * inactive list to speed up its reclaim. It is moved to the 499 * head of the list, rather than the tail, to give the flusher 500 * threads some time to write it out, as this is much more 501 * effective than the single-page writeout from reclaim. 502 * 503 * If the page isn't page_mapped and dirty/writeback, the page 504 * could reclaim asap using PG_reclaim. 505 * 506 * 1. active, mapped page -> none 507 * 2. active, dirty/writeback page -> inactive, head, PG_reclaim 508 * 3. inactive, mapped page -> none 509 * 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim 510 * 5. inactive, clean -> inactive, tail 511 * 6. Others -> none 512 * 513 * In 4, why it moves inactive's head, the VM expects the page would 514 * be write it out by flusher threads as this is much more effective 515 * than the single-page writeout from reclaim. 516 */ 517 static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, 518 void *arg) 519 { 520 int lru, file; 521 bool active; 522 523 if (!PageLRU(page)) 524 return; 525 526 if (PageUnevictable(page)) 527 return; 528 529 /* Some processes are using the page */ 530 if (page_mapped(page)) 531 return; 532 533 active = PageActive(page); 534 file = page_is_file_cache(page); 535 lru = page_lru_base_type(page); 536 537 del_page_from_lru_list(page, lruvec, lru + active); 538 ClearPageActive(page); 539 ClearPageReferenced(page); 540 add_page_to_lru_list(page, lruvec, lru); 541 542 if (PageWriteback(page) || PageDirty(page)) { 543 /* 544 * PG_reclaim could be raced with end_page_writeback 545 * It can make readahead confusing. But race window 546 * is _really_ small and it's non-critical problem. 547 */ 548 SetPageReclaim(page); 549 } else { 550 /* 551 * The page's writeback ends up during pagevec 552 * We moves tha page into tail of inactive. 553 */ 554 list_move_tail(&page->lru, &lruvec->lists[lru]); 555 __count_vm_event(PGROTATED); 556 } 557 558 if (active) 559 __count_vm_event(PGDEACTIVATE); 560 update_page_reclaim_stat(lruvec, file, 0); 561 } 562 563 564 static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, 565 void *arg) 566 { 567 if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { 568 int file = page_is_file_cache(page); 569 int lru = page_lru_base_type(page); 570 571 del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE); 572 ClearPageActive(page); 573 ClearPageReferenced(page); 574 add_page_to_lru_list(page, lruvec, lru); 575 576 __count_vm_event(PGDEACTIVATE); 577 update_page_reclaim_stat(lruvec, file, 0); 578 } 579 } 580 581 /* 582 * Drain pages out of the cpu's pagevecs. 583 * Either "cpu" is the current CPU, and preemption has already been 584 * disabled; or "cpu" is being hot-unplugged, and is already dead. 585 */ 586 void lru_add_drain_cpu(int cpu) 587 { 588 struct pagevec *pvec = &per_cpu(lru_add_pvec, cpu); 589 590 if (pagevec_count(pvec)) 591 __pagevec_lru_add(pvec); 592 593 pvec = &per_cpu(lru_rotate_pvecs, cpu); 594 if (pagevec_count(pvec)) { 595 unsigned long flags; 596 597 /* No harm done if a racing interrupt already did this */ 598 local_irq_save(flags); 599 pagevec_move_tail(pvec); 600 local_irq_restore(flags); 601 } 602 603 pvec = &per_cpu(lru_deactivate_file_pvecs, cpu); 604 if (pagevec_count(pvec)) 605 pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); 606 607 pvec = &per_cpu(lru_deactivate_pvecs, cpu); 608 if (pagevec_count(pvec)) 609 pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); 610 611 activate_page_drain(cpu); 612 } 613 614 /** 615 * deactivate_file_page - forcefully deactivate a file page 616 * @page: page to deactivate 617 * 618 * This function hints the VM that @page is a good reclaim candidate, 619 * for example if its invalidation fails due to the page being dirty 620 * or under writeback. 621 */ 622 void deactivate_file_page(struct page *page) 623 { 624 /* 625 * In a workload with many unevictable page such as mprotect, 626 * unevictable page deactivation for accelerating reclaim is pointless. 627 */ 628 if (PageUnevictable(page)) 629 return; 630 631 if (likely(get_page_unless_zero(page))) { 632 struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs); 633 634 if (!pagevec_add(pvec, page) || PageCompound(page)) 635 pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); 636 put_cpu_var(lru_deactivate_file_pvecs); 637 } 638 } 639 640 /** 641 * deactivate_page - deactivate a page 642 * @page: page to deactivate 643 * 644 * deactivate_page() moves @page to the inactive list if @page was on the active 645 * list and was not an unevictable page. This is done to accelerate the reclaim 646 * of @page. 647 */ 648 void deactivate_page(struct page *page) 649 { 650 if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { 651 struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); 652 653 get_page(page); 654 if (!pagevec_add(pvec, page) || PageCompound(page)) 655 pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); 656 put_cpu_var(lru_deactivate_pvecs); 657 } 658 } 659 660 void lru_add_drain(void) 661 { 662 lru_add_drain_cpu(get_cpu()); 663 put_cpu(); 664 } 665 666 static void lru_add_drain_per_cpu(struct work_struct *dummy) 667 { 668 lru_add_drain(); 669 } 670 671 static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); 672 673 void lru_add_drain_all(void) 674 { 675 static DEFINE_MUTEX(lock); 676 static struct cpumask has_work; 677 int cpu; 678 679 /* 680 * Make sure nobody triggers this path before mm_percpu_wq is fully 681 * initialized. 682 */ 683 if (WARN_ON(!mm_percpu_wq)) 684 return; 685 686 mutex_lock(&lock); 687 get_online_cpus(); 688 cpumask_clear(&has_work); 689 690 for_each_online_cpu(cpu) { 691 struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); 692 693 if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || 694 pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || 695 pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) || 696 pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || 697 need_activate_page_drain(cpu)) { 698 INIT_WORK(work, lru_add_drain_per_cpu); 699 queue_work_on(cpu, mm_percpu_wq, work); 700 cpumask_set_cpu(cpu, &has_work); 701 } 702 } 703 704 for_each_cpu(cpu, &has_work) 705 flush_work(&per_cpu(lru_add_drain_work, cpu)); 706 707 put_online_cpus(); 708 mutex_unlock(&lock); 709 } 710 711 /** 712 * release_pages - batched put_page() 713 * @pages: array of pages to release 714 * @nr: number of pages 715 * @cold: whether the pages are cache cold 716 * 717 * Decrement the reference count on all the pages in @pages. If it 718 * fell to zero, remove the page from the LRU and free it. 719 */ 720 void release_pages(struct page **pages, int nr, bool cold) 721 { 722 int i; 723 LIST_HEAD(pages_to_free); 724 struct pglist_data *locked_pgdat = NULL; 725 struct lruvec *lruvec; 726 unsigned long uninitialized_var(flags); 727 unsigned int uninitialized_var(lock_batch); 728 729 for (i = 0; i < nr; i++) { 730 struct page *page = pages[i]; 731 732 /* 733 * Make sure the IRQ-safe lock-holding time does not get 734 * excessive with a continuous string of pages from the 735 * same pgdat. The lock is held only if pgdat != NULL. 736 */ 737 if (locked_pgdat && ++lock_batch == SWAP_CLUSTER_MAX) { 738 spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); 739 locked_pgdat = NULL; 740 } 741 742 if (is_huge_zero_page(page)) 743 continue; 744 745 page = compound_head(page); 746 if (!put_page_testzero(page)) 747 continue; 748 749 if (PageCompound(page)) { 750 if (locked_pgdat) { 751 spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); 752 locked_pgdat = NULL; 753 } 754 __put_compound_page(page); 755 continue; 756 } 757 758 if (PageLRU(page)) { 759 struct pglist_data *pgdat = page_pgdat(page); 760 761 if (pgdat != locked_pgdat) { 762 if (locked_pgdat) 763 spin_unlock_irqrestore(&locked_pgdat->lru_lock, 764 flags); 765 lock_batch = 0; 766 locked_pgdat = pgdat; 767 spin_lock_irqsave(&locked_pgdat->lru_lock, flags); 768 } 769 770 lruvec = mem_cgroup_page_lruvec(page, locked_pgdat); 771 VM_BUG_ON_PAGE(!PageLRU(page), page); 772 __ClearPageLRU(page); 773 del_page_from_lru_list(page, lruvec, page_off_lru(page)); 774 } 775 776 /* Clear Active bit in case of parallel mark_page_accessed */ 777 __ClearPageActive(page); 778 __ClearPageWaiters(page); 779 780 list_add(&page->lru, &pages_to_free); 781 } 782 if (locked_pgdat) 783 spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); 784 785 mem_cgroup_uncharge_list(&pages_to_free); 786 free_hot_cold_page_list(&pages_to_free, cold); 787 } 788 EXPORT_SYMBOL(release_pages); 789 790 /* 791 * The pages which we're about to release may be in the deferred lru-addition 792 * queues. That would prevent them from really being freed right now. That's 793 * OK from a correctness point of view but is inefficient - those pages may be 794 * cache-warm and we want to give them back to the page allocator ASAP. 795 * 796 * So __pagevec_release() will drain those queues here. __pagevec_lru_add() 797 * and __pagevec_lru_add_active() call release_pages() directly to avoid 798 * mutual recursion. 799 */ 800 void __pagevec_release(struct pagevec *pvec) 801 { 802 lru_add_drain(); 803 release_pages(pvec->pages, pagevec_count(pvec), pvec->cold); 804 pagevec_reinit(pvec); 805 } 806 EXPORT_SYMBOL(__pagevec_release); 807 808 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 809 /* used by __split_huge_page_refcount() */ 810 void lru_add_page_tail(struct page *page, struct page *page_tail, 811 struct lruvec *lruvec, struct list_head *list) 812 { 813 const int file = 0; 814 815 VM_BUG_ON_PAGE(!PageHead(page), page); 816 VM_BUG_ON_PAGE(PageCompound(page_tail), page); 817 VM_BUG_ON_PAGE(PageLRU(page_tail), page); 818 VM_BUG_ON(NR_CPUS != 1 && 819 !spin_is_locked(&lruvec_pgdat(lruvec)->lru_lock)); 820 821 if (!list) 822 SetPageLRU(page_tail); 823 824 if (likely(PageLRU(page))) 825 list_add_tail(&page_tail->lru, &page->lru); 826 else if (list) { 827 /* page reclaim is reclaiming a huge page */ 828 get_page(page_tail); 829 list_add_tail(&page_tail->lru, list); 830 } else { 831 struct list_head *list_head; 832 /* 833 * Head page has not yet been counted, as an hpage, 834 * so we must account for each subpage individually. 835 * 836 * Use the standard add function to put page_tail on the list, 837 * but then correct its position so they all end up in order. 838 */ 839 add_page_to_lru_list(page_tail, lruvec, page_lru(page_tail)); 840 list_head = page_tail->lru.prev; 841 list_move_tail(&page_tail->lru, list_head); 842 } 843 844 if (!PageUnevictable(page)) 845 update_page_reclaim_stat(lruvec, file, PageActive(page_tail)); 846 } 847 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 848 849 static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, 850 void *arg) 851 { 852 int file = page_is_file_cache(page); 853 int active = PageActive(page); 854 enum lru_list lru = page_lru(page); 855 856 VM_BUG_ON_PAGE(PageLRU(page), page); 857 858 SetPageLRU(page); 859 add_page_to_lru_list(page, lruvec, lru); 860 update_page_reclaim_stat(lruvec, file, active); 861 trace_mm_lru_insertion(page, lru); 862 } 863 864 /* 865 * Add the passed pages to the LRU, then drop the caller's refcount 866 * on them. Reinitialises the caller's pagevec. 867 */ 868 void __pagevec_lru_add(struct pagevec *pvec) 869 { 870 pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL); 871 } 872 EXPORT_SYMBOL(__pagevec_lru_add); 873 874 /** 875 * pagevec_lookup_entries - gang pagecache lookup 876 * @pvec: Where the resulting entries are placed 877 * @mapping: The address_space to search 878 * @start: The starting entry index 879 * @nr_entries: The maximum number of entries 880 * @indices: The cache indices corresponding to the entries in @pvec 881 * 882 * pagevec_lookup_entries() will search for and return a group of up 883 * to @nr_entries pages and shadow entries in the mapping. All 884 * entries are placed in @pvec. pagevec_lookup_entries() takes a 885 * reference against actual pages in @pvec. 886 * 887 * The search returns a group of mapping-contiguous entries with 888 * ascending indexes. There may be holes in the indices due to 889 * not-present entries. 890 * 891 * pagevec_lookup_entries() returns the number of entries which were 892 * found. 893 */ 894 unsigned pagevec_lookup_entries(struct pagevec *pvec, 895 struct address_space *mapping, 896 pgoff_t start, unsigned nr_pages, 897 pgoff_t *indices) 898 { 899 pvec->nr = find_get_entries(mapping, start, nr_pages, 900 pvec->pages, indices); 901 return pagevec_count(pvec); 902 } 903 904 /** 905 * pagevec_remove_exceptionals - pagevec exceptionals pruning 906 * @pvec: The pagevec to prune 907 * 908 * pagevec_lookup_entries() fills both pages and exceptional radix 909 * tree entries into the pagevec. This function prunes all 910 * exceptionals from @pvec without leaving holes, so that it can be 911 * passed on to page-only pagevec operations. 912 */ 913 void pagevec_remove_exceptionals(struct pagevec *pvec) 914 { 915 int i, j; 916 917 for (i = 0, j = 0; i < pagevec_count(pvec); i++) { 918 struct page *page = pvec->pages[i]; 919 if (!radix_tree_exceptional_entry(page)) 920 pvec->pages[j++] = page; 921 } 922 pvec->nr = j; 923 } 924 925 /** 926 * pagevec_lookup - gang pagecache lookup 927 * @pvec: Where the resulting pages are placed 928 * @mapping: The address_space to search 929 * @start: The starting page index 930 * @nr_pages: The maximum number of pages 931 * 932 * pagevec_lookup() will search for and return a group of up to @nr_pages pages 933 * in the mapping. The pages are placed in @pvec. pagevec_lookup() takes a 934 * reference against the pages in @pvec. 935 * 936 * The search returns a group of mapping-contiguous pages with ascending 937 * indexes. There may be holes in the indices due to not-present pages. 938 * 939 * pagevec_lookup() returns the number of pages which were found. 940 */ 941 unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, 942 pgoff_t start, unsigned nr_pages) 943 { 944 pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages); 945 return pagevec_count(pvec); 946 } 947 EXPORT_SYMBOL(pagevec_lookup); 948 949 unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, 950 pgoff_t *index, int tag, unsigned nr_pages) 951 { 952 pvec->nr = find_get_pages_tag(mapping, index, tag, 953 nr_pages, pvec->pages); 954 return pagevec_count(pvec); 955 } 956 EXPORT_SYMBOL(pagevec_lookup_tag); 957 958 /* 959 * Perform any setup for the swap system 960 */ 961 void __init swap_setup(void) 962 { 963 unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); 964 965 /* Use a smaller cluster for small-memory machines */ 966 if (megs < 16) 967 page_cluster = 2; 968 else 969 page_cluster = 3; 970 /* 971 * Right now other parts of the system means that we 972 * _really_ don't want to cluster much more 973 */ 974 } 975