1 /* 2 * linux/mm/swap.c 3 * 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 5 */ 6 7 /* 8 * This file contains the default values for the operation of the 9 * Linux VM subsystem. Fine-tuning documentation can be found in 10 * Documentation/sysctl/vm.txt. 11 * Started 18.12.91 12 * Swap aging added 23.2.95, Stephen Tweedie. 13 * Buffermem limits added 12.3.98, Rik van Riel. 14 */ 15 16 #include <linux/mm.h> 17 #include <linux/sched.h> 18 #include <linux/kernel_stat.h> 19 #include <linux/swap.h> 20 #include <linux/mman.h> 21 #include <linux/pagemap.h> 22 #include <linux/pagevec.h> 23 #include <linux/init.h> 24 #include <linux/export.h> 25 #include <linux/mm_inline.h> 26 #include <linux/percpu_counter.h> 27 #include <linux/percpu.h> 28 #include <linux/cpu.h> 29 #include <linux/notifier.h> 30 #include <linux/backing-dev.h> 31 #include <linux/memcontrol.h> 32 #include <linux/gfp.h> 33 #include <linux/uio.h> 34 #include <linux/hugetlb.h> 35 36 #include "internal.h" 37 38 #define CREATE_TRACE_POINTS 39 #include <trace/events/pagemap.h> 40 41 /* How many pages do we try to swap or page in/out together? */ 42 int page_cluster; 43 44 static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); 45 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); 46 static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); 47 48 /* 49 * This path almost never happens for VM activity - pages are normally 50 * freed via pagevecs. But it gets used by networking. 51 */ 52 static void __page_cache_release(struct page *page) 53 { 54 if (PageLRU(page)) { 55 struct zone *zone = page_zone(page); 56 struct lruvec *lruvec; 57 unsigned long flags; 58 59 spin_lock_irqsave(&zone->lru_lock, flags); 60 lruvec = mem_cgroup_page_lruvec(page, zone); 61 VM_BUG_ON(!PageLRU(page)); 62 __ClearPageLRU(page); 63 del_page_from_lru_list(page, lruvec, page_off_lru(page)); 64 spin_unlock_irqrestore(&zone->lru_lock, flags); 65 } 66 } 67 68 static void __put_single_page(struct page *page) 69 { 70 __page_cache_release(page); 71 free_hot_cold_page(page, 0); 72 } 73 74 static void __put_compound_page(struct page *page) 75 { 76 compound_page_dtor *dtor; 77 78 __page_cache_release(page); 79 dtor = get_compound_page_dtor(page); 80 (*dtor)(page); 81 } 82 83 static void put_compound_page(struct page *page) 84 { 85 if (unlikely(PageTail(page))) { 86 /* __split_huge_page_refcount can run under us */ 87 struct page *page_head = compound_trans_head(page); 88 89 if (likely(page != page_head && 90 get_page_unless_zero(page_head))) { 91 unsigned long flags; 92 93 /* 94 * THP can not break up slab pages so avoid taking 95 * compound_lock(). Slab performs non-atomic bit ops 96 * on page->flags for better performance. In particular 97 * slab_unlock() in slub used to be a hot path. It is 98 * still hot on arches that do not support 99 * this_cpu_cmpxchg_double(). 100 */ 101 if (PageSlab(page_head) || PageHeadHuge(page_head)) { 102 if (likely(PageTail(page))) { 103 /* 104 * __split_huge_page_refcount 105 * cannot race here. 106 */ 107 VM_BUG_ON(!PageHead(page_head)); 108 atomic_dec(&page->_mapcount); 109 if (put_page_testzero(page_head)) 110 VM_BUG_ON(1); 111 if (put_page_testzero(page_head)) 112 __put_compound_page(page_head); 113 return; 114 } else 115 /* 116 * __split_huge_page_refcount 117 * run before us, "page" was a 118 * THP tail. The split 119 * page_head has been freed 120 * and reallocated as slab or 121 * hugetlbfs page of smaller 122 * order (only possible if 123 * reallocated as slab on 124 * x86). 125 */ 126 goto skip_lock; 127 } 128 /* 129 * page_head wasn't a dangling pointer but it 130 * may not be a head page anymore by the time 131 * we obtain the lock. That is ok as long as it 132 * can't be freed from under us. 133 */ 134 flags = compound_lock_irqsave(page_head); 135 if (unlikely(!PageTail(page))) { 136 /* __split_huge_page_refcount run before us */ 137 compound_unlock_irqrestore(page_head, flags); 138 skip_lock: 139 if (put_page_testzero(page_head)) { 140 /* 141 * The head page may have been 142 * freed and reallocated as a 143 * compound page of smaller 144 * order and then freed again. 145 * All we know is that it 146 * cannot have become: a THP 147 * page, a compound page of 148 * higher order, a tail page. 149 * That is because we still 150 * hold the refcount of the 151 * split THP tail and 152 * page_head was the THP head 153 * before the split. 154 */ 155 if (PageHead(page_head)) 156 __put_compound_page(page_head); 157 else 158 __put_single_page(page_head); 159 } 160 out_put_single: 161 if (put_page_testzero(page)) 162 __put_single_page(page); 163 return; 164 } 165 VM_BUG_ON(page_head != page->first_page); 166 /* 167 * We can release the refcount taken by 168 * get_page_unless_zero() now that 169 * __split_huge_page_refcount() is blocked on 170 * the compound_lock. 171 */ 172 if (put_page_testzero(page_head)) 173 VM_BUG_ON(1); 174 /* __split_huge_page_refcount will wait now */ 175 VM_BUG_ON(page_mapcount(page) <= 0); 176 atomic_dec(&page->_mapcount); 177 VM_BUG_ON(atomic_read(&page_head->_count) <= 0); 178 VM_BUG_ON(atomic_read(&page->_count) != 0); 179 compound_unlock_irqrestore(page_head, flags); 180 181 if (put_page_testzero(page_head)) { 182 if (PageHead(page_head)) 183 __put_compound_page(page_head); 184 else 185 __put_single_page(page_head); 186 } 187 } else { 188 /* page_head is a dangling pointer */ 189 VM_BUG_ON(PageTail(page)); 190 goto out_put_single; 191 } 192 } else if (put_page_testzero(page)) { 193 if (PageHead(page)) 194 __put_compound_page(page); 195 else 196 __put_single_page(page); 197 } 198 } 199 200 void put_page(struct page *page) 201 { 202 if (unlikely(PageCompound(page))) 203 put_compound_page(page); 204 else if (put_page_testzero(page)) 205 __put_single_page(page); 206 } 207 EXPORT_SYMBOL(put_page); 208 209 /* 210 * This function is exported but must not be called by anything other 211 * than get_page(). It implements the slow path of get_page(). 212 */ 213 bool __get_page_tail(struct page *page) 214 { 215 /* 216 * This takes care of get_page() if run on a tail page 217 * returned by one of the get_user_pages/follow_page variants. 218 * get_user_pages/follow_page itself doesn't need the compound 219 * lock because it runs __get_page_tail_foll() under the 220 * proper PT lock that already serializes against 221 * split_huge_page(). 222 */ 223 unsigned long flags; 224 bool got = false; 225 struct page *page_head = compound_trans_head(page); 226 227 if (likely(page != page_head && get_page_unless_zero(page_head))) { 228 /* Ref to put_compound_page() comment. */ 229 if (PageSlab(page_head) || PageHeadHuge(page_head)) { 230 if (likely(PageTail(page))) { 231 /* 232 * This is a hugetlbfs page or a slab 233 * page. __split_huge_page_refcount 234 * cannot race here. 235 */ 236 VM_BUG_ON(!PageHead(page_head)); 237 __get_page_tail_foll(page, false); 238 return true; 239 } else { 240 /* 241 * __split_huge_page_refcount run 242 * before us, "page" was a THP 243 * tail. The split page_head has been 244 * freed and reallocated as slab or 245 * hugetlbfs page of smaller order 246 * (only possible if reallocated as 247 * slab on x86). 248 */ 249 put_page(page_head); 250 return false; 251 } 252 } 253 254 /* 255 * page_head wasn't a dangling pointer but it 256 * may not be a head page anymore by the time 257 * we obtain the lock. That is ok as long as it 258 * can't be freed from under us. 259 */ 260 flags = compound_lock_irqsave(page_head); 261 /* here __split_huge_page_refcount won't run anymore */ 262 if (likely(PageTail(page))) { 263 __get_page_tail_foll(page, false); 264 got = true; 265 } 266 compound_unlock_irqrestore(page_head, flags); 267 if (unlikely(!got)) 268 put_page(page_head); 269 } 270 return got; 271 } 272 EXPORT_SYMBOL(__get_page_tail); 273 274 /** 275 * put_pages_list() - release a list of pages 276 * @pages: list of pages threaded on page->lru 277 * 278 * Release a list of pages which are strung together on page.lru. Currently 279 * used by read_cache_pages() and related error recovery code. 280 */ 281 void put_pages_list(struct list_head *pages) 282 { 283 while (!list_empty(pages)) { 284 struct page *victim; 285 286 victim = list_entry(pages->prev, struct page, lru); 287 list_del(&victim->lru); 288 page_cache_release(victim); 289 } 290 } 291 EXPORT_SYMBOL(put_pages_list); 292 293 /* 294 * get_kernel_pages() - pin kernel pages in memory 295 * @kiov: An array of struct kvec structures 296 * @nr_segs: number of segments to pin 297 * @write: pinning for read/write, currently ignored 298 * @pages: array that receives pointers to the pages pinned. 299 * Should be at least nr_segs long. 300 * 301 * Returns number of pages pinned. This may be fewer than the number 302 * requested. If nr_pages is 0 or negative, returns 0. If no pages 303 * were pinned, returns -errno. Each page returned must be released 304 * with a put_page() call when it is finished with. 305 */ 306 int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write, 307 struct page **pages) 308 { 309 int seg; 310 311 for (seg = 0; seg < nr_segs; seg++) { 312 if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE)) 313 return seg; 314 315 pages[seg] = kmap_to_page(kiov[seg].iov_base); 316 page_cache_get(pages[seg]); 317 } 318 319 return seg; 320 } 321 EXPORT_SYMBOL_GPL(get_kernel_pages); 322 323 /* 324 * get_kernel_page() - pin a kernel page in memory 325 * @start: starting kernel address 326 * @write: pinning for read/write, currently ignored 327 * @pages: array that receives pointer to the page pinned. 328 * Must be at least nr_segs long. 329 * 330 * Returns 1 if page is pinned. If the page was not pinned, returns 331 * -errno. The page returned must be released with a put_page() call 332 * when it is finished with. 333 */ 334 int get_kernel_page(unsigned long start, int write, struct page **pages) 335 { 336 const struct kvec kiov = { 337 .iov_base = (void *)start, 338 .iov_len = PAGE_SIZE 339 }; 340 341 return get_kernel_pages(&kiov, 1, write, pages); 342 } 343 EXPORT_SYMBOL_GPL(get_kernel_page); 344 345 static void pagevec_lru_move_fn(struct pagevec *pvec, 346 void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg), 347 void *arg) 348 { 349 int i; 350 struct zone *zone = NULL; 351 struct lruvec *lruvec; 352 unsigned long flags = 0; 353 354 for (i = 0; i < pagevec_count(pvec); i++) { 355 struct page *page = pvec->pages[i]; 356 struct zone *pagezone = page_zone(page); 357 358 if (pagezone != zone) { 359 if (zone) 360 spin_unlock_irqrestore(&zone->lru_lock, flags); 361 zone = pagezone; 362 spin_lock_irqsave(&zone->lru_lock, flags); 363 } 364 365 lruvec = mem_cgroup_page_lruvec(page, zone); 366 (*move_fn)(page, lruvec, arg); 367 } 368 if (zone) 369 spin_unlock_irqrestore(&zone->lru_lock, flags); 370 release_pages(pvec->pages, pvec->nr, pvec->cold); 371 pagevec_reinit(pvec); 372 } 373 374 static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec, 375 void *arg) 376 { 377 int *pgmoved = arg; 378 379 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 380 enum lru_list lru = page_lru_base_type(page); 381 list_move_tail(&page->lru, &lruvec->lists[lru]); 382 (*pgmoved)++; 383 } 384 } 385 386 /* 387 * pagevec_move_tail() must be called with IRQ disabled. 388 * Otherwise this may cause nasty races. 389 */ 390 static void pagevec_move_tail(struct pagevec *pvec) 391 { 392 int pgmoved = 0; 393 394 pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved); 395 __count_vm_events(PGROTATED, pgmoved); 396 } 397 398 /* 399 * Writeback is about to end against a page which has been marked for immediate 400 * reclaim. If it still appears to be reclaimable, move it to the tail of the 401 * inactive list. 402 */ 403 void rotate_reclaimable_page(struct page *page) 404 { 405 if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && 406 !PageUnevictable(page) && PageLRU(page)) { 407 struct pagevec *pvec; 408 unsigned long flags; 409 410 page_cache_get(page); 411 local_irq_save(flags); 412 pvec = &__get_cpu_var(lru_rotate_pvecs); 413 if (!pagevec_add(pvec, page)) 414 pagevec_move_tail(pvec); 415 local_irq_restore(flags); 416 } 417 } 418 419 static void update_page_reclaim_stat(struct lruvec *lruvec, 420 int file, int rotated) 421 { 422 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; 423 424 reclaim_stat->recent_scanned[file]++; 425 if (rotated) 426 reclaim_stat->recent_rotated[file]++; 427 } 428 429 static void __activate_page(struct page *page, struct lruvec *lruvec, 430 void *arg) 431 { 432 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 433 int file = page_is_file_cache(page); 434 int lru = page_lru_base_type(page); 435 436 del_page_from_lru_list(page, lruvec, lru); 437 SetPageActive(page); 438 lru += LRU_ACTIVE; 439 add_page_to_lru_list(page, lruvec, lru); 440 trace_mm_lru_activate(page, page_to_pfn(page)); 441 442 __count_vm_event(PGACTIVATE); 443 update_page_reclaim_stat(lruvec, file, 1); 444 } 445 } 446 447 #ifdef CONFIG_SMP 448 static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); 449 450 static void activate_page_drain(int cpu) 451 { 452 struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu); 453 454 if (pagevec_count(pvec)) 455 pagevec_lru_move_fn(pvec, __activate_page, NULL); 456 } 457 458 static bool need_activate_page_drain(int cpu) 459 { 460 return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; 461 } 462 463 void activate_page(struct page *page) 464 { 465 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 466 struct pagevec *pvec = &get_cpu_var(activate_page_pvecs); 467 468 page_cache_get(page); 469 if (!pagevec_add(pvec, page)) 470 pagevec_lru_move_fn(pvec, __activate_page, NULL); 471 put_cpu_var(activate_page_pvecs); 472 } 473 } 474 475 #else 476 static inline void activate_page_drain(int cpu) 477 { 478 } 479 480 static bool need_activate_page_drain(int cpu) 481 { 482 return false; 483 } 484 485 void activate_page(struct page *page) 486 { 487 struct zone *zone = page_zone(page); 488 489 spin_lock_irq(&zone->lru_lock); 490 __activate_page(page, mem_cgroup_page_lruvec(page, zone), NULL); 491 spin_unlock_irq(&zone->lru_lock); 492 } 493 #endif 494 495 static void __lru_cache_activate_page(struct page *page) 496 { 497 struct pagevec *pvec = &get_cpu_var(lru_add_pvec); 498 int i; 499 500 /* 501 * Search backwards on the optimistic assumption that the page being 502 * activated has just been added to this pagevec. Note that only 503 * the local pagevec is examined as a !PageLRU page could be in the 504 * process of being released, reclaimed, migrated or on a remote 505 * pagevec that is currently being drained. Furthermore, marking 506 * a remote pagevec's page PageActive potentially hits a race where 507 * a page is marked PageActive just after it is added to the inactive 508 * list causing accounting errors and BUG_ON checks to trigger. 509 */ 510 for (i = pagevec_count(pvec) - 1; i >= 0; i--) { 511 struct page *pagevec_page = pvec->pages[i]; 512 513 if (pagevec_page == page) { 514 SetPageActive(page); 515 break; 516 } 517 } 518 519 put_cpu_var(lru_add_pvec); 520 } 521 522 /* 523 * Mark a page as having seen activity. 524 * 525 * inactive,unreferenced -> inactive,referenced 526 * inactive,referenced -> active,unreferenced 527 * active,unreferenced -> active,referenced 528 */ 529 void mark_page_accessed(struct page *page) 530 { 531 if (!PageActive(page) && !PageUnevictable(page) && 532 PageReferenced(page)) { 533 534 /* 535 * If the page is on the LRU, queue it for activation via 536 * activate_page_pvecs. Otherwise, assume the page is on a 537 * pagevec, mark it active and it'll be moved to the active 538 * LRU on the next drain. 539 */ 540 if (PageLRU(page)) 541 activate_page(page); 542 else 543 __lru_cache_activate_page(page); 544 ClearPageReferenced(page); 545 } else if (!PageReferenced(page)) { 546 SetPageReferenced(page); 547 } 548 } 549 EXPORT_SYMBOL(mark_page_accessed); 550 551 /* 552 * Queue the page for addition to the LRU via pagevec. The decision on whether 553 * to add the page to the [in]active [file|anon] list is deferred until the 554 * pagevec is drained. This gives a chance for the caller of __lru_cache_add() 555 * have the page added to the active list using mark_page_accessed(). 556 */ 557 void __lru_cache_add(struct page *page) 558 { 559 struct pagevec *pvec = &get_cpu_var(lru_add_pvec); 560 561 page_cache_get(page); 562 if (!pagevec_space(pvec)) 563 __pagevec_lru_add(pvec); 564 pagevec_add(pvec, page); 565 put_cpu_var(lru_add_pvec); 566 } 567 EXPORT_SYMBOL(__lru_cache_add); 568 569 /** 570 * lru_cache_add - add a page to a page list 571 * @page: the page to be added to the LRU. 572 */ 573 void lru_cache_add(struct page *page) 574 { 575 VM_BUG_ON(PageActive(page) && PageUnevictable(page)); 576 VM_BUG_ON(PageLRU(page)); 577 __lru_cache_add(page); 578 } 579 580 /** 581 * add_page_to_unevictable_list - add a page to the unevictable list 582 * @page: the page to be added to the unevictable list 583 * 584 * Add page directly to its zone's unevictable list. To avoid races with 585 * tasks that might be making the page evictable, through eg. munlock, 586 * munmap or exit, while it's not on the lru, we want to add the page 587 * while it's locked or otherwise "invisible" to other tasks. This is 588 * difficult to do when using the pagevec cache, so bypass that. 589 */ 590 void add_page_to_unevictable_list(struct page *page) 591 { 592 struct zone *zone = page_zone(page); 593 struct lruvec *lruvec; 594 595 spin_lock_irq(&zone->lru_lock); 596 lruvec = mem_cgroup_page_lruvec(page, zone); 597 ClearPageActive(page); 598 SetPageUnevictable(page); 599 SetPageLRU(page); 600 add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE); 601 spin_unlock_irq(&zone->lru_lock); 602 } 603 604 /* 605 * If the page can not be invalidated, it is moved to the 606 * inactive list to speed up its reclaim. It is moved to the 607 * head of the list, rather than the tail, to give the flusher 608 * threads some time to write it out, as this is much more 609 * effective than the single-page writeout from reclaim. 610 * 611 * If the page isn't page_mapped and dirty/writeback, the page 612 * could reclaim asap using PG_reclaim. 613 * 614 * 1. active, mapped page -> none 615 * 2. active, dirty/writeback page -> inactive, head, PG_reclaim 616 * 3. inactive, mapped page -> none 617 * 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim 618 * 5. inactive, clean -> inactive, tail 619 * 6. Others -> none 620 * 621 * In 4, why it moves inactive's head, the VM expects the page would 622 * be write it out by flusher threads as this is much more effective 623 * than the single-page writeout from reclaim. 624 */ 625 static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, 626 void *arg) 627 { 628 int lru, file; 629 bool active; 630 631 if (!PageLRU(page)) 632 return; 633 634 if (PageUnevictable(page)) 635 return; 636 637 /* Some processes are using the page */ 638 if (page_mapped(page)) 639 return; 640 641 active = PageActive(page); 642 file = page_is_file_cache(page); 643 lru = page_lru_base_type(page); 644 645 del_page_from_lru_list(page, lruvec, lru + active); 646 ClearPageActive(page); 647 ClearPageReferenced(page); 648 add_page_to_lru_list(page, lruvec, lru); 649 650 if (PageWriteback(page) || PageDirty(page)) { 651 /* 652 * PG_reclaim could be raced with end_page_writeback 653 * It can make readahead confusing. But race window 654 * is _really_ small and it's non-critical problem. 655 */ 656 SetPageReclaim(page); 657 } else { 658 /* 659 * The page's writeback ends up during pagevec 660 * We moves tha page into tail of inactive. 661 */ 662 list_move_tail(&page->lru, &lruvec->lists[lru]); 663 __count_vm_event(PGROTATED); 664 } 665 666 if (active) 667 __count_vm_event(PGDEACTIVATE); 668 update_page_reclaim_stat(lruvec, file, 0); 669 } 670 671 /* 672 * Drain pages out of the cpu's pagevecs. 673 * Either "cpu" is the current CPU, and preemption has already been 674 * disabled; or "cpu" is being hot-unplugged, and is already dead. 675 */ 676 void lru_add_drain_cpu(int cpu) 677 { 678 struct pagevec *pvec = &per_cpu(lru_add_pvec, cpu); 679 680 if (pagevec_count(pvec)) 681 __pagevec_lru_add(pvec); 682 683 pvec = &per_cpu(lru_rotate_pvecs, cpu); 684 if (pagevec_count(pvec)) { 685 unsigned long flags; 686 687 /* No harm done if a racing interrupt already did this */ 688 local_irq_save(flags); 689 pagevec_move_tail(pvec); 690 local_irq_restore(flags); 691 } 692 693 pvec = &per_cpu(lru_deactivate_pvecs, cpu); 694 if (pagevec_count(pvec)) 695 pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); 696 697 activate_page_drain(cpu); 698 } 699 700 /** 701 * deactivate_page - forcefully deactivate a page 702 * @page: page to deactivate 703 * 704 * This function hints the VM that @page is a good reclaim candidate, 705 * for example if its invalidation fails due to the page being dirty 706 * or under writeback. 707 */ 708 void deactivate_page(struct page *page) 709 { 710 /* 711 * In a workload with many unevictable page such as mprotect, unevictable 712 * page deactivation for accelerating reclaim is pointless. 713 */ 714 if (PageUnevictable(page)) 715 return; 716 717 if (likely(get_page_unless_zero(page))) { 718 struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); 719 720 if (!pagevec_add(pvec, page)) 721 pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); 722 put_cpu_var(lru_deactivate_pvecs); 723 } 724 } 725 726 void lru_add_drain(void) 727 { 728 lru_add_drain_cpu(get_cpu()); 729 put_cpu(); 730 } 731 732 static void lru_add_drain_per_cpu(struct work_struct *dummy) 733 { 734 lru_add_drain(); 735 } 736 737 static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); 738 739 void lru_add_drain_all(void) 740 { 741 static DEFINE_MUTEX(lock); 742 static struct cpumask has_work; 743 int cpu; 744 745 mutex_lock(&lock); 746 get_online_cpus(); 747 cpumask_clear(&has_work); 748 749 for_each_online_cpu(cpu) { 750 struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); 751 752 if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || 753 pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || 754 pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || 755 need_activate_page_drain(cpu)) { 756 INIT_WORK(work, lru_add_drain_per_cpu); 757 schedule_work_on(cpu, work); 758 cpumask_set_cpu(cpu, &has_work); 759 } 760 } 761 762 for_each_cpu(cpu, &has_work) 763 flush_work(&per_cpu(lru_add_drain_work, cpu)); 764 765 put_online_cpus(); 766 mutex_unlock(&lock); 767 } 768 769 /* 770 * Batched page_cache_release(). Decrement the reference count on all the 771 * passed pages. If it fell to zero then remove the page from the LRU and 772 * free it. 773 * 774 * Avoid taking zone->lru_lock if possible, but if it is taken, retain it 775 * for the remainder of the operation. 776 * 777 * The locking in this function is against shrink_inactive_list(): we recheck 778 * the page count inside the lock to see whether shrink_inactive_list() 779 * grabbed the page via the LRU. If it did, give up: shrink_inactive_list() 780 * will free it. 781 */ 782 void release_pages(struct page **pages, int nr, int cold) 783 { 784 int i; 785 LIST_HEAD(pages_to_free); 786 struct zone *zone = NULL; 787 struct lruvec *lruvec; 788 unsigned long uninitialized_var(flags); 789 790 for (i = 0; i < nr; i++) { 791 struct page *page = pages[i]; 792 793 if (unlikely(PageCompound(page))) { 794 if (zone) { 795 spin_unlock_irqrestore(&zone->lru_lock, flags); 796 zone = NULL; 797 } 798 put_compound_page(page); 799 continue; 800 } 801 802 if (!put_page_testzero(page)) 803 continue; 804 805 if (PageLRU(page)) { 806 struct zone *pagezone = page_zone(page); 807 808 if (pagezone != zone) { 809 if (zone) 810 spin_unlock_irqrestore(&zone->lru_lock, 811 flags); 812 zone = pagezone; 813 spin_lock_irqsave(&zone->lru_lock, flags); 814 } 815 816 lruvec = mem_cgroup_page_lruvec(page, zone); 817 VM_BUG_ON(!PageLRU(page)); 818 __ClearPageLRU(page); 819 del_page_from_lru_list(page, lruvec, page_off_lru(page)); 820 } 821 822 /* Clear Active bit in case of parallel mark_page_accessed */ 823 ClearPageActive(page); 824 825 list_add(&page->lru, &pages_to_free); 826 } 827 if (zone) 828 spin_unlock_irqrestore(&zone->lru_lock, flags); 829 830 free_hot_cold_page_list(&pages_to_free, cold); 831 } 832 EXPORT_SYMBOL(release_pages); 833 834 /* 835 * The pages which we're about to release may be in the deferred lru-addition 836 * queues. That would prevent them from really being freed right now. That's 837 * OK from a correctness point of view but is inefficient - those pages may be 838 * cache-warm and we want to give them back to the page allocator ASAP. 839 * 840 * So __pagevec_release() will drain those queues here. __pagevec_lru_add() 841 * and __pagevec_lru_add_active() call release_pages() directly to avoid 842 * mutual recursion. 843 */ 844 void __pagevec_release(struct pagevec *pvec) 845 { 846 lru_add_drain(); 847 release_pages(pvec->pages, pagevec_count(pvec), pvec->cold); 848 pagevec_reinit(pvec); 849 } 850 EXPORT_SYMBOL(__pagevec_release); 851 852 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 853 /* used by __split_huge_page_refcount() */ 854 void lru_add_page_tail(struct page *page, struct page *page_tail, 855 struct lruvec *lruvec, struct list_head *list) 856 { 857 const int file = 0; 858 859 VM_BUG_ON(!PageHead(page)); 860 VM_BUG_ON(PageCompound(page_tail)); 861 VM_BUG_ON(PageLRU(page_tail)); 862 VM_BUG_ON(NR_CPUS != 1 && 863 !spin_is_locked(&lruvec_zone(lruvec)->lru_lock)); 864 865 if (!list) 866 SetPageLRU(page_tail); 867 868 if (likely(PageLRU(page))) 869 list_add_tail(&page_tail->lru, &page->lru); 870 else if (list) { 871 /* page reclaim is reclaiming a huge page */ 872 get_page(page_tail); 873 list_add_tail(&page_tail->lru, list); 874 } else { 875 struct list_head *list_head; 876 /* 877 * Head page has not yet been counted, as an hpage, 878 * so we must account for each subpage individually. 879 * 880 * Use the standard add function to put page_tail on the list, 881 * but then correct its position so they all end up in order. 882 */ 883 add_page_to_lru_list(page_tail, lruvec, page_lru(page_tail)); 884 list_head = page_tail->lru.prev; 885 list_move_tail(&page_tail->lru, list_head); 886 } 887 888 if (!PageUnevictable(page)) 889 update_page_reclaim_stat(lruvec, file, PageActive(page_tail)); 890 } 891 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 892 893 static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, 894 void *arg) 895 { 896 int file = page_is_file_cache(page); 897 int active = PageActive(page); 898 enum lru_list lru = page_lru(page); 899 900 VM_BUG_ON(PageLRU(page)); 901 902 SetPageLRU(page); 903 add_page_to_lru_list(page, lruvec, lru); 904 update_page_reclaim_stat(lruvec, file, active); 905 trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page)); 906 } 907 908 /* 909 * Add the passed pages to the LRU, then drop the caller's refcount 910 * on them. Reinitialises the caller's pagevec. 911 */ 912 void __pagevec_lru_add(struct pagevec *pvec) 913 { 914 pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL); 915 } 916 EXPORT_SYMBOL(__pagevec_lru_add); 917 918 /** 919 * pagevec_lookup - gang pagecache lookup 920 * @pvec: Where the resulting pages are placed 921 * @mapping: The address_space to search 922 * @start: The starting page index 923 * @nr_pages: The maximum number of pages 924 * 925 * pagevec_lookup() will search for and return a group of up to @nr_pages pages 926 * in the mapping. The pages are placed in @pvec. pagevec_lookup() takes a 927 * reference against the pages in @pvec. 928 * 929 * The search returns a group of mapping-contiguous pages with ascending 930 * indexes. There may be holes in the indices due to not-present pages. 931 * 932 * pagevec_lookup() returns the number of pages which were found. 933 */ 934 unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, 935 pgoff_t start, unsigned nr_pages) 936 { 937 pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages); 938 return pagevec_count(pvec); 939 } 940 EXPORT_SYMBOL(pagevec_lookup); 941 942 unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, 943 pgoff_t *index, int tag, unsigned nr_pages) 944 { 945 pvec->nr = find_get_pages_tag(mapping, index, tag, 946 nr_pages, pvec->pages); 947 return pagevec_count(pvec); 948 } 949 EXPORT_SYMBOL(pagevec_lookup_tag); 950 951 /* 952 * Perform any setup for the swap system 953 */ 954 void __init swap_setup(void) 955 { 956 unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); 957 #ifdef CONFIG_SWAP 958 int i; 959 960 if (bdi_init(swapper_spaces[0].backing_dev_info)) 961 panic("Failed to init swap bdi"); 962 for (i = 0; i < MAX_SWAPFILES; i++) { 963 spin_lock_init(&swapper_spaces[i].tree_lock); 964 INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear); 965 } 966 #endif 967 968 /* Use a smaller cluster for small-memory machines */ 969 if (megs < 16) 970 page_cluster = 2; 971 else 972 page_cluster = 3; 973 /* 974 * Right now other parts of the system means that we 975 * _really_ don't want to cluster much more 976 */ 977 } 978