1 /* 2 * linux/mm/swap.c 3 * 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 5 */ 6 7 /* 8 * This file contains the default values for the operation of the 9 * Linux VM subsystem. Fine-tuning documentation can be found in 10 * Documentation/sysctl/vm.txt. 11 * Started 18.12.91 12 * Swap aging added 23.2.95, Stephen Tweedie. 13 * Buffermem limits added 12.3.98, Rik van Riel. 14 */ 15 16 #include <linux/mm.h> 17 #include <linux/sched.h> 18 #include <linux/kernel_stat.h> 19 #include <linux/swap.h> 20 #include <linux/mman.h> 21 #include <linux/pagemap.h> 22 #include <linux/pagevec.h> 23 #include <linux/init.h> 24 #include <linux/export.h> 25 #include <linux/mm_inline.h> 26 #include <linux/percpu_counter.h> 27 #include <linux/percpu.h> 28 #include <linux/cpu.h> 29 #include <linux/notifier.h> 30 #include <linux/backing-dev.h> 31 #include <linux/memcontrol.h> 32 #include <linux/gfp.h> 33 #include <linux/uio.h> 34 35 #include "internal.h" 36 37 #define CREATE_TRACE_POINTS 38 #include <trace/events/pagemap.h> 39 40 /* How many pages do we try to swap or page in/out together? */ 41 int page_cluster; 42 43 static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); 44 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); 45 static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); 46 47 /* 48 * This path almost never happens for VM activity - pages are normally 49 * freed via pagevecs. But it gets used by networking. 50 */ 51 static void __page_cache_release(struct page *page) 52 { 53 if (PageLRU(page)) { 54 struct zone *zone = page_zone(page); 55 struct lruvec *lruvec; 56 unsigned long flags; 57 58 spin_lock_irqsave(&zone->lru_lock, flags); 59 lruvec = mem_cgroup_page_lruvec(page, zone); 60 VM_BUG_ON(!PageLRU(page)); 61 __ClearPageLRU(page); 62 del_page_from_lru_list(page, lruvec, page_off_lru(page)); 63 spin_unlock_irqrestore(&zone->lru_lock, flags); 64 } 65 } 66 67 static void __put_single_page(struct page *page) 68 { 69 __page_cache_release(page); 70 free_hot_cold_page(page, 0); 71 } 72 73 static void __put_compound_page(struct page *page) 74 { 75 compound_page_dtor *dtor; 76 77 __page_cache_release(page); 78 dtor = get_compound_page_dtor(page); 79 (*dtor)(page); 80 } 81 82 static void put_compound_page(struct page *page) 83 { 84 if (unlikely(PageTail(page))) { 85 /* __split_huge_page_refcount can run under us */ 86 struct page *page_head = compound_trans_head(page); 87 88 if (likely(page != page_head && 89 get_page_unless_zero(page_head))) { 90 unsigned long flags; 91 92 /* 93 * THP can not break up slab pages so avoid taking 94 * compound_lock(). Slab performs non-atomic bit ops 95 * on page->flags for better performance. In particular 96 * slab_unlock() in slub used to be a hot path. It is 97 * still hot on arches that do not support 98 * this_cpu_cmpxchg_double(). 99 */ 100 if (PageSlab(page_head)) { 101 if (PageTail(page)) { 102 if (put_page_testzero(page_head)) 103 VM_BUG_ON(1); 104 105 atomic_dec(&page->_mapcount); 106 goto skip_lock_tail; 107 } else 108 goto skip_lock; 109 } 110 /* 111 * page_head wasn't a dangling pointer but it 112 * may not be a head page anymore by the time 113 * we obtain the lock. That is ok as long as it 114 * can't be freed from under us. 115 */ 116 flags = compound_lock_irqsave(page_head); 117 if (unlikely(!PageTail(page))) { 118 /* __split_huge_page_refcount run before us */ 119 compound_unlock_irqrestore(page_head, flags); 120 skip_lock: 121 if (put_page_testzero(page_head)) 122 __put_single_page(page_head); 123 out_put_single: 124 if (put_page_testzero(page)) 125 __put_single_page(page); 126 return; 127 } 128 VM_BUG_ON(page_head != page->first_page); 129 /* 130 * We can release the refcount taken by 131 * get_page_unless_zero() now that 132 * __split_huge_page_refcount() is blocked on 133 * the compound_lock. 134 */ 135 if (put_page_testzero(page_head)) 136 VM_BUG_ON(1); 137 /* __split_huge_page_refcount will wait now */ 138 VM_BUG_ON(page_mapcount(page) <= 0); 139 atomic_dec(&page->_mapcount); 140 VM_BUG_ON(atomic_read(&page_head->_count) <= 0); 141 VM_BUG_ON(atomic_read(&page->_count) != 0); 142 compound_unlock_irqrestore(page_head, flags); 143 144 skip_lock_tail: 145 if (put_page_testzero(page_head)) { 146 if (PageHead(page_head)) 147 __put_compound_page(page_head); 148 else 149 __put_single_page(page_head); 150 } 151 } else { 152 /* page_head is a dangling pointer */ 153 VM_BUG_ON(PageTail(page)); 154 goto out_put_single; 155 } 156 } else if (put_page_testzero(page)) { 157 if (PageHead(page)) 158 __put_compound_page(page); 159 else 160 __put_single_page(page); 161 } 162 } 163 164 void put_page(struct page *page) 165 { 166 if (unlikely(PageCompound(page))) 167 put_compound_page(page); 168 else if (put_page_testzero(page)) 169 __put_single_page(page); 170 } 171 EXPORT_SYMBOL(put_page); 172 173 /* 174 * This function is exported but must not be called by anything other 175 * than get_page(). It implements the slow path of get_page(). 176 */ 177 bool __get_page_tail(struct page *page) 178 { 179 /* 180 * This takes care of get_page() if run on a tail page 181 * returned by one of the get_user_pages/follow_page variants. 182 * get_user_pages/follow_page itself doesn't need the compound 183 * lock because it runs __get_page_tail_foll() under the 184 * proper PT lock that already serializes against 185 * split_huge_page(). 186 */ 187 unsigned long flags; 188 bool got = false; 189 struct page *page_head = compound_trans_head(page); 190 191 if (likely(page != page_head && get_page_unless_zero(page_head))) { 192 193 /* Ref to put_compound_page() comment. */ 194 if (PageSlab(page_head)) { 195 if (likely(PageTail(page))) { 196 __get_page_tail_foll(page, false); 197 return true; 198 } else { 199 put_page(page_head); 200 return false; 201 } 202 } 203 204 /* 205 * page_head wasn't a dangling pointer but it 206 * may not be a head page anymore by the time 207 * we obtain the lock. That is ok as long as it 208 * can't be freed from under us. 209 */ 210 flags = compound_lock_irqsave(page_head); 211 /* here __split_huge_page_refcount won't run anymore */ 212 if (likely(PageTail(page))) { 213 __get_page_tail_foll(page, false); 214 got = true; 215 } 216 compound_unlock_irqrestore(page_head, flags); 217 if (unlikely(!got)) 218 put_page(page_head); 219 } 220 return got; 221 } 222 EXPORT_SYMBOL(__get_page_tail); 223 224 /** 225 * put_pages_list() - release a list of pages 226 * @pages: list of pages threaded on page->lru 227 * 228 * Release a list of pages which are strung together on page.lru. Currently 229 * used by read_cache_pages() and related error recovery code. 230 */ 231 void put_pages_list(struct list_head *pages) 232 { 233 while (!list_empty(pages)) { 234 struct page *victim; 235 236 victim = list_entry(pages->prev, struct page, lru); 237 list_del(&victim->lru); 238 page_cache_release(victim); 239 } 240 } 241 EXPORT_SYMBOL(put_pages_list); 242 243 /* 244 * get_kernel_pages() - pin kernel pages in memory 245 * @kiov: An array of struct kvec structures 246 * @nr_segs: number of segments to pin 247 * @write: pinning for read/write, currently ignored 248 * @pages: array that receives pointers to the pages pinned. 249 * Should be at least nr_segs long. 250 * 251 * Returns number of pages pinned. This may be fewer than the number 252 * requested. If nr_pages is 0 or negative, returns 0. If no pages 253 * were pinned, returns -errno. Each page returned must be released 254 * with a put_page() call when it is finished with. 255 */ 256 int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write, 257 struct page **pages) 258 { 259 int seg; 260 261 for (seg = 0; seg < nr_segs; seg++) { 262 if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE)) 263 return seg; 264 265 pages[seg] = kmap_to_page(kiov[seg].iov_base); 266 page_cache_get(pages[seg]); 267 } 268 269 return seg; 270 } 271 EXPORT_SYMBOL_GPL(get_kernel_pages); 272 273 /* 274 * get_kernel_page() - pin a kernel page in memory 275 * @start: starting kernel address 276 * @write: pinning for read/write, currently ignored 277 * @pages: array that receives pointer to the page pinned. 278 * Must be at least nr_segs long. 279 * 280 * Returns 1 if page is pinned. If the page was not pinned, returns 281 * -errno. The page returned must be released with a put_page() call 282 * when it is finished with. 283 */ 284 int get_kernel_page(unsigned long start, int write, struct page **pages) 285 { 286 const struct kvec kiov = { 287 .iov_base = (void *)start, 288 .iov_len = PAGE_SIZE 289 }; 290 291 return get_kernel_pages(&kiov, 1, write, pages); 292 } 293 EXPORT_SYMBOL_GPL(get_kernel_page); 294 295 static void pagevec_lru_move_fn(struct pagevec *pvec, 296 void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg), 297 void *arg) 298 { 299 int i; 300 struct zone *zone = NULL; 301 struct lruvec *lruvec; 302 unsigned long flags = 0; 303 304 for (i = 0; i < pagevec_count(pvec); i++) { 305 struct page *page = pvec->pages[i]; 306 struct zone *pagezone = page_zone(page); 307 308 if (pagezone != zone) { 309 if (zone) 310 spin_unlock_irqrestore(&zone->lru_lock, flags); 311 zone = pagezone; 312 spin_lock_irqsave(&zone->lru_lock, flags); 313 } 314 315 lruvec = mem_cgroup_page_lruvec(page, zone); 316 (*move_fn)(page, lruvec, arg); 317 } 318 if (zone) 319 spin_unlock_irqrestore(&zone->lru_lock, flags); 320 release_pages(pvec->pages, pvec->nr, pvec->cold); 321 pagevec_reinit(pvec); 322 } 323 324 static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec, 325 void *arg) 326 { 327 int *pgmoved = arg; 328 329 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 330 enum lru_list lru = page_lru_base_type(page); 331 list_move_tail(&page->lru, &lruvec->lists[lru]); 332 (*pgmoved)++; 333 } 334 } 335 336 /* 337 * pagevec_move_tail() must be called with IRQ disabled. 338 * Otherwise this may cause nasty races. 339 */ 340 static void pagevec_move_tail(struct pagevec *pvec) 341 { 342 int pgmoved = 0; 343 344 pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved); 345 __count_vm_events(PGROTATED, pgmoved); 346 } 347 348 /* 349 * Writeback is about to end against a page which has been marked for immediate 350 * reclaim. If it still appears to be reclaimable, move it to the tail of the 351 * inactive list. 352 */ 353 void rotate_reclaimable_page(struct page *page) 354 { 355 if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && 356 !PageUnevictable(page) && PageLRU(page)) { 357 struct pagevec *pvec; 358 unsigned long flags; 359 360 page_cache_get(page); 361 local_irq_save(flags); 362 pvec = &__get_cpu_var(lru_rotate_pvecs); 363 if (!pagevec_add(pvec, page)) 364 pagevec_move_tail(pvec); 365 local_irq_restore(flags); 366 } 367 } 368 369 static void update_page_reclaim_stat(struct lruvec *lruvec, 370 int file, int rotated) 371 { 372 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; 373 374 reclaim_stat->recent_scanned[file]++; 375 if (rotated) 376 reclaim_stat->recent_rotated[file]++; 377 } 378 379 static void __activate_page(struct page *page, struct lruvec *lruvec, 380 void *arg) 381 { 382 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 383 int file = page_is_file_cache(page); 384 int lru = page_lru_base_type(page); 385 386 del_page_from_lru_list(page, lruvec, lru); 387 SetPageActive(page); 388 lru += LRU_ACTIVE; 389 add_page_to_lru_list(page, lruvec, lru); 390 trace_mm_lru_activate(page, page_to_pfn(page)); 391 392 __count_vm_event(PGACTIVATE); 393 update_page_reclaim_stat(lruvec, file, 1); 394 } 395 } 396 397 #ifdef CONFIG_SMP 398 static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); 399 400 static void activate_page_drain(int cpu) 401 { 402 struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu); 403 404 if (pagevec_count(pvec)) 405 pagevec_lru_move_fn(pvec, __activate_page, NULL); 406 } 407 408 void activate_page(struct page *page) 409 { 410 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 411 struct pagevec *pvec = &get_cpu_var(activate_page_pvecs); 412 413 page_cache_get(page); 414 if (!pagevec_add(pvec, page)) 415 pagevec_lru_move_fn(pvec, __activate_page, NULL); 416 put_cpu_var(activate_page_pvecs); 417 } 418 } 419 420 #else 421 static inline void activate_page_drain(int cpu) 422 { 423 } 424 425 void activate_page(struct page *page) 426 { 427 struct zone *zone = page_zone(page); 428 429 spin_lock_irq(&zone->lru_lock); 430 __activate_page(page, mem_cgroup_page_lruvec(page, zone), NULL); 431 spin_unlock_irq(&zone->lru_lock); 432 } 433 #endif 434 435 static void __lru_cache_activate_page(struct page *page) 436 { 437 struct pagevec *pvec = &get_cpu_var(lru_add_pvec); 438 int i; 439 440 /* 441 * Search backwards on the optimistic assumption that the page being 442 * activated has just been added to this pagevec. Note that only 443 * the local pagevec is examined as a !PageLRU page could be in the 444 * process of being released, reclaimed, migrated or on a remote 445 * pagevec that is currently being drained. Furthermore, marking 446 * a remote pagevec's page PageActive potentially hits a race where 447 * a page is marked PageActive just after it is added to the inactive 448 * list causing accounting errors and BUG_ON checks to trigger. 449 */ 450 for (i = pagevec_count(pvec) - 1; i >= 0; i--) { 451 struct page *pagevec_page = pvec->pages[i]; 452 453 if (pagevec_page == page) { 454 SetPageActive(page); 455 break; 456 } 457 } 458 459 put_cpu_var(lru_add_pvec); 460 } 461 462 /* 463 * Mark a page as having seen activity. 464 * 465 * inactive,unreferenced -> inactive,referenced 466 * inactive,referenced -> active,unreferenced 467 * active,unreferenced -> active,referenced 468 */ 469 void mark_page_accessed(struct page *page) 470 { 471 if (!PageActive(page) && !PageUnevictable(page) && 472 PageReferenced(page)) { 473 474 /* 475 * If the page is on the LRU, queue it for activation via 476 * activate_page_pvecs. Otherwise, assume the page is on a 477 * pagevec, mark it active and it'll be moved to the active 478 * LRU on the next drain. 479 */ 480 if (PageLRU(page)) 481 activate_page(page); 482 else 483 __lru_cache_activate_page(page); 484 ClearPageReferenced(page); 485 } else if (!PageReferenced(page)) { 486 SetPageReferenced(page); 487 } 488 } 489 EXPORT_SYMBOL(mark_page_accessed); 490 491 /* 492 * Queue the page for addition to the LRU via pagevec. The decision on whether 493 * to add the page to the [in]active [file|anon] list is deferred until the 494 * pagevec is drained. This gives a chance for the caller of __lru_cache_add() 495 * have the page added to the active list using mark_page_accessed(). 496 */ 497 void __lru_cache_add(struct page *page) 498 { 499 struct pagevec *pvec = &get_cpu_var(lru_add_pvec); 500 501 page_cache_get(page); 502 if (!pagevec_space(pvec)) 503 __pagevec_lru_add(pvec); 504 pagevec_add(pvec, page); 505 put_cpu_var(lru_add_pvec); 506 } 507 EXPORT_SYMBOL(__lru_cache_add); 508 509 /** 510 * lru_cache_add - add a page to a page list 511 * @page: the page to be added to the LRU. 512 */ 513 void lru_cache_add(struct page *page) 514 { 515 VM_BUG_ON(PageActive(page) && PageUnevictable(page)); 516 VM_BUG_ON(PageLRU(page)); 517 __lru_cache_add(page); 518 } 519 520 /** 521 * add_page_to_unevictable_list - add a page to the unevictable list 522 * @page: the page to be added to the unevictable list 523 * 524 * Add page directly to its zone's unevictable list. To avoid races with 525 * tasks that might be making the page evictable, through eg. munlock, 526 * munmap or exit, while it's not on the lru, we want to add the page 527 * while it's locked or otherwise "invisible" to other tasks. This is 528 * difficult to do when using the pagevec cache, so bypass that. 529 */ 530 void add_page_to_unevictable_list(struct page *page) 531 { 532 struct zone *zone = page_zone(page); 533 struct lruvec *lruvec; 534 535 spin_lock_irq(&zone->lru_lock); 536 lruvec = mem_cgroup_page_lruvec(page, zone); 537 ClearPageActive(page); 538 SetPageUnevictable(page); 539 SetPageLRU(page); 540 add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE); 541 spin_unlock_irq(&zone->lru_lock); 542 } 543 544 /* 545 * If the page can not be invalidated, it is moved to the 546 * inactive list to speed up its reclaim. It is moved to the 547 * head of the list, rather than the tail, to give the flusher 548 * threads some time to write it out, as this is much more 549 * effective than the single-page writeout from reclaim. 550 * 551 * If the page isn't page_mapped and dirty/writeback, the page 552 * could reclaim asap using PG_reclaim. 553 * 554 * 1. active, mapped page -> none 555 * 2. active, dirty/writeback page -> inactive, head, PG_reclaim 556 * 3. inactive, mapped page -> none 557 * 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim 558 * 5. inactive, clean -> inactive, tail 559 * 6. Others -> none 560 * 561 * In 4, why it moves inactive's head, the VM expects the page would 562 * be write it out by flusher threads as this is much more effective 563 * than the single-page writeout from reclaim. 564 */ 565 static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, 566 void *arg) 567 { 568 int lru, file; 569 bool active; 570 571 if (!PageLRU(page)) 572 return; 573 574 if (PageUnevictable(page)) 575 return; 576 577 /* Some processes are using the page */ 578 if (page_mapped(page)) 579 return; 580 581 active = PageActive(page); 582 file = page_is_file_cache(page); 583 lru = page_lru_base_type(page); 584 585 del_page_from_lru_list(page, lruvec, lru + active); 586 ClearPageActive(page); 587 ClearPageReferenced(page); 588 add_page_to_lru_list(page, lruvec, lru); 589 590 if (PageWriteback(page) || PageDirty(page)) { 591 /* 592 * PG_reclaim could be raced with end_page_writeback 593 * It can make readahead confusing. But race window 594 * is _really_ small and it's non-critical problem. 595 */ 596 SetPageReclaim(page); 597 } else { 598 /* 599 * The page's writeback ends up during pagevec 600 * We moves tha page into tail of inactive. 601 */ 602 list_move_tail(&page->lru, &lruvec->lists[lru]); 603 __count_vm_event(PGROTATED); 604 } 605 606 if (active) 607 __count_vm_event(PGDEACTIVATE); 608 update_page_reclaim_stat(lruvec, file, 0); 609 } 610 611 /* 612 * Drain pages out of the cpu's pagevecs. 613 * Either "cpu" is the current CPU, and preemption has already been 614 * disabled; or "cpu" is being hot-unplugged, and is already dead. 615 */ 616 void lru_add_drain_cpu(int cpu) 617 { 618 struct pagevec *pvec = &per_cpu(lru_add_pvec, cpu); 619 620 if (pagevec_count(pvec)) 621 __pagevec_lru_add(pvec); 622 623 pvec = &per_cpu(lru_rotate_pvecs, cpu); 624 if (pagevec_count(pvec)) { 625 unsigned long flags; 626 627 /* No harm done if a racing interrupt already did this */ 628 local_irq_save(flags); 629 pagevec_move_tail(pvec); 630 local_irq_restore(flags); 631 } 632 633 pvec = &per_cpu(lru_deactivate_pvecs, cpu); 634 if (pagevec_count(pvec)) 635 pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); 636 637 activate_page_drain(cpu); 638 } 639 640 /** 641 * deactivate_page - forcefully deactivate a page 642 * @page: page to deactivate 643 * 644 * This function hints the VM that @page is a good reclaim candidate, 645 * for example if its invalidation fails due to the page being dirty 646 * or under writeback. 647 */ 648 void deactivate_page(struct page *page) 649 { 650 /* 651 * In a workload with many unevictable page such as mprotect, unevictable 652 * page deactivation for accelerating reclaim is pointless. 653 */ 654 if (PageUnevictable(page)) 655 return; 656 657 if (likely(get_page_unless_zero(page))) { 658 struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); 659 660 if (!pagevec_add(pvec, page)) 661 pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); 662 put_cpu_var(lru_deactivate_pvecs); 663 } 664 } 665 666 void lru_add_drain(void) 667 { 668 lru_add_drain_cpu(get_cpu()); 669 put_cpu(); 670 } 671 672 static void lru_add_drain_per_cpu(struct work_struct *dummy) 673 { 674 lru_add_drain(); 675 } 676 677 /* 678 * Returns 0 for success 679 */ 680 int lru_add_drain_all(void) 681 { 682 return schedule_on_each_cpu(lru_add_drain_per_cpu); 683 } 684 685 /* 686 * Batched page_cache_release(). Decrement the reference count on all the 687 * passed pages. If it fell to zero then remove the page from the LRU and 688 * free it. 689 * 690 * Avoid taking zone->lru_lock if possible, but if it is taken, retain it 691 * for the remainder of the operation. 692 * 693 * The locking in this function is against shrink_inactive_list(): we recheck 694 * the page count inside the lock to see whether shrink_inactive_list() 695 * grabbed the page via the LRU. If it did, give up: shrink_inactive_list() 696 * will free it. 697 */ 698 void release_pages(struct page **pages, int nr, int cold) 699 { 700 int i; 701 LIST_HEAD(pages_to_free); 702 struct zone *zone = NULL; 703 struct lruvec *lruvec; 704 unsigned long uninitialized_var(flags); 705 706 for (i = 0; i < nr; i++) { 707 struct page *page = pages[i]; 708 709 if (unlikely(PageCompound(page))) { 710 if (zone) { 711 spin_unlock_irqrestore(&zone->lru_lock, flags); 712 zone = NULL; 713 } 714 put_compound_page(page); 715 continue; 716 } 717 718 if (!put_page_testzero(page)) 719 continue; 720 721 if (PageLRU(page)) { 722 struct zone *pagezone = page_zone(page); 723 724 if (pagezone != zone) { 725 if (zone) 726 spin_unlock_irqrestore(&zone->lru_lock, 727 flags); 728 zone = pagezone; 729 spin_lock_irqsave(&zone->lru_lock, flags); 730 } 731 732 lruvec = mem_cgroup_page_lruvec(page, zone); 733 VM_BUG_ON(!PageLRU(page)); 734 __ClearPageLRU(page); 735 del_page_from_lru_list(page, lruvec, page_off_lru(page)); 736 } 737 738 /* Clear Active bit in case of parallel mark_page_accessed */ 739 ClearPageActive(page); 740 741 list_add(&page->lru, &pages_to_free); 742 } 743 if (zone) 744 spin_unlock_irqrestore(&zone->lru_lock, flags); 745 746 free_hot_cold_page_list(&pages_to_free, cold); 747 } 748 EXPORT_SYMBOL(release_pages); 749 750 /* 751 * The pages which we're about to release may be in the deferred lru-addition 752 * queues. That would prevent them from really being freed right now. That's 753 * OK from a correctness point of view but is inefficient - those pages may be 754 * cache-warm and we want to give them back to the page allocator ASAP. 755 * 756 * So __pagevec_release() will drain those queues here. __pagevec_lru_add() 757 * and __pagevec_lru_add_active() call release_pages() directly to avoid 758 * mutual recursion. 759 */ 760 void __pagevec_release(struct pagevec *pvec) 761 { 762 lru_add_drain(); 763 release_pages(pvec->pages, pagevec_count(pvec), pvec->cold); 764 pagevec_reinit(pvec); 765 } 766 EXPORT_SYMBOL(__pagevec_release); 767 768 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 769 /* used by __split_huge_page_refcount() */ 770 void lru_add_page_tail(struct page *page, struct page *page_tail, 771 struct lruvec *lruvec, struct list_head *list) 772 { 773 const int file = 0; 774 775 VM_BUG_ON(!PageHead(page)); 776 VM_BUG_ON(PageCompound(page_tail)); 777 VM_BUG_ON(PageLRU(page_tail)); 778 VM_BUG_ON(NR_CPUS != 1 && 779 !spin_is_locked(&lruvec_zone(lruvec)->lru_lock)); 780 781 if (!list) 782 SetPageLRU(page_tail); 783 784 if (likely(PageLRU(page))) 785 list_add_tail(&page_tail->lru, &page->lru); 786 else if (list) { 787 /* page reclaim is reclaiming a huge page */ 788 get_page(page_tail); 789 list_add_tail(&page_tail->lru, list); 790 } else { 791 struct list_head *list_head; 792 /* 793 * Head page has not yet been counted, as an hpage, 794 * so we must account for each subpage individually. 795 * 796 * Use the standard add function to put page_tail on the list, 797 * but then correct its position so they all end up in order. 798 */ 799 add_page_to_lru_list(page_tail, lruvec, page_lru(page_tail)); 800 list_head = page_tail->lru.prev; 801 list_move_tail(&page_tail->lru, list_head); 802 } 803 804 if (!PageUnevictable(page)) 805 update_page_reclaim_stat(lruvec, file, PageActive(page_tail)); 806 } 807 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 808 809 static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, 810 void *arg) 811 { 812 int file = page_is_file_cache(page); 813 int active = PageActive(page); 814 enum lru_list lru = page_lru(page); 815 816 VM_BUG_ON(PageLRU(page)); 817 818 SetPageLRU(page); 819 add_page_to_lru_list(page, lruvec, lru); 820 update_page_reclaim_stat(lruvec, file, active); 821 trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page)); 822 } 823 824 /* 825 * Add the passed pages to the LRU, then drop the caller's refcount 826 * on them. Reinitialises the caller's pagevec. 827 */ 828 void __pagevec_lru_add(struct pagevec *pvec) 829 { 830 pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL); 831 } 832 EXPORT_SYMBOL(__pagevec_lru_add); 833 834 /** 835 * pagevec_lookup - gang pagecache lookup 836 * @pvec: Where the resulting pages are placed 837 * @mapping: The address_space to search 838 * @start: The starting page index 839 * @nr_pages: The maximum number of pages 840 * 841 * pagevec_lookup() will search for and return a group of up to @nr_pages pages 842 * in the mapping. The pages are placed in @pvec. pagevec_lookup() takes a 843 * reference against the pages in @pvec. 844 * 845 * The search returns a group of mapping-contiguous pages with ascending 846 * indexes. There may be holes in the indices due to not-present pages. 847 * 848 * pagevec_lookup() returns the number of pages which were found. 849 */ 850 unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, 851 pgoff_t start, unsigned nr_pages) 852 { 853 pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages); 854 return pagevec_count(pvec); 855 } 856 EXPORT_SYMBOL(pagevec_lookup); 857 858 unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, 859 pgoff_t *index, int tag, unsigned nr_pages) 860 { 861 pvec->nr = find_get_pages_tag(mapping, index, tag, 862 nr_pages, pvec->pages); 863 return pagevec_count(pvec); 864 } 865 EXPORT_SYMBOL(pagevec_lookup_tag); 866 867 /* 868 * Perform any setup for the swap system 869 */ 870 void __init swap_setup(void) 871 { 872 unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); 873 #ifdef CONFIG_SWAP 874 int i; 875 876 bdi_init(swapper_spaces[0].backing_dev_info); 877 for (i = 0; i < MAX_SWAPFILES; i++) { 878 spin_lock_init(&swapper_spaces[i].tree_lock); 879 INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear); 880 } 881 #endif 882 883 /* Use a smaller cluster for small-memory machines */ 884 if (megs < 16) 885 page_cluster = 2; 886 else 887 page_cluster = 3; 888 /* 889 * Right now other parts of the system means that we 890 * _really_ don't want to cluster much more 891 */ 892 } 893