1 /* 2 * linux/mm/swap.c 3 * 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 5 */ 6 7 /* 8 * This file contains the default values for the operation of the 9 * Linux VM subsystem. Fine-tuning documentation can be found in 10 * Documentation/sysctl/vm.txt. 11 * Started 18.12.91 12 * Swap aging added 23.2.95, Stephen Tweedie. 13 * Buffermem limits added 12.3.98, Rik van Riel. 14 */ 15 16 #include <linux/mm.h> 17 #include <linux/sched.h> 18 #include <linux/kernel_stat.h> 19 #include <linux/swap.h> 20 #include <linux/mman.h> 21 #include <linux/pagemap.h> 22 #include <linux/pagevec.h> 23 #include <linux/init.h> 24 #include <linux/module.h> 25 #include <linux/mm_inline.h> 26 #include <linux/buffer_head.h> /* for try_to_release_page() */ 27 #include <linux/percpu_counter.h> 28 #include <linux/percpu.h> 29 #include <linux/cpu.h> 30 #include <linux/notifier.h> 31 #include <linux/backing-dev.h> 32 #include <linux/memcontrol.h> 33 34 #include "internal.h" 35 36 /* How many pages do we try to swap or page in/out together? */ 37 int page_cluster; 38 39 static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs); 40 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); 41 42 /* 43 * This path almost never happens for VM activity - pages are normally 44 * freed via pagevecs. But it gets used by networking. 45 */ 46 static void __page_cache_release(struct page *page) 47 { 48 if (PageLRU(page)) { 49 unsigned long flags; 50 struct zone *zone = page_zone(page); 51 52 spin_lock_irqsave(&zone->lru_lock, flags); 53 VM_BUG_ON(!PageLRU(page)); 54 __ClearPageLRU(page); 55 del_page_from_lru(zone, page); 56 spin_unlock_irqrestore(&zone->lru_lock, flags); 57 } 58 free_hot_page(page); 59 } 60 61 static void put_compound_page(struct page *page) 62 { 63 page = compound_head(page); 64 if (put_page_testzero(page)) { 65 compound_page_dtor *dtor; 66 67 dtor = get_compound_page_dtor(page); 68 (*dtor)(page); 69 } 70 } 71 72 void put_page(struct page *page) 73 { 74 if (unlikely(PageCompound(page))) 75 put_compound_page(page); 76 else if (put_page_testzero(page)) 77 __page_cache_release(page); 78 } 79 EXPORT_SYMBOL(put_page); 80 81 /** 82 * put_pages_list() - release a list of pages 83 * @pages: list of pages threaded on page->lru 84 * 85 * Release a list of pages which are strung together on page.lru. Currently 86 * used by read_cache_pages() and related error recovery code. 87 */ 88 void put_pages_list(struct list_head *pages) 89 { 90 while (!list_empty(pages)) { 91 struct page *victim; 92 93 victim = list_entry(pages->prev, struct page, lru); 94 list_del(&victim->lru); 95 page_cache_release(victim); 96 } 97 } 98 EXPORT_SYMBOL(put_pages_list); 99 100 /* 101 * pagevec_move_tail() must be called with IRQ disabled. 102 * Otherwise this may cause nasty races. 103 */ 104 static void pagevec_move_tail(struct pagevec *pvec) 105 { 106 int i; 107 int pgmoved = 0; 108 struct zone *zone = NULL; 109 110 for (i = 0; i < pagevec_count(pvec); i++) { 111 struct page *page = pvec->pages[i]; 112 struct zone *pagezone = page_zone(page); 113 114 if (pagezone != zone) { 115 if (zone) 116 spin_unlock(&zone->lru_lock); 117 zone = pagezone; 118 spin_lock(&zone->lru_lock); 119 } 120 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 121 int lru = page_is_file_cache(page); 122 list_move_tail(&page->lru, &zone->lru[lru].list); 123 pgmoved++; 124 } 125 } 126 if (zone) 127 spin_unlock(&zone->lru_lock); 128 __count_vm_events(PGROTATED, pgmoved); 129 release_pages(pvec->pages, pvec->nr, pvec->cold); 130 pagevec_reinit(pvec); 131 } 132 133 /* 134 * Writeback is about to end against a page which has been marked for immediate 135 * reclaim. If it still appears to be reclaimable, move it to the tail of the 136 * inactive list. 137 */ 138 void rotate_reclaimable_page(struct page *page) 139 { 140 if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && 141 !PageUnevictable(page) && PageLRU(page)) { 142 struct pagevec *pvec; 143 unsigned long flags; 144 145 page_cache_get(page); 146 local_irq_save(flags); 147 pvec = &__get_cpu_var(lru_rotate_pvecs); 148 if (!pagevec_add(pvec, page)) 149 pagevec_move_tail(pvec); 150 local_irq_restore(flags); 151 } 152 } 153 154 /* 155 * FIXME: speed this up? 156 */ 157 void activate_page(struct page *page) 158 { 159 struct zone *zone = page_zone(page); 160 161 spin_lock_irq(&zone->lru_lock); 162 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 163 int file = page_is_file_cache(page); 164 int lru = LRU_BASE + file; 165 del_page_from_lru_list(zone, page, lru); 166 167 SetPageActive(page); 168 lru += LRU_ACTIVE; 169 add_page_to_lru_list(zone, page, lru); 170 __count_vm_event(PGACTIVATE); 171 mem_cgroup_move_lists(page, lru); 172 173 zone->recent_rotated[!!file]++; 174 zone->recent_scanned[!!file]++; 175 } 176 spin_unlock_irq(&zone->lru_lock); 177 } 178 179 /* 180 * Mark a page as having seen activity. 181 * 182 * inactive,unreferenced -> inactive,referenced 183 * inactive,referenced -> active,unreferenced 184 * active,unreferenced -> active,referenced 185 */ 186 void mark_page_accessed(struct page *page) 187 { 188 if (!PageActive(page) && !PageUnevictable(page) && 189 PageReferenced(page) && PageLRU(page)) { 190 activate_page(page); 191 ClearPageReferenced(page); 192 } else if (!PageReferenced(page)) { 193 SetPageReferenced(page); 194 } 195 } 196 197 EXPORT_SYMBOL(mark_page_accessed); 198 199 void __lru_cache_add(struct page *page, enum lru_list lru) 200 { 201 struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru]; 202 203 page_cache_get(page); 204 if (!pagevec_add(pvec, page)) 205 ____pagevec_lru_add(pvec, lru); 206 put_cpu_var(lru_add_pvecs); 207 } 208 209 /** 210 * lru_cache_add_lru - add a page to a page list 211 * @page: the page to be added to the LRU. 212 * @lru: the LRU list to which the page is added. 213 */ 214 void lru_cache_add_lru(struct page *page, enum lru_list lru) 215 { 216 if (PageActive(page)) { 217 VM_BUG_ON(PageUnevictable(page)); 218 ClearPageActive(page); 219 } else if (PageUnevictable(page)) { 220 VM_BUG_ON(PageActive(page)); 221 ClearPageUnevictable(page); 222 } 223 224 VM_BUG_ON(PageLRU(page) || PageActive(page) || PageUnevictable(page)); 225 __lru_cache_add(page, lru); 226 } 227 228 /** 229 * add_page_to_unevictable_list - add a page to the unevictable list 230 * @page: the page to be added to the unevictable list 231 * 232 * Add page directly to its zone's unevictable list. To avoid races with 233 * tasks that might be making the page evictable, through eg. munlock, 234 * munmap or exit, while it's not on the lru, we want to add the page 235 * while it's locked or otherwise "invisible" to other tasks. This is 236 * difficult to do when using the pagevec cache, so bypass that. 237 */ 238 void add_page_to_unevictable_list(struct page *page) 239 { 240 struct zone *zone = page_zone(page); 241 242 spin_lock_irq(&zone->lru_lock); 243 SetPageUnevictable(page); 244 SetPageLRU(page); 245 add_page_to_lru_list(zone, page, LRU_UNEVICTABLE); 246 spin_unlock_irq(&zone->lru_lock); 247 } 248 249 /** 250 * lru_cache_add_active_or_unevictable 251 * @page: the page to be added to LRU 252 * @vma: vma in which page is mapped for determining reclaimability 253 * 254 * place @page on active or unevictable LRU list, depending on 255 * page_evictable(). Note that if the page is not evictable, 256 * it goes directly back onto it's zone's unevictable list. It does 257 * NOT use a per cpu pagevec. 258 */ 259 void lru_cache_add_active_or_unevictable(struct page *page, 260 struct vm_area_struct *vma) 261 { 262 if (page_evictable(page, vma)) 263 lru_cache_add_lru(page, LRU_ACTIVE + page_is_file_cache(page)); 264 else 265 add_page_to_unevictable_list(page); 266 } 267 268 /* 269 * Drain pages out of the cpu's pagevecs. 270 * Either "cpu" is the current CPU, and preemption has already been 271 * disabled; or "cpu" is being hot-unplugged, and is already dead. 272 */ 273 static void drain_cpu_pagevecs(int cpu) 274 { 275 struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu); 276 struct pagevec *pvec; 277 int lru; 278 279 for_each_lru(lru) { 280 pvec = &pvecs[lru - LRU_BASE]; 281 if (pagevec_count(pvec)) 282 ____pagevec_lru_add(pvec, lru); 283 } 284 285 pvec = &per_cpu(lru_rotate_pvecs, cpu); 286 if (pagevec_count(pvec)) { 287 unsigned long flags; 288 289 /* No harm done if a racing interrupt already did this */ 290 local_irq_save(flags); 291 pagevec_move_tail(pvec); 292 local_irq_restore(flags); 293 } 294 } 295 296 void lru_add_drain(void) 297 { 298 drain_cpu_pagevecs(get_cpu()); 299 put_cpu(); 300 } 301 302 static void lru_add_drain_per_cpu(struct work_struct *dummy) 303 { 304 lru_add_drain(); 305 } 306 307 /* 308 * Returns 0 for success 309 */ 310 int lru_add_drain_all(void) 311 { 312 return schedule_on_each_cpu(lru_add_drain_per_cpu); 313 } 314 315 /* 316 * Batched page_cache_release(). Decrement the reference count on all the 317 * passed pages. If it fell to zero then remove the page from the LRU and 318 * free it. 319 * 320 * Avoid taking zone->lru_lock if possible, but if it is taken, retain it 321 * for the remainder of the operation. 322 * 323 * The locking in this function is against shrink_inactive_list(): we recheck 324 * the page count inside the lock to see whether shrink_inactive_list() 325 * grabbed the page via the LRU. If it did, give up: shrink_inactive_list() 326 * will free it. 327 */ 328 void release_pages(struct page **pages, int nr, int cold) 329 { 330 int i; 331 struct pagevec pages_to_free; 332 struct zone *zone = NULL; 333 unsigned long uninitialized_var(flags); 334 335 pagevec_init(&pages_to_free, cold); 336 for (i = 0; i < nr; i++) { 337 struct page *page = pages[i]; 338 339 if (unlikely(PageCompound(page))) { 340 if (zone) { 341 spin_unlock_irqrestore(&zone->lru_lock, flags); 342 zone = NULL; 343 } 344 put_compound_page(page); 345 continue; 346 } 347 348 if (!put_page_testzero(page)) 349 continue; 350 351 if (PageLRU(page)) { 352 struct zone *pagezone = page_zone(page); 353 354 if (pagezone != zone) { 355 if (zone) 356 spin_unlock_irqrestore(&zone->lru_lock, 357 flags); 358 zone = pagezone; 359 spin_lock_irqsave(&zone->lru_lock, flags); 360 } 361 VM_BUG_ON(!PageLRU(page)); 362 __ClearPageLRU(page); 363 del_page_from_lru(zone, page); 364 } 365 366 if (!pagevec_add(&pages_to_free, page)) { 367 if (zone) { 368 spin_unlock_irqrestore(&zone->lru_lock, flags); 369 zone = NULL; 370 } 371 __pagevec_free(&pages_to_free); 372 pagevec_reinit(&pages_to_free); 373 } 374 } 375 if (zone) 376 spin_unlock_irqrestore(&zone->lru_lock, flags); 377 378 pagevec_free(&pages_to_free); 379 } 380 381 /* 382 * The pages which we're about to release may be in the deferred lru-addition 383 * queues. That would prevent them from really being freed right now. That's 384 * OK from a correctness point of view but is inefficient - those pages may be 385 * cache-warm and we want to give them back to the page allocator ASAP. 386 * 387 * So __pagevec_release() will drain those queues here. __pagevec_lru_add() 388 * and __pagevec_lru_add_active() call release_pages() directly to avoid 389 * mutual recursion. 390 */ 391 void __pagevec_release(struct pagevec *pvec) 392 { 393 lru_add_drain(); 394 release_pages(pvec->pages, pagevec_count(pvec), pvec->cold); 395 pagevec_reinit(pvec); 396 } 397 398 EXPORT_SYMBOL(__pagevec_release); 399 400 /* 401 * pagevec_release() for pages which are known to not be on the LRU 402 * 403 * This function reinitialises the caller's pagevec. 404 */ 405 void __pagevec_release_nonlru(struct pagevec *pvec) 406 { 407 int i; 408 struct pagevec pages_to_free; 409 410 pagevec_init(&pages_to_free, pvec->cold); 411 for (i = 0; i < pagevec_count(pvec); i++) { 412 struct page *page = pvec->pages[i]; 413 414 VM_BUG_ON(PageLRU(page)); 415 if (put_page_testzero(page)) 416 pagevec_add(&pages_to_free, page); 417 } 418 pagevec_free(&pages_to_free); 419 pagevec_reinit(pvec); 420 } 421 422 /* 423 * Add the passed pages to the LRU, then drop the caller's refcount 424 * on them. Reinitialises the caller's pagevec. 425 */ 426 void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) 427 { 428 int i; 429 struct zone *zone = NULL; 430 VM_BUG_ON(is_unevictable_lru(lru)); 431 432 for (i = 0; i < pagevec_count(pvec); i++) { 433 struct page *page = pvec->pages[i]; 434 struct zone *pagezone = page_zone(page); 435 int file; 436 437 if (pagezone != zone) { 438 if (zone) 439 spin_unlock_irq(&zone->lru_lock); 440 zone = pagezone; 441 spin_lock_irq(&zone->lru_lock); 442 } 443 VM_BUG_ON(PageActive(page)); 444 VM_BUG_ON(PageUnevictable(page)); 445 VM_BUG_ON(PageLRU(page)); 446 SetPageLRU(page); 447 file = is_file_lru(lru); 448 zone->recent_scanned[file]++; 449 if (is_active_lru(lru)) { 450 SetPageActive(page); 451 zone->recent_rotated[file]++; 452 } 453 add_page_to_lru_list(zone, page, lru); 454 } 455 if (zone) 456 spin_unlock_irq(&zone->lru_lock); 457 release_pages(pvec->pages, pvec->nr, pvec->cold); 458 pagevec_reinit(pvec); 459 } 460 461 EXPORT_SYMBOL(____pagevec_lru_add); 462 463 /* 464 * Try to drop buffers from the pages in a pagevec 465 */ 466 void pagevec_strip(struct pagevec *pvec) 467 { 468 int i; 469 470 for (i = 0; i < pagevec_count(pvec); i++) { 471 struct page *page = pvec->pages[i]; 472 473 if (PagePrivate(page) && trylock_page(page)) { 474 if (PagePrivate(page)) 475 try_to_release_page(page, 0); 476 unlock_page(page); 477 } 478 } 479 } 480 481 /** 482 * pagevec_swap_free - try to free swap space from the pages in a pagevec 483 * @pvec: pagevec with swapcache pages to free the swap space of 484 * 485 * The caller needs to hold an extra reference to each page and 486 * not hold the page lock on the pages. This function uses a 487 * trylock on the page lock so it may not always free the swap 488 * space associated with a page. 489 */ 490 void pagevec_swap_free(struct pagevec *pvec) 491 { 492 int i; 493 494 for (i = 0; i < pagevec_count(pvec); i++) { 495 struct page *page = pvec->pages[i]; 496 497 if (PageSwapCache(page) && trylock_page(page)) { 498 if (PageSwapCache(page)) 499 remove_exclusive_swap_page_ref(page); 500 unlock_page(page); 501 } 502 } 503 } 504 505 /** 506 * pagevec_lookup - gang pagecache lookup 507 * @pvec: Where the resulting pages are placed 508 * @mapping: The address_space to search 509 * @start: The starting page index 510 * @nr_pages: The maximum number of pages 511 * 512 * pagevec_lookup() will search for and return a group of up to @nr_pages pages 513 * in the mapping. The pages are placed in @pvec. pagevec_lookup() takes a 514 * reference against the pages in @pvec. 515 * 516 * The search returns a group of mapping-contiguous pages with ascending 517 * indexes. There may be holes in the indices due to not-present pages. 518 * 519 * pagevec_lookup() returns the number of pages which were found. 520 */ 521 unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, 522 pgoff_t start, unsigned nr_pages) 523 { 524 pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages); 525 return pagevec_count(pvec); 526 } 527 528 EXPORT_SYMBOL(pagevec_lookup); 529 530 unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, 531 pgoff_t *index, int tag, unsigned nr_pages) 532 { 533 pvec->nr = find_get_pages_tag(mapping, index, tag, 534 nr_pages, pvec->pages); 535 return pagevec_count(pvec); 536 } 537 538 EXPORT_SYMBOL(pagevec_lookup_tag); 539 540 #ifdef CONFIG_SMP 541 /* 542 * We tolerate a little inaccuracy to avoid ping-ponging the counter between 543 * CPUs 544 */ 545 #define ACCT_THRESHOLD max(16, NR_CPUS * 2) 546 547 static DEFINE_PER_CPU(long, committed_space); 548 549 void vm_acct_memory(long pages) 550 { 551 long *local; 552 553 preempt_disable(); 554 local = &__get_cpu_var(committed_space); 555 *local += pages; 556 if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) { 557 atomic_long_add(*local, &vm_committed_space); 558 *local = 0; 559 } 560 preempt_enable(); 561 } 562 563 #ifdef CONFIG_HOTPLUG_CPU 564 565 /* Drop the CPU's cached committed space back into the central pool. */ 566 static int cpu_swap_callback(struct notifier_block *nfb, 567 unsigned long action, 568 void *hcpu) 569 { 570 long *committed; 571 572 committed = &per_cpu(committed_space, (long)hcpu); 573 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { 574 atomic_long_add(*committed, &vm_committed_space); 575 *committed = 0; 576 drain_cpu_pagevecs((long)hcpu); 577 } 578 return NOTIFY_OK; 579 } 580 #endif /* CONFIG_HOTPLUG_CPU */ 581 #endif /* CONFIG_SMP */ 582 583 /* 584 * Perform any setup for the swap system 585 */ 586 void __init swap_setup(void) 587 { 588 unsigned long megs = num_physpages >> (20 - PAGE_SHIFT); 589 590 #ifdef CONFIG_SWAP 591 bdi_init(swapper_space.backing_dev_info); 592 #endif 593 594 /* Use a smaller cluster for small-memory machines */ 595 if (megs < 16) 596 page_cluster = 2; 597 else 598 page_cluster = 3; 599 /* 600 * Right now other parts of the system means that we 601 * _really_ don't want to cluster much more 602 */ 603 #ifdef CONFIG_HOTPLUG_CPU 604 hotcpu_notifier(cpu_swap_callback, 0); 605 #endif 606 } 607