1 /* 2 * linux/mm/swap.c 3 * 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 5 */ 6 7 /* 8 * This file contains the default values for the operation of the 9 * Linux VM subsystem. Fine-tuning documentation can be found in 10 * Documentation/sysctl/vm.txt. 11 * Started 18.12.91 12 * Swap aging added 23.2.95, Stephen Tweedie. 13 * Buffermem limits added 12.3.98, Rik van Riel. 14 */ 15 16 #include <linux/mm.h> 17 #include <linux/sched.h> 18 #include <linux/kernel_stat.h> 19 #include <linux/swap.h> 20 #include <linux/mman.h> 21 #include <linux/pagemap.h> 22 #include <linux/pagevec.h> 23 #include <linux/init.h> 24 #include <linux/module.h> 25 #include <linux/mm_inline.h> 26 #include <linux/buffer_head.h> /* for try_to_release_page() */ 27 #include <linux/percpu_counter.h> 28 #include <linux/percpu.h> 29 #include <linux/cpu.h> 30 #include <linux/notifier.h> 31 #include <linux/backing-dev.h> 32 33 /* How many pages do we try to swap or page in/out together? */ 34 int page_cluster; 35 36 static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, }; 37 static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, }; 38 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs) = { 0, }; 39 40 /* 41 * This path almost never happens for VM activity - pages are normally 42 * freed via pagevecs. But it gets used by networking. 43 */ 44 static void fastcall __page_cache_release(struct page *page) 45 { 46 if (PageLRU(page)) { 47 unsigned long flags; 48 struct zone *zone = page_zone(page); 49 50 spin_lock_irqsave(&zone->lru_lock, flags); 51 VM_BUG_ON(!PageLRU(page)); 52 __ClearPageLRU(page); 53 del_page_from_lru(zone, page); 54 spin_unlock_irqrestore(&zone->lru_lock, flags); 55 } 56 free_hot_page(page); 57 } 58 59 static void put_compound_page(struct page *page) 60 { 61 page = compound_head(page); 62 if (put_page_testzero(page)) { 63 compound_page_dtor *dtor; 64 65 dtor = get_compound_page_dtor(page); 66 (*dtor)(page); 67 } 68 } 69 70 void put_page(struct page *page) 71 { 72 if (unlikely(PageCompound(page))) 73 put_compound_page(page); 74 else if (put_page_testzero(page)) 75 __page_cache_release(page); 76 } 77 EXPORT_SYMBOL(put_page); 78 79 /** 80 * put_pages_list(): release a list of pages 81 * 82 * Release a list of pages which are strung together on page.lru. Currently 83 * used by read_cache_pages() and related error recovery code. 84 * 85 * @pages: list of pages threaded on page->lru 86 */ 87 void put_pages_list(struct list_head *pages) 88 { 89 while (!list_empty(pages)) { 90 struct page *victim; 91 92 victim = list_entry(pages->prev, struct page, lru); 93 list_del(&victim->lru); 94 page_cache_release(victim); 95 } 96 } 97 EXPORT_SYMBOL(put_pages_list); 98 99 /* 100 * pagevec_move_tail() must be called with IRQ disabled. 101 * Otherwise this may cause nasty races. 102 */ 103 static void pagevec_move_tail(struct pagevec *pvec) 104 { 105 int i; 106 int pgmoved = 0; 107 struct zone *zone = NULL; 108 109 for (i = 0; i < pagevec_count(pvec); i++) { 110 struct page *page = pvec->pages[i]; 111 struct zone *pagezone = page_zone(page); 112 113 if (pagezone != zone) { 114 if (zone) 115 spin_unlock(&zone->lru_lock); 116 zone = pagezone; 117 spin_lock(&zone->lru_lock); 118 } 119 if (PageLRU(page) && !PageActive(page)) { 120 list_move_tail(&page->lru, &zone->inactive_list); 121 pgmoved++; 122 } 123 } 124 if (zone) 125 spin_unlock(&zone->lru_lock); 126 __count_vm_events(PGROTATED, pgmoved); 127 release_pages(pvec->pages, pvec->nr, pvec->cold); 128 pagevec_reinit(pvec); 129 } 130 131 /* 132 * Writeback is about to end against a page which has been marked for immediate 133 * reclaim. If it still appears to be reclaimable, move it to the tail of the 134 * inactive list. 135 * 136 * Returns zero if it cleared PG_writeback. 137 */ 138 int rotate_reclaimable_page(struct page *page) 139 { 140 struct pagevec *pvec; 141 unsigned long flags; 142 143 if (PageLocked(page)) 144 return 1; 145 if (PageDirty(page)) 146 return 1; 147 if (PageActive(page)) 148 return 1; 149 if (!PageLRU(page)) 150 return 1; 151 152 page_cache_get(page); 153 local_irq_save(flags); 154 pvec = &__get_cpu_var(lru_rotate_pvecs); 155 if (!pagevec_add(pvec, page)) 156 pagevec_move_tail(pvec); 157 local_irq_restore(flags); 158 159 if (!test_clear_page_writeback(page)) 160 BUG(); 161 162 return 0; 163 } 164 165 /* 166 * FIXME: speed this up? 167 */ 168 void fastcall activate_page(struct page *page) 169 { 170 struct zone *zone = page_zone(page); 171 172 spin_lock_irq(&zone->lru_lock); 173 if (PageLRU(page) && !PageActive(page)) { 174 del_page_from_inactive_list(zone, page); 175 SetPageActive(page); 176 add_page_to_active_list(zone, page); 177 __count_vm_event(PGACTIVATE); 178 } 179 spin_unlock_irq(&zone->lru_lock); 180 } 181 182 /* 183 * Mark a page as having seen activity. 184 * 185 * inactive,unreferenced -> inactive,referenced 186 * inactive,referenced -> active,unreferenced 187 * active,unreferenced -> active,referenced 188 */ 189 void fastcall mark_page_accessed(struct page *page) 190 { 191 if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) { 192 activate_page(page); 193 ClearPageReferenced(page); 194 } else if (!PageReferenced(page)) { 195 SetPageReferenced(page); 196 } 197 } 198 199 EXPORT_SYMBOL(mark_page_accessed); 200 201 /** 202 * lru_cache_add: add a page to the page lists 203 * @page: the page to add 204 */ 205 void fastcall lru_cache_add(struct page *page) 206 { 207 struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); 208 209 page_cache_get(page); 210 if (!pagevec_add(pvec, page)) 211 __pagevec_lru_add(pvec); 212 put_cpu_var(lru_add_pvecs); 213 } 214 215 void fastcall lru_cache_add_active(struct page *page) 216 { 217 struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs); 218 219 page_cache_get(page); 220 if (!pagevec_add(pvec, page)) 221 __pagevec_lru_add_active(pvec); 222 put_cpu_var(lru_add_active_pvecs); 223 } 224 225 /* 226 * Drain pages out of the cpu's pagevecs. 227 * Either "cpu" is the current CPU, and preemption has already been 228 * disabled; or "cpu" is being hot-unplugged, and is already dead. 229 */ 230 static void drain_cpu_pagevecs(int cpu) 231 { 232 struct pagevec *pvec; 233 234 pvec = &per_cpu(lru_add_pvecs, cpu); 235 if (pagevec_count(pvec)) 236 __pagevec_lru_add(pvec); 237 238 pvec = &per_cpu(lru_add_active_pvecs, cpu); 239 if (pagevec_count(pvec)) 240 __pagevec_lru_add_active(pvec); 241 242 pvec = &per_cpu(lru_rotate_pvecs, cpu); 243 if (pagevec_count(pvec)) { 244 unsigned long flags; 245 246 /* No harm done if a racing interrupt already did this */ 247 local_irq_save(flags); 248 pagevec_move_tail(pvec); 249 local_irq_restore(flags); 250 } 251 } 252 253 void lru_add_drain(void) 254 { 255 drain_cpu_pagevecs(get_cpu()); 256 put_cpu(); 257 } 258 259 #ifdef CONFIG_NUMA 260 static void lru_add_drain_per_cpu(struct work_struct *dummy) 261 { 262 lru_add_drain(); 263 } 264 265 /* 266 * Returns 0 for success 267 */ 268 int lru_add_drain_all(void) 269 { 270 return schedule_on_each_cpu(lru_add_drain_per_cpu); 271 } 272 273 #else 274 275 /* 276 * Returns 0 for success 277 */ 278 int lru_add_drain_all(void) 279 { 280 lru_add_drain(); 281 return 0; 282 } 283 #endif 284 285 /* 286 * Batched page_cache_release(). Decrement the reference count on all the 287 * passed pages. If it fell to zero then remove the page from the LRU and 288 * free it. 289 * 290 * Avoid taking zone->lru_lock if possible, but if it is taken, retain it 291 * for the remainder of the operation. 292 * 293 * The locking in this function is against shrink_cache(): we recheck the 294 * page count inside the lock to see whether shrink_cache grabbed the page 295 * via the LRU. If it did, give up: shrink_cache will free it. 296 */ 297 void release_pages(struct page **pages, int nr, int cold) 298 { 299 int i; 300 struct pagevec pages_to_free; 301 struct zone *zone = NULL; 302 unsigned long uninitialized_var(flags); 303 304 pagevec_init(&pages_to_free, cold); 305 for (i = 0; i < nr; i++) { 306 struct page *page = pages[i]; 307 308 if (unlikely(PageCompound(page))) { 309 if (zone) { 310 spin_unlock_irqrestore(&zone->lru_lock, flags); 311 zone = NULL; 312 } 313 put_compound_page(page); 314 continue; 315 } 316 317 if (!put_page_testzero(page)) 318 continue; 319 320 if (PageLRU(page)) { 321 struct zone *pagezone = page_zone(page); 322 if (pagezone != zone) { 323 if (zone) 324 spin_unlock_irqrestore(&zone->lru_lock, 325 flags); 326 zone = pagezone; 327 spin_lock_irqsave(&zone->lru_lock, flags); 328 } 329 VM_BUG_ON(!PageLRU(page)); 330 __ClearPageLRU(page); 331 del_page_from_lru(zone, page); 332 } 333 334 if (!pagevec_add(&pages_to_free, page)) { 335 if (zone) { 336 spin_unlock_irqrestore(&zone->lru_lock, flags); 337 zone = NULL; 338 } 339 __pagevec_free(&pages_to_free); 340 pagevec_reinit(&pages_to_free); 341 } 342 } 343 if (zone) 344 spin_unlock_irqrestore(&zone->lru_lock, flags); 345 346 pagevec_free(&pages_to_free); 347 } 348 349 /* 350 * The pages which we're about to release may be in the deferred lru-addition 351 * queues. That would prevent them from really being freed right now. That's 352 * OK from a correctness point of view but is inefficient - those pages may be 353 * cache-warm and we want to give them back to the page allocator ASAP. 354 * 355 * So __pagevec_release() will drain those queues here. __pagevec_lru_add() 356 * and __pagevec_lru_add_active() call release_pages() directly to avoid 357 * mutual recursion. 358 */ 359 void __pagevec_release(struct pagevec *pvec) 360 { 361 lru_add_drain(); 362 release_pages(pvec->pages, pagevec_count(pvec), pvec->cold); 363 pagevec_reinit(pvec); 364 } 365 366 EXPORT_SYMBOL(__pagevec_release); 367 368 /* 369 * pagevec_release() for pages which are known to not be on the LRU 370 * 371 * This function reinitialises the caller's pagevec. 372 */ 373 void __pagevec_release_nonlru(struct pagevec *pvec) 374 { 375 int i; 376 struct pagevec pages_to_free; 377 378 pagevec_init(&pages_to_free, pvec->cold); 379 for (i = 0; i < pagevec_count(pvec); i++) { 380 struct page *page = pvec->pages[i]; 381 382 VM_BUG_ON(PageLRU(page)); 383 if (put_page_testzero(page)) 384 pagevec_add(&pages_to_free, page); 385 } 386 pagevec_free(&pages_to_free); 387 pagevec_reinit(pvec); 388 } 389 390 /* 391 * Add the passed pages to the LRU, then drop the caller's refcount 392 * on them. Reinitialises the caller's pagevec. 393 */ 394 void __pagevec_lru_add(struct pagevec *pvec) 395 { 396 int i; 397 struct zone *zone = NULL; 398 399 for (i = 0; i < pagevec_count(pvec); i++) { 400 struct page *page = pvec->pages[i]; 401 struct zone *pagezone = page_zone(page); 402 403 if (pagezone != zone) { 404 if (zone) 405 spin_unlock_irq(&zone->lru_lock); 406 zone = pagezone; 407 spin_lock_irq(&zone->lru_lock); 408 } 409 VM_BUG_ON(PageLRU(page)); 410 SetPageLRU(page); 411 add_page_to_inactive_list(zone, page); 412 } 413 if (zone) 414 spin_unlock_irq(&zone->lru_lock); 415 release_pages(pvec->pages, pvec->nr, pvec->cold); 416 pagevec_reinit(pvec); 417 } 418 419 EXPORT_SYMBOL(__pagevec_lru_add); 420 421 void __pagevec_lru_add_active(struct pagevec *pvec) 422 { 423 int i; 424 struct zone *zone = NULL; 425 426 for (i = 0; i < pagevec_count(pvec); i++) { 427 struct page *page = pvec->pages[i]; 428 struct zone *pagezone = page_zone(page); 429 430 if (pagezone != zone) { 431 if (zone) 432 spin_unlock_irq(&zone->lru_lock); 433 zone = pagezone; 434 spin_lock_irq(&zone->lru_lock); 435 } 436 VM_BUG_ON(PageLRU(page)); 437 SetPageLRU(page); 438 VM_BUG_ON(PageActive(page)); 439 SetPageActive(page); 440 add_page_to_active_list(zone, page); 441 } 442 if (zone) 443 spin_unlock_irq(&zone->lru_lock); 444 release_pages(pvec->pages, pvec->nr, pvec->cold); 445 pagevec_reinit(pvec); 446 } 447 448 /* 449 * Try to drop buffers from the pages in a pagevec 450 */ 451 void pagevec_strip(struct pagevec *pvec) 452 { 453 int i; 454 455 for (i = 0; i < pagevec_count(pvec); i++) { 456 struct page *page = pvec->pages[i]; 457 458 if (PagePrivate(page) && !TestSetPageLocked(page)) { 459 if (PagePrivate(page)) 460 try_to_release_page(page, 0); 461 unlock_page(page); 462 } 463 } 464 } 465 466 /** 467 * pagevec_lookup - gang pagecache lookup 468 * @pvec: Where the resulting pages are placed 469 * @mapping: The address_space to search 470 * @start: The starting page index 471 * @nr_pages: The maximum number of pages 472 * 473 * pagevec_lookup() will search for and return a group of up to @nr_pages pages 474 * in the mapping. The pages are placed in @pvec. pagevec_lookup() takes a 475 * reference against the pages in @pvec. 476 * 477 * The search returns a group of mapping-contiguous pages with ascending 478 * indexes. There may be holes in the indices due to not-present pages. 479 * 480 * pagevec_lookup() returns the number of pages which were found. 481 */ 482 unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, 483 pgoff_t start, unsigned nr_pages) 484 { 485 pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages); 486 return pagevec_count(pvec); 487 } 488 489 EXPORT_SYMBOL(pagevec_lookup); 490 491 unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, 492 pgoff_t *index, int tag, unsigned nr_pages) 493 { 494 pvec->nr = find_get_pages_tag(mapping, index, tag, 495 nr_pages, pvec->pages); 496 return pagevec_count(pvec); 497 } 498 499 EXPORT_SYMBOL(pagevec_lookup_tag); 500 501 #ifdef CONFIG_SMP 502 /* 503 * We tolerate a little inaccuracy to avoid ping-ponging the counter between 504 * CPUs 505 */ 506 #define ACCT_THRESHOLD max(16, NR_CPUS * 2) 507 508 static DEFINE_PER_CPU(long, committed_space) = 0; 509 510 void vm_acct_memory(long pages) 511 { 512 long *local; 513 514 preempt_disable(); 515 local = &__get_cpu_var(committed_space); 516 *local += pages; 517 if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) { 518 atomic_add(*local, &vm_committed_space); 519 *local = 0; 520 } 521 preempt_enable(); 522 } 523 524 #ifdef CONFIG_HOTPLUG_CPU 525 526 /* Drop the CPU's cached committed space back into the central pool. */ 527 static int cpu_swap_callback(struct notifier_block *nfb, 528 unsigned long action, 529 void *hcpu) 530 { 531 long *committed; 532 533 committed = &per_cpu(committed_space, (long)hcpu); 534 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { 535 atomic_add(*committed, &vm_committed_space); 536 *committed = 0; 537 drain_cpu_pagevecs((long)hcpu); 538 } 539 return NOTIFY_OK; 540 } 541 #endif /* CONFIG_HOTPLUG_CPU */ 542 #endif /* CONFIG_SMP */ 543 544 /* 545 * Perform any setup for the swap system 546 */ 547 void __init swap_setup(void) 548 { 549 unsigned long megs = num_physpages >> (20 - PAGE_SHIFT); 550 551 #ifdef CONFIG_SWAP 552 bdi_init(swapper_space.backing_dev_info); 553 #endif 554 555 /* Use a smaller cluster for small-memory machines */ 556 if (megs < 16) 557 page_cluster = 2; 558 else 559 page_cluster = 3; 560 /* 561 * Right now other parts of the system means that we 562 * _really_ don't want to cluster much more 563 */ 564 #ifdef CONFIG_HOTPLUG_CPU 565 hotcpu_notifier(cpu_swap_callback, 0); 566 #endif 567 } 568