1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * linux/mm/swap.c 4 * 5 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 6 */ 7 8 /* 9 * This file contains the default values for the operation of the 10 * Linux VM subsystem. Fine-tuning documentation can be found in 11 * Documentation/admin-guide/sysctl/vm.rst. 12 * Started 18.12.91 13 * Swap aging added 23.2.95, Stephen Tweedie. 14 * Buffermem limits added 12.3.98, Rik van Riel. 15 */ 16 17 #include <linux/mm.h> 18 #include <linux/sched.h> 19 #include <linux/kernel_stat.h> 20 #include <linux/swap.h> 21 #include <linux/mman.h> 22 #include <linux/pagemap.h> 23 #include <linux/pagevec.h> 24 #include <linux/init.h> 25 #include <linux/export.h> 26 #include <linux/mm_inline.h> 27 #include <linux/percpu_counter.h> 28 #include <linux/memremap.h> 29 #include <linux/percpu.h> 30 #include <linux/cpu.h> 31 #include <linux/notifier.h> 32 #include <linux/backing-dev.h> 33 #include <linux/memcontrol.h> 34 #include <linux/gfp.h> 35 #include <linux/uio.h> 36 #include <linux/hugetlb.h> 37 #include <linux/page_idle.h> 38 #include <linux/local_lock.h> 39 #include <linux/buffer_head.h> 40 41 #include "internal.h" 42 43 #define CREATE_TRACE_POINTS 44 #include <trace/events/pagemap.h> 45 46 /* How many pages do we try to swap or page in/out together? As a power of 2 */ 47 int page_cluster; 48 const int page_cluster_max = 31; 49 50 /* Protecting only lru_rotate.fbatch which requires disabling interrupts */ 51 struct lru_rotate { 52 local_lock_t lock; 53 struct folio_batch fbatch; 54 }; 55 static DEFINE_PER_CPU(struct lru_rotate, lru_rotate) = { 56 .lock = INIT_LOCAL_LOCK(lock), 57 }; 58 59 /* 60 * The following folio batches are grouped together because they are protected 61 * by disabling preemption (and interrupts remain enabled). 62 */ 63 struct cpu_fbatches { 64 local_lock_t lock; 65 struct folio_batch lru_add; 66 struct folio_batch lru_deactivate_file; 67 struct folio_batch lru_deactivate; 68 struct folio_batch lru_lazyfree; 69 #ifdef CONFIG_SMP 70 struct folio_batch activate; 71 #endif 72 }; 73 static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches) = { 74 .lock = INIT_LOCAL_LOCK(lock), 75 }; 76 77 /* 78 * This path almost never happens for VM activity - pages are normally freed 79 * in batches. But it gets used by networking - and for compound pages. 80 */ 81 static void __page_cache_release(struct folio *folio) 82 { 83 if (folio_test_lru(folio)) { 84 struct lruvec *lruvec; 85 unsigned long flags; 86 87 lruvec = folio_lruvec_lock_irqsave(folio, &flags); 88 lruvec_del_folio(lruvec, folio); 89 __folio_clear_lru_flags(folio); 90 unlock_page_lruvec_irqrestore(lruvec, flags); 91 } 92 /* See comment on folio_test_mlocked in release_pages() */ 93 if (unlikely(folio_test_mlocked(folio))) { 94 long nr_pages = folio_nr_pages(folio); 95 96 __folio_clear_mlocked(folio); 97 zone_stat_mod_folio(folio, NR_MLOCK, -nr_pages); 98 count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages); 99 } 100 } 101 102 static void __folio_put_small(struct folio *folio) 103 { 104 __page_cache_release(folio); 105 mem_cgroup_uncharge(folio); 106 free_unref_page(&folio->page, 0); 107 } 108 109 static void __folio_put_large(struct folio *folio) 110 { 111 /* 112 * __page_cache_release() is supposed to be called for thp, not for 113 * hugetlb. This is because hugetlb page does never have PageLRU set 114 * (it's never listed to any LRU lists) and no memcg routines should 115 * be called for hugetlb (it has a separate hugetlb_cgroup.) 116 */ 117 if (!folio_test_hugetlb(folio)) 118 __page_cache_release(folio); 119 destroy_large_folio(folio); 120 } 121 122 void __folio_put(struct folio *folio) 123 { 124 if (unlikely(folio_is_zone_device(folio))) 125 free_zone_device_page(&folio->page); 126 else if (unlikely(folio_test_large(folio))) 127 __folio_put_large(folio); 128 else 129 __folio_put_small(folio); 130 } 131 EXPORT_SYMBOL(__folio_put); 132 133 /** 134 * put_pages_list() - release a list of pages 135 * @pages: list of pages threaded on page->lru 136 * 137 * Release a list of pages which are strung together on page.lru. 138 */ 139 void put_pages_list(struct list_head *pages) 140 { 141 struct folio *folio, *next; 142 143 list_for_each_entry_safe(folio, next, pages, lru) { 144 if (!folio_put_testzero(folio)) { 145 list_del(&folio->lru); 146 continue; 147 } 148 if (folio_test_large(folio)) { 149 list_del(&folio->lru); 150 __folio_put_large(folio); 151 continue; 152 } 153 /* LRU flag must be clear because it's passed using the lru */ 154 } 155 156 free_unref_page_list(pages); 157 INIT_LIST_HEAD(pages); 158 } 159 EXPORT_SYMBOL(put_pages_list); 160 161 typedef void (*move_fn_t)(struct lruvec *lruvec, struct folio *folio); 162 163 static void lru_add_fn(struct lruvec *lruvec, struct folio *folio) 164 { 165 int was_unevictable = folio_test_clear_unevictable(folio); 166 long nr_pages = folio_nr_pages(folio); 167 168 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); 169 170 /* 171 * Is an smp_mb__after_atomic() still required here, before 172 * folio_evictable() tests the mlocked flag, to rule out the possibility 173 * of stranding an evictable folio on an unevictable LRU? I think 174 * not, because __munlock_folio() only clears the mlocked flag 175 * while the LRU lock is held. 176 * 177 * (That is not true of __page_cache_release(), and not necessarily 178 * true of release_pages(): but those only clear the mlocked flag after 179 * folio_put_testzero() has excluded any other users of the folio.) 180 */ 181 if (folio_evictable(folio)) { 182 if (was_unevictable) 183 __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); 184 } else { 185 folio_clear_active(folio); 186 folio_set_unevictable(folio); 187 /* 188 * folio->mlock_count = !!folio_test_mlocked(folio)? 189 * But that leaves __mlock_folio() in doubt whether another 190 * actor has already counted the mlock or not. Err on the 191 * safe side, underestimate, let page reclaim fix it, rather 192 * than leaving a page on the unevictable LRU indefinitely. 193 */ 194 folio->mlock_count = 0; 195 if (!was_unevictable) 196 __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages); 197 } 198 199 lruvec_add_folio(lruvec, folio); 200 trace_mm_lru_insertion(folio); 201 } 202 203 static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) 204 { 205 int i; 206 struct lruvec *lruvec = NULL; 207 unsigned long flags = 0; 208 209 for (i = 0; i < folio_batch_count(fbatch); i++) { 210 struct folio *folio = fbatch->folios[i]; 211 212 /* block memcg migration while the folio moves between lru */ 213 if (move_fn != lru_add_fn && !folio_test_clear_lru(folio)) 214 continue; 215 216 lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags); 217 move_fn(lruvec, folio); 218 219 folio_set_lru(folio); 220 } 221 222 if (lruvec) 223 unlock_page_lruvec_irqrestore(lruvec, flags); 224 folios_put(fbatch->folios, folio_batch_count(fbatch)); 225 folio_batch_reinit(fbatch); 226 } 227 228 static void folio_batch_add_and_move(struct folio_batch *fbatch, 229 struct folio *folio, move_fn_t move_fn) 230 { 231 if (folio_batch_add(fbatch, folio) && !folio_test_large(folio) && 232 !lru_cache_disabled()) 233 return; 234 folio_batch_move_lru(fbatch, move_fn); 235 } 236 237 static void lru_move_tail_fn(struct lruvec *lruvec, struct folio *folio) 238 { 239 if (!folio_test_unevictable(folio)) { 240 lruvec_del_folio(lruvec, folio); 241 folio_clear_active(folio); 242 lruvec_add_folio_tail(lruvec, folio); 243 __count_vm_events(PGROTATED, folio_nr_pages(folio)); 244 } 245 } 246 247 /* 248 * Writeback is about to end against a folio which has been marked for 249 * immediate reclaim. If it still appears to be reclaimable, move it 250 * to the tail of the inactive list. 251 * 252 * folio_rotate_reclaimable() must disable IRQs, to prevent nasty races. 253 */ 254 void folio_rotate_reclaimable(struct folio *folio) 255 { 256 if (!folio_test_locked(folio) && !folio_test_dirty(folio) && 257 !folio_test_unevictable(folio) && folio_test_lru(folio)) { 258 struct folio_batch *fbatch; 259 unsigned long flags; 260 261 folio_get(folio); 262 local_lock_irqsave(&lru_rotate.lock, flags); 263 fbatch = this_cpu_ptr(&lru_rotate.fbatch); 264 folio_batch_add_and_move(fbatch, folio, lru_move_tail_fn); 265 local_unlock_irqrestore(&lru_rotate.lock, flags); 266 } 267 } 268 269 void lru_note_cost(struct lruvec *lruvec, bool file, 270 unsigned int nr_io, unsigned int nr_rotated) 271 { 272 unsigned long cost; 273 274 /* 275 * Reflect the relative cost of incurring IO and spending CPU 276 * time on rotations. This doesn't attempt to make a precise 277 * comparison, it just says: if reloads are about comparable 278 * between the LRU lists, or rotations are overwhelmingly 279 * different between them, adjust scan balance for CPU work. 280 */ 281 cost = nr_io * SWAP_CLUSTER_MAX + nr_rotated; 282 283 do { 284 unsigned long lrusize; 285 286 /* 287 * Hold lruvec->lru_lock is safe here, since 288 * 1) The pinned lruvec in reclaim, or 289 * 2) From a pre-LRU page during refault (which also holds the 290 * rcu lock, so would be safe even if the page was on the LRU 291 * and could move simultaneously to a new lruvec). 292 */ 293 spin_lock_irq(&lruvec->lru_lock); 294 /* Record cost event */ 295 if (file) 296 lruvec->file_cost += cost; 297 else 298 lruvec->anon_cost += cost; 299 300 /* 301 * Decay previous events 302 * 303 * Because workloads change over time (and to avoid 304 * overflow) we keep these statistics as a floating 305 * average, which ends up weighing recent refaults 306 * more than old ones. 307 */ 308 lrusize = lruvec_page_state(lruvec, NR_INACTIVE_ANON) + 309 lruvec_page_state(lruvec, NR_ACTIVE_ANON) + 310 lruvec_page_state(lruvec, NR_INACTIVE_FILE) + 311 lruvec_page_state(lruvec, NR_ACTIVE_FILE); 312 313 if (lruvec->file_cost + lruvec->anon_cost > lrusize / 4) { 314 lruvec->file_cost /= 2; 315 lruvec->anon_cost /= 2; 316 } 317 spin_unlock_irq(&lruvec->lru_lock); 318 } while ((lruvec = parent_lruvec(lruvec))); 319 } 320 321 void lru_note_cost_refault(struct folio *folio) 322 { 323 lru_note_cost(folio_lruvec(folio), folio_is_file_lru(folio), 324 folio_nr_pages(folio), 0); 325 } 326 327 static void folio_activate_fn(struct lruvec *lruvec, struct folio *folio) 328 { 329 if (!folio_test_active(folio) && !folio_test_unevictable(folio)) { 330 long nr_pages = folio_nr_pages(folio); 331 332 lruvec_del_folio(lruvec, folio); 333 folio_set_active(folio); 334 lruvec_add_folio(lruvec, folio); 335 trace_mm_lru_activate(folio); 336 337 __count_vm_events(PGACTIVATE, nr_pages); 338 __count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE, 339 nr_pages); 340 } 341 } 342 343 #ifdef CONFIG_SMP 344 static void folio_activate_drain(int cpu) 345 { 346 struct folio_batch *fbatch = &per_cpu(cpu_fbatches.activate, cpu); 347 348 if (folio_batch_count(fbatch)) 349 folio_batch_move_lru(fbatch, folio_activate_fn); 350 } 351 352 void folio_activate(struct folio *folio) 353 { 354 if (folio_test_lru(folio) && !folio_test_active(folio) && 355 !folio_test_unevictable(folio)) { 356 struct folio_batch *fbatch; 357 358 folio_get(folio); 359 local_lock(&cpu_fbatches.lock); 360 fbatch = this_cpu_ptr(&cpu_fbatches.activate); 361 folio_batch_add_and_move(fbatch, folio, folio_activate_fn); 362 local_unlock(&cpu_fbatches.lock); 363 } 364 } 365 366 #else 367 static inline void folio_activate_drain(int cpu) 368 { 369 } 370 371 void folio_activate(struct folio *folio) 372 { 373 struct lruvec *lruvec; 374 375 if (folio_test_clear_lru(folio)) { 376 lruvec = folio_lruvec_lock_irq(folio); 377 folio_activate_fn(lruvec, folio); 378 unlock_page_lruvec_irq(lruvec); 379 folio_set_lru(folio); 380 } 381 } 382 #endif 383 384 static void __lru_cache_activate_folio(struct folio *folio) 385 { 386 struct folio_batch *fbatch; 387 int i; 388 389 local_lock(&cpu_fbatches.lock); 390 fbatch = this_cpu_ptr(&cpu_fbatches.lru_add); 391 392 /* 393 * Search backwards on the optimistic assumption that the folio being 394 * activated has just been added to this batch. Note that only 395 * the local batch is examined as a !LRU folio could be in the 396 * process of being released, reclaimed, migrated or on a remote 397 * batch that is currently being drained. Furthermore, marking 398 * a remote batch's folio active potentially hits a race where 399 * a folio is marked active just after it is added to the inactive 400 * list causing accounting errors and BUG_ON checks to trigger. 401 */ 402 for (i = folio_batch_count(fbatch) - 1; i >= 0; i--) { 403 struct folio *batch_folio = fbatch->folios[i]; 404 405 if (batch_folio == folio) { 406 folio_set_active(folio); 407 break; 408 } 409 } 410 411 local_unlock(&cpu_fbatches.lock); 412 } 413 414 #ifdef CONFIG_LRU_GEN 415 static void folio_inc_refs(struct folio *folio) 416 { 417 unsigned long new_flags, old_flags = READ_ONCE(folio->flags); 418 419 if (folio_test_unevictable(folio)) 420 return; 421 422 if (!folio_test_referenced(folio)) { 423 folio_set_referenced(folio); 424 return; 425 } 426 427 if (!folio_test_workingset(folio)) { 428 folio_set_workingset(folio); 429 return; 430 } 431 432 /* see the comment on MAX_NR_TIERS */ 433 do { 434 new_flags = old_flags & LRU_REFS_MASK; 435 if (new_flags == LRU_REFS_MASK) 436 break; 437 438 new_flags += BIT(LRU_REFS_PGOFF); 439 new_flags |= old_flags & ~LRU_REFS_MASK; 440 } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); 441 } 442 #else 443 static void folio_inc_refs(struct folio *folio) 444 { 445 } 446 #endif /* CONFIG_LRU_GEN */ 447 448 /* 449 * Mark a page as having seen activity. 450 * 451 * inactive,unreferenced -> inactive,referenced 452 * inactive,referenced -> active,unreferenced 453 * active,unreferenced -> active,referenced 454 * 455 * When a newly allocated page is not yet visible, so safe for non-atomic ops, 456 * __SetPageReferenced(page) may be substituted for mark_page_accessed(page). 457 */ 458 void folio_mark_accessed(struct folio *folio) 459 { 460 if (lru_gen_enabled()) { 461 folio_inc_refs(folio); 462 return; 463 } 464 465 if (!folio_test_referenced(folio)) { 466 folio_set_referenced(folio); 467 } else if (folio_test_unevictable(folio)) { 468 /* 469 * Unevictable pages are on the "LRU_UNEVICTABLE" list. But, 470 * this list is never rotated or maintained, so marking an 471 * unevictable page accessed has no effect. 472 */ 473 } else if (!folio_test_active(folio)) { 474 /* 475 * If the folio is on the LRU, queue it for activation via 476 * cpu_fbatches.activate. Otherwise, assume the folio is in a 477 * folio_batch, mark it active and it'll be moved to the active 478 * LRU on the next drain. 479 */ 480 if (folio_test_lru(folio)) 481 folio_activate(folio); 482 else 483 __lru_cache_activate_folio(folio); 484 folio_clear_referenced(folio); 485 workingset_activation(folio); 486 } 487 if (folio_test_idle(folio)) 488 folio_clear_idle(folio); 489 } 490 EXPORT_SYMBOL(folio_mark_accessed); 491 492 /** 493 * folio_add_lru - Add a folio to an LRU list. 494 * @folio: The folio to be added to the LRU. 495 * 496 * Queue the folio for addition to the LRU. The decision on whether 497 * to add the page to the [in]active [file|anon] list is deferred until the 498 * folio_batch is drained. This gives a chance for the caller of folio_add_lru() 499 * have the folio added to the active list using folio_mark_accessed(). 500 */ 501 void folio_add_lru(struct folio *folio) 502 { 503 struct folio_batch *fbatch; 504 505 VM_BUG_ON_FOLIO(folio_test_active(folio) && 506 folio_test_unevictable(folio), folio); 507 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); 508 509 /* see the comment in lru_gen_add_folio() */ 510 if (lru_gen_enabled() && !folio_test_unevictable(folio) && 511 lru_gen_in_fault() && !(current->flags & PF_MEMALLOC)) 512 folio_set_active(folio); 513 514 folio_get(folio); 515 local_lock(&cpu_fbatches.lock); 516 fbatch = this_cpu_ptr(&cpu_fbatches.lru_add); 517 folio_batch_add_and_move(fbatch, folio, lru_add_fn); 518 local_unlock(&cpu_fbatches.lock); 519 } 520 EXPORT_SYMBOL(folio_add_lru); 521 522 /** 523 * folio_add_lru_vma() - Add a folio to the appropate LRU list for this VMA. 524 * @folio: The folio to be added to the LRU. 525 * @vma: VMA in which the folio is mapped. 526 * 527 * If the VMA is mlocked, @folio is added to the unevictable list. 528 * Otherwise, it is treated the same way as folio_add_lru(). 529 */ 530 void folio_add_lru_vma(struct folio *folio, struct vm_area_struct *vma) 531 { 532 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); 533 534 if (unlikely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED)) 535 mlock_new_folio(folio); 536 else 537 folio_add_lru(folio); 538 } 539 540 /* 541 * If the folio cannot be invalidated, it is moved to the 542 * inactive list to speed up its reclaim. It is moved to the 543 * head of the list, rather than the tail, to give the flusher 544 * threads some time to write it out, as this is much more 545 * effective than the single-page writeout from reclaim. 546 * 547 * If the folio isn't mapped and dirty/writeback, the folio 548 * could be reclaimed asap using the reclaim flag. 549 * 550 * 1. active, mapped folio -> none 551 * 2. active, dirty/writeback folio -> inactive, head, reclaim 552 * 3. inactive, mapped folio -> none 553 * 4. inactive, dirty/writeback folio -> inactive, head, reclaim 554 * 5. inactive, clean -> inactive, tail 555 * 6. Others -> none 556 * 557 * In 4, it moves to the head of the inactive list so the folio is 558 * written out by flusher threads as this is much more efficient 559 * than the single-page writeout from reclaim. 560 */ 561 static void lru_deactivate_file_fn(struct lruvec *lruvec, struct folio *folio) 562 { 563 bool active = folio_test_active(folio); 564 long nr_pages = folio_nr_pages(folio); 565 566 if (folio_test_unevictable(folio)) 567 return; 568 569 /* Some processes are using the folio */ 570 if (folio_mapped(folio)) 571 return; 572 573 lruvec_del_folio(lruvec, folio); 574 folio_clear_active(folio); 575 folio_clear_referenced(folio); 576 577 if (folio_test_writeback(folio) || folio_test_dirty(folio)) { 578 /* 579 * Setting the reclaim flag could race with 580 * folio_end_writeback() and confuse readahead. But the 581 * race window is _really_ small and it's not a critical 582 * problem. 583 */ 584 lruvec_add_folio(lruvec, folio); 585 folio_set_reclaim(folio); 586 } else { 587 /* 588 * The folio's writeback ended while it was in the batch. 589 * We move that folio to the tail of the inactive list. 590 */ 591 lruvec_add_folio_tail(lruvec, folio); 592 __count_vm_events(PGROTATED, nr_pages); 593 } 594 595 if (active) { 596 __count_vm_events(PGDEACTIVATE, nr_pages); 597 __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, 598 nr_pages); 599 } 600 } 601 602 static void lru_deactivate_fn(struct lruvec *lruvec, struct folio *folio) 603 { 604 if (!folio_test_unevictable(folio) && (folio_test_active(folio) || lru_gen_enabled())) { 605 long nr_pages = folio_nr_pages(folio); 606 607 lruvec_del_folio(lruvec, folio); 608 folio_clear_active(folio); 609 folio_clear_referenced(folio); 610 lruvec_add_folio(lruvec, folio); 611 612 __count_vm_events(PGDEACTIVATE, nr_pages); 613 __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, 614 nr_pages); 615 } 616 } 617 618 static void lru_lazyfree_fn(struct lruvec *lruvec, struct folio *folio) 619 { 620 if (folio_test_anon(folio) && folio_test_swapbacked(folio) && 621 !folio_test_swapcache(folio) && !folio_test_unevictable(folio)) { 622 long nr_pages = folio_nr_pages(folio); 623 624 lruvec_del_folio(lruvec, folio); 625 folio_clear_active(folio); 626 folio_clear_referenced(folio); 627 /* 628 * Lazyfree folios are clean anonymous folios. They have 629 * the swapbacked flag cleared, to distinguish them from normal 630 * anonymous folios 631 */ 632 folio_clear_swapbacked(folio); 633 lruvec_add_folio(lruvec, folio); 634 635 __count_vm_events(PGLAZYFREE, nr_pages); 636 __count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, 637 nr_pages); 638 } 639 } 640 641 /* 642 * Drain pages out of the cpu's folio_batch. 643 * Either "cpu" is the current CPU, and preemption has already been 644 * disabled; or "cpu" is being hot-unplugged, and is already dead. 645 */ 646 void lru_add_drain_cpu(int cpu) 647 { 648 struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu); 649 struct folio_batch *fbatch = &fbatches->lru_add; 650 651 if (folio_batch_count(fbatch)) 652 folio_batch_move_lru(fbatch, lru_add_fn); 653 654 fbatch = &per_cpu(lru_rotate.fbatch, cpu); 655 /* Disabling interrupts below acts as a compiler barrier. */ 656 if (data_race(folio_batch_count(fbatch))) { 657 unsigned long flags; 658 659 /* No harm done if a racing interrupt already did this */ 660 local_lock_irqsave(&lru_rotate.lock, flags); 661 folio_batch_move_lru(fbatch, lru_move_tail_fn); 662 local_unlock_irqrestore(&lru_rotate.lock, flags); 663 } 664 665 fbatch = &fbatches->lru_deactivate_file; 666 if (folio_batch_count(fbatch)) 667 folio_batch_move_lru(fbatch, lru_deactivate_file_fn); 668 669 fbatch = &fbatches->lru_deactivate; 670 if (folio_batch_count(fbatch)) 671 folio_batch_move_lru(fbatch, lru_deactivate_fn); 672 673 fbatch = &fbatches->lru_lazyfree; 674 if (folio_batch_count(fbatch)) 675 folio_batch_move_lru(fbatch, lru_lazyfree_fn); 676 677 folio_activate_drain(cpu); 678 } 679 680 /** 681 * deactivate_file_folio() - Deactivate a file folio. 682 * @folio: Folio to deactivate. 683 * 684 * This function hints to the VM that @folio is a good reclaim candidate, 685 * for example if its invalidation fails due to the folio being dirty 686 * or under writeback. 687 * 688 * Context: Caller holds a reference on the folio. 689 */ 690 void deactivate_file_folio(struct folio *folio) 691 { 692 struct folio_batch *fbatch; 693 694 /* Deactivating an unevictable folio will not accelerate reclaim */ 695 if (folio_test_unevictable(folio)) 696 return; 697 698 folio_get(folio); 699 local_lock(&cpu_fbatches.lock); 700 fbatch = this_cpu_ptr(&cpu_fbatches.lru_deactivate_file); 701 folio_batch_add_and_move(fbatch, folio, lru_deactivate_file_fn); 702 local_unlock(&cpu_fbatches.lock); 703 } 704 705 /* 706 * folio_deactivate - deactivate a folio 707 * @folio: folio to deactivate 708 * 709 * folio_deactivate() moves @folio to the inactive list if @folio was on the 710 * active list and was not unevictable. This is done to accelerate the 711 * reclaim of @folio. 712 */ 713 void folio_deactivate(struct folio *folio) 714 { 715 if (folio_test_lru(folio) && !folio_test_unevictable(folio) && 716 (folio_test_active(folio) || lru_gen_enabled())) { 717 struct folio_batch *fbatch; 718 719 folio_get(folio); 720 local_lock(&cpu_fbatches.lock); 721 fbatch = this_cpu_ptr(&cpu_fbatches.lru_deactivate); 722 folio_batch_add_and_move(fbatch, folio, lru_deactivate_fn); 723 local_unlock(&cpu_fbatches.lock); 724 } 725 } 726 727 /** 728 * folio_mark_lazyfree - make an anon folio lazyfree 729 * @folio: folio to deactivate 730 * 731 * folio_mark_lazyfree() moves @folio to the inactive file list. 732 * This is done to accelerate the reclaim of @folio. 733 */ 734 void folio_mark_lazyfree(struct folio *folio) 735 { 736 if (folio_test_lru(folio) && folio_test_anon(folio) && 737 folio_test_swapbacked(folio) && !folio_test_swapcache(folio) && 738 !folio_test_unevictable(folio)) { 739 struct folio_batch *fbatch; 740 741 folio_get(folio); 742 local_lock(&cpu_fbatches.lock); 743 fbatch = this_cpu_ptr(&cpu_fbatches.lru_lazyfree); 744 folio_batch_add_and_move(fbatch, folio, lru_lazyfree_fn); 745 local_unlock(&cpu_fbatches.lock); 746 } 747 } 748 749 void lru_add_drain(void) 750 { 751 local_lock(&cpu_fbatches.lock); 752 lru_add_drain_cpu(smp_processor_id()); 753 local_unlock(&cpu_fbatches.lock); 754 mlock_drain_local(); 755 } 756 757 /* 758 * It's called from per-cpu workqueue context in SMP case so 759 * lru_add_drain_cpu and invalidate_bh_lrus_cpu should run on 760 * the same cpu. It shouldn't be a problem in !SMP case since 761 * the core is only one and the locks will disable preemption. 762 */ 763 static void lru_add_and_bh_lrus_drain(void) 764 { 765 local_lock(&cpu_fbatches.lock); 766 lru_add_drain_cpu(smp_processor_id()); 767 local_unlock(&cpu_fbatches.lock); 768 invalidate_bh_lrus_cpu(); 769 mlock_drain_local(); 770 } 771 772 void lru_add_drain_cpu_zone(struct zone *zone) 773 { 774 local_lock(&cpu_fbatches.lock); 775 lru_add_drain_cpu(smp_processor_id()); 776 drain_local_pages(zone); 777 local_unlock(&cpu_fbatches.lock); 778 mlock_drain_local(); 779 } 780 781 #ifdef CONFIG_SMP 782 783 static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); 784 785 static void lru_add_drain_per_cpu(struct work_struct *dummy) 786 { 787 lru_add_and_bh_lrus_drain(); 788 } 789 790 static bool cpu_needs_drain(unsigned int cpu) 791 { 792 struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu); 793 794 /* Check these in order of likelihood that they're not zero */ 795 return folio_batch_count(&fbatches->lru_add) || 796 data_race(folio_batch_count(&per_cpu(lru_rotate.fbatch, cpu))) || 797 folio_batch_count(&fbatches->lru_deactivate_file) || 798 folio_batch_count(&fbatches->lru_deactivate) || 799 folio_batch_count(&fbatches->lru_lazyfree) || 800 folio_batch_count(&fbatches->activate) || 801 need_mlock_drain(cpu) || 802 has_bh_in_lru(cpu, NULL); 803 } 804 805 /* 806 * Doesn't need any cpu hotplug locking because we do rely on per-cpu 807 * kworkers being shut down before our page_alloc_cpu_dead callback is 808 * executed on the offlined cpu. 809 * Calling this function with cpu hotplug locks held can actually lead 810 * to obscure indirect dependencies via WQ context. 811 */ 812 static inline void __lru_add_drain_all(bool force_all_cpus) 813 { 814 /* 815 * lru_drain_gen - Global pages generation number 816 * 817 * (A) Definition: global lru_drain_gen = x implies that all generations 818 * 0 < n <= x are already *scheduled* for draining. 819 * 820 * This is an optimization for the highly-contended use case where a 821 * user space workload keeps constantly generating a flow of pages for 822 * each CPU. 823 */ 824 static unsigned int lru_drain_gen; 825 static struct cpumask has_work; 826 static DEFINE_MUTEX(lock); 827 unsigned cpu, this_gen; 828 829 /* 830 * Make sure nobody triggers this path before mm_percpu_wq is fully 831 * initialized. 832 */ 833 if (WARN_ON(!mm_percpu_wq)) 834 return; 835 836 /* 837 * Guarantee folio_batch counter stores visible by this CPU 838 * are visible to other CPUs before loading the current drain 839 * generation. 840 */ 841 smp_mb(); 842 843 /* 844 * (B) Locally cache global LRU draining generation number 845 * 846 * The read barrier ensures that the counter is loaded before the mutex 847 * is taken. It pairs with smp_mb() inside the mutex critical section 848 * at (D). 849 */ 850 this_gen = smp_load_acquire(&lru_drain_gen); 851 852 mutex_lock(&lock); 853 854 /* 855 * (C) Exit the draining operation if a newer generation, from another 856 * lru_add_drain_all(), was already scheduled for draining. Check (A). 857 */ 858 if (unlikely(this_gen != lru_drain_gen && !force_all_cpus)) 859 goto done; 860 861 /* 862 * (D) Increment global generation number 863 * 864 * Pairs with smp_load_acquire() at (B), outside of the critical 865 * section. Use a full memory barrier to guarantee that the 866 * new global drain generation number is stored before loading 867 * folio_batch counters. 868 * 869 * This pairing must be done here, before the for_each_online_cpu loop 870 * below which drains the page vectors. 871 * 872 * Let x, y, and z represent some system CPU numbers, where x < y < z. 873 * Assume CPU #z is in the middle of the for_each_online_cpu loop 874 * below and has already reached CPU #y's per-cpu data. CPU #x comes 875 * along, adds some pages to its per-cpu vectors, then calls 876 * lru_add_drain_all(). 877 * 878 * If the paired barrier is done at any later step, e.g. after the 879 * loop, CPU #x will just exit at (C) and miss flushing out all of its 880 * added pages. 881 */ 882 WRITE_ONCE(lru_drain_gen, lru_drain_gen + 1); 883 smp_mb(); 884 885 cpumask_clear(&has_work); 886 for_each_online_cpu(cpu) { 887 struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); 888 889 if (cpu_needs_drain(cpu)) { 890 INIT_WORK(work, lru_add_drain_per_cpu); 891 queue_work_on(cpu, mm_percpu_wq, work); 892 __cpumask_set_cpu(cpu, &has_work); 893 } 894 } 895 896 for_each_cpu(cpu, &has_work) 897 flush_work(&per_cpu(lru_add_drain_work, cpu)); 898 899 done: 900 mutex_unlock(&lock); 901 } 902 903 void lru_add_drain_all(void) 904 { 905 __lru_add_drain_all(false); 906 } 907 #else 908 void lru_add_drain_all(void) 909 { 910 lru_add_drain(); 911 } 912 #endif /* CONFIG_SMP */ 913 914 atomic_t lru_disable_count = ATOMIC_INIT(0); 915 916 /* 917 * lru_cache_disable() needs to be called before we start compiling 918 * a list of pages to be migrated using isolate_lru_page(). 919 * It drains pages on LRU cache and then disable on all cpus until 920 * lru_cache_enable is called. 921 * 922 * Must be paired with a call to lru_cache_enable(). 923 */ 924 void lru_cache_disable(void) 925 { 926 atomic_inc(&lru_disable_count); 927 /* 928 * Readers of lru_disable_count are protected by either disabling 929 * preemption or rcu_read_lock: 930 * 931 * preempt_disable, local_irq_disable [bh_lru_lock()] 932 * rcu_read_lock [rt_spin_lock CONFIG_PREEMPT_RT] 933 * preempt_disable [local_lock !CONFIG_PREEMPT_RT] 934 * 935 * Since v5.1 kernel, synchronize_rcu() is guaranteed to wait on 936 * preempt_disable() regions of code. So any CPU which sees 937 * lru_disable_count = 0 will have exited the critical 938 * section when synchronize_rcu() returns. 939 */ 940 synchronize_rcu_expedited(); 941 #ifdef CONFIG_SMP 942 __lru_add_drain_all(true); 943 #else 944 lru_add_and_bh_lrus_drain(); 945 #endif 946 } 947 948 /** 949 * release_pages - batched put_page() 950 * @arg: array of pages to release 951 * @nr: number of pages 952 * 953 * Decrement the reference count on all the pages in @arg. If it 954 * fell to zero, remove the page from the LRU and free it. 955 * 956 * Note that the argument can be an array of pages, encoded pages, 957 * or folio pointers. We ignore any encoded bits, and turn any of 958 * them into just a folio that gets free'd. 959 */ 960 void release_pages(release_pages_arg arg, int nr) 961 { 962 int i; 963 struct encoded_page **encoded = arg.encoded_pages; 964 LIST_HEAD(pages_to_free); 965 struct lruvec *lruvec = NULL; 966 unsigned long flags = 0; 967 unsigned int lock_batch; 968 969 for (i = 0; i < nr; i++) { 970 struct folio *folio; 971 972 /* Turn any of the argument types into a folio */ 973 folio = page_folio(encoded_page_ptr(encoded[i])); 974 975 /* 976 * Make sure the IRQ-safe lock-holding time does not get 977 * excessive with a continuous string of pages from the 978 * same lruvec. The lock is held only if lruvec != NULL. 979 */ 980 if (lruvec && ++lock_batch == SWAP_CLUSTER_MAX) { 981 unlock_page_lruvec_irqrestore(lruvec, flags); 982 lruvec = NULL; 983 } 984 985 if (is_huge_zero_page(&folio->page)) 986 continue; 987 988 if (folio_is_zone_device(folio)) { 989 if (lruvec) { 990 unlock_page_lruvec_irqrestore(lruvec, flags); 991 lruvec = NULL; 992 } 993 if (put_devmap_managed_page(&folio->page)) 994 continue; 995 if (folio_put_testzero(folio)) 996 free_zone_device_page(&folio->page); 997 continue; 998 } 999 1000 if (!folio_put_testzero(folio)) 1001 continue; 1002 1003 if (folio_test_large(folio)) { 1004 if (lruvec) { 1005 unlock_page_lruvec_irqrestore(lruvec, flags); 1006 lruvec = NULL; 1007 } 1008 __folio_put_large(folio); 1009 continue; 1010 } 1011 1012 if (folio_test_lru(folio)) { 1013 struct lruvec *prev_lruvec = lruvec; 1014 1015 lruvec = folio_lruvec_relock_irqsave(folio, lruvec, 1016 &flags); 1017 if (prev_lruvec != lruvec) 1018 lock_batch = 0; 1019 1020 lruvec_del_folio(lruvec, folio); 1021 __folio_clear_lru_flags(folio); 1022 } 1023 1024 /* 1025 * In rare cases, when truncation or holepunching raced with 1026 * munlock after VM_LOCKED was cleared, Mlocked may still be 1027 * found set here. This does not indicate a problem, unless 1028 * "unevictable_pgs_cleared" appears worryingly large. 1029 */ 1030 if (unlikely(folio_test_mlocked(folio))) { 1031 __folio_clear_mlocked(folio); 1032 zone_stat_sub_folio(folio, NR_MLOCK); 1033 count_vm_event(UNEVICTABLE_PGCLEARED); 1034 } 1035 1036 list_add(&folio->lru, &pages_to_free); 1037 } 1038 if (lruvec) 1039 unlock_page_lruvec_irqrestore(lruvec, flags); 1040 1041 mem_cgroup_uncharge_list(&pages_to_free); 1042 free_unref_page_list(&pages_to_free); 1043 } 1044 EXPORT_SYMBOL(release_pages); 1045 1046 /* 1047 * The folios which we're about to release may be in the deferred lru-addition 1048 * queues. That would prevent them from really being freed right now. That's 1049 * OK from a correctness point of view but is inefficient - those folios may be 1050 * cache-warm and we want to give them back to the page allocator ASAP. 1051 * 1052 * So __folio_batch_release() will drain those queues here. 1053 * folio_batch_move_lru() calls folios_put() directly to avoid 1054 * mutual recursion. 1055 */ 1056 void __folio_batch_release(struct folio_batch *fbatch) 1057 { 1058 if (!fbatch->percpu_pvec_drained) { 1059 lru_add_drain(); 1060 fbatch->percpu_pvec_drained = true; 1061 } 1062 release_pages(fbatch->folios, folio_batch_count(fbatch)); 1063 folio_batch_reinit(fbatch); 1064 } 1065 EXPORT_SYMBOL(__folio_batch_release); 1066 1067 /** 1068 * folio_batch_remove_exceptionals() - Prune non-folios from a batch. 1069 * @fbatch: The batch to prune 1070 * 1071 * find_get_entries() fills a batch with both folios and shadow/swap/DAX 1072 * entries. This function prunes all the non-folio entries from @fbatch 1073 * without leaving holes, so that it can be passed on to folio-only batch 1074 * operations. 1075 */ 1076 void folio_batch_remove_exceptionals(struct folio_batch *fbatch) 1077 { 1078 unsigned int i, j; 1079 1080 for (i = 0, j = 0; i < folio_batch_count(fbatch); i++) { 1081 struct folio *folio = fbatch->folios[i]; 1082 if (!xa_is_value(folio)) 1083 fbatch->folios[j++] = folio; 1084 } 1085 fbatch->nr = j; 1086 } 1087 1088 /* 1089 * Perform any setup for the swap system 1090 */ 1091 void __init swap_setup(void) 1092 { 1093 unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT); 1094 1095 /* Use a smaller cluster for small-memory machines */ 1096 if (megs < 16) 1097 page_cluster = 2; 1098 else 1099 page_cluster = 3; 1100 /* 1101 * Right now other parts of the system means that we 1102 * _really_ don't want to cluster much more 1103 */ 1104 } 1105