1 /* 2 * mm/rmap.c - physical to virtual reverse mappings 3 * 4 * Copyright 2001, Rik van Riel <riel@conectiva.com.br> 5 * Released under the General Public License (GPL). 6 * 7 * Simple, low overhead reverse mapping scheme. 8 * Please try to keep this thing as modular as possible. 9 * 10 * Provides methods for unmapping each kind of mapped page: 11 * the anon methods track anonymous pages, and 12 * the file methods track pages belonging to an inode. 13 * 14 * Original design by Rik van Riel <riel@conectiva.com.br> 2001 15 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004 16 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004 17 * Contributions by Hugh Dickins 2003, 2004 18 */ 19 20 /* 21 * Lock ordering in mm: 22 * 23 * inode->i_mutex (while writing or truncating, not reading or faulting) 24 * mm->mmap_sem 25 * page->flags PG_locked (lock_page) 26 * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share) 27 * mapping->i_mmap_rwsem 28 * anon_vma->rwsem 29 * mm->page_table_lock or pte_lock 30 * pgdat->lru_lock (in mark_page_accessed, isolate_lru_page) 31 * swap_lock (in swap_duplicate, swap_info_get) 32 * mmlist_lock (in mmput, drain_mmlist and others) 33 * mapping->private_lock (in __set_page_dirty_buffers) 34 * mem_cgroup_{begin,end}_page_stat (memcg->move_lock) 35 * i_pages lock (widely used) 36 * inode->i_lock (in set_page_dirty's __mark_inode_dirty) 37 * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) 38 * sb_lock (within inode_lock in fs/fs-writeback.c) 39 * i_pages lock (widely used, in set_page_dirty, 40 * in arch-dependent flush_dcache_mmap_lock, 41 * within bdi.wb->list_lock in __sync_single_inode) 42 * 43 * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon) 44 * ->tasklist_lock 45 * pte map lock 46 */ 47 48 #include <linux/mm.h> 49 #include <linux/sched/mm.h> 50 #include <linux/sched/task.h> 51 #include <linux/pagemap.h> 52 #include <linux/swap.h> 53 #include <linux/swapops.h> 54 #include <linux/slab.h> 55 #include <linux/init.h> 56 #include <linux/ksm.h> 57 #include <linux/rmap.h> 58 #include <linux/rcupdate.h> 59 #include <linux/export.h> 60 #include <linux/memcontrol.h> 61 #include <linux/mmu_notifier.h> 62 #include <linux/migrate.h> 63 #include <linux/hugetlb.h> 64 #include <linux/huge_mm.h> 65 #include <linux/backing-dev.h> 66 #include <linux/page_idle.h> 67 #include <linux/memremap.h> 68 #include <linux/userfaultfd_k.h> 69 70 #include <asm/tlbflush.h> 71 72 #include <trace/events/tlb.h> 73 74 #include "internal.h" 75 76 static struct kmem_cache *anon_vma_cachep; 77 static struct kmem_cache *anon_vma_chain_cachep; 78 79 static inline struct anon_vma *anon_vma_alloc(void) 80 { 81 struct anon_vma *anon_vma; 82 83 anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); 84 if (anon_vma) { 85 atomic_set(&anon_vma->refcount, 1); 86 anon_vma->degree = 1; /* Reference for first vma */ 87 anon_vma->parent = anon_vma; 88 /* 89 * Initialise the anon_vma root to point to itself. If called 90 * from fork, the root will be reset to the parents anon_vma. 91 */ 92 anon_vma->root = anon_vma; 93 } 94 95 return anon_vma; 96 } 97 98 static inline void anon_vma_free(struct anon_vma *anon_vma) 99 { 100 VM_BUG_ON(atomic_read(&anon_vma->refcount)); 101 102 /* 103 * Synchronize against page_lock_anon_vma_read() such that 104 * we can safely hold the lock without the anon_vma getting 105 * freed. 106 * 107 * Relies on the full mb implied by the atomic_dec_and_test() from 108 * put_anon_vma() against the acquire barrier implied by 109 * down_read_trylock() from page_lock_anon_vma_read(). This orders: 110 * 111 * page_lock_anon_vma_read() VS put_anon_vma() 112 * down_read_trylock() atomic_dec_and_test() 113 * LOCK MB 114 * atomic_read() rwsem_is_locked() 115 * 116 * LOCK should suffice since the actual taking of the lock must 117 * happen _before_ what follows. 118 */ 119 might_sleep(); 120 if (rwsem_is_locked(&anon_vma->root->rwsem)) { 121 anon_vma_lock_write(anon_vma); 122 anon_vma_unlock_write(anon_vma); 123 } 124 125 kmem_cache_free(anon_vma_cachep, anon_vma); 126 } 127 128 static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp) 129 { 130 return kmem_cache_alloc(anon_vma_chain_cachep, gfp); 131 } 132 133 static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) 134 { 135 kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); 136 } 137 138 static void anon_vma_chain_link(struct vm_area_struct *vma, 139 struct anon_vma_chain *avc, 140 struct anon_vma *anon_vma) 141 { 142 avc->vma = vma; 143 avc->anon_vma = anon_vma; 144 list_add(&avc->same_vma, &vma->anon_vma_chain); 145 anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); 146 } 147 148 /** 149 * __anon_vma_prepare - attach an anon_vma to a memory region 150 * @vma: the memory region in question 151 * 152 * This makes sure the memory mapping described by 'vma' has 153 * an 'anon_vma' attached to it, so that we can associate the 154 * anonymous pages mapped into it with that anon_vma. 155 * 156 * The common case will be that we already have one, which 157 * is handled inline by anon_vma_prepare(). But if 158 * not we either need to find an adjacent mapping that we 159 * can re-use the anon_vma from (very common when the only 160 * reason for splitting a vma has been mprotect()), or we 161 * allocate a new one. 162 * 163 * Anon-vma allocations are very subtle, because we may have 164 * optimistically looked up an anon_vma in page_lock_anon_vma_read() 165 * and that may actually touch the spinlock even in the newly 166 * allocated vma (it depends on RCU to make sure that the 167 * anon_vma isn't actually destroyed). 168 * 169 * As a result, we need to do proper anon_vma locking even 170 * for the new allocation. At the same time, we do not want 171 * to do any locking for the common case of already having 172 * an anon_vma. 173 * 174 * This must be called with the mmap_sem held for reading. 175 */ 176 int __anon_vma_prepare(struct vm_area_struct *vma) 177 { 178 struct mm_struct *mm = vma->vm_mm; 179 struct anon_vma *anon_vma, *allocated; 180 struct anon_vma_chain *avc; 181 182 might_sleep(); 183 184 avc = anon_vma_chain_alloc(GFP_KERNEL); 185 if (!avc) 186 goto out_enomem; 187 188 anon_vma = find_mergeable_anon_vma(vma); 189 allocated = NULL; 190 if (!anon_vma) { 191 anon_vma = anon_vma_alloc(); 192 if (unlikely(!anon_vma)) 193 goto out_enomem_free_avc; 194 allocated = anon_vma; 195 } 196 197 anon_vma_lock_write(anon_vma); 198 /* page_table_lock to protect against threads */ 199 spin_lock(&mm->page_table_lock); 200 if (likely(!vma->anon_vma)) { 201 vma->anon_vma = anon_vma; 202 anon_vma_chain_link(vma, avc, anon_vma); 203 /* vma reference or self-parent link for new root */ 204 anon_vma->degree++; 205 allocated = NULL; 206 avc = NULL; 207 } 208 spin_unlock(&mm->page_table_lock); 209 anon_vma_unlock_write(anon_vma); 210 211 if (unlikely(allocated)) 212 put_anon_vma(allocated); 213 if (unlikely(avc)) 214 anon_vma_chain_free(avc); 215 216 return 0; 217 218 out_enomem_free_avc: 219 anon_vma_chain_free(avc); 220 out_enomem: 221 return -ENOMEM; 222 } 223 224 /* 225 * This is a useful helper function for locking the anon_vma root as 226 * we traverse the vma->anon_vma_chain, looping over anon_vma's that 227 * have the same vma. 228 * 229 * Such anon_vma's should have the same root, so you'd expect to see 230 * just a single mutex_lock for the whole traversal. 231 */ 232 static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma) 233 { 234 struct anon_vma *new_root = anon_vma->root; 235 if (new_root != root) { 236 if (WARN_ON_ONCE(root)) 237 up_write(&root->rwsem); 238 root = new_root; 239 down_write(&root->rwsem); 240 } 241 return root; 242 } 243 244 static inline void unlock_anon_vma_root(struct anon_vma *root) 245 { 246 if (root) 247 up_write(&root->rwsem); 248 } 249 250 /* 251 * Attach the anon_vmas from src to dst. 252 * Returns 0 on success, -ENOMEM on failure. 253 * 254 * anon_vma_clone() is called by __vma_split(), __split_vma(), copy_vma() and 255 * anon_vma_fork(). The first three want an exact copy of src, while the last 256 * one, anon_vma_fork(), may try to reuse an existing anon_vma to prevent 257 * endless growth of anon_vma. Since dst->anon_vma is set to NULL before call, 258 * we can identify this case by checking (!dst->anon_vma && src->anon_vma). 259 * 260 * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find 261 * and reuse existing anon_vma which has no vmas and only one child anon_vma. 262 * This prevents degradation of anon_vma hierarchy to endless linear chain in 263 * case of constantly forking task. On the other hand, an anon_vma with more 264 * than one child isn't reused even if there was no alive vma, thus rmap 265 * walker has a good chance of avoiding scanning the whole hierarchy when it 266 * searches where page is mapped. 267 */ 268 int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) 269 { 270 struct anon_vma_chain *avc, *pavc; 271 struct anon_vma *root = NULL; 272 struct vm_area_struct *prev = dst->vm_prev, *pprev = src->vm_prev; 273 274 /* 275 * If parent share anon_vma with its vm_prev, keep this sharing in in 276 * child. 277 * 278 * 1. Parent has vm_prev, which implies we have vm_prev. 279 * 2. Parent and its vm_prev have the same anon_vma. 280 */ 281 if (!dst->anon_vma && src->anon_vma && 282 pprev && pprev->anon_vma == src->anon_vma) 283 dst->anon_vma = prev->anon_vma; 284 285 286 list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { 287 struct anon_vma *anon_vma; 288 289 avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN); 290 if (unlikely(!avc)) { 291 unlock_anon_vma_root(root); 292 root = NULL; 293 avc = anon_vma_chain_alloc(GFP_KERNEL); 294 if (!avc) 295 goto enomem_failure; 296 } 297 anon_vma = pavc->anon_vma; 298 root = lock_anon_vma_root(root, anon_vma); 299 anon_vma_chain_link(dst, avc, anon_vma); 300 301 /* 302 * Reuse existing anon_vma if its degree lower than two, 303 * that means it has no vma and only one anon_vma child. 304 * 305 * Do not chose parent anon_vma, otherwise first child 306 * will always reuse it. Root anon_vma is never reused: 307 * it has self-parent reference and at least one child. 308 */ 309 if (!dst->anon_vma && src->anon_vma && 310 anon_vma != src->anon_vma && anon_vma->degree < 2) 311 dst->anon_vma = anon_vma; 312 } 313 if (dst->anon_vma) 314 dst->anon_vma->degree++; 315 unlock_anon_vma_root(root); 316 return 0; 317 318 enomem_failure: 319 /* 320 * dst->anon_vma is dropped here otherwise its degree can be incorrectly 321 * decremented in unlink_anon_vmas(). 322 * We can safely do this because callers of anon_vma_clone() don't care 323 * about dst->anon_vma if anon_vma_clone() failed. 324 */ 325 dst->anon_vma = NULL; 326 unlink_anon_vmas(dst); 327 return -ENOMEM; 328 } 329 330 /* 331 * Attach vma to its own anon_vma, as well as to the anon_vmas that 332 * the corresponding VMA in the parent process is attached to. 333 * Returns 0 on success, non-zero on failure. 334 */ 335 int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) 336 { 337 struct anon_vma_chain *avc; 338 struct anon_vma *anon_vma; 339 int error; 340 341 /* Don't bother if the parent process has no anon_vma here. */ 342 if (!pvma->anon_vma) 343 return 0; 344 345 /* Drop inherited anon_vma, we'll reuse existing or allocate new. */ 346 vma->anon_vma = NULL; 347 348 /* 349 * First, attach the new VMA to the parent VMA's anon_vmas, 350 * so rmap can find non-COWed pages in child processes. 351 */ 352 error = anon_vma_clone(vma, pvma); 353 if (error) 354 return error; 355 356 /* An existing anon_vma has been reused, all done then. */ 357 if (vma->anon_vma) 358 return 0; 359 360 /* Then add our own anon_vma. */ 361 anon_vma = anon_vma_alloc(); 362 if (!anon_vma) 363 goto out_error; 364 avc = anon_vma_chain_alloc(GFP_KERNEL); 365 if (!avc) 366 goto out_error_free_anon_vma; 367 368 /* 369 * The root anon_vma's spinlock is the lock actually used when we 370 * lock any of the anon_vmas in this anon_vma tree. 371 */ 372 anon_vma->root = pvma->anon_vma->root; 373 anon_vma->parent = pvma->anon_vma; 374 /* 375 * With refcounts, an anon_vma can stay around longer than the 376 * process it belongs to. The root anon_vma needs to be pinned until 377 * this anon_vma is freed, because the lock lives in the root. 378 */ 379 get_anon_vma(anon_vma->root); 380 /* Mark this anon_vma as the one where our new (COWed) pages go. */ 381 vma->anon_vma = anon_vma; 382 anon_vma_lock_write(anon_vma); 383 anon_vma_chain_link(vma, avc, anon_vma); 384 anon_vma->parent->degree++; 385 anon_vma_unlock_write(anon_vma); 386 387 return 0; 388 389 out_error_free_anon_vma: 390 put_anon_vma(anon_vma); 391 out_error: 392 unlink_anon_vmas(vma); 393 return -ENOMEM; 394 } 395 396 void unlink_anon_vmas(struct vm_area_struct *vma) 397 { 398 struct anon_vma_chain *avc, *next; 399 struct anon_vma *root = NULL; 400 401 /* 402 * Unlink each anon_vma chained to the VMA. This list is ordered 403 * from newest to oldest, ensuring the root anon_vma gets freed last. 404 */ 405 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { 406 struct anon_vma *anon_vma = avc->anon_vma; 407 408 root = lock_anon_vma_root(root, anon_vma); 409 anon_vma_interval_tree_remove(avc, &anon_vma->rb_root); 410 411 /* 412 * Leave empty anon_vmas on the list - we'll need 413 * to free them outside the lock. 414 */ 415 if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) { 416 anon_vma->parent->degree--; 417 continue; 418 } 419 420 list_del(&avc->same_vma); 421 anon_vma_chain_free(avc); 422 } 423 if (vma->anon_vma) 424 vma->anon_vma->degree--; 425 unlock_anon_vma_root(root); 426 427 /* 428 * Iterate the list once more, it now only contains empty and unlinked 429 * anon_vmas, destroy them. Could not do before due to __put_anon_vma() 430 * needing to write-acquire the anon_vma->root->rwsem. 431 */ 432 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { 433 struct anon_vma *anon_vma = avc->anon_vma; 434 435 VM_WARN_ON(anon_vma->degree); 436 put_anon_vma(anon_vma); 437 438 list_del(&avc->same_vma); 439 anon_vma_chain_free(avc); 440 } 441 } 442 443 static void anon_vma_ctor(void *data) 444 { 445 struct anon_vma *anon_vma = data; 446 447 init_rwsem(&anon_vma->rwsem); 448 atomic_set(&anon_vma->refcount, 0); 449 anon_vma->rb_root = RB_ROOT_CACHED; 450 } 451 452 void __init anon_vma_init(void) 453 { 454 anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), 455 0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT, 456 anon_vma_ctor); 457 anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, 458 SLAB_PANIC|SLAB_ACCOUNT); 459 } 460 461 /* 462 * Getting a lock on a stable anon_vma from a page off the LRU is tricky! 463 * 464 * Since there is no serialization what so ever against page_remove_rmap() 465 * the best this function can do is return a locked anon_vma that might 466 * have been relevant to this page. 467 * 468 * The page might have been remapped to a different anon_vma or the anon_vma 469 * returned may already be freed (and even reused). 470 * 471 * In case it was remapped to a different anon_vma, the new anon_vma will be a 472 * child of the old anon_vma, and the anon_vma lifetime rules will therefore 473 * ensure that any anon_vma obtained from the page will still be valid for as 474 * long as we observe page_mapped() [ hence all those page_mapped() tests ]. 475 * 476 * All users of this function must be very careful when walking the anon_vma 477 * chain and verify that the page in question is indeed mapped in it 478 * [ something equivalent to page_mapped_in_vma() ]. 479 * 480 * Since anon_vma's slab is SLAB_TYPESAFE_BY_RCU and we know from 481 * page_remove_rmap() that the anon_vma pointer from page->mapping is valid 482 * if there is a mapcount, we can dereference the anon_vma after observing 483 * those. 484 */ 485 struct anon_vma *page_get_anon_vma(struct page *page) 486 { 487 struct anon_vma *anon_vma = NULL; 488 unsigned long anon_mapping; 489 490 rcu_read_lock(); 491 anon_mapping = (unsigned long)READ_ONCE(page->mapping); 492 if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) 493 goto out; 494 if (!page_mapped(page)) 495 goto out; 496 497 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); 498 if (!atomic_inc_not_zero(&anon_vma->refcount)) { 499 anon_vma = NULL; 500 goto out; 501 } 502 503 /* 504 * If this page is still mapped, then its anon_vma cannot have been 505 * freed. But if it has been unmapped, we have no security against the 506 * anon_vma structure being freed and reused (for another anon_vma: 507 * SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero() 508 * above cannot corrupt). 509 */ 510 if (!page_mapped(page)) { 511 rcu_read_unlock(); 512 put_anon_vma(anon_vma); 513 return NULL; 514 } 515 out: 516 rcu_read_unlock(); 517 518 return anon_vma; 519 } 520 521 /* 522 * Similar to page_get_anon_vma() except it locks the anon_vma. 523 * 524 * Its a little more complex as it tries to keep the fast path to a single 525 * atomic op -- the trylock. If we fail the trylock, we fall back to getting a 526 * reference like with page_get_anon_vma() and then block on the mutex. 527 */ 528 struct anon_vma *page_lock_anon_vma_read(struct page *page) 529 { 530 struct anon_vma *anon_vma = NULL; 531 struct anon_vma *root_anon_vma; 532 unsigned long anon_mapping; 533 534 rcu_read_lock(); 535 anon_mapping = (unsigned long)READ_ONCE(page->mapping); 536 if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) 537 goto out; 538 if (!page_mapped(page)) 539 goto out; 540 541 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); 542 root_anon_vma = READ_ONCE(anon_vma->root); 543 if (down_read_trylock(&root_anon_vma->rwsem)) { 544 /* 545 * If the page is still mapped, then this anon_vma is still 546 * its anon_vma, and holding the mutex ensures that it will 547 * not go away, see anon_vma_free(). 548 */ 549 if (!page_mapped(page)) { 550 up_read(&root_anon_vma->rwsem); 551 anon_vma = NULL; 552 } 553 goto out; 554 } 555 556 /* trylock failed, we got to sleep */ 557 if (!atomic_inc_not_zero(&anon_vma->refcount)) { 558 anon_vma = NULL; 559 goto out; 560 } 561 562 if (!page_mapped(page)) { 563 rcu_read_unlock(); 564 put_anon_vma(anon_vma); 565 return NULL; 566 } 567 568 /* we pinned the anon_vma, its safe to sleep */ 569 rcu_read_unlock(); 570 anon_vma_lock_read(anon_vma); 571 572 if (atomic_dec_and_test(&anon_vma->refcount)) { 573 /* 574 * Oops, we held the last refcount, release the lock 575 * and bail -- can't simply use put_anon_vma() because 576 * we'll deadlock on the anon_vma_lock_write() recursion. 577 */ 578 anon_vma_unlock_read(anon_vma); 579 __put_anon_vma(anon_vma); 580 anon_vma = NULL; 581 } 582 583 return anon_vma; 584 585 out: 586 rcu_read_unlock(); 587 return anon_vma; 588 } 589 590 void page_unlock_anon_vma_read(struct anon_vma *anon_vma) 591 { 592 anon_vma_unlock_read(anon_vma); 593 } 594 595 #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH 596 /* 597 * Flush TLB entries for recently unmapped pages from remote CPUs. It is 598 * important if a PTE was dirty when it was unmapped that it's flushed 599 * before any IO is initiated on the page to prevent lost writes. Similarly, 600 * it must be flushed before freeing to prevent data leakage. 601 */ 602 void try_to_unmap_flush(void) 603 { 604 struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; 605 606 if (!tlb_ubc->flush_required) 607 return; 608 609 arch_tlbbatch_flush(&tlb_ubc->arch); 610 tlb_ubc->flush_required = false; 611 tlb_ubc->writable = false; 612 } 613 614 /* Flush iff there are potentially writable TLB entries that can race with IO */ 615 void try_to_unmap_flush_dirty(void) 616 { 617 struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; 618 619 if (tlb_ubc->writable) 620 try_to_unmap_flush(); 621 } 622 623 static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable) 624 { 625 struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; 626 627 arch_tlbbatch_add_mm(&tlb_ubc->arch, mm); 628 tlb_ubc->flush_required = true; 629 630 /* 631 * Ensure compiler does not re-order the setting of tlb_flush_batched 632 * before the PTE is cleared. 633 */ 634 barrier(); 635 mm->tlb_flush_batched = true; 636 637 /* 638 * If the PTE was dirty then it's best to assume it's writable. The 639 * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush() 640 * before the page is queued for IO. 641 */ 642 if (writable) 643 tlb_ubc->writable = true; 644 } 645 646 /* 647 * Returns true if the TLB flush should be deferred to the end of a batch of 648 * unmap operations to reduce IPIs. 649 */ 650 static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) 651 { 652 bool should_defer = false; 653 654 if (!(flags & TTU_BATCH_FLUSH)) 655 return false; 656 657 /* If remote CPUs need to be flushed then defer batch the flush */ 658 if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids) 659 should_defer = true; 660 put_cpu(); 661 662 return should_defer; 663 } 664 665 /* 666 * Reclaim unmaps pages under the PTL but do not flush the TLB prior to 667 * releasing the PTL if TLB flushes are batched. It's possible for a parallel 668 * operation such as mprotect or munmap to race between reclaim unmapping 669 * the page and flushing the page. If this race occurs, it potentially allows 670 * access to data via a stale TLB entry. Tracking all mm's that have TLB 671 * batching in flight would be expensive during reclaim so instead track 672 * whether TLB batching occurred in the past and if so then do a flush here 673 * if required. This will cost one additional flush per reclaim cycle paid 674 * by the first operation at risk such as mprotect and mumap. 675 * 676 * This must be called under the PTL so that an access to tlb_flush_batched 677 * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise 678 * via the PTL. 679 */ 680 void flush_tlb_batched_pending(struct mm_struct *mm) 681 { 682 if (mm->tlb_flush_batched) { 683 flush_tlb_mm(mm); 684 685 /* 686 * Do not allow the compiler to re-order the clearing of 687 * tlb_flush_batched before the tlb is flushed. 688 */ 689 barrier(); 690 mm->tlb_flush_batched = false; 691 } 692 } 693 #else 694 static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable) 695 { 696 } 697 698 static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) 699 { 700 return false; 701 } 702 #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ 703 704 /* 705 * At what user virtual address is page expected in vma? 706 * Caller should check the page is actually part of the vma. 707 */ 708 unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) 709 { 710 unsigned long address; 711 if (PageAnon(page)) { 712 struct anon_vma *page__anon_vma = page_anon_vma(page); 713 /* 714 * Note: swapoff's unuse_vma() is more efficient with this 715 * check, and needs it to match anon_vma when KSM is active. 716 */ 717 if (!vma->anon_vma || !page__anon_vma || 718 vma->anon_vma->root != page__anon_vma->root) 719 return -EFAULT; 720 } else if (page->mapping) { 721 if (!vma->vm_file || vma->vm_file->f_mapping != page->mapping) 722 return -EFAULT; 723 } else 724 return -EFAULT; 725 address = __vma_address(page, vma); 726 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) 727 return -EFAULT; 728 return address; 729 } 730 731 pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) 732 { 733 pgd_t *pgd; 734 p4d_t *p4d; 735 pud_t *pud; 736 pmd_t *pmd = NULL; 737 pmd_t pmde; 738 739 pgd = pgd_offset(mm, address); 740 if (!pgd_present(*pgd)) 741 goto out; 742 743 p4d = p4d_offset(pgd, address); 744 if (!p4d_present(*p4d)) 745 goto out; 746 747 pud = pud_offset(p4d, address); 748 if (!pud_present(*pud)) 749 goto out; 750 751 pmd = pmd_offset(pud, address); 752 /* 753 * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at() 754 * without holding anon_vma lock for write. So when looking for a 755 * genuine pmde (in which to find pte), test present and !THP together. 756 */ 757 pmde = *pmd; 758 barrier(); 759 if (!pmd_present(pmde) || pmd_trans_huge(pmde)) 760 pmd = NULL; 761 out: 762 return pmd; 763 } 764 765 struct page_referenced_arg { 766 int mapcount; 767 int referenced; 768 unsigned long vm_flags; 769 struct mem_cgroup *memcg; 770 }; 771 /* 772 * arg: page_referenced_arg will be passed 773 */ 774 static bool page_referenced_one(struct page *page, struct vm_area_struct *vma, 775 unsigned long address, void *arg) 776 { 777 struct page_referenced_arg *pra = arg; 778 struct page_vma_mapped_walk pvmw = { 779 .page = page, 780 .vma = vma, 781 .address = address, 782 }; 783 int referenced = 0; 784 785 while (page_vma_mapped_walk(&pvmw)) { 786 address = pvmw.address; 787 788 if (vma->vm_flags & VM_LOCKED) { 789 page_vma_mapped_walk_done(&pvmw); 790 pra->vm_flags |= VM_LOCKED; 791 return false; /* To break the loop */ 792 } 793 794 if (pvmw.pte) { 795 if (ptep_clear_flush_young_notify(vma, address, 796 pvmw.pte)) { 797 /* 798 * Don't treat a reference through 799 * a sequentially read mapping as such. 800 * If the page has been used in another mapping, 801 * we will catch it; if this other mapping is 802 * already gone, the unmap path will have set 803 * PG_referenced or activated the page. 804 */ 805 if (likely(!(vma->vm_flags & VM_SEQ_READ))) 806 referenced++; 807 } 808 } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { 809 if (pmdp_clear_flush_young_notify(vma, address, 810 pvmw.pmd)) 811 referenced++; 812 } else { 813 /* unexpected pmd-mapped page? */ 814 WARN_ON_ONCE(1); 815 } 816 817 pra->mapcount--; 818 } 819 820 if (referenced) 821 clear_page_idle(page); 822 if (test_and_clear_page_young(page)) 823 referenced++; 824 825 if (referenced) { 826 pra->referenced++; 827 pra->vm_flags |= vma->vm_flags; 828 } 829 830 if (!pra->mapcount) 831 return false; /* To break the loop */ 832 833 return true; 834 } 835 836 static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg) 837 { 838 struct page_referenced_arg *pra = arg; 839 struct mem_cgroup *memcg = pra->memcg; 840 841 if (!mm_match_cgroup(vma->vm_mm, memcg)) 842 return true; 843 844 return false; 845 } 846 847 /** 848 * page_referenced - test if the page was referenced 849 * @page: the page to test 850 * @is_locked: caller holds lock on the page 851 * @memcg: target memory cgroup 852 * @vm_flags: collect encountered vma->vm_flags who actually referenced the page 853 * 854 * Quick test_and_clear_referenced for all mappings to a page, 855 * returns the number of ptes which referenced the page. 856 */ 857 int page_referenced(struct page *page, 858 int is_locked, 859 struct mem_cgroup *memcg, 860 unsigned long *vm_flags) 861 { 862 int we_locked = 0; 863 struct page_referenced_arg pra = { 864 .mapcount = total_mapcount(page), 865 .memcg = memcg, 866 }; 867 struct rmap_walk_control rwc = { 868 .rmap_one = page_referenced_one, 869 .arg = (void *)&pra, 870 .anon_lock = page_lock_anon_vma_read, 871 }; 872 873 *vm_flags = 0; 874 if (!pra.mapcount) 875 return 0; 876 877 if (!page_rmapping(page)) 878 return 0; 879 880 if (!is_locked && (!PageAnon(page) || PageKsm(page))) { 881 we_locked = trylock_page(page); 882 if (!we_locked) 883 return 1; 884 } 885 886 /* 887 * If we are reclaiming on behalf of a cgroup, skip 888 * counting on behalf of references from different 889 * cgroups 890 */ 891 if (memcg) { 892 rwc.invalid_vma = invalid_page_referenced_vma; 893 } 894 895 rmap_walk(page, &rwc); 896 *vm_flags = pra.vm_flags; 897 898 if (we_locked) 899 unlock_page(page); 900 901 return pra.referenced; 902 } 903 904 static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, 905 unsigned long address, void *arg) 906 { 907 struct page_vma_mapped_walk pvmw = { 908 .page = page, 909 .vma = vma, 910 .address = address, 911 .flags = PVMW_SYNC, 912 }; 913 struct mmu_notifier_range range; 914 int *cleaned = arg; 915 916 /* 917 * We have to assume the worse case ie pmd for invalidation. Note that 918 * the page can not be free from this function. 919 */ 920 mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 921 0, vma, vma->vm_mm, address, 922 min(vma->vm_end, address + page_size(page))); 923 mmu_notifier_invalidate_range_start(&range); 924 925 while (page_vma_mapped_walk(&pvmw)) { 926 int ret = 0; 927 928 address = pvmw.address; 929 if (pvmw.pte) { 930 pte_t entry; 931 pte_t *pte = pvmw.pte; 932 933 if (!pte_dirty(*pte) && !pte_write(*pte)) 934 continue; 935 936 flush_cache_page(vma, address, pte_pfn(*pte)); 937 entry = ptep_clear_flush(vma, address, pte); 938 entry = pte_wrprotect(entry); 939 entry = pte_mkclean(entry); 940 set_pte_at(vma->vm_mm, address, pte, entry); 941 ret = 1; 942 } else { 943 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE 944 pmd_t *pmd = pvmw.pmd; 945 pmd_t entry; 946 947 if (!pmd_dirty(*pmd) && !pmd_write(*pmd)) 948 continue; 949 950 flush_cache_page(vma, address, page_to_pfn(page)); 951 entry = pmdp_invalidate(vma, address, pmd); 952 entry = pmd_wrprotect(entry); 953 entry = pmd_mkclean(entry); 954 set_pmd_at(vma->vm_mm, address, pmd, entry); 955 ret = 1; 956 #else 957 /* unexpected pmd-mapped page? */ 958 WARN_ON_ONCE(1); 959 #endif 960 } 961 962 /* 963 * No need to call mmu_notifier_invalidate_range() as we are 964 * downgrading page table protection not changing it to point 965 * to a new page. 966 * 967 * See Documentation/vm/mmu_notifier.rst 968 */ 969 if (ret) 970 (*cleaned)++; 971 } 972 973 mmu_notifier_invalidate_range_end(&range); 974 975 return true; 976 } 977 978 static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg) 979 { 980 if (vma->vm_flags & VM_SHARED) 981 return false; 982 983 return true; 984 } 985 986 int page_mkclean(struct page *page) 987 { 988 int cleaned = 0; 989 struct address_space *mapping; 990 struct rmap_walk_control rwc = { 991 .arg = (void *)&cleaned, 992 .rmap_one = page_mkclean_one, 993 .invalid_vma = invalid_mkclean_vma, 994 }; 995 996 BUG_ON(!PageLocked(page)); 997 998 if (!page_mapped(page)) 999 return 0; 1000 1001 mapping = page_mapping(page); 1002 if (!mapping) 1003 return 0; 1004 1005 rmap_walk(page, &rwc); 1006 1007 return cleaned; 1008 } 1009 EXPORT_SYMBOL_GPL(page_mkclean); 1010 1011 /** 1012 * page_move_anon_rmap - move a page to our anon_vma 1013 * @page: the page to move to our anon_vma 1014 * @vma: the vma the page belongs to 1015 * 1016 * When a page belongs exclusively to one process after a COW event, 1017 * that page can be moved into the anon_vma that belongs to just that 1018 * process, so the rmap code will not search the parent or sibling 1019 * processes. 1020 */ 1021 void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma) 1022 { 1023 struct anon_vma *anon_vma = vma->anon_vma; 1024 1025 page = compound_head(page); 1026 1027 VM_BUG_ON_PAGE(!PageLocked(page), page); 1028 VM_BUG_ON_VMA(!anon_vma, vma); 1029 1030 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 1031 /* 1032 * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written 1033 * simultaneously, so a concurrent reader (eg page_referenced()'s 1034 * PageAnon()) will not see one without the other. 1035 */ 1036 WRITE_ONCE(page->mapping, (struct address_space *) anon_vma); 1037 } 1038 1039 /** 1040 * __page_set_anon_rmap - set up new anonymous rmap 1041 * @page: Page or Hugepage to add to rmap 1042 * @vma: VM area to add page to. 1043 * @address: User virtual address of the mapping 1044 * @exclusive: the page is exclusively owned by the current process 1045 */ 1046 static void __page_set_anon_rmap(struct page *page, 1047 struct vm_area_struct *vma, unsigned long address, int exclusive) 1048 { 1049 struct anon_vma *anon_vma = vma->anon_vma; 1050 1051 BUG_ON(!anon_vma); 1052 1053 if (PageAnon(page)) 1054 return; 1055 1056 /* 1057 * If the page isn't exclusively mapped into this vma, 1058 * we must use the _oldest_ possible anon_vma for the 1059 * page mapping! 1060 */ 1061 if (!exclusive) 1062 anon_vma = anon_vma->root; 1063 1064 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 1065 page->mapping = (struct address_space *) anon_vma; 1066 page->index = linear_page_index(vma, address); 1067 } 1068 1069 /** 1070 * __page_check_anon_rmap - sanity check anonymous rmap addition 1071 * @page: the page to add the mapping to 1072 * @vma: the vm area in which the mapping is added 1073 * @address: the user virtual address mapped 1074 */ 1075 static void __page_check_anon_rmap(struct page *page, 1076 struct vm_area_struct *vma, unsigned long address) 1077 { 1078 /* 1079 * The page's anon-rmap details (mapping and index) are guaranteed to 1080 * be set up correctly at this point. 1081 * 1082 * We have exclusion against page_add_anon_rmap because the caller 1083 * always holds the page locked, except if called from page_dup_rmap, 1084 * in which case the page is already known to be setup. 1085 * 1086 * We have exclusion against page_add_new_anon_rmap because those pages 1087 * are initially only visible via the pagetables, and the pte is locked 1088 * over the call to page_add_new_anon_rmap. 1089 */ 1090 VM_BUG_ON_PAGE(page_anon_vma(page)->root != vma->anon_vma->root, page); 1091 VM_BUG_ON_PAGE(page_to_pgoff(page) != linear_page_index(vma, address), 1092 page); 1093 } 1094 1095 /** 1096 * page_add_anon_rmap - add pte mapping to an anonymous page 1097 * @page: the page to add the mapping to 1098 * @vma: the vm area in which the mapping is added 1099 * @address: the user virtual address mapped 1100 * @compound: charge the page as compound or small page 1101 * 1102 * The caller needs to hold the pte lock, and the page must be locked in 1103 * the anon_vma case: to serialize mapping,index checking after setting, 1104 * and to ensure that PageAnon is not being upgraded racily to PageKsm 1105 * (but PageKsm is never downgraded to PageAnon). 1106 */ 1107 void page_add_anon_rmap(struct page *page, 1108 struct vm_area_struct *vma, unsigned long address, bool compound) 1109 { 1110 do_page_add_anon_rmap(page, vma, address, compound ? RMAP_COMPOUND : 0); 1111 } 1112 1113 /* 1114 * Special version of the above for do_swap_page, which often runs 1115 * into pages that are exclusively owned by the current process. 1116 * Everybody else should continue to use page_add_anon_rmap above. 1117 */ 1118 void do_page_add_anon_rmap(struct page *page, 1119 struct vm_area_struct *vma, unsigned long address, int flags) 1120 { 1121 bool compound = flags & RMAP_COMPOUND; 1122 bool first; 1123 1124 if (compound) { 1125 atomic_t *mapcount; 1126 VM_BUG_ON_PAGE(!PageLocked(page), page); 1127 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 1128 mapcount = compound_mapcount_ptr(page); 1129 first = atomic_inc_and_test(mapcount); 1130 } else { 1131 first = atomic_inc_and_test(&page->_mapcount); 1132 } 1133 1134 if (first) { 1135 int nr = compound ? hpage_nr_pages(page) : 1; 1136 /* 1137 * We use the irq-unsafe __{inc|mod}_zone_page_stat because 1138 * these counters are not modified in interrupt context, and 1139 * pte lock(a spinlock) is held, which implies preemption 1140 * disabled. 1141 */ 1142 if (compound) 1143 __inc_node_page_state(page, NR_ANON_THPS); 1144 __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr); 1145 } 1146 if (unlikely(PageKsm(page))) 1147 return; 1148 1149 VM_BUG_ON_PAGE(!PageLocked(page), page); 1150 1151 /* address might be in next vma when migration races vma_adjust */ 1152 if (first) 1153 __page_set_anon_rmap(page, vma, address, 1154 flags & RMAP_EXCLUSIVE); 1155 else 1156 __page_check_anon_rmap(page, vma, address); 1157 } 1158 1159 /** 1160 * page_add_new_anon_rmap - add pte mapping to a new anonymous page 1161 * @page: the page to add the mapping to 1162 * @vma: the vm area in which the mapping is added 1163 * @address: the user virtual address mapped 1164 * @compound: charge the page as compound or small page 1165 * 1166 * Same as page_add_anon_rmap but must only be called on *new* pages. 1167 * This means the inc-and-test can be bypassed. 1168 * Page does not have to be locked. 1169 */ 1170 void page_add_new_anon_rmap(struct page *page, 1171 struct vm_area_struct *vma, unsigned long address, bool compound) 1172 { 1173 int nr = compound ? hpage_nr_pages(page) : 1; 1174 1175 VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); 1176 __SetPageSwapBacked(page); 1177 if (compound) { 1178 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 1179 /* increment count (starts at -1) */ 1180 atomic_set(compound_mapcount_ptr(page), 0); 1181 __inc_node_page_state(page, NR_ANON_THPS); 1182 } else { 1183 /* Anon THP always mapped first with PMD */ 1184 VM_BUG_ON_PAGE(PageTransCompound(page), page); 1185 /* increment count (starts at -1) */ 1186 atomic_set(&page->_mapcount, 0); 1187 } 1188 __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr); 1189 __page_set_anon_rmap(page, vma, address, 1); 1190 } 1191 1192 /** 1193 * page_add_file_rmap - add pte mapping to a file page 1194 * @page: the page to add the mapping to 1195 * @compound: charge the page as compound or small page 1196 * 1197 * The caller needs to hold the pte lock. 1198 */ 1199 void page_add_file_rmap(struct page *page, bool compound) 1200 { 1201 int i, nr = 1; 1202 1203 VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page); 1204 lock_page_memcg(page); 1205 if (compound && PageTransHuge(page)) { 1206 for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) { 1207 if (atomic_inc_and_test(&page[i]._mapcount)) 1208 nr++; 1209 } 1210 if (!atomic_inc_and_test(compound_mapcount_ptr(page))) 1211 goto out; 1212 if (PageSwapBacked(page)) 1213 __inc_node_page_state(page, NR_SHMEM_PMDMAPPED); 1214 else 1215 __inc_node_page_state(page, NR_FILE_PMDMAPPED); 1216 } else { 1217 if (PageTransCompound(page) && page_mapping(page)) { 1218 VM_WARN_ON_ONCE(!PageLocked(page)); 1219 1220 SetPageDoubleMap(compound_head(page)); 1221 if (PageMlocked(page)) 1222 clear_page_mlock(compound_head(page)); 1223 } 1224 if (!atomic_inc_and_test(&page->_mapcount)) 1225 goto out; 1226 } 1227 __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr); 1228 out: 1229 unlock_page_memcg(page); 1230 } 1231 1232 static void page_remove_file_rmap(struct page *page, bool compound) 1233 { 1234 int i, nr = 1; 1235 1236 VM_BUG_ON_PAGE(compound && !PageHead(page), page); 1237 lock_page_memcg(page); 1238 1239 /* Hugepages are not counted in NR_FILE_MAPPED for now. */ 1240 if (unlikely(PageHuge(page))) { 1241 /* hugetlb pages are always mapped with pmds */ 1242 atomic_dec(compound_mapcount_ptr(page)); 1243 goto out; 1244 } 1245 1246 /* page still mapped by someone else? */ 1247 if (compound && PageTransHuge(page)) { 1248 for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) { 1249 if (atomic_add_negative(-1, &page[i]._mapcount)) 1250 nr++; 1251 } 1252 if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) 1253 goto out; 1254 if (PageSwapBacked(page)) 1255 __dec_node_page_state(page, NR_SHMEM_PMDMAPPED); 1256 else 1257 __dec_node_page_state(page, NR_FILE_PMDMAPPED); 1258 } else { 1259 if (!atomic_add_negative(-1, &page->_mapcount)) 1260 goto out; 1261 } 1262 1263 /* 1264 * We use the irq-unsafe __{inc|mod}_lruvec_page_state because 1265 * these counters are not modified in interrupt context, and 1266 * pte lock(a spinlock) is held, which implies preemption disabled. 1267 */ 1268 __mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr); 1269 1270 if (unlikely(PageMlocked(page))) 1271 clear_page_mlock(page); 1272 out: 1273 unlock_page_memcg(page); 1274 } 1275 1276 static void page_remove_anon_compound_rmap(struct page *page) 1277 { 1278 int i, nr; 1279 1280 if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) 1281 return; 1282 1283 /* Hugepages are not counted in NR_ANON_PAGES for now. */ 1284 if (unlikely(PageHuge(page))) 1285 return; 1286 1287 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) 1288 return; 1289 1290 __dec_node_page_state(page, NR_ANON_THPS); 1291 1292 if (TestClearPageDoubleMap(page)) { 1293 /* 1294 * Subpages can be mapped with PTEs too. Check how many of 1295 * them are still mapped. 1296 */ 1297 for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) { 1298 if (atomic_add_negative(-1, &page[i]._mapcount)) 1299 nr++; 1300 } 1301 1302 /* 1303 * Queue the page for deferred split if at least one small 1304 * page of the compound page is unmapped, but at least one 1305 * small page is still mapped. 1306 */ 1307 if (nr && nr < HPAGE_PMD_NR) 1308 deferred_split_huge_page(page); 1309 } else { 1310 nr = HPAGE_PMD_NR; 1311 } 1312 1313 if (unlikely(PageMlocked(page))) 1314 clear_page_mlock(page); 1315 1316 if (nr) 1317 __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, -nr); 1318 } 1319 1320 /** 1321 * page_remove_rmap - take down pte mapping from a page 1322 * @page: page to remove mapping from 1323 * @compound: uncharge the page as compound or small page 1324 * 1325 * The caller needs to hold the pte lock. 1326 */ 1327 void page_remove_rmap(struct page *page, bool compound) 1328 { 1329 if (!PageAnon(page)) 1330 return page_remove_file_rmap(page, compound); 1331 1332 if (compound) 1333 return page_remove_anon_compound_rmap(page); 1334 1335 /* page still mapped by someone else? */ 1336 if (!atomic_add_negative(-1, &page->_mapcount)) 1337 return; 1338 1339 /* 1340 * We use the irq-unsafe __{inc|mod}_zone_page_stat because 1341 * these counters are not modified in interrupt context, and 1342 * pte lock(a spinlock) is held, which implies preemption disabled. 1343 */ 1344 __dec_node_page_state(page, NR_ANON_MAPPED); 1345 1346 if (unlikely(PageMlocked(page))) 1347 clear_page_mlock(page); 1348 1349 if (PageTransCompound(page)) 1350 deferred_split_huge_page(compound_head(page)); 1351 1352 /* 1353 * It would be tidy to reset the PageAnon mapping here, 1354 * but that might overwrite a racing page_add_anon_rmap 1355 * which increments mapcount after us but sets mapping 1356 * before us: so leave the reset to free_unref_page, 1357 * and remember that it's only reliable while mapped. 1358 * Leaving it set also helps swapoff to reinstate ptes 1359 * faster for those pages still in swapcache. 1360 */ 1361 } 1362 1363 /* 1364 * @arg: enum ttu_flags will be passed to this argument 1365 */ 1366 static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, 1367 unsigned long address, void *arg) 1368 { 1369 struct mm_struct *mm = vma->vm_mm; 1370 struct page_vma_mapped_walk pvmw = { 1371 .page = page, 1372 .vma = vma, 1373 .address = address, 1374 }; 1375 pte_t pteval; 1376 struct page *subpage; 1377 bool ret = true; 1378 struct mmu_notifier_range range; 1379 enum ttu_flags flags = (enum ttu_flags)arg; 1380 1381 /* munlock has nothing to gain from examining un-locked vmas */ 1382 if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) 1383 return true; 1384 1385 if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) && 1386 is_zone_device_page(page) && !is_device_private_page(page)) 1387 return true; 1388 1389 if (flags & TTU_SPLIT_HUGE_PMD) { 1390 split_huge_pmd_address(vma, address, 1391 flags & TTU_SPLIT_FREEZE, page); 1392 } 1393 1394 /* 1395 * For THP, we have to assume the worse case ie pmd for invalidation. 1396 * For hugetlb, it could be much worse if we need to do pud 1397 * invalidation in the case of pmd sharing. 1398 * 1399 * Note that the page can not be free in this function as call of 1400 * try_to_unmap() must hold a reference on the page. 1401 */ 1402 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, 1403 address, 1404 min(vma->vm_end, address + page_size(page))); 1405 if (PageHuge(page)) { 1406 /* 1407 * If sharing is possible, start and end will be adjusted 1408 * accordingly. 1409 */ 1410 adjust_range_if_pmd_sharing_possible(vma, &range.start, 1411 &range.end); 1412 } 1413 mmu_notifier_invalidate_range_start(&range); 1414 1415 while (page_vma_mapped_walk(&pvmw)) { 1416 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 1417 /* PMD-mapped THP migration entry */ 1418 if (!pvmw.pte && (flags & TTU_MIGRATION)) { 1419 VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page); 1420 1421 set_pmd_migration_entry(&pvmw, page); 1422 continue; 1423 } 1424 #endif 1425 1426 /* 1427 * If the page is mlock()d, we cannot swap it out. 1428 * If it's recently referenced (perhaps page_referenced 1429 * skipped over this mm) then we should reactivate it. 1430 */ 1431 if (!(flags & TTU_IGNORE_MLOCK)) { 1432 if (vma->vm_flags & VM_LOCKED) { 1433 /* PTE-mapped THP are never mlocked */ 1434 if (!PageTransCompound(page)) { 1435 /* 1436 * Holding pte lock, we do *not* need 1437 * mmap_sem here 1438 */ 1439 mlock_vma_page(page); 1440 } 1441 ret = false; 1442 page_vma_mapped_walk_done(&pvmw); 1443 break; 1444 } 1445 if (flags & TTU_MUNLOCK) 1446 continue; 1447 } 1448 1449 /* Unexpected PMD-mapped THP? */ 1450 VM_BUG_ON_PAGE(!pvmw.pte, page); 1451 1452 subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte); 1453 address = pvmw.address; 1454 1455 if (PageHuge(page)) { 1456 if (huge_pmd_unshare(mm, &address, pvmw.pte)) { 1457 /* 1458 * huge_pmd_unshare unmapped an entire PMD 1459 * page. There is no way of knowing exactly 1460 * which PMDs may be cached for this mm, so 1461 * we must flush them all. start/end were 1462 * already adjusted above to cover this range. 1463 */ 1464 flush_cache_range(vma, range.start, range.end); 1465 flush_tlb_range(vma, range.start, range.end); 1466 mmu_notifier_invalidate_range(mm, range.start, 1467 range.end); 1468 1469 /* 1470 * The ref count of the PMD page was dropped 1471 * which is part of the way map counting 1472 * is done for shared PMDs. Return 'true' 1473 * here. When there is no other sharing, 1474 * huge_pmd_unshare returns false and we will 1475 * unmap the actual page and drop map count 1476 * to zero. 1477 */ 1478 page_vma_mapped_walk_done(&pvmw); 1479 break; 1480 } 1481 } 1482 1483 if (IS_ENABLED(CONFIG_MIGRATION) && 1484 (flags & TTU_MIGRATION) && 1485 is_zone_device_page(page)) { 1486 swp_entry_t entry; 1487 pte_t swp_pte; 1488 1489 pteval = ptep_get_and_clear(mm, pvmw.address, pvmw.pte); 1490 1491 /* 1492 * Store the pfn of the page in a special migration 1493 * pte. do_swap_page() will wait until the migration 1494 * pte is removed and then restart fault handling. 1495 */ 1496 entry = make_migration_entry(page, 0); 1497 swp_pte = swp_entry_to_pte(entry); 1498 if (pte_soft_dirty(pteval)) 1499 swp_pte = pte_swp_mksoft_dirty(swp_pte); 1500 set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte); 1501 /* 1502 * No need to invalidate here it will synchronize on 1503 * against the special swap migration pte. 1504 * 1505 * The assignment to subpage above was computed from a 1506 * swap PTE which results in an invalid pointer. 1507 * Since only PAGE_SIZE pages can currently be 1508 * migrated, just set it to page. This will need to be 1509 * changed when hugepage migrations to device private 1510 * memory are supported. 1511 */ 1512 subpage = page; 1513 goto discard; 1514 } 1515 1516 if (!(flags & TTU_IGNORE_ACCESS)) { 1517 if (ptep_clear_flush_young_notify(vma, address, 1518 pvmw.pte)) { 1519 ret = false; 1520 page_vma_mapped_walk_done(&pvmw); 1521 break; 1522 } 1523 } 1524 1525 /* Nuke the page table entry. */ 1526 flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); 1527 if (should_defer_flush(mm, flags)) { 1528 /* 1529 * We clear the PTE but do not flush so potentially 1530 * a remote CPU could still be writing to the page. 1531 * If the entry was previously clean then the 1532 * architecture must guarantee that a clear->dirty 1533 * transition on a cached TLB entry is written through 1534 * and traps if the PTE is unmapped. 1535 */ 1536 pteval = ptep_get_and_clear(mm, address, pvmw.pte); 1537 1538 set_tlb_ubc_flush_pending(mm, pte_dirty(pteval)); 1539 } else { 1540 pteval = ptep_clear_flush(vma, address, pvmw.pte); 1541 } 1542 1543 /* Move the dirty bit to the page. Now the pte is gone. */ 1544 if (pte_dirty(pteval)) 1545 set_page_dirty(page); 1546 1547 /* Update high watermark before we lower rss */ 1548 update_hiwater_rss(mm); 1549 1550 if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { 1551 pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); 1552 if (PageHuge(page)) { 1553 hugetlb_count_sub(compound_nr(page), mm); 1554 set_huge_swap_pte_at(mm, address, 1555 pvmw.pte, pteval, 1556 vma_mmu_pagesize(vma)); 1557 } else { 1558 dec_mm_counter(mm, mm_counter(page)); 1559 set_pte_at(mm, address, pvmw.pte, pteval); 1560 } 1561 1562 } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) { 1563 /* 1564 * The guest indicated that the page content is of no 1565 * interest anymore. Simply discard the pte, vmscan 1566 * will take care of the rest. 1567 * A future reference will then fault in a new zero 1568 * page. When userfaultfd is active, we must not drop 1569 * this page though, as its main user (postcopy 1570 * migration) will not expect userfaults on already 1571 * copied pages. 1572 */ 1573 dec_mm_counter(mm, mm_counter(page)); 1574 /* We have to invalidate as we cleared the pte */ 1575 mmu_notifier_invalidate_range(mm, address, 1576 address + PAGE_SIZE); 1577 } else if (IS_ENABLED(CONFIG_MIGRATION) && 1578 (flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))) { 1579 swp_entry_t entry; 1580 pte_t swp_pte; 1581 1582 if (arch_unmap_one(mm, vma, address, pteval) < 0) { 1583 set_pte_at(mm, address, pvmw.pte, pteval); 1584 ret = false; 1585 page_vma_mapped_walk_done(&pvmw); 1586 break; 1587 } 1588 1589 /* 1590 * Store the pfn of the page in a special migration 1591 * pte. do_swap_page() will wait until the migration 1592 * pte is removed and then restart fault handling. 1593 */ 1594 entry = make_migration_entry(subpage, 1595 pte_write(pteval)); 1596 swp_pte = swp_entry_to_pte(entry); 1597 if (pte_soft_dirty(pteval)) 1598 swp_pte = pte_swp_mksoft_dirty(swp_pte); 1599 set_pte_at(mm, address, pvmw.pte, swp_pte); 1600 /* 1601 * No need to invalidate here it will synchronize on 1602 * against the special swap migration pte. 1603 */ 1604 } else if (PageAnon(page)) { 1605 swp_entry_t entry = { .val = page_private(subpage) }; 1606 pte_t swp_pte; 1607 /* 1608 * Store the swap location in the pte. 1609 * See handle_pte_fault() ... 1610 */ 1611 if (unlikely(PageSwapBacked(page) != PageSwapCache(page))) { 1612 WARN_ON_ONCE(1); 1613 ret = false; 1614 /* We have to invalidate as we cleared the pte */ 1615 mmu_notifier_invalidate_range(mm, address, 1616 address + PAGE_SIZE); 1617 page_vma_mapped_walk_done(&pvmw); 1618 break; 1619 } 1620 1621 /* MADV_FREE page check */ 1622 if (!PageSwapBacked(page)) { 1623 if (!PageDirty(page)) { 1624 /* Invalidate as we cleared the pte */ 1625 mmu_notifier_invalidate_range(mm, 1626 address, address + PAGE_SIZE); 1627 dec_mm_counter(mm, MM_ANONPAGES); 1628 goto discard; 1629 } 1630 1631 /* 1632 * If the page was redirtied, it cannot be 1633 * discarded. Remap the page to page table. 1634 */ 1635 set_pte_at(mm, address, pvmw.pte, pteval); 1636 SetPageSwapBacked(page); 1637 ret = false; 1638 page_vma_mapped_walk_done(&pvmw); 1639 break; 1640 } 1641 1642 if (swap_duplicate(entry) < 0) { 1643 set_pte_at(mm, address, pvmw.pte, pteval); 1644 ret = false; 1645 page_vma_mapped_walk_done(&pvmw); 1646 break; 1647 } 1648 if (arch_unmap_one(mm, vma, address, pteval) < 0) { 1649 set_pte_at(mm, address, pvmw.pte, pteval); 1650 ret = false; 1651 page_vma_mapped_walk_done(&pvmw); 1652 break; 1653 } 1654 if (list_empty(&mm->mmlist)) { 1655 spin_lock(&mmlist_lock); 1656 if (list_empty(&mm->mmlist)) 1657 list_add(&mm->mmlist, &init_mm.mmlist); 1658 spin_unlock(&mmlist_lock); 1659 } 1660 dec_mm_counter(mm, MM_ANONPAGES); 1661 inc_mm_counter(mm, MM_SWAPENTS); 1662 swp_pte = swp_entry_to_pte(entry); 1663 if (pte_soft_dirty(pteval)) 1664 swp_pte = pte_swp_mksoft_dirty(swp_pte); 1665 set_pte_at(mm, address, pvmw.pte, swp_pte); 1666 /* Invalidate as we cleared the pte */ 1667 mmu_notifier_invalidate_range(mm, address, 1668 address + PAGE_SIZE); 1669 } else { 1670 /* 1671 * This is a locked file-backed page, thus it cannot 1672 * be removed from the page cache and replaced by a new 1673 * page before mmu_notifier_invalidate_range_end, so no 1674 * concurrent thread might update its page table to 1675 * point at new page while a device still is using this 1676 * page. 1677 * 1678 * See Documentation/vm/mmu_notifier.rst 1679 */ 1680 dec_mm_counter(mm, mm_counter_file(page)); 1681 } 1682 discard: 1683 /* 1684 * No need to call mmu_notifier_invalidate_range() it has be 1685 * done above for all cases requiring it to happen under page 1686 * table lock before mmu_notifier_invalidate_range_end() 1687 * 1688 * See Documentation/vm/mmu_notifier.rst 1689 */ 1690 page_remove_rmap(subpage, PageHuge(page)); 1691 put_page(page); 1692 } 1693 1694 mmu_notifier_invalidate_range_end(&range); 1695 1696 return ret; 1697 } 1698 1699 bool is_vma_temporary_stack(struct vm_area_struct *vma) 1700 { 1701 int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); 1702 1703 if (!maybe_stack) 1704 return false; 1705 1706 if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == 1707 VM_STACK_INCOMPLETE_SETUP) 1708 return true; 1709 1710 return false; 1711 } 1712 1713 static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) 1714 { 1715 return is_vma_temporary_stack(vma); 1716 } 1717 1718 static int page_mapcount_is_zero(struct page *page) 1719 { 1720 return !total_mapcount(page); 1721 } 1722 1723 /** 1724 * try_to_unmap - try to remove all page table mappings to a page 1725 * @page: the page to get unmapped 1726 * @flags: action and flags 1727 * 1728 * Tries to remove all the page table entries which are mapping this 1729 * page, used in the pageout path. Caller must hold the page lock. 1730 * 1731 * If unmap is successful, return true. Otherwise, false. 1732 */ 1733 bool try_to_unmap(struct page *page, enum ttu_flags flags) 1734 { 1735 struct rmap_walk_control rwc = { 1736 .rmap_one = try_to_unmap_one, 1737 .arg = (void *)flags, 1738 .done = page_mapcount_is_zero, 1739 .anon_lock = page_lock_anon_vma_read, 1740 }; 1741 1742 /* 1743 * During exec, a temporary VMA is setup and later moved. 1744 * The VMA is moved under the anon_vma lock but not the 1745 * page tables leading to a race where migration cannot 1746 * find the migration ptes. Rather than increasing the 1747 * locking requirements of exec(), migration skips 1748 * temporary VMAs until after exec() completes. 1749 */ 1750 if ((flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE)) 1751 && !PageKsm(page) && PageAnon(page)) 1752 rwc.invalid_vma = invalid_migration_vma; 1753 1754 if (flags & TTU_RMAP_LOCKED) 1755 rmap_walk_locked(page, &rwc); 1756 else 1757 rmap_walk(page, &rwc); 1758 1759 return !page_mapcount(page) ? true : false; 1760 } 1761 1762 static int page_not_mapped(struct page *page) 1763 { 1764 return !page_mapped(page); 1765 }; 1766 1767 /** 1768 * try_to_munlock - try to munlock a page 1769 * @page: the page to be munlocked 1770 * 1771 * Called from munlock code. Checks all of the VMAs mapping the page 1772 * to make sure nobody else has this page mlocked. The page will be 1773 * returned with PG_mlocked cleared if no other vmas have it mlocked. 1774 */ 1775 1776 void try_to_munlock(struct page *page) 1777 { 1778 struct rmap_walk_control rwc = { 1779 .rmap_one = try_to_unmap_one, 1780 .arg = (void *)TTU_MUNLOCK, 1781 .done = page_not_mapped, 1782 .anon_lock = page_lock_anon_vma_read, 1783 1784 }; 1785 1786 VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page); 1787 VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page); 1788 1789 rmap_walk(page, &rwc); 1790 } 1791 1792 void __put_anon_vma(struct anon_vma *anon_vma) 1793 { 1794 struct anon_vma *root = anon_vma->root; 1795 1796 anon_vma_free(anon_vma); 1797 if (root != anon_vma && atomic_dec_and_test(&root->refcount)) 1798 anon_vma_free(root); 1799 } 1800 1801 static struct anon_vma *rmap_walk_anon_lock(struct page *page, 1802 struct rmap_walk_control *rwc) 1803 { 1804 struct anon_vma *anon_vma; 1805 1806 if (rwc->anon_lock) 1807 return rwc->anon_lock(page); 1808 1809 /* 1810 * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() 1811 * because that depends on page_mapped(); but not all its usages 1812 * are holding mmap_sem. Users without mmap_sem are required to 1813 * take a reference count to prevent the anon_vma disappearing 1814 */ 1815 anon_vma = page_anon_vma(page); 1816 if (!anon_vma) 1817 return NULL; 1818 1819 anon_vma_lock_read(anon_vma); 1820 return anon_vma; 1821 } 1822 1823 /* 1824 * rmap_walk_anon - do something to anonymous page using the object-based 1825 * rmap method 1826 * @page: the page to be handled 1827 * @rwc: control variable according to each walk type 1828 * 1829 * Find all the mappings of a page using the mapping pointer and the vma chains 1830 * contained in the anon_vma struct it points to. 1831 * 1832 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma 1833 * where the page was found will be held for write. So, we won't recheck 1834 * vm_flags for that VMA. That should be OK, because that vma shouldn't be 1835 * LOCKED. 1836 */ 1837 static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, 1838 bool locked) 1839 { 1840 struct anon_vma *anon_vma; 1841 pgoff_t pgoff_start, pgoff_end; 1842 struct anon_vma_chain *avc; 1843 1844 if (locked) { 1845 anon_vma = page_anon_vma(page); 1846 /* anon_vma disappear under us? */ 1847 VM_BUG_ON_PAGE(!anon_vma, page); 1848 } else { 1849 anon_vma = rmap_walk_anon_lock(page, rwc); 1850 } 1851 if (!anon_vma) 1852 return; 1853 1854 pgoff_start = page_to_pgoff(page); 1855 pgoff_end = pgoff_start + hpage_nr_pages(page) - 1; 1856 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, 1857 pgoff_start, pgoff_end) { 1858 struct vm_area_struct *vma = avc->vma; 1859 unsigned long address = vma_address(page, vma); 1860 1861 cond_resched(); 1862 1863 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) 1864 continue; 1865 1866 if (!rwc->rmap_one(page, vma, address, rwc->arg)) 1867 break; 1868 if (rwc->done && rwc->done(page)) 1869 break; 1870 } 1871 1872 if (!locked) 1873 anon_vma_unlock_read(anon_vma); 1874 } 1875 1876 /* 1877 * rmap_walk_file - do something to file page using the object-based rmap method 1878 * @page: the page to be handled 1879 * @rwc: control variable according to each walk type 1880 * 1881 * Find all the mappings of a page using the mapping pointer and the vma chains 1882 * contained in the address_space struct it points to. 1883 * 1884 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma 1885 * where the page was found will be held for write. So, we won't recheck 1886 * vm_flags for that VMA. That should be OK, because that vma shouldn't be 1887 * LOCKED. 1888 */ 1889 static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, 1890 bool locked) 1891 { 1892 struct address_space *mapping = page_mapping(page); 1893 pgoff_t pgoff_start, pgoff_end; 1894 struct vm_area_struct *vma; 1895 1896 /* 1897 * The page lock not only makes sure that page->mapping cannot 1898 * suddenly be NULLified by truncation, it makes sure that the 1899 * structure at mapping cannot be freed and reused yet, 1900 * so we can safely take mapping->i_mmap_rwsem. 1901 */ 1902 VM_BUG_ON_PAGE(!PageLocked(page), page); 1903 1904 if (!mapping) 1905 return; 1906 1907 pgoff_start = page_to_pgoff(page); 1908 pgoff_end = pgoff_start + hpage_nr_pages(page) - 1; 1909 if (!locked) 1910 i_mmap_lock_read(mapping); 1911 vma_interval_tree_foreach(vma, &mapping->i_mmap, 1912 pgoff_start, pgoff_end) { 1913 unsigned long address = vma_address(page, vma); 1914 1915 cond_resched(); 1916 1917 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) 1918 continue; 1919 1920 if (!rwc->rmap_one(page, vma, address, rwc->arg)) 1921 goto done; 1922 if (rwc->done && rwc->done(page)) 1923 goto done; 1924 } 1925 1926 done: 1927 if (!locked) 1928 i_mmap_unlock_read(mapping); 1929 } 1930 1931 void rmap_walk(struct page *page, struct rmap_walk_control *rwc) 1932 { 1933 if (unlikely(PageKsm(page))) 1934 rmap_walk_ksm(page, rwc); 1935 else if (PageAnon(page)) 1936 rmap_walk_anon(page, rwc, false); 1937 else 1938 rmap_walk_file(page, rwc, false); 1939 } 1940 1941 /* Like rmap_walk, but caller holds relevant rmap lock */ 1942 void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc) 1943 { 1944 /* no ksm support for now */ 1945 VM_BUG_ON_PAGE(PageKsm(page), page); 1946 if (PageAnon(page)) 1947 rmap_walk_anon(page, rwc, true); 1948 else 1949 rmap_walk_file(page, rwc, true); 1950 } 1951 1952 #ifdef CONFIG_HUGETLB_PAGE 1953 /* 1954 * The following two functions are for anonymous (private mapped) hugepages. 1955 * Unlike common anonymous pages, anonymous hugepages have no accounting code 1956 * and no lru code, because we handle hugepages differently from common pages. 1957 */ 1958 void hugepage_add_anon_rmap(struct page *page, 1959 struct vm_area_struct *vma, unsigned long address) 1960 { 1961 struct anon_vma *anon_vma = vma->anon_vma; 1962 int first; 1963 1964 BUG_ON(!PageLocked(page)); 1965 BUG_ON(!anon_vma); 1966 /* address might be in next vma when migration races vma_adjust */ 1967 first = atomic_inc_and_test(compound_mapcount_ptr(page)); 1968 if (first) 1969 __page_set_anon_rmap(page, vma, address, 0); 1970 } 1971 1972 void hugepage_add_new_anon_rmap(struct page *page, 1973 struct vm_area_struct *vma, unsigned long address) 1974 { 1975 BUG_ON(address < vma->vm_start || address >= vma->vm_end); 1976 atomic_set(compound_mapcount_ptr(page), 0); 1977 __page_set_anon_rmap(page, vma, address, 1); 1978 } 1979 #endif /* CONFIG_HUGETLB_PAGE */ 1980