1 /* 2 * mm/rmap.c - physical to virtual reverse mappings 3 * 4 * Copyright 2001, Rik van Riel <riel@conectiva.com.br> 5 * Released under the General Public License (GPL). 6 * 7 * Simple, low overhead reverse mapping scheme. 8 * Please try to keep this thing as modular as possible. 9 * 10 * Provides methods for unmapping each kind of mapped page: 11 * the anon methods track anonymous pages, and 12 * the file methods track pages belonging to an inode. 13 * 14 * Original design by Rik van Riel <riel@conectiva.com.br> 2001 15 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004 16 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004 17 * Contributions by Hugh Dickins 2003, 2004 18 */ 19 20 /* 21 * Lock ordering in mm: 22 * 23 * inode->i_mutex (while writing or truncating, not reading or faulting) 24 * mm->mmap_sem 25 * page->flags PG_locked (lock_page) 26 * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share) 27 * mapping->i_mmap_rwsem 28 * anon_vma->rwsem 29 * mm->page_table_lock or pte_lock 30 * zone_lru_lock (in mark_page_accessed, isolate_lru_page) 31 * swap_lock (in swap_duplicate, swap_info_get) 32 * mmlist_lock (in mmput, drain_mmlist and others) 33 * mapping->private_lock (in __set_page_dirty_buffers) 34 * mem_cgroup_{begin,end}_page_stat (memcg->move_lock) 35 * mapping->tree_lock (widely used) 36 * inode->i_lock (in set_page_dirty's __mark_inode_dirty) 37 * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) 38 * sb_lock (within inode_lock in fs/fs-writeback.c) 39 * mapping->tree_lock (widely used, in set_page_dirty, 40 * in arch-dependent flush_dcache_mmap_lock, 41 * within bdi.wb->list_lock in __sync_single_inode) 42 * 43 * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon) 44 * ->tasklist_lock 45 * pte map lock 46 */ 47 48 #include <linux/mm.h> 49 #include <linux/pagemap.h> 50 #include <linux/swap.h> 51 #include <linux/swapops.h> 52 #include <linux/slab.h> 53 #include <linux/init.h> 54 #include <linux/ksm.h> 55 #include <linux/rmap.h> 56 #include <linux/rcupdate.h> 57 #include <linux/export.h> 58 #include <linux/memcontrol.h> 59 #include <linux/mmu_notifier.h> 60 #include <linux/migrate.h> 61 #include <linux/hugetlb.h> 62 #include <linux/backing-dev.h> 63 #include <linux/page_idle.h> 64 65 #include <asm/tlbflush.h> 66 67 #include <trace/events/tlb.h> 68 69 #include "internal.h" 70 71 static struct kmem_cache *anon_vma_cachep; 72 static struct kmem_cache *anon_vma_chain_cachep; 73 74 static inline struct anon_vma *anon_vma_alloc(void) 75 { 76 struct anon_vma *anon_vma; 77 78 anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); 79 if (anon_vma) { 80 atomic_set(&anon_vma->refcount, 1); 81 anon_vma->degree = 1; /* Reference for first vma */ 82 anon_vma->parent = anon_vma; 83 /* 84 * Initialise the anon_vma root to point to itself. If called 85 * from fork, the root will be reset to the parents anon_vma. 86 */ 87 anon_vma->root = anon_vma; 88 } 89 90 return anon_vma; 91 } 92 93 static inline void anon_vma_free(struct anon_vma *anon_vma) 94 { 95 VM_BUG_ON(atomic_read(&anon_vma->refcount)); 96 97 /* 98 * Synchronize against page_lock_anon_vma_read() such that 99 * we can safely hold the lock without the anon_vma getting 100 * freed. 101 * 102 * Relies on the full mb implied by the atomic_dec_and_test() from 103 * put_anon_vma() against the acquire barrier implied by 104 * down_read_trylock() from page_lock_anon_vma_read(). This orders: 105 * 106 * page_lock_anon_vma_read() VS put_anon_vma() 107 * down_read_trylock() atomic_dec_and_test() 108 * LOCK MB 109 * atomic_read() rwsem_is_locked() 110 * 111 * LOCK should suffice since the actual taking of the lock must 112 * happen _before_ what follows. 113 */ 114 might_sleep(); 115 if (rwsem_is_locked(&anon_vma->root->rwsem)) { 116 anon_vma_lock_write(anon_vma); 117 anon_vma_unlock_write(anon_vma); 118 } 119 120 kmem_cache_free(anon_vma_cachep, anon_vma); 121 } 122 123 static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp) 124 { 125 return kmem_cache_alloc(anon_vma_chain_cachep, gfp); 126 } 127 128 static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) 129 { 130 kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); 131 } 132 133 static void anon_vma_chain_link(struct vm_area_struct *vma, 134 struct anon_vma_chain *avc, 135 struct anon_vma *anon_vma) 136 { 137 avc->vma = vma; 138 avc->anon_vma = anon_vma; 139 list_add(&avc->same_vma, &vma->anon_vma_chain); 140 anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); 141 } 142 143 /** 144 * __anon_vma_prepare - attach an anon_vma to a memory region 145 * @vma: the memory region in question 146 * 147 * This makes sure the memory mapping described by 'vma' has 148 * an 'anon_vma' attached to it, so that we can associate the 149 * anonymous pages mapped into it with that anon_vma. 150 * 151 * The common case will be that we already have one, which 152 * is handled inline by anon_vma_prepare(). But if 153 * not we either need to find an adjacent mapping that we 154 * can re-use the anon_vma from (very common when the only 155 * reason for splitting a vma has been mprotect()), or we 156 * allocate a new one. 157 * 158 * Anon-vma allocations are very subtle, because we may have 159 * optimistically looked up an anon_vma in page_lock_anon_vma_read() 160 * and that may actually touch the spinlock even in the newly 161 * allocated vma (it depends on RCU to make sure that the 162 * anon_vma isn't actually destroyed). 163 * 164 * As a result, we need to do proper anon_vma locking even 165 * for the new allocation. At the same time, we do not want 166 * to do any locking for the common case of already having 167 * an anon_vma. 168 * 169 * This must be called with the mmap_sem held for reading. 170 */ 171 int __anon_vma_prepare(struct vm_area_struct *vma) 172 { 173 struct mm_struct *mm = vma->vm_mm; 174 struct anon_vma *anon_vma, *allocated; 175 struct anon_vma_chain *avc; 176 177 might_sleep(); 178 179 avc = anon_vma_chain_alloc(GFP_KERNEL); 180 if (!avc) 181 goto out_enomem; 182 183 anon_vma = find_mergeable_anon_vma(vma); 184 allocated = NULL; 185 if (!anon_vma) { 186 anon_vma = anon_vma_alloc(); 187 if (unlikely(!anon_vma)) 188 goto out_enomem_free_avc; 189 allocated = anon_vma; 190 } 191 192 anon_vma_lock_write(anon_vma); 193 /* page_table_lock to protect against threads */ 194 spin_lock(&mm->page_table_lock); 195 if (likely(!vma->anon_vma)) { 196 vma->anon_vma = anon_vma; 197 anon_vma_chain_link(vma, avc, anon_vma); 198 /* vma reference or self-parent link for new root */ 199 anon_vma->degree++; 200 allocated = NULL; 201 avc = NULL; 202 } 203 spin_unlock(&mm->page_table_lock); 204 anon_vma_unlock_write(anon_vma); 205 206 if (unlikely(allocated)) 207 put_anon_vma(allocated); 208 if (unlikely(avc)) 209 anon_vma_chain_free(avc); 210 211 return 0; 212 213 out_enomem_free_avc: 214 anon_vma_chain_free(avc); 215 out_enomem: 216 return -ENOMEM; 217 } 218 219 /* 220 * This is a useful helper function for locking the anon_vma root as 221 * we traverse the vma->anon_vma_chain, looping over anon_vma's that 222 * have the same vma. 223 * 224 * Such anon_vma's should have the same root, so you'd expect to see 225 * just a single mutex_lock for the whole traversal. 226 */ 227 static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma) 228 { 229 struct anon_vma *new_root = anon_vma->root; 230 if (new_root != root) { 231 if (WARN_ON_ONCE(root)) 232 up_write(&root->rwsem); 233 root = new_root; 234 down_write(&root->rwsem); 235 } 236 return root; 237 } 238 239 static inline void unlock_anon_vma_root(struct anon_vma *root) 240 { 241 if (root) 242 up_write(&root->rwsem); 243 } 244 245 /* 246 * Attach the anon_vmas from src to dst. 247 * Returns 0 on success, -ENOMEM on failure. 248 * 249 * If dst->anon_vma is NULL this function tries to find and reuse existing 250 * anon_vma which has no vmas and only one child anon_vma. This prevents 251 * degradation of anon_vma hierarchy to endless linear chain in case of 252 * constantly forking task. On the other hand, an anon_vma with more than one 253 * child isn't reused even if there was no alive vma, thus rmap walker has a 254 * good chance of avoiding scanning the whole hierarchy when it searches where 255 * page is mapped. 256 */ 257 int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) 258 { 259 struct anon_vma_chain *avc, *pavc; 260 struct anon_vma *root = NULL; 261 262 list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { 263 struct anon_vma *anon_vma; 264 265 avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN); 266 if (unlikely(!avc)) { 267 unlock_anon_vma_root(root); 268 root = NULL; 269 avc = anon_vma_chain_alloc(GFP_KERNEL); 270 if (!avc) 271 goto enomem_failure; 272 } 273 anon_vma = pavc->anon_vma; 274 root = lock_anon_vma_root(root, anon_vma); 275 anon_vma_chain_link(dst, avc, anon_vma); 276 277 /* 278 * Reuse existing anon_vma if its degree lower than two, 279 * that means it has no vma and only one anon_vma child. 280 * 281 * Do not chose parent anon_vma, otherwise first child 282 * will always reuse it. Root anon_vma is never reused: 283 * it has self-parent reference and at least one child. 284 */ 285 if (!dst->anon_vma && anon_vma != src->anon_vma && 286 anon_vma->degree < 2) 287 dst->anon_vma = anon_vma; 288 } 289 if (dst->anon_vma) 290 dst->anon_vma->degree++; 291 unlock_anon_vma_root(root); 292 return 0; 293 294 enomem_failure: 295 /* 296 * dst->anon_vma is dropped here otherwise its degree can be incorrectly 297 * decremented in unlink_anon_vmas(). 298 * We can safely do this because callers of anon_vma_clone() don't care 299 * about dst->anon_vma if anon_vma_clone() failed. 300 */ 301 dst->anon_vma = NULL; 302 unlink_anon_vmas(dst); 303 return -ENOMEM; 304 } 305 306 /* 307 * Attach vma to its own anon_vma, as well as to the anon_vmas that 308 * the corresponding VMA in the parent process is attached to. 309 * Returns 0 on success, non-zero on failure. 310 */ 311 int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) 312 { 313 struct anon_vma_chain *avc; 314 struct anon_vma *anon_vma; 315 int error; 316 317 /* Don't bother if the parent process has no anon_vma here. */ 318 if (!pvma->anon_vma) 319 return 0; 320 321 /* Drop inherited anon_vma, we'll reuse existing or allocate new. */ 322 vma->anon_vma = NULL; 323 324 /* 325 * First, attach the new VMA to the parent VMA's anon_vmas, 326 * so rmap can find non-COWed pages in child processes. 327 */ 328 error = anon_vma_clone(vma, pvma); 329 if (error) 330 return error; 331 332 /* An existing anon_vma has been reused, all done then. */ 333 if (vma->anon_vma) 334 return 0; 335 336 /* Then add our own anon_vma. */ 337 anon_vma = anon_vma_alloc(); 338 if (!anon_vma) 339 goto out_error; 340 avc = anon_vma_chain_alloc(GFP_KERNEL); 341 if (!avc) 342 goto out_error_free_anon_vma; 343 344 /* 345 * The root anon_vma's spinlock is the lock actually used when we 346 * lock any of the anon_vmas in this anon_vma tree. 347 */ 348 anon_vma->root = pvma->anon_vma->root; 349 anon_vma->parent = pvma->anon_vma; 350 /* 351 * With refcounts, an anon_vma can stay around longer than the 352 * process it belongs to. The root anon_vma needs to be pinned until 353 * this anon_vma is freed, because the lock lives in the root. 354 */ 355 get_anon_vma(anon_vma->root); 356 /* Mark this anon_vma as the one where our new (COWed) pages go. */ 357 vma->anon_vma = anon_vma; 358 anon_vma_lock_write(anon_vma); 359 anon_vma_chain_link(vma, avc, anon_vma); 360 anon_vma->parent->degree++; 361 anon_vma_unlock_write(anon_vma); 362 363 return 0; 364 365 out_error_free_anon_vma: 366 put_anon_vma(anon_vma); 367 out_error: 368 unlink_anon_vmas(vma); 369 return -ENOMEM; 370 } 371 372 void unlink_anon_vmas(struct vm_area_struct *vma) 373 { 374 struct anon_vma_chain *avc, *next; 375 struct anon_vma *root = NULL; 376 377 /* 378 * Unlink each anon_vma chained to the VMA. This list is ordered 379 * from newest to oldest, ensuring the root anon_vma gets freed last. 380 */ 381 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { 382 struct anon_vma *anon_vma = avc->anon_vma; 383 384 root = lock_anon_vma_root(root, anon_vma); 385 anon_vma_interval_tree_remove(avc, &anon_vma->rb_root); 386 387 /* 388 * Leave empty anon_vmas on the list - we'll need 389 * to free them outside the lock. 390 */ 391 if (RB_EMPTY_ROOT(&anon_vma->rb_root)) { 392 anon_vma->parent->degree--; 393 continue; 394 } 395 396 list_del(&avc->same_vma); 397 anon_vma_chain_free(avc); 398 } 399 if (vma->anon_vma) 400 vma->anon_vma->degree--; 401 unlock_anon_vma_root(root); 402 403 /* 404 * Iterate the list once more, it now only contains empty and unlinked 405 * anon_vmas, destroy them. Could not do before due to __put_anon_vma() 406 * needing to write-acquire the anon_vma->root->rwsem. 407 */ 408 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { 409 struct anon_vma *anon_vma = avc->anon_vma; 410 411 VM_WARN_ON(anon_vma->degree); 412 put_anon_vma(anon_vma); 413 414 list_del(&avc->same_vma); 415 anon_vma_chain_free(avc); 416 } 417 } 418 419 static void anon_vma_ctor(void *data) 420 { 421 struct anon_vma *anon_vma = data; 422 423 init_rwsem(&anon_vma->rwsem); 424 atomic_set(&anon_vma->refcount, 0); 425 anon_vma->rb_root = RB_ROOT; 426 } 427 428 void __init anon_vma_init(void) 429 { 430 anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), 431 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT, 432 anon_vma_ctor); 433 anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, 434 SLAB_PANIC|SLAB_ACCOUNT); 435 } 436 437 /* 438 * Getting a lock on a stable anon_vma from a page off the LRU is tricky! 439 * 440 * Since there is no serialization what so ever against page_remove_rmap() 441 * the best this function can do is return a locked anon_vma that might 442 * have been relevant to this page. 443 * 444 * The page might have been remapped to a different anon_vma or the anon_vma 445 * returned may already be freed (and even reused). 446 * 447 * In case it was remapped to a different anon_vma, the new anon_vma will be a 448 * child of the old anon_vma, and the anon_vma lifetime rules will therefore 449 * ensure that any anon_vma obtained from the page will still be valid for as 450 * long as we observe page_mapped() [ hence all those page_mapped() tests ]. 451 * 452 * All users of this function must be very careful when walking the anon_vma 453 * chain and verify that the page in question is indeed mapped in it 454 * [ something equivalent to page_mapped_in_vma() ]. 455 * 456 * Since anon_vma's slab is DESTROY_BY_RCU and we know from page_remove_rmap() 457 * that the anon_vma pointer from page->mapping is valid if there is a 458 * mapcount, we can dereference the anon_vma after observing those. 459 */ 460 struct anon_vma *page_get_anon_vma(struct page *page) 461 { 462 struct anon_vma *anon_vma = NULL; 463 unsigned long anon_mapping; 464 465 rcu_read_lock(); 466 anon_mapping = (unsigned long)READ_ONCE(page->mapping); 467 if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) 468 goto out; 469 if (!page_mapped(page)) 470 goto out; 471 472 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); 473 if (!atomic_inc_not_zero(&anon_vma->refcount)) { 474 anon_vma = NULL; 475 goto out; 476 } 477 478 /* 479 * If this page is still mapped, then its anon_vma cannot have been 480 * freed. But if it has been unmapped, we have no security against the 481 * anon_vma structure being freed and reused (for another anon_vma: 482 * SLAB_DESTROY_BY_RCU guarantees that - so the atomic_inc_not_zero() 483 * above cannot corrupt). 484 */ 485 if (!page_mapped(page)) { 486 rcu_read_unlock(); 487 put_anon_vma(anon_vma); 488 return NULL; 489 } 490 out: 491 rcu_read_unlock(); 492 493 return anon_vma; 494 } 495 496 /* 497 * Similar to page_get_anon_vma() except it locks the anon_vma. 498 * 499 * Its a little more complex as it tries to keep the fast path to a single 500 * atomic op -- the trylock. If we fail the trylock, we fall back to getting a 501 * reference like with page_get_anon_vma() and then block on the mutex. 502 */ 503 struct anon_vma *page_lock_anon_vma_read(struct page *page) 504 { 505 struct anon_vma *anon_vma = NULL; 506 struct anon_vma *root_anon_vma; 507 unsigned long anon_mapping; 508 509 rcu_read_lock(); 510 anon_mapping = (unsigned long)READ_ONCE(page->mapping); 511 if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) 512 goto out; 513 if (!page_mapped(page)) 514 goto out; 515 516 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); 517 root_anon_vma = READ_ONCE(anon_vma->root); 518 if (down_read_trylock(&root_anon_vma->rwsem)) { 519 /* 520 * If the page is still mapped, then this anon_vma is still 521 * its anon_vma, and holding the mutex ensures that it will 522 * not go away, see anon_vma_free(). 523 */ 524 if (!page_mapped(page)) { 525 up_read(&root_anon_vma->rwsem); 526 anon_vma = NULL; 527 } 528 goto out; 529 } 530 531 /* trylock failed, we got to sleep */ 532 if (!atomic_inc_not_zero(&anon_vma->refcount)) { 533 anon_vma = NULL; 534 goto out; 535 } 536 537 if (!page_mapped(page)) { 538 rcu_read_unlock(); 539 put_anon_vma(anon_vma); 540 return NULL; 541 } 542 543 /* we pinned the anon_vma, its safe to sleep */ 544 rcu_read_unlock(); 545 anon_vma_lock_read(anon_vma); 546 547 if (atomic_dec_and_test(&anon_vma->refcount)) { 548 /* 549 * Oops, we held the last refcount, release the lock 550 * and bail -- can't simply use put_anon_vma() because 551 * we'll deadlock on the anon_vma_lock_write() recursion. 552 */ 553 anon_vma_unlock_read(anon_vma); 554 __put_anon_vma(anon_vma); 555 anon_vma = NULL; 556 } 557 558 return anon_vma; 559 560 out: 561 rcu_read_unlock(); 562 return anon_vma; 563 } 564 565 void page_unlock_anon_vma_read(struct anon_vma *anon_vma) 566 { 567 anon_vma_unlock_read(anon_vma); 568 } 569 570 #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH 571 /* 572 * Flush TLB entries for recently unmapped pages from remote CPUs. It is 573 * important if a PTE was dirty when it was unmapped that it's flushed 574 * before any IO is initiated on the page to prevent lost writes. Similarly, 575 * it must be flushed before freeing to prevent data leakage. 576 */ 577 void try_to_unmap_flush(void) 578 { 579 struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; 580 int cpu; 581 582 if (!tlb_ubc->flush_required) 583 return; 584 585 cpu = get_cpu(); 586 587 if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask)) { 588 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); 589 local_flush_tlb(); 590 trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL); 591 } 592 593 if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids) 594 flush_tlb_others(&tlb_ubc->cpumask, NULL, 0, TLB_FLUSH_ALL); 595 cpumask_clear(&tlb_ubc->cpumask); 596 tlb_ubc->flush_required = false; 597 tlb_ubc->writable = false; 598 put_cpu(); 599 } 600 601 /* Flush iff there are potentially writable TLB entries that can race with IO */ 602 void try_to_unmap_flush_dirty(void) 603 { 604 struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; 605 606 if (tlb_ubc->writable) 607 try_to_unmap_flush(); 608 } 609 610 static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable) 611 { 612 struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; 613 614 cpumask_or(&tlb_ubc->cpumask, &tlb_ubc->cpumask, mm_cpumask(mm)); 615 tlb_ubc->flush_required = true; 616 617 /* 618 * If the PTE was dirty then it's best to assume it's writable. The 619 * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush() 620 * before the page is queued for IO. 621 */ 622 if (writable) 623 tlb_ubc->writable = true; 624 } 625 626 /* 627 * Returns true if the TLB flush should be deferred to the end of a batch of 628 * unmap operations to reduce IPIs. 629 */ 630 static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) 631 { 632 bool should_defer = false; 633 634 if (!(flags & TTU_BATCH_FLUSH)) 635 return false; 636 637 /* If remote CPUs need to be flushed then defer batch the flush */ 638 if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids) 639 should_defer = true; 640 put_cpu(); 641 642 return should_defer; 643 } 644 #else 645 static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable) 646 { 647 } 648 649 static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) 650 { 651 return false; 652 } 653 #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ 654 655 /* 656 * At what user virtual address is page expected in vma? 657 * Caller should check the page is actually part of the vma. 658 */ 659 unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) 660 { 661 unsigned long address; 662 if (PageAnon(page)) { 663 struct anon_vma *page__anon_vma = page_anon_vma(page); 664 /* 665 * Note: swapoff's unuse_vma() is more efficient with this 666 * check, and needs it to match anon_vma when KSM is active. 667 */ 668 if (!vma->anon_vma || !page__anon_vma || 669 vma->anon_vma->root != page__anon_vma->root) 670 return -EFAULT; 671 } else if (page->mapping) { 672 if (!vma->vm_file || vma->vm_file->f_mapping != page->mapping) 673 return -EFAULT; 674 } else 675 return -EFAULT; 676 address = __vma_address(page, vma); 677 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) 678 return -EFAULT; 679 return address; 680 } 681 682 pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) 683 { 684 pgd_t *pgd; 685 pud_t *pud; 686 pmd_t *pmd = NULL; 687 pmd_t pmde; 688 689 pgd = pgd_offset(mm, address); 690 if (!pgd_present(*pgd)) 691 goto out; 692 693 pud = pud_offset(pgd, address); 694 if (!pud_present(*pud)) 695 goto out; 696 697 pmd = pmd_offset(pud, address); 698 /* 699 * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at() 700 * without holding anon_vma lock for write. So when looking for a 701 * genuine pmde (in which to find pte), test present and !THP together. 702 */ 703 pmde = *pmd; 704 barrier(); 705 if (!pmd_present(pmde) || pmd_trans_huge(pmde)) 706 pmd = NULL; 707 out: 708 return pmd; 709 } 710 711 struct page_referenced_arg { 712 int mapcount; 713 int referenced; 714 unsigned long vm_flags; 715 struct mem_cgroup *memcg; 716 }; 717 /* 718 * arg: page_referenced_arg will be passed 719 */ 720 static int page_referenced_one(struct page *page, struct vm_area_struct *vma, 721 unsigned long address, void *arg) 722 { 723 struct page_referenced_arg *pra = arg; 724 struct page_vma_mapped_walk pvmw = { 725 .page = page, 726 .vma = vma, 727 .address = address, 728 }; 729 int referenced = 0; 730 731 while (page_vma_mapped_walk(&pvmw)) { 732 address = pvmw.address; 733 734 if (vma->vm_flags & VM_LOCKED) { 735 page_vma_mapped_walk_done(&pvmw); 736 pra->vm_flags |= VM_LOCKED; 737 return SWAP_FAIL; /* To break the loop */ 738 } 739 740 if (pvmw.pte) { 741 if (ptep_clear_flush_young_notify(vma, address, 742 pvmw.pte)) { 743 /* 744 * Don't treat a reference through 745 * a sequentially read mapping as such. 746 * If the page has been used in another mapping, 747 * we will catch it; if this other mapping is 748 * already gone, the unmap path will have set 749 * PG_referenced or activated the page. 750 */ 751 if (likely(!(vma->vm_flags & VM_SEQ_READ))) 752 referenced++; 753 } 754 } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { 755 if (pmdp_clear_flush_young_notify(vma, address, 756 pvmw.pmd)) 757 referenced++; 758 } else { 759 /* unexpected pmd-mapped page? */ 760 WARN_ON_ONCE(1); 761 } 762 763 pra->mapcount--; 764 } 765 766 if (referenced) 767 clear_page_idle(page); 768 if (test_and_clear_page_young(page)) 769 referenced++; 770 771 if (referenced) { 772 pra->referenced++; 773 pra->vm_flags |= vma->vm_flags; 774 } 775 776 if (!pra->mapcount) 777 return SWAP_SUCCESS; /* To break the loop */ 778 779 return SWAP_AGAIN; 780 } 781 782 static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg) 783 { 784 struct page_referenced_arg *pra = arg; 785 struct mem_cgroup *memcg = pra->memcg; 786 787 if (!mm_match_cgroup(vma->vm_mm, memcg)) 788 return true; 789 790 return false; 791 } 792 793 /** 794 * page_referenced - test if the page was referenced 795 * @page: the page to test 796 * @is_locked: caller holds lock on the page 797 * @memcg: target memory cgroup 798 * @vm_flags: collect encountered vma->vm_flags who actually referenced the page 799 * 800 * Quick test_and_clear_referenced for all mappings to a page, 801 * returns the number of ptes which referenced the page. 802 */ 803 int page_referenced(struct page *page, 804 int is_locked, 805 struct mem_cgroup *memcg, 806 unsigned long *vm_flags) 807 { 808 int ret; 809 int we_locked = 0; 810 struct page_referenced_arg pra = { 811 .mapcount = total_mapcount(page), 812 .memcg = memcg, 813 }; 814 struct rmap_walk_control rwc = { 815 .rmap_one = page_referenced_one, 816 .arg = (void *)&pra, 817 .anon_lock = page_lock_anon_vma_read, 818 }; 819 820 *vm_flags = 0; 821 if (!page_mapped(page)) 822 return 0; 823 824 if (!page_rmapping(page)) 825 return 0; 826 827 if (!is_locked && (!PageAnon(page) || PageKsm(page))) { 828 we_locked = trylock_page(page); 829 if (!we_locked) 830 return 1; 831 } 832 833 /* 834 * If we are reclaiming on behalf of a cgroup, skip 835 * counting on behalf of references from different 836 * cgroups 837 */ 838 if (memcg) { 839 rwc.invalid_vma = invalid_page_referenced_vma; 840 } 841 842 ret = rmap_walk(page, &rwc); 843 *vm_flags = pra.vm_flags; 844 845 if (we_locked) 846 unlock_page(page); 847 848 return pra.referenced; 849 } 850 851 static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, 852 unsigned long address, void *arg) 853 { 854 struct page_vma_mapped_walk pvmw = { 855 .page = page, 856 .vma = vma, 857 .address = address, 858 .flags = PVMW_SYNC, 859 }; 860 int *cleaned = arg; 861 862 while (page_vma_mapped_walk(&pvmw)) { 863 int ret = 0; 864 address = pvmw.address; 865 if (pvmw.pte) { 866 pte_t entry; 867 pte_t *pte = pvmw.pte; 868 869 if (!pte_dirty(*pte) && !pte_write(*pte)) 870 continue; 871 872 flush_cache_page(vma, address, pte_pfn(*pte)); 873 entry = ptep_clear_flush(vma, address, pte); 874 entry = pte_wrprotect(entry); 875 entry = pte_mkclean(entry); 876 set_pte_at(vma->vm_mm, address, pte, entry); 877 ret = 1; 878 } else { 879 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE 880 pmd_t *pmd = pvmw.pmd; 881 pmd_t entry; 882 883 if (!pmd_dirty(*pmd) && !pmd_write(*pmd)) 884 continue; 885 886 flush_cache_page(vma, address, page_to_pfn(page)); 887 entry = pmdp_huge_clear_flush(vma, address, pmd); 888 entry = pmd_wrprotect(entry); 889 entry = pmd_mkclean(entry); 890 set_pmd_at(vma->vm_mm, address, pmd, entry); 891 ret = 1; 892 #else 893 /* unexpected pmd-mapped page? */ 894 WARN_ON_ONCE(1); 895 #endif 896 } 897 898 if (ret) { 899 mmu_notifier_invalidate_page(vma->vm_mm, address); 900 (*cleaned)++; 901 } 902 } 903 904 return SWAP_AGAIN; 905 } 906 907 static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg) 908 { 909 if (vma->vm_flags & VM_SHARED) 910 return false; 911 912 return true; 913 } 914 915 int page_mkclean(struct page *page) 916 { 917 int cleaned = 0; 918 struct address_space *mapping; 919 struct rmap_walk_control rwc = { 920 .arg = (void *)&cleaned, 921 .rmap_one = page_mkclean_one, 922 .invalid_vma = invalid_mkclean_vma, 923 }; 924 925 BUG_ON(!PageLocked(page)); 926 927 if (!page_mapped(page)) 928 return 0; 929 930 mapping = page_mapping(page); 931 if (!mapping) 932 return 0; 933 934 rmap_walk(page, &rwc); 935 936 return cleaned; 937 } 938 EXPORT_SYMBOL_GPL(page_mkclean); 939 940 /** 941 * page_move_anon_rmap - move a page to our anon_vma 942 * @page: the page to move to our anon_vma 943 * @vma: the vma the page belongs to 944 * 945 * When a page belongs exclusively to one process after a COW event, 946 * that page can be moved into the anon_vma that belongs to just that 947 * process, so the rmap code will not search the parent or sibling 948 * processes. 949 */ 950 void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma) 951 { 952 struct anon_vma *anon_vma = vma->anon_vma; 953 954 page = compound_head(page); 955 956 VM_BUG_ON_PAGE(!PageLocked(page), page); 957 VM_BUG_ON_VMA(!anon_vma, vma); 958 959 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 960 /* 961 * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written 962 * simultaneously, so a concurrent reader (eg page_referenced()'s 963 * PageAnon()) will not see one without the other. 964 */ 965 WRITE_ONCE(page->mapping, (struct address_space *) anon_vma); 966 } 967 968 /** 969 * __page_set_anon_rmap - set up new anonymous rmap 970 * @page: Page to add to rmap 971 * @vma: VM area to add page to. 972 * @address: User virtual address of the mapping 973 * @exclusive: the page is exclusively owned by the current process 974 */ 975 static void __page_set_anon_rmap(struct page *page, 976 struct vm_area_struct *vma, unsigned long address, int exclusive) 977 { 978 struct anon_vma *anon_vma = vma->anon_vma; 979 980 BUG_ON(!anon_vma); 981 982 if (PageAnon(page)) 983 return; 984 985 /* 986 * If the page isn't exclusively mapped into this vma, 987 * we must use the _oldest_ possible anon_vma for the 988 * page mapping! 989 */ 990 if (!exclusive) 991 anon_vma = anon_vma->root; 992 993 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 994 page->mapping = (struct address_space *) anon_vma; 995 page->index = linear_page_index(vma, address); 996 } 997 998 /** 999 * __page_check_anon_rmap - sanity check anonymous rmap addition 1000 * @page: the page to add the mapping to 1001 * @vma: the vm area in which the mapping is added 1002 * @address: the user virtual address mapped 1003 */ 1004 static void __page_check_anon_rmap(struct page *page, 1005 struct vm_area_struct *vma, unsigned long address) 1006 { 1007 #ifdef CONFIG_DEBUG_VM 1008 /* 1009 * The page's anon-rmap details (mapping and index) are guaranteed to 1010 * be set up correctly at this point. 1011 * 1012 * We have exclusion against page_add_anon_rmap because the caller 1013 * always holds the page locked, except if called from page_dup_rmap, 1014 * in which case the page is already known to be setup. 1015 * 1016 * We have exclusion against page_add_new_anon_rmap because those pages 1017 * are initially only visible via the pagetables, and the pte is locked 1018 * over the call to page_add_new_anon_rmap. 1019 */ 1020 BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root); 1021 BUG_ON(page_to_pgoff(page) != linear_page_index(vma, address)); 1022 #endif 1023 } 1024 1025 /** 1026 * page_add_anon_rmap - add pte mapping to an anonymous page 1027 * @page: the page to add the mapping to 1028 * @vma: the vm area in which the mapping is added 1029 * @address: the user virtual address mapped 1030 * @compound: charge the page as compound or small page 1031 * 1032 * The caller needs to hold the pte lock, and the page must be locked in 1033 * the anon_vma case: to serialize mapping,index checking after setting, 1034 * and to ensure that PageAnon is not being upgraded racily to PageKsm 1035 * (but PageKsm is never downgraded to PageAnon). 1036 */ 1037 void page_add_anon_rmap(struct page *page, 1038 struct vm_area_struct *vma, unsigned long address, bool compound) 1039 { 1040 do_page_add_anon_rmap(page, vma, address, compound ? RMAP_COMPOUND : 0); 1041 } 1042 1043 /* 1044 * Special version of the above for do_swap_page, which often runs 1045 * into pages that are exclusively owned by the current process. 1046 * Everybody else should continue to use page_add_anon_rmap above. 1047 */ 1048 void do_page_add_anon_rmap(struct page *page, 1049 struct vm_area_struct *vma, unsigned long address, int flags) 1050 { 1051 bool compound = flags & RMAP_COMPOUND; 1052 bool first; 1053 1054 if (compound) { 1055 atomic_t *mapcount; 1056 VM_BUG_ON_PAGE(!PageLocked(page), page); 1057 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 1058 mapcount = compound_mapcount_ptr(page); 1059 first = atomic_inc_and_test(mapcount); 1060 } else { 1061 first = atomic_inc_and_test(&page->_mapcount); 1062 } 1063 1064 if (first) { 1065 int nr = compound ? hpage_nr_pages(page) : 1; 1066 /* 1067 * We use the irq-unsafe __{inc|mod}_zone_page_stat because 1068 * these counters are not modified in interrupt context, and 1069 * pte lock(a spinlock) is held, which implies preemption 1070 * disabled. 1071 */ 1072 if (compound) 1073 __inc_node_page_state(page, NR_ANON_THPS); 1074 __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr); 1075 } 1076 if (unlikely(PageKsm(page))) 1077 return; 1078 1079 VM_BUG_ON_PAGE(!PageLocked(page), page); 1080 1081 /* address might be in next vma when migration races vma_adjust */ 1082 if (first) 1083 __page_set_anon_rmap(page, vma, address, 1084 flags & RMAP_EXCLUSIVE); 1085 else 1086 __page_check_anon_rmap(page, vma, address); 1087 } 1088 1089 /** 1090 * page_add_new_anon_rmap - add pte mapping to a new anonymous page 1091 * @page: the page to add the mapping to 1092 * @vma: the vm area in which the mapping is added 1093 * @address: the user virtual address mapped 1094 * @compound: charge the page as compound or small page 1095 * 1096 * Same as page_add_anon_rmap but must only be called on *new* pages. 1097 * This means the inc-and-test can be bypassed. 1098 * Page does not have to be locked. 1099 */ 1100 void page_add_new_anon_rmap(struct page *page, 1101 struct vm_area_struct *vma, unsigned long address, bool compound) 1102 { 1103 int nr = compound ? hpage_nr_pages(page) : 1; 1104 1105 VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); 1106 __SetPageSwapBacked(page); 1107 if (compound) { 1108 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 1109 /* increment count (starts at -1) */ 1110 atomic_set(compound_mapcount_ptr(page), 0); 1111 __inc_node_page_state(page, NR_ANON_THPS); 1112 } else { 1113 /* Anon THP always mapped first with PMD */ 1114 VM_BUG_ON_PAGE(PageTransCompound(page), page); 1115 /* increment count (starts at -1) */ 1116 atomic_set(&page->_mapcount, 0); 1117 } 1118 __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr); 1119 __page_set_anon_rmap(page, vma, address, 1); 1120 } 1121 1122 /** 1123 * page_add_file_rmap - add pte mapping to a file page 1124 * @page: the page to add the mapping to 1125 * 1126 * The caller needs to hold the pte lock. 1127 */ 1128 void page_add_file_rmap(struct page *page, bool compound) 1129 { 1130 int i, nr = 1; 1131 1132 VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page); 1133 lock_page_memcg(page); 1134 if (compound && PageTransHuge(page)) { 1135 for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) { 1136 if (atomic_inc_and_test(&page[i]._mapcount)) 1137 nr++; 1138 } 1139 if (!atomic_inc_and_test(compound_mapcount_ptr(page))) 1140 goto out; 1141 VM_BUG_ON_PAGE(!PageSwapBacked(page), page); 1142 __inc_node_page_state(page, NR_SHMEM_PMDMAPPED); 1143 } else { 1144 if (PageTransCompound(page) && page_mapping(page)) { 1145 VM_WARN_ON_ONCE(!PageLocked(page)); 1146 1147 SetPageDoubleMap(compound_head(page)); 1148 if (PageMlocked(page)) 1149 clear_page_mlock(compound_head(page)); 1150 } 1151 if (!atomic_inc_and_test(&page->_mapcount)) 1152 goto out; 1153 } 1154 __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, nr); 1155 mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); 1156 out: 1157 unlock_page_memcg(page); 1158 } 1159 1160 static void page_remove_file_rmap(struct page *page, bool compound) 1161 { 1162 int i, nr = 1; 1163 1164 VM_BUG_ON_PAGE(compound && !PageHead(page), page); 1165 lock_page_memcg(page); 1166 1167 /* Hugepages are not counted in NR_FILE_MAPPED for now. */ 1168 if (unlikely(PageHuge(page))) { 1169 /* hugetlb pages are always mapped with pmds */ 1170 atomic_dec(compound_mapcount_ptr(page)); 1171 goto out; 1172 } 1173 1174 /* page still mapped by someone else? */ 1175 if (compound && PageTransHuge(page)) { 1176 for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) { 1177 if (atomic_add_negative(-1, &page[i]._mapcount)) 1178 nr++; 1179 } 1180 if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) 1181 goto out; 1182 VM_BUG_ON_PAGE(!PageSwapBacked(page), page); 1183 __dec_node_page_state(page, NR_SHMEM_PMDMAPPED); 1184 } else { 1185 if (!atomic_add_negative(-1, &page->_mapcount)) 1186 goto out; 1187 } 1188 1189 /* 1190 * We use the irq-unsafe __{inc|mod}_zone_page_state because 1191 * these counters are not modified in interrupt context, and 1192 * pte lock(a spinlock) is held, which implies preemption disabled. 1193 */ 1194 __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, -nr); 1195 mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); 1196 1197 if (unlikely(PageMlocked(page))) 1198 clear_page_mlock(page); 1199 out: 1200 unlock_page_memcg(page); 1201 } 1202 1203 static void page_remove_anon_compound_rmap(struct page *page) 1204 { 1205 int i, nr; 1206 1207 if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) 1208 return; 1209 1210 /* Hugepages are not counted in NR_ANON_PAGES for now. */ 1211 if (unlikely(PageHuge(page))) 1212 return; 1213 1214 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) 1215 return; 1216 1217 __dec_node_page_state(page, NR_ANON_THPS); 1218 1219 if (TestClearPageDoubleMap(page)) { 1220 /* 1221 * Subpages can be mapped with PTEs too. Check how many of 1222 * themi are still mapped. 1223 */ 1224 for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) { 1225 if (atomic_add_negative(-1, &page[i]._mapcount)) 1226 nr++; 1227 } 1228 } else { 1229 nr = HPAGE_PMD_NR; 1230 } 1231 1232 if (unlikely(PageMlocked(page))) 1233 clear_page_mlock(page); 1234 1235 if (nr) { 1236 __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, -nr); 1237 deferred_split_huge_page(page); 1238 } 1239 } 1240 1241 /** 1242 * page_remove_rmap - take down pte mapping from a page 1243 * @page: page to remove mapping from 1244 * @compound: uncharge the page as compound or small page 1245 * 1246 * The caller needs to hold the pte lock. 1247 */ 1248 void page_remove_rmap(struct page *page, bool compound) 1249 { 1250 if (!PageAnon(page)) 1251 return page_remove_file_rmap(page, compound); 1252 1253 if (compound) 1254 return page_remove_anon_compound_rmap(page); 1255 1256 /* page still mapped by someone else? */ 1257 if (!atomic_add_negative(-1, &page->_mapcount)) 1258 return; 1259 1260 /* 1261 * We use the irq-unsafe __{inc|mod}_zone_page_stat because 1262 * these counters are not modified in interrupt context, and 1263 * pte lock(a spinlock) is held, which implies preemption disabled. 1264 */ 1265 __dec_node_page_state(page, NR_ANON_MAPPED); 1266 1267 if (unlikely(PageMlocked(page))) 1268 clear_page_mlock(page); 1269 1270 if (PageTransCompound(page)) 1271 deferred_split_huge_page(compound_head(page)); 1272 1273 /* 1274 * It would be tidy to reset the PageAnon mapping here, 1275 * but that might overwrite a racing page_add_anon_rmap 1276 * which increments mapcount after us but sets mapping 1277 * before us: so leave the reset to free_hot_cold_page, 1278 * and remember that it's only reliable while mapped. 1279 * Leaving it set also helps swapoff to reinstate ptes 1280 * faster for those pages still in swapcache. 1281 */ 1282 } 1283 1284 struct rmap_private { 1285 enum ttu_flags flags; 1286 int lazyfreed; 1287 }; 1288 1289 /* 1290 * @arg: enum ttu_flags will be passed to this argument 1291 */ 1292 static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, 1293 unsigned long address, void *arg) 1294 { 1295 struct mm_struct *mm = vma->vm_mm; 1296 struct page_vma_mapped_walk pvmw = { 1297 .page = page, 1298 .vma = vma, 1299 .address = address, 1300 }; 1301 pte_t pteval; 1302 struct page *subpage; 1303 int ret = SWAP_AGAIN; 1304 struct rmap_private *rp = arg; 1305 enum ttu_flags flags = rp->flags; 1306 1307 /* munlock has nothing to gain from examining un-locked vmas */ 1308 if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) 1309 return SWAP_AGAIN; 1310 1311 if (flags & TTU_SPLIT_HUGE_PMD) { 1312 split_huge_pmd_address(vma, address, 1313 flags & TTU_MIGRATION, page); 1314 } 1315 1316 while (page_vma_mapped_walk(&pvmw)) { 1317 subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte); 1318 address = pvmw.address; 1319 1320 /* Unexpected PMD-mapped THP? */ 1321 VM_BUG_ON_PAGE(!pvmw.pte, page); 1322 1323 /* 1324 * If the page is mlock()d, we cannot swap it out. 1325 * If it's recently referenced (perhaps page_referenced 1326 * skipped over this mm) then we should reactivate it. 1327 */ 1328 if (!(flags & TTU_IGNORE_MLOCK)) { 1329 if (vma->vm_flags & VM_LOCKED) { 1330 /* PTE-mapped THP are never mlocked */ 1331 if (!PageTransCompound(page)) { 1332 /* 1333 * Holding pte lock, we do *not* need 1334 * mmap_sem here 1335 */ 1336 mlock_vma_page(page); 1337 } 1338 ret = SWAP_MLOCK; 1339 page_vma_mapped_walk_done(&pvmw); 1340 break; 1341 } 1342 if (flags & TTU_MUNLOCK) 1343 continue; 1344 } 1345 1346 if (!(flags & TTU_IGNORE_ACCESS)) { 1347 if (ptep_clear_flush_young_notify(vma, address, 1348 pvmw.pte)) { 1349 ret = SWAP_FAIL; 1350 page_vma_mapped_walk_done(&pvmw); 1351 break; 1352 } 1353 } 1354 1355 /* Nuke the page table entry. */ 1356 flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); 1357 if (should_defer_flush(mm, flags)) { 1358 /* 1359 * We clear the PTE but do not flush so potentially 1360 * a remote CPU could still be writing to the page. 1361 * If the entry was previously clean then the 1362 * architecture must guarantee that a clear->dirty 1363 * transition on a cached TLB entry is written through 1364 * and traps if the PTE is unmapped. 1365 */ 1366 pteval = ptep_get_and_clear(mm, address, pvmw.pte); 1367 1368 set_tlb_ubc_flush_pending(mm, pte_dirty(pteval)); 1369 } else { 1370 pteval = ptep_clear_flush(vma, address, pvmw.pte); 1371 } 1372 1373 /* Move the dirty bit to the page. Now the pte is gone. */ 1374 if (pte_dirty(pteval)) 1375 set_page_dirty(page); 1376 1377 /* Update high watermark before we lower rss */ 1378 update_hiwater_rss(mm); 1379 1380 if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { 1381 if (PageHuge(page)) { 1382 int nr = 1 << compound_order(page); 1383 hugetlb_count_sub(nr, mm); 1384 } else { 1385 dec_mm_counter(mm, mm_counter(page)); 1386 } 1387 1388 pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); 1389 set_pte_at(mm, address, pvmw.pte, pteval); 1390 } else if (pte_unused(pteval)) { 1391 /* 1392 * The guest indicated that the page content is of no 1393 * interest anymore. Simply discard the pte, vmscan 1394 * will take care of the rest. 1395 */ 1396 dec_mm_counter(mm, mm_counter(page)); 1397 } else if (IS_ENABLED(CONFIG_MIGRATION) && 1398 (flags & TTU_MIGRATION)) { 1399 swp_entry_t entry; 1400 pte_t swp_pte; 1401 /* 1402 * Store the pfn of the page in a special migration 1403 * pte. do_swap_page() will wait until the migration 1404 * pte is removed and then restart fault handling. 1405 */ 1406 entry = make_migration_entry(subpage, 1407 pte_write(pteval)); 1408 swp_pte = swp_entry_to_pte(entry); 1409 if (pte_soft_dirty(pteval)) 1410 swp_pte = pte_swp_mksoft_dirty(swp_pte); 1411 set_pte_at(mm, address, pvmw.pte, swp_pte); 1412 } else if (PageAnon(page)) { 1413 swp_entry_t entry = { .val = page_private(subpage) }; 1414 pte_t swp_pte; 1415 /* 1416 * Store the swap location in the pte. 1417 * See handle_pte_fault() ... 1418 */ 1419 VM_BUG_ON_PAGE(!PageSwapCache(page), page); 1420 1421 if (!PageDirty(page) && (flags & TTU_LZFREE)) { 1422 /* It's a freeable page by MADV_FREE */ 1423 dec_mm_counter(mm, MM_ANONPAGES); 1424 rp->lazyfreed++; 1425 goto discard; 1426 } 1427 1428 if (swap_duplicate(entry) < 0) { 1429 set_pte_at(mm, address, pvmw.pte, pteval); 1430 ret = SWAP_FAIL; 1431 page_vma_mapped_walk_done(&pvmw); 1432 break; 1433 } 1434 if (list_empty(&mm->mmlist)) { 1435 spin_lock(&mmlist_lock); 1436 if (list_empty(&mm->mmlist)) 1437 list_add(&mm->mmlist, &init_mm.mmlist); 1438 spin_unlock(&mmlist_lock); 1439 } 1440 dec_mm_counter(mm, MM_ANONPAGES); 1441 inc_mm_counter(mm, MM_SWAPENTS); 1442 swp_pte = swp_entry_to_pte(entry); 1443 if (pte_soft_dirty(pteval)) 1444 swp_pte = pte_swp_mksoft_dirty(swp_pte); 1445 set_pte_at(mm, address, pvmw.pte, swp_pte); 1446 } else 1447 dec_mm_counter(mm, mm_counter_file(page)); 1448 discard: 1449 page_remove_rmap(subpage, PageHuge(page)); 1450 put_page(page); 1451 mmu_notifier_invalidate_page(mm, address); 1452 } 1453 return ret; 1454 } 1455 1456 bool is_vma_temporary_stack(struct vm_area_struct *vma) 1457 { 1458 int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); 1459 1460 if (!maybe_stack) 1461 return false; 1462 1463 if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == 1464 VM_STACK_INCOMPLETE_SETUP) 1465 return true; 1466 1467 return false; 1468 } 1469 1470 static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) 1471 { 1472 return is_vma_temporary_stack(vma); 1473 } 1474 1475 static int page_mapcount_is_zero(struct page *page) 1476 { 1477 return !total_mapcount(page); 1478 } 1479 1480 /** 1481 * try_to_unmap - try to remove all page table mappings to a page 1482 * @page: the page to get unmapped 1483 * @flags: action and flags 1484 * 1485 * Tries to remove all the page table entries which are mapping this 1486 * page, used in the pageout path. Caller must hold the page lock. 1487 * Return values are: 1488 * 1489 * SWAP_SUCCESS - we succeeded in removing all mappings 1490 * SWAP_AGAIN - we missed a mapping, try again later 1491 * SWAP_FAIL - the page is unswappable 1492 * SWAP_MLOCK - page is mlocked. 1493 */ 1494 int try_to_unmap(struct page *page, enum ttu_flags flags) 1495 { 1496 int ret; 1497 struct rmap_private rp = { 1498 .flags = flags, 1499 .lazyfreed = 0, 1500 }; 1501 1502 struct rmap_walk_control rwc = { 1503 .rmap_one = try_to_unmap_one, 1504 .arg = &rp, 1505 .done = page_mapcount_is_zero, 1506 .anon_lock = page_lock_anon_vma_read, 1507 }; 1508 1509 /* 1510 * During exec, a temporary VMA is setup and later moved. 1511 * The VMA is moved under the anon_vma lock but not the 1512 * page tables leading to a race where migration cannot 1513 * find the migration ptes. Rather than increasing the 1514 * locking requirements of exec(), migration skips 1515 * temporary VMAs until after exec() completes. 1516 */ 1517 if ((flags & TTU_MIGRATION) && !PageKsm(page) && PageAnon(page)) 1518 rwc.invalid_vma = invalid_migration_vma; 1519 1520 if (flags & TTU_RMAP_LOCKED) 1521 ret = rmap_walk_locked(page, &rwc); 1522 else 1523 ret = rmap_walk(page, &rwc); 1524 1525 if (ret != SWAP_MLOCK && !page_mapcount(page)) { 1526 ret = SWAP_SUCCESS; 1527 if (rp.lazyfreed && !PageDirty(page)) 1528 ret = SWAP_LZFREE; 1529 } 1530 return ret; 1531 } 1532 1533 static int page_not_mapped(struct page *page) 1534 { 1535 return !page_mapped(page); 1536 }; 1537 1538 /** 1539 * try_to_munlock - try to munlock a page 1540 * @page: the page to be munlocked 1541 * 1542 * Called from munlock code. Checks all of the VMAs mapping the page 1543 * to make sure nobody else has this page mlocked. The page will be 1544 * returned with PG_mlocked cleared if no other vmas have it mlocked. 1545 * 1546 * Return values are: 1547 * 1548 * SWAP_AGAIN - no vma is holding page mlocked, or, 1549 * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem 1550 * SWAP_FAIL - page cannot be located at present 1551 * SWAP_MLOCK - page is now mlocked. 1552 */ 1553 int try_to_munlock(struct page *page) 1554 { 1555 int ret; 1556 struct rmap_private rp = { 1557 .flags = TTU_MUNLOCK, 1558 .lazyfreed = 0, 1559 }; 1560 1561 struct rmap_walk_control rwc = { 1562 .rmap_one = try_to_unmap_one, 1563 .arg = &rp, 1564 .done = page_not_mapped, 1565 .anon_lock = page_lock_anon_vma_read, 1566 1567 }; 1568 1569 VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page); 1570 1571 ret = rmap_walk(page, &rwc); 1572 return ret; 1573 } 1574 1575 void __put_anon_vma(struct anon_vma *anon_vma) 1576 { 1577 struct anon_vma *root = anon_vma->root; 1578 1579 anon_vma_free(anon_vma); 1580 if (root != anon_vma && atomic_dec_and_test(&root->refcount)) 1581 anon_vma_free(root); 1582 } 1583 1584 static struct anon_vma *rmap_walk_anon_lock(struct page *page, 1585 struct rmap_walk_control *rwc) 1586 { 1587 struct anon_vma *anon_vma; 1588 1589 if (rwc->anon_lock) 1590 return rwc->anon_lock(page); 1591 1592 /* 1593 * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() 1594 * because that depends on page_mapped(); but not all its usages 1595 * are holding mmap_sem. Users without mmap_sem are required to 1596 * take a reference count to prevent the anon_vma disappearing 1597 */ 1598 anon_vma = page_anon_vma(page); 1599 if (!anon_vma) 1600 return NULL; 1601 1602 anon_vma_lock_read(anon_vma); 1603 return anon_vma; 1604 } 1605 1606 /* 1607 * rmap_walk_anon - do something to anonymous page using the object-based 1608 * rmap method 1609 * @page: the page to be handled 1610 * @rwc: control variable according to each walk type 1611 * 1612 * Find all the mappings of a page using the mapping pointer and the vma chains 1613 * contained in the anon_vma struct it points to. 1614 * 1615 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma 1616 * where the page was found will be held for write. So, we won't recheck 1617 * vm_flags for that VMA. That should be OK, because that vma shouldn't be 1618 * LOCKED. 1619 */ 1620 static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, 1621 bool locked) 1622 { 1623 struct anon_vma *anon_vma; 1624 pgoff_t pgoff_start, pgoff_end; 1625 struct anon_vma_chain *avc; 1626 int ret = SWAP_AGAIN; 1627 1628 if (locked) { 1629 anon_vma = page_anon_vma(page); 1630 /* anon_vma disappear under us? */ 1631 VM_BUG_ON_PAGE(!anon_vma, page); 1632 } else { 1633 anon_vma = rmap_walk_anon_lock(page, rwc); 1634 } 1635 if (!anon_vma) 1636 return ret; 1637 1638 pgoff_start = page_to_pgoff(page); 1639 pgoff_end = pgoff_start + hpage_nr_pages(page) - 1; 1640 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, 1641 pgoff_start, pgoff_end) { 1642 struct vm_area_struct *vma = avc->vma; 1643 unsigned long address = vma_address(page, vma); 1644 1645 cond_resched(); 1646 1647 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) 1648 continue; 1649 1650 ret = rwc->rmap_one(page, vma, address, rwc->arg); 1651 if (ret != SWAP_AGAIN) 1652 break; 1653 if (rwc->done && rwc->done(page)) 1654 break; 1655 } 1656 1657 if (!locked) 1658 anon_vma_unlock_read(anon_vma); 1659 return ret; 1660 } 1661 1662 /* 1663 * rmap_walk_file - do something to file page using the object-based rmap method 1664 * @page: the page to be handled 1665 * @rwc: control variable according to each walk type 1666 * 1667 * Find all the mappings of a page using the mapping pointer and the vma chains 1668 * contained in the address_space struct it points to. 1669 * 1670 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma 1671 * where the page was found will be held for write. So, we won't recheck 1672 * vm_flags for that VMA. That should be OK, because that vma shouldn't be 1673 * LOCKED. 1674 */ 1675 static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, 1676 bool locked) 1677 { 1678 struct address_space *mapping = page_mapping(page); 1679 pgoff_t pgoff_start, pgoff_end; 1680 struct vm_area_struct *vma; 1681 int ret = SWAP_AGAIN; 1682 1683 /* 1684 * The page lock not only makes sure that page->mapping cannot 1685 * suddenly be NULLified by truncation, it makes sure that the 1686 * structure at mapping cannot be freed and reused yet, 1687 * so we can safely take mapping->i_mmap_rwsem. 1688 */ 1689 VM_BUG_ON_PAGE(!PageLocked(page), page); 1690 1691 if (!mapping) 1692 return ret; 1693 1694 pgoff_start = page_to_pgoff(page); 1695 pgoff_end = pgoff_start + hpage_nr_pages(page) - 1; 1696 if (!locked) 1697 i_mmap_lock_read(mapping); 1698 vma_interval_tree_foreach(vma, &mapping->i_mmap, 1699 pgoff_start, pgoff_end) { 1700 unsigned long address = vma_address(page, vma); 1701 1702 cond_resched(); 1703 1704 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) 1705 continue; 1706 1707 ret = rwc->rmap_one(page, vma, address, rwc->arg); 1708 if (ret != SWAP_AGAIN) 1709 goto done; 1710 if (rwc->done && rwc->done(page)) 1711 goto done; 1712 } 1713 1714 done: 1715 if (!locked) 1716 i_mmap_unlock_read(mapping); 1717 return ret; 1718 } 1719 1720 int rmap_walk(struct page *page, struct rmap_walk_control *rwc) 1721 { 1722 if (unlikely(PageKsm(page))) 1723 return rmap_walk_ksm(page, rwc); 1724 else if (PageAnon(page)) 1725 return rmap_walk_anon(page, rwc, false); 1726 else 1727 return rmap_walk_file(page, rwc, false); 1728 } 1729 1730 /* Like rmap_walk, but caller holds relevant rmap lock */ 1731 int rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc) 1732 { 1733 /* no ksm support for now */ 1734 VM_BUG_ON_PAGE(PageKsm(page), page); 1735 if (PageAnon(page)) 1736 return rmap_walk_anon(page, rwc, true); 1737 else 1738 return rmap_walk_file(page, rwc, true); 1739 } 1740 1741 #ifdef CONFIG_HUGETLB_PAGE 1742 /* 1743 * The following three functions are for anonymous (private mapped) hugepages. 1744 * Unlike common anonymous pages, anonymous hugepages have no accounting code 1745 * and no lru code, because we handle hugepages differently from common pages. 1746 */ 1747 static void __hugepage_set_anon_rmap(struct page *page, 1748 struct vm_area_struct *vma, unsigned long address, int exclusive) 1749 { 1750 struct anon_vma *anon_vma = vma->anon_vma; 1751 1752 BUG_ON(!anon_vma); 1753 1754 if (PageAnon(page)) 1755 return; 1756 if (!exclusive) 1757 anon_vma = anon_vma->root; 1758 1759 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 1760 page->mapping = (struct address_space *) anon_vma; 1761 page->index = linear_page_index(vma, address); 1762 } 1763 1764 void hugepage_add_anon_rmap(struct page *page, 1765 struct vm_area_struct *vma, unsigned long address) 1766 { 1767 struct anon_vma *anon_vma = vma->anon_vma; 1768 int first; 1769 1770 BUG_ON(!PageLocked(page)); 1771 BUG_ON(!anon_vma); 1772 /* address might be in next vma when migration races vma_adjust */ 1773 first = atomic_inc_and_test(compound_mapcount_ptr(page)); 1774 if (first) 1775 __hugepage_set_anon_rmap(page, vma, address, 0); 1776 } 1777 1778 void hugepage_add_new_anon_rmap(struct page *page, 1779 struct vm_area_struct *vma, unsigned long address) 1780 { 1781 BUG_ON(address < vma->vm_start || address >= vma->vm_end); 1782 atomic_set(compound_mapcount_ptr(page), 0); 1783 __hugepage_set_anon_rmap(page, vma, address, 1); 1784 } 1785 #endif /* CONFIG_HUGETLB_PAGE */ 1786