1 /* 2 * mm/rmap.c - physical to virtual reverse mappings 3 * 4 * Copyright 2001, Rik van Riel <riel@conectiva.com.br> 5 * Released under the General Public License (GPL). 6 * 7 * Simple, low overhead reverse mapping scheme. 8 * Please try to keep this thing as modular as possible. 9 * 10 * Provides methods for unmapping each kind of mapped page: 11 * the anon methods track anonymous pages, and 12 * the file methods track pages belonging to an inode. 13 * 14 * Original design by Rik van Riel <riel@conectiva.com.br> 2001 15 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004 16 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004 17 * Contributions by Hugh Dickins 2003, 2004 18 */ 19 20 /* 21 * Lock ordering in mm: 22 * 23 * inode->i_mutex (while writing or truncating, not reading or faulting) 24 * mm->mmap_sem 25 * page->flags PG_locked (lock_page) 26 * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share) 27 * mapping->i_mmap_rwsem 28 * anon_vma->rwsem 29 * mm->page_table_lock or pte_lock 30 * zone_lru_lock (in mark_page_accessed, isolate_lru_page) 31 * swap_lock (in swap_duplicate, swap_info_get) 32 * mmlist_lock (in mmput, drain_mmlist and others) 33 * mapping->private_lock (in __set_page_dirty_buffers) 34 * mem_cgroup_{begin,end}_page_stat (memcg->move_lock) 35 * mapping->tree_lock (widely used) 36 * inode->i_lock (in set_page_dirty's __mark_inode_dirty) 37 * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) 38 * sb_lock (within inode_lock in fs/fs-writeback.c) 39 * mapping->tree_lock (widely used, in set_page_dirty, 40 * in arch-dependent flush_dcache_mmap_lock, 41 * within bdi.wb->list_lock in __sync_single_inode) 42 * 43 * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon) 44 * ->tasklist_lock 45 * pte map lock 46 */ 47 48 #include <linux/mm.h> 49 #include <linux/sched/mm.h> 50 #include <linux/sched/task.h> 51 #include <linux/pagemap.h> 52 #include <linux/swap.h> 53 #include <linux/swapops.h> 54 #include <linux/slab.h> 55 #include <linux/init.h> 56 #include <linux/ksm.h> 57 #include <linux/rmap.h> 58 #include <linux/rcupdate.h> 59 #include <linux/export.h> 60 #include <linux/memcontrol.h> 61 #include <linux/mmu_notifier.h> 62 #include <linux/migrate.h> 63 #include <linux/hugetlb.h> 64 #include <linux/backing-dev.h> 65 #include <linux/page_idle.h> 66 #include <linux/memremap.h> 67 68 #include <asm/tlbflush.h> 69 70 #include <trace/events/tlb.h> 71 72 #include "internal.h" 73 74 static struct kmem_cache *anon_vma_cachep; 75 static struct kmem_cache *anon_vma_chain_cachep; 76 77 static inline struct anon_vma *anon_vma_alloc(void) 78 { 79 struct anon_vma *anon_vma; 80 81 anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); 82 if (anon_vma) { 83 atomic_set(&anon_vma->refcount, 1); 84 anon_vma->degree = 1; /* Reference for first vma */ 85 anon_vma->parent = anon_vma; 86 /* 87 * Initialise the anon_vma root to point to itself. If called 88 * from fork, the root will be reset to the parents anon_vma. 89 */ 90 anon_vma->root = anon_vma; 91 } 92 93 return anon_vma; 94 } 95 96 static inline void anon_vma_free(struct anon_vma *anon_vma) 97 { 98 VM_BUG_ON(atomic_read(&anon_vma->refcount)); 99 100 /* 101 * Synchronize against page_lock_anon_vma_read() such that 102 * we can safely hold the lock without the anon_vma getting 103 * freed. 104 * 105 * Relies on the full mb implied by the atomic_dec_and_test() from 106 * put_anon_vma() against the acquire barrier implied by 107 * down_read_trylock() from page_lock_anon_vma_read(). This orders: 108 * 109 * page_lock_anon_vma_read() VS put_anon_vma() 110 * down_read_trylock() atomic_dec_and_test() 111 * LOCK MB 112 * atomic_read() rwsem_is_locked() 113 * 114 * LOCK should suffice since the actual taking of the lock must 115 * happen _before_ what follows. 116 */ 117 might_sleep(); 118 if (rwsem_is_locked(&anon_vma->root->rwsem)) { 119 anon_vma_lock_write(anon_vma); 120 anon_vma_unlock_write(anon_vma); 121 } 122 123 kmem_cache_free(anon_vma_cachep, anon_vma); 124 } 125 126 static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp) 127 { 128 return kmem_cache_alloc(anon_vma_chain_cachep, gfp); 129 } 130 131 static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) 132 { 133 kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); 134 } 135 136 static void anon_vma_chain_link(struct vm_area_struct *vma, 137 struct anon_vma_chain *avc, 138 struct anon_vma *anon_vma) 139 { 140 avc->vma = vma; 141 avc->anon_vma = anon_vma; 142 list_add(&avc->same_vma, &vma->anon_vma_chain); 143 anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); 144 } 145 146 /** 147 * __anon_vma_prepare - attach an anon_vma to a memory region 148 * @vma: the memory region in question 149 * 150 * This makes sure the memory mapping described by 'vma' has 151 * an 'anon_vma' attached to it, so that we can associate the 152 * anonymous pages mapped into it with that anon_vma. 153 * 154 * The common case will be that we already have one, which 155 * is handled inline by anon_vma_prepare(). But if 156 * not we either need to find an adjacent mapping that we 157 * can re-use the anon_vma from (very common when the only 158 * reason for splitting a vma has been mprotect()), or we 159 * allocate a new one. 160 * 161 * Anon-vma allocations are very subtle, because we may have 162 * optimistically looked up an anon_vma in page_lock_anon_vma_read() 163 * and that may actually touch the spinlock even in the newly 164 * allocated vma (it depends on RCU to make sure that the 165 * anon_vma isn't actually destroyed). 166 * 167 * As a result, we need to do proper anon_vma locking even 168 * for the new allocation. At the same time, we do not want 169 * to do any locking for the common case of already having 170 * an anon_vma. 171 * 172 * This must be called with the mmap_sem held for reading. 173 */ 174 int __anon_vma_prepare(struct vm_area_struct *vma) 175 { 176 struct mm_struct *mm = vma->vm_mm; 177 struct anon_vma *anon_vma, *allocated; 178 struct anon_vma_chain *avc; 179 180 might_sleep(); 181 182 avc = anon_vma_chain_alloc(GFP_KERNEL); 183 if (!avc) 184 goto out_enomem; 185 186 anon_vma = find_mergeable_anon_vma(vma); 187 allocated = NULL; 188 if (!anon_vma) { 189 anon_vma = anon_vma_alloc(); 190 if (unlikely(!anon_vma)) 191 goto out_enomem_free_avc; 192 allocated = anon_vma; 193 } 194 195 anon_vma_lock_write(anon_vma); 196 /* page_table_lock to protect against threads */ 197 spin_lock(&mm->page_table_lock); 198 if (likely(!vma->anon_vma)) { 199 vma->anon_vma = anon_vma; 200 anon_vma_chain_link(vma, avc, anon_vma); 201 /* vma reference or self-parent link for new root */ 202 anon_vma->degree++; 203 allocated = NULL; 204 avc = NULL; 205 } 206 spin_unlock(&mm->page_table_lock); 207 anon_vma_unlock_write(anon_vma); 208 209 if (unlikely(allocated)) 210 put_anon_vma(allocated); 211 if (unlikely(avc)) 212 anon_vma_chain_free(avc); 213 214 return 0; 215 216 out_enomem_free_avc: 217 anon_vma_chain_free(avc); 218 out_enomem: 219 return -ENOMEM; 220 } 221 222 /* 223 * This is a useful helper function for locking the anon_vma root as 224 * we traverse the vma->anon_vma_chain, looping over anon_vma's that 225 * have the same vma. 226 * 227 * Such anon_vma's should have the same root, so you'd expect to see 228 * just a single mutex_lock for the whole traversal. 229 */ 230 static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma) 231 { 232 struct anon_vma *new_root = anon_vma->root; 233 if (new_root != root) { 234 if (WARN_ON_ONCE(root)) 235 up_write(&root->rwsem); 236 root = new_root; 237 down_write(&root->rwsem); 238 } 239 return root; 240 } 241 242 static inline void unlock_anon_vma_root(struct anon_vma *root) 243 { 244 if (root) 245 up_write(&root->rwsem); 246 } 247 248 /* 249 * Attach the anon_vmas from src to dst. 250 * Returns 0 on success, -ENOMEM on failure. 251 * 252 * If dst->anon_vma is NULL this function tries to find and reuse existing 253 * anon_vma which has no vmas and only one child anon_vma. This prevents 254 * degradation of anon_vma hierarchy to endless linear chain in case of 255 * constantly forking task. On the other hand, an anon_vma with more than one 256 * child isn't reused even if there was no alive vma, thus rmap walker has a 257 * good chance of avoiding scanning the whole hierarchy when it searches where 258 * page is mapped. 259 */ 260 int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) 261 { 262 struct anon_vma_chain *avc, *pavc; 263 struct anon_vma *root = NULL; 264 265 list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { 266 struct anon_vma *anon_vma; 267 268 avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN); 269 if (unlikely(!avc)) { 270 unlock_anon_vma_root(root); 271 root = NULL; 272 avc = anon_vma_chain_alloc(GFP_KERNEL); 273 if (!avc) 274 goto enomem_failure; 275 } 276 anon_vma = pavc->anon_vma; 277 root = lock_anon_vma_root(root, anon_vma); 278 anon_vma_chain_link(dst, avc, anon_vma); 279 280 /* 281 * Reuse existing anon_vma if its degree lower than two, 282 * that means it has no vma and only one anon_vma child. 283 * 284 * Do not chose parent anon_vma, otherwise first child 285 * will always reuse it. Root anon_vma is never reused: 286 * it has self-parent reference and at least one child. 287 */ 288 if (!dst->anon_vma && anon_vma != src->anon_vma && 289 anon_vma->degree < 2) 290 dst->anon_vma = anon_vma; 291 } 292 if (dst->anon_vma) 293 dst->anon_vma->degree++; 294 unlock_anon_vma_root(root); 295 return 0; 296 297 enomem_failure: 298 /* 299 * dst->anon_vma is dropped here otherwise its degree can be incorrectly 300 * decremented in unlink_anon_vmas(). 301 * We can safely do this because callers of anon_vma_clone() don't care 302 * about dst->anon_vma if anon_vma_clone() failed. 303 */ 304 dst->anon_vma = NULL; 305 unlink_anon_vmas(dst); 306 return -ENOMEM; 307 } 308 309 /* 310 * Attach vma to its own anon_vma, as well as to the anon_vmas that 311 * the corresponding VMA in the parent process is attached to. 312 * Returns 0 on success, non-zero on failure. 313 */ 314 int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) 315 { 316 struct anon_vma_chain *avc; 317 struct anon_vma *anon_vma; 318 int error; 319 320 /* Don't bother if the parent process has no anon_vma here. */ 321 if (!pvma->anon_vma) 322 return 0; 323 324 /* Drop inherited anon_vma, we'll reuse existing or allocate new. */ 325 vma->anon_vma = NULL; 326 327 /* 328 * First, attach the new VMA to the parent VMA's anon_vmas, 329 * so rmap can find non-COWed pages in child processes. 330 */ 331 error = anon_vma_clone(vma, pvma); 332 if (error) 333 return error; 334 335 /* An existing anon_vma has been reused, all done then. */ 336 if (vma->anon_vma) 337 return 0; 338 339 /* Then add our own anon_vma. */ 340 anon_vma = anon_vma_alloc(); 341 if (!anon_vma) 342 goto out_error; 343 avc = anon_vma_chain_alloc(GFP_KERNEL); 344 if (!avc) 345 goto out_error_free_anon_vma; 346 347 /* 348 * The root anon_vma's spinlock is the lock actually used when we 349 * lock any of the anon_vmas in this anon_vma tree. 350 */ 351 anon_vma->root = pvma->anon_vma->root; 352 anon_vma->parent = pvma->anon_vma; 353 /* 354 * With refcounts, an anon_vma can stay around longer than the 355 * process it belongs to. The root anon_vma needs to be pinned until 356 * this anon_vma is freed, because the lock lives in the root. 357 */ 358 get_anon_vma(anon_vma->root); 359 /* Mark this anon_vma as the one where our new (COWed) pages go. */ 360 vma->anon_vma = anon_vma; 361 anon_vma_lock_write(anon_vma); 362 anon_vma_chain_link(vma, avc, anon_vma); 363 anon_vma->parent->degree++; 364 anon_vma_unlock_write(anon_vma); 365 366 return 0; 367 368 out_error_free_anon_vma: 369 put_anon_vma(anon_vma); 370 out_error: 371 unlink_anon_vmas(vma); 372 return -ENOMEM; 373 } 374 375 void unlink_anon_vmas(struct vm_area_struct *vma) 376 { 377 struct anon_vma_chain *avc, *next; 378 struct anon_vma *root = NULL; 379 380 /* 381 * Unlink each anon_vma chained to the VMA. This list is ordered 382 * from newest to oldest, ensuring the root anon_vma gets freed last. 383 */ 384 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { 385 struct anon_vma *anon_vma = avc->anon_vma; 386 387 root = lock_anon_vma_root(root, anon_vma); 388 anon_vma_interval_tree_remove(avc, &anon_vma->rb_root); 389 390 /* 391 * Leave empty anon_vmas on the list - we'll need 392 * to free them outside the lock. 393 */ 394 if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) { 395 anon_vma->parent->degree--; 396 continue; 397 } 398 399 list_del(&avc->same_vma); 400 anon_vma_chain_free(avc); 401 } 402 if (vma->anon_vma) 403 vma->anon_vma->degree--; 404 unlock_anon_vma_root(root); 405 406 /* 407 * Iterate the list once more, it now only contains empty and unlinked 408 * anon_vmas, destroy them. Could not do before due to __put_anon_vma() 409 * needing to write-acquire the anon_vma->root->rwsem. 410 */ 411 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { 412 struct anon_vma *anon_vma = avc->anon_vma; 413 414 VM_WARN_ON(anon_vma->degree); 415 put_anon_vma(anon_vma); 416 417 list_del(&avc->same_vma); 418 anon_vma_chain_free(avc); 419 } 420 } 421 422 static void anon_vma_ctor(void *data) 423 { 424 struct anon_vma *anon_vma = data; 425 426 init_rwsem(&anon_vma->rwsem); 427 atomic_set(&anon_vma->refcount, 0); 428 anon_vma->rb_root = RB_ROOT_CACHED; 429 } 430 431 void __init anon_vma_init(void) 432 { 433 anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), 434 0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT, 435 anon_vma_ctor); 436 anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, 437 SLAB_PANIC|SLAB_ACCOUNT); 438 } 439 440 /* 441 * Getting a lock on a stable anon_vma from a page off the LRU is tricky! 442 * 443 * Since there is no serialization what so ever against page_remove_rmap() 444 * the best this function can do is return a locked anon_vma that might 445 * have been relevant to this page. 446 * 447 * The page might have been remapped to a different anon_vma or the anon_vma 448 * returned may already be freed (and even reused). 449 * 450 * In case it was remapped to a different anon_vma, the new anon_vma will be a 451 * child of the old anon_vma, and the anon_vma lifetime rules will therefore 452 * ensure that any anon_vma obtained from the page will still be valid for as 453 * long as we observe page_mapped() [ hence all those page_mapped() tests ]. 454 * 455 * All users of this function must be very careful when walking the anon_vma 456 * chain and verify that the page in question is indeed mapped in it 457 * [ something equivalent to page_mapped_in_vma() ]. 458 * 459 * Since anon_vma's slab is DESTROY_BY_RCU and we know from page_remove_rmap() 460 * that the anon_vma pointer from page->mapping is valid if there is a 461 * mapcount, we can dereference the anon_vma after observing those. 462 */ 463 struct anon_vma *page_get_anon_vma(struct page *page) 464 { 465 struct anon_vma *anon_vma = NULL; 466 unsigned long anon_mapping; 467 468 rcu_read_lock(); 469 anon_mapping = (unsigned long)READ_ONCE(page->mapping); 470 if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) 471 goto out; 472 if (!page_mapped(page)) 473 goto out; 474 475 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); 476 if (!atomic_inc_not_zero(&anon_vma->refcount)) { 477 anon_vma = NULL; 478 goto out; 479 } 480 481 /* 482 * If this page is still mapped, then its anon_vma cannot have been 483 * freed. But if it has been unmapped, we have no security against the 484 * anon_vma structure being freed and reused (for another anon_vma: 485 * SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero() 486 * above cannot corrupt). 487 */ 488 if (!page_mapped(page)) { 489 rcu_read_unlock(); 490 put_anon_vma(anon_vma); 491 return NULL; 492 } 493 out: 494 rcu_read_unlock(); 495 496 return anon_vma; 497 } 498 499 /* 500 * Similar to page_get_anon_vma() except it locks the anon_vma. 501 * 502 * Its a little more complex as it tries to keep the fast path to a single 503 * atomic op -- the trylock. If we fail the trylock, we fall back to getting a 504 * reference like with page_get_anon_vma() and then block on the mutex. 505 */ 506 struct anon_vma *page_lock_anon_vma_read(struct page *page) 507 { 508 struct anon_vma *anon_vma = NULL; 509 struct anon_vma *root_anon_vma; 510 unsigned long anon_mapping; 511 512 rcu_read_lock(); 513 anon_mapping = (unsigned long)READ_ONCE(page->mapping); 514 if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) 515 goto out; 516 if (!page_mapped(page)) 517 goto out; 518 519 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); 520 root_anon_vma = READ_ONCE(anon_vma->root); 521 if (down_read_trylock(&root_anon_vma->rwsem)) { 522 /* 523 * If the page is still mapped, then this anon_vma is still 524 * its anon_vma, and holding the mutex ensures that it will 525 * not go away, see anon_vma_free(). 526 */ 527 if (!page_mapped(page)) { 528 up_read(&root_anon_vma->rwsem); 529 anon_vma = NULL; 530 } 531 goto out; 532 } 533 534 /* trylock failed, we got to sleep */ 535 if (!atomic_inc_not_zero(&anon_vma->refcount)) { 536 anon_vma = NULL; 537 goto out; 538 } 539 540 if (!page_mapped(page)) { 541 rcu_read_unlock(); 542 put_anon_vma(anon_vma); 543 return NULL; 544 } 545 546 /* we pinned the anon_vma, its safe to sleep */ 547 rcu_read_unlock(); 548 anon_vma_lock_read(anon_vma); 549 550 if (atomic_dec_and_test(&anon_vma->refcount)) { 551 /* 552 * Oops, we held the last refcount, release the lock 553 * and bail -- can't simply use put_anon_vma() because 554 * we'll deadlock on the anon_vma_lock_write() recursion. 555 */ 556 anon_vma_unlock_read(anon_vma); 557 __put_anon_vma(anon_vma); 558 anon_vma = NULL; 559 } 560 561 return anon_vma; 562 563 out: 564 rcu_read_unlock(); 565 return anon_vma; 566 } 567 568 void page_unlock_anon_vma_read(struct anon_vma *anon_vma) 569 { 570 anon_vma_unlock_read(anon_vma); 571 } 572 573 #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH 574 /* 575 * Flush TLB entries for recently unmapped pages from remote CPUs. It is 576 * important if a PTE was dirty when it was unmapped that it's flushed 577 * before any IO is initiated on the page to prevent lost writes. Similarly, 578 * it must be flushed before freeing to prevent data leakage. 579 */ 580 void try_to_unmap_flush(void) 581 { 582 struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; 583 584 if (!tlb_ubc->flush_required) 585 return; 586 587 arch_tlbbatch_flush(&tlb_ubc->arch); 588 tlb_ubc->flush_required = false; 589 tlb_ubc->writable = false; 590 } 591 592 /* Flush iff there are potentially writable TLB entries that can race with IO */ 593 void try_to_unmap_flush_dirty(void) 594 { 595 struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; 596 597 if (tlb_ubc->writable) 598 try_to_unmap_flush(); 599 } 600 601 static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable) 602 { 603 struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; 604 605 arch_tlbbatch_add_mm(&tlb_ubc->arch, mm); 606 tlb_ubc->flush_required = true; 607 608 /* 609 * Ensure compiler does not re-order the setting of tlb_flush_batched 610 * before the PTE is cleared. 611 */ 612 barrier(); 613 mm->tlb_flush_batched = true; 614 615 /* 616 * If the PTE was dirty then it's best to assume it's writable. The 617 * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush() 618 * before the page is queued for IO. 619 */ 620 if (writable) 621 tlb_ubc->writable = true; 622 } 623 624 /* 625 * Returns true if the TLB flush should be deferred to the end of a batch of 626 * unmap operations to reduce IPIs. 627 */ 628 static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) 629 { 630 bool should_defer = false; 631 632 if (!(flags & TTU_BATCH_FLUSH)) 633 return false; 634 635 /* If remote CPUs need to be flushed then defer batch the flush */ 636 if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids) 637 should_defer = true; 638 put_cpu(); 639 640 return should_defer; 641 } 642 643 /* 644 * Reclaim unmaps pages under the PTL but do not flush the TLB prior to 645 * releasing the PTL if TLB flushes are batched. It's possible for a parallel 646 * operation such as mprotect or munmap to race between reclaim unmapping 647 * the page and flushing the page. If this race occurs, it potentially allows 648 * access to data via a stale TLB entry. Tracking all mm's that have TLB 649 * batching in flight would be expensive during reclaim so instead track 650 * whether TLB batching occurred in the past and if so then do a flush here 651 * if required. This will cost one additional flush per reclaim cycle paid 652 * by the first operation at risk such as mprotect and mumap. 653 * 654 * This must be called under the PTL so that an access to tlb_flush_batched 655 * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise 656 * via the PTL. 657 */ 658 void flush_tlb_batched_pending(struct mm_struct *mm) 659 { 660 if (mm->tlb_flush_batched) { 661 flush_tlb_mm(mm); 662 663 /* 664 * Do not allow the compiler to re-order the clearing of 665 * tlb_flush_batched before the tlb is flushed. 666 */ 667 barrier(); 668 mm->tlb_flush_batched = false; 669 } 670 } 671 #else 672 static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable) 673 { 674 } 675 676 static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) 677 { 678 return false; 679 } 680 #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ 681 682 /* 683 * At what user virtual address is page expected in vma? 684 * Caller should check the page is actually part of the vma. 685 */ 686 unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) 687 { 688 unsigned long address; 689 if (PageAnon(page)) { 690 struct anon_vma *page__anon_vma = page_anon_vma(page); 691 /* 692 * Note: swapoff's unuse_vma() is more efficient with this 693 * check, and needs it to match anon_vma when KSM is active. 694 */ 695 if (!vma->anon_vma || !page__anon_vma || 696 vma->anon_vma->root != page__anon_vma->root) 697 return -EFAULT; 698 } else if (page->mapping) { 699 if (!vma->vm_file || vma->vm_file->f_mapping != page->mapping) 700 return -EFAULT; 701 } else 702 return -EFAULT; 703 address = __vma_address(page, vma); 704 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) 705 return -EFAULT; 706 return address; 707 } 708 709 pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) 710 { 711 pgd_t *pgd; 712 p4d_t *p4d; 713 pud_t *pud; 714 pmd_t *pmd = NULL; 715 pmd_t pmde; 716 717 pgd = pgd_offset(mm, address); 718 if (!pgd_present(*pgd)) 719 goto out; 720 721 p4d = p4d_offset(pgd, address); 722 if (!p4d_present(*p4d)) 723 goto out; 724 725 pud = pud_offset(p4d, address); 726 if (!pud_present(*pud)) 727 goto out; 728 729 pmd = pmd_offset(pud, address); 730 /* 731 * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at() 732 * without holding anon_vma lock for write. So when looking for a 733 * genuine pmde (in which to find pte), test present and !THP together. 734 */ 735 pmde = *pmd; 736 barrier(); 737 if (!pmd_present(pmde) || pmd_trans_huge(pmde)) 738 pmd = NULL; 739 out: 740 return pmd; 741 } 742 743 struct page_referenced_arg { 744 int mapcount; 745 int referenced; 746 unsigned long vm_flags; 747 struct mem_cgroup *memcg; 748 }; 749 /* 750 * arg: page_referenced_arg will be passed 751 */ 752 static bool page_referenced_one(struct page *page, struct vm_area_struct *vma, 753 unsigned long address, void *arg) 754 { 755 struct page_referenced_arg *pra = arg; 756 struct page_vma_mapped_walk pvmw = { 757 .page = page, 758 .vma = vma, 759 .address = address, 760 }; 761 int referenced = 0; 762 763 while (page_vma_mapped_walk(&pvmw)) { 764 address = pvmw.address; 765 766 if (vma->vm_flags & VM_LOCKED) { 767 page_vma_mapped_walk_done(&pvmw); 768 pra->vm_flags |= VM_LOCKED; 769 return false; /* To break the loop */ 770 } 771 772 if (pvmw.pte) { 773 if (ptep_clear_flush_young_notify(vma, address, 774 pvmw.pte)) { 775 /* 776 * Don't treat a reference through 777 * a sequentially read mapping as such. 778 * If the page has been used in another mapping, 779 * we will catch it; if this other mapping is 780 * already gone, the unmap path will have set 781 * PG_referenced or activated the page. 782 */ 783 if (likely(!(vma->vm_flags & VM_SEQ_READ))) 784 referenced++; 785 } 786 } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { 787 if (pmdp_clear_flush_young_notify(vma, address, 788 pvmw.pmd)) 789 referenced++; 790 } else { 791 /* unexpected pmd-mapped page? */ 792 WARN_ON_ONCE(1); 793 } 794 795 pra->mapcount--; 796 } 797 798 if (referenced) 799 clear_page_idle(page); 800 if (test_and_clear_page_young(page)) 801 referenced++; 802 803 if (referenced) { 804 pra->referenced++; 805 pra->vm_flags |= vma->vm_flags; 806 } 807 808 if (!pra->mapcount) 809 return false; /* To break the loop */ 810 811 return true; 812 } 813 814 static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg) 815 { 816 struct page_referenced_arg *pra = arg; 817 struct mem_cgroup *memcg = pra->memcg; 818 819 if (!mm_match_cgroup(vma->vm_mm, memcg)) 820 return true; 821 822 return false; 823 } 824 825 /** 826 * page_referenced - test if the page was referenced 827 * @page: the page to test 828 * @is_locked: caller holds lock on the page 829 * @memcg: target memory cgroup 830 * @vm_flags: collect encountered vma->vm_flags who actually referenced the page 831 * 832 * Quick test_and_clear_referenced for all mappings to a page, 833 * returns the number of ptes which referenced the page. 834 */ 835 int page_referenced(struct page *page, 836 int is_locked, 837 struct mem_cgroup *memcg, 838 unsigned long *vm_flags) 839 { 840 int we_locked = 0; 841 struct page_referenced_arg pra = { 842 .mapcount = total_mapcount(page), 843 .memcg = memcg, 844 }; 845 struct rmap_walk_control rwc = { 846 .rmap_one = page_referenced_one, 847 .arg = (void *)&pra, 848 .anon_lock = page_lock_anon_vma_read, 849 }; 850 851 *vm_flags = 0; 852 if (!page_mapped(page)) 853 return 0; 854 855 if (!page_rmapping(page)) 856 return 0; 857 858 if (!is_locked && (!PageAnon(page) || PageKsm(page))) { 859 we_locked = trylock_page(page); 860 if (!we_locked) 861 return 1; 862 } 863 864 /* 865 * If we are reclaiming on behalf of a cgroup, skip 866 * counting on behalf of references from different 867 * cgroups 868 */ 869 if (memcg) { 870 rwc.invalid_vma = invalid_page_referenced_vma; 871 } 872 873 rmap_walk(page, &rwc); 874 *vm_flags = pra.vm_flags; 875 876 if (we_locked) 877 unlock_page(page); 878 879 return pra.referenced; 880 } 881 882 static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, 883 unsigned long address, void *arg) 884 { 885 struct page_vma_mapped_walk pvmw = { 886 .page = page, 887 .vma = vma, 888 .address = address, 889 .flags = PVMW_SYNC, 890 }; 891 unsigned long start = address, end; 892 int *cleaned = arg; 893 894 /* 895 * We have to assume the worse case ie pmd for invalidation. Note that 896 * the page can not be free from this function. 897 */ 898 end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page))); 899 mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); 900 901 while (page_vma_mapped_walk(&pvmw)) { 902 unsigned long cstart, cend; 903 int ret = 0; 904 905 cstart = address = pvmw.address; 906 if (pvmw.pte) { 907 pte_t entry; 908 pte_t *pte = pvmw.pte; 909 910 if (!pte_dirty(*pte) && !pte_write(*pte)) 911 continue; 912 913 flush_cache_page(vma, address, pte_pfn(*pte)); 914 entry = ptep_clear_flush(vma, address, pte); 915 entry = pte_wrprotect(entry); 916 entry = pte_mkclean(entry); 917 set_pte_at(vma->vm_mm, address, pte, entry); 918 cend = cstart + PAGE_SIZE; 919 ret = 1; 920 } else { 921 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE 922 pmd_t *pmd = pvmw.pmd; 923 pmd_t entry; 924 925 if (!pmd_dirty(*pmd) && !pmd_write(*pmd)) 926 continue; 927 928 flush_cache_page(vma, address, page_to_pfn(page)); 929 entry = pmdp_huge_clear_flush(vma, address, pmd); 930 entry = pmd_wrprotect(entry); 931 entry = pmd_mkclean(entry); 932 set_pmd_at(vma->vm_mm, address, pmd, entry); 933 cstart &= PMD_MASK; 934 cend = cstart + PMD_SIZE; 935 ret = 1; 936 #else 937 /* unexpected pmd-mapped page? */ 938 WARN_ON_ONCE(1); 939 #endif 940 } 941 942 if (ret) { 943 mmu_notifier_invalidate_range(vma->vm_mm, cstart, cend); 944 (*cleaned)++; 945 } 946 } 947 948 mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); 949 950 return true; 951 } 952 953 static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg) 954 { 955 if (vma->vm_flags & VM_SHARED) 956 return false; 957 958 return true; 959 } 960 961 int page_mkclean(struct page *page) 962 { 963 int cleaned = 0; 964 struct address_space *mapping; 965 struct rmap_walk_control rwc = { 966 .arg = (void *)&cleaned, 967 .rmap_one = page_mkclean_one, 968 .invalid_vma = invalid_mkclean_vma, 969 }; 970 971 BUG_ON(!PageLocked(page)); 972 973 if (!page_mapped(page)) 974 return 0; 975 976 mapping = page_mapping(page); 977 if (!mapping) 978 return 0; 979 980 rmap_walk(page, &rwc); 981 982 return cleaned; 983 } 984 EXPORT_SYMBOL_GPL(page_mkclean); 985 986 /** 987 * page_move_anon_rmap - move a page to our anon_vma 988 * @page: the page to move to our anon_vma 989 * @vma: the vma the page belongs to 990 * 991 * When a page belongs exclusively to one process after a COW event, 992 * that page can be moved into the anon_vma that belongs to just that 993 * process, so the rmap code will not search the parent or sibling 994 * processes. 995 */ 996 void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma) 997 { 998 struct anon_vma *anon_vma = vma->anon_vma; 999 1000 page = compound_head(page); 1001 1002 VM_BUG_ON_PAGE(!PageLocked(page), page); 1003 VM_BUG_ON_VMA(!anon_vma, vma); 1004 1005 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 1006 /* 1007 * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written 1008 * simultaneously, so a concurrent reader (eg page_referenced()'s 1009 * PageAnon()) will not see one without the other. 1010 */ 1011 WRITE_ONCE(page->mapping, (struct address_space *) anon_vma); 1012 } 1013 1014 /** 1015 * __page_set_anon_rmap - set up new anonymous rmap 1016 * @page: Page to add to rmap 1017 * @vma: VM area to add page to. 1018 * @address: User virtual address of the mapping 1019 * @exclusive: the page is exclusively owned by the current process 1020 */ 1021 static void __page_set_anon_rmap(struct page *page, 1022 struct vm_area_struct *vma, unsigned long address, int exclusive) 1023 { 1024 struct anon_vma *anon_vma = vma->anon_vma; 1025 1026 BUG_ON(!anon_vma); 1027 1028 if (PageAnon(page)) 1029 return; 1030 1031 /* 1032 * If the page isn't exclusively mapped into this vma, 1033 * we must use the _oldest_ possible anon_vma for the 1034 * page mapping! 1035 */ 1036 if (!exclusive) 1037 anon_vma = anon_vma->root; 1038 1039 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 1040 page->mapping = (struct address_space *) anon_vma; 1041 page->index = linear_page_index(vma, address); 1042 } 1043 1044 /** 1045 * __page_check_anon_rmap - sanity check anonymous rmap addition 1046 * @page: the page to add the mapping to 1047 * @vma: the vm area in which the mapping is added 1048 * @address: the user virtual address mapped 1049 */ 1050 static void __page_check_anon_rmap(struct page *page, 1051 struct vm_area_struct *vma, unsigned long address) 1052 { 1053 #ifdef CONFIG_DEBUG_VM 1054 /* 1055 * The page's anon-rmap details (mapping and index) are guaranteed to 1056 * be set up correctly at this point. 1057 * 1058 * We have exclusion against page_add_anon_rmap because the caller 1059 * always holds the page locked, except if called from page_dup_rmap, 1060 * in which case the page is already known to be setup. 1061 * 1062 * We have exclusion against page_add_new_anon_rmap because those pages 1063 * are initially only visible via the pagetables, and the pte is locked 1064 * over the call to page_add_new_anon_rmap. 1065 */ 1066 BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root); 1067 BUG_ON(page_to_pgoff(page) != linear_page_index(vma, address)); 1068 #endif 1069 } 1070 1071 /** 1072 * page_add_anon_rmap - add pte mapping to an anonymous page 1073 * @page: the page to add the mapping to 1074 * @vma: the vm area in which the mapping is added 1075 * @address: the user virtual address mapped 1076 * @compound: charge the page as compound or small page 1077 * 1078 * The caller needs to hold the pte lock, and the page must be locked in 1079 * the anon_vma case: to serialize mapping,index checking after setting, 1080 * and to ensure that PageAnon is not being upgraded racily to PageKsm 1081 * (but PageKsm is never downgraded to PageAnon). 1082 */ 1083 void page_add_anon_rmap(struct page *page, 1084 struct vm_area_struct *vma, unsigned long address, bool compound) 1085 { 1086 do_page_add_anon_rmap(page, vma, address, compound ? RMAP_COMPOUND : 0); 1087 } 1088 1089 /* 1090 * Special version of the above for do_swap_page, which often runs 1091 * into pages that are exclusively owned by the current process. 1092 * Everybody else should continue to use page_add_anon_rmap above. 1093 */ 1094 void do_page_add_anon_rmap(struct page *page, 1095 struct vm_area_struct *vma, unsigned long address, int flags) 1096 { 1097 bool compound = flags & RMAP_COMPOUND; 1098 bool first; 1099 1100 if (compound) { 1101 atomic_t *mapcount; 1102 VM_BUG_ON_PAGE(!PageLocked(page), page); 1103 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 1104 mapcount = compound_mapcount_ptr(page); 1105 first = atomic_inc_and_test(mapcount); 1106 } else { 1107 first = atomic_inc_and_test(&page->_mapcount); 1108 } 1109 1110 if (first) { 1111 int nr = compound ? hpage_nr_pages(page) : 1; 1112 /* 1113 * We use the irq-unsafe __{inc|mod}_zone_page_stat because 1114 * these counters are not modified in interrupt context, and 1115 * pte lock(a spinlock) is held, which implies preemption 1116 * disabled. 1117 */ 1118 if (compound) 1119 __inc_node_page_state(page, NR_ANON_THPS); 1120 __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr); 1121 } 1122 if (unlikely(PageKsm(page))) 1123 return; 1124 1125 VM_BUG_ON_PAGE(!PageLocked(page), page); 1126 1127 /* address might be in next vma when migration races vma_adjust */ 1128 if (first) 1129 __page_set_anon_rmap(page, vma, address, 1130 flags & RMAP_EXCLUSIVE); 1131 else 1132 __page_check_anon_rmap(page, vma, address); 1133 } 1134 1135 /** 1136 * page_add_new_anon_rmap - add pte mapping to a new anonymous page 1137 * @page: the page to add the mapping to 1138 * @vma: the vm area in which the mapping is added 1139 * @address: the user virtual address mapped 1140 * @compound: charge the page as compound or small page 1141 * 1142 * Same as page_add_anon_rmap but must only be called on *new* pages. 1143 * This means the inc-and-test can be bypassed. 1144 * Page does not have to be locked. 1145 */ 1146 void page_add_new_anon_rmap(struct page *page, 1147 struct vm_area_struct *vma, unsigned long address, bool compound) 1148 { 1149 int nr = compound ? hpage_nr_pages(page) : 1; 1150 1151 VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); 1152 __SetPageSwapBacked(page); 1153 if (compound) { 1154 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 1155 /* increment count (starts at -1) */ 1156 atomic_set(compound_mapcount_ptr(page), 0); 1157 __inc_node_page_state(page, NR_ANON_THPS); 1158 } else { 1159 /* Anon THP always mapped first with PMD */ 1160 VM_BUG_ON_PAGE(PageTransCompound(page), page); 1161 /* increment count (starts at -1) */ 1162 atomic_set(&page->_mapcount, 0); 1163 } 1164 __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr); 1165 __page_set_anon_rmap(page, vma, address, 1); 1166 } 1167 1168 /** 1169 * page_add_file_rmap - add pte mapping to a file page 1170 * @page: the page to add the mapping to 1171 * 1172 * The caller needs to hold the pte lock. 1173 */ 1174 void page_add_file_rmap(struct page *page, bool compound) 1175 { 1176 int i, nr = 1; 1177 1178 VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page); 1179 lock_page_memcg(page); 1180 if (compound && PageTransHuge(page)) { 1181 for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) { 1182 if (atomic_inc_and_test(&page[i]._mapcount)) 1183 nr++; 1184 } 1185 if (!atomic_inc_and_test(compound_mapcount_ptr(page))) 1186 goto out; 1187 VM_BUG_ON_PAGE(!PageSwapBacked(page), page); 1188 __inc_node_page_state(page, NR_SHMEM_PMDMAPPED); 1189 } else { 1190 if (PageTransCompound(page) && page_mapping(page)) { 1191 VM_WARN_ON_ONCE(!PageLocked(page)); 1192 1193 SetPageDoubleMap(compound_head(page)); 1194 if (PageMlocked(page)) 1195 clear_page_mlock(compound_head(page)); 1196 } 1197 if (!atomic_inc_and_test(&page->_mapcount)) 1198 goto out; 1199 } 1200 __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr); 1201 out: 1202 unlock_page_memcg(page); 1203 } 1204 1205 static void page_remove_file_rmap(struct page *page, bool compound) 1206 { 1207 int i, nr = 1; 1208 1209 VM_BUG_ON_PAGE(compound && !PageHead(page), page); 1210 lock_page_memcg(page); 1211 1212 /* Hugepages are not counted in NR_FILE_MAPPED for now. */ 1213 if (unlikely(PageHuge(page))) { 1214 /* hugetlb pages are always mapped with pmds */ 1215 atomic_dec(compound_mapcount_ptr(page)); 1216 goto out; 1217 } 1218 1219 /* page still mapped by someone else? */ 1220 if (compound && PageTransHuge(page)) { 1221 for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) { 1222 if (atomic_add_negative(-1, &page[i]._mapcount)) 1223 nr++; 1224 } 1225 if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) 1226 goto out; 1227 VM_BUG_ON_PAGE(!PageSwapBacked(page), page); 1228 __dec_node_page_state(page, NR_SHMEM_PMDMAPPED); 1229 } else { 1230 if (!atomic_add_negative(-1, &page->_mapcount)) 1231 goto out; 1232 } 1233 1234 /* 1235 * We use the irq-unsafe __{inc|mod}_lruvec_page_state because 1236 * these counters are not modified in interrupt context, and 1237 * pte lock(a spinlock) is held, which implies preemption disabled. 1238 */ 1239 __mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr); 1240 1241 if (unlikely(PageMlocked(page))) 1242 clear_page_mlock(page); 1243 out: 1244 unlock_page_memcg(page); 1245 } 1246 1247 static void page_remove_anon_compound_rmap(struct page *page) 1248 { 1249 int i, nr; 1250 1251 if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) 1252 return; 1253 1254 /* Hugepages are not counted in NR_ANON_PAGES for now. */ 1255 if (unlikely(PageHuge(page))) 1256 return; 1257 1258 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) 1259 return; 1260 1261 __dec_node_page_state(page, NR_ANON_THPS); 1262 1263 if (TestClearPageDoubleMap(page)) { 1264 /* 1265 * Subpages can be mapped with PTEs too. Check how many of 1266 * themi are still mapped. 1267 */ 1268 for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) { 1269 if (atomic_add_negative(-1, &page[i]._mapcount)) 1270 nr++; 1271 } 1272 } else { 1273 nr = HPAGE_PMD_NR; 1274 } 1275 1276 if (unlikely(PageMlocked(page))) 1277 clear_page_mlock(page); 1278 1279 if (nr) { 1280 __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, -nr); 1281 deferred_split_huge_page(page); 1282 } 1283 } 1284 1285 /** 1286 * page_remove_rmap - take down pte mapping from a page 1287 * @page: page to remove mapping from 1288 * @compound: uncharge the page as compound or small page 1289 * 1290 * The caller needs to hold the pte lock. 1291 */ 1292 void page_remove_rmap(struct page *page, bool compound) 1293 { 1294 if (!PageAnon(page)) 1295 return page_remove_file_rmap(page, compound); 1296 1297 if (compound) 1298 return page_remove_anon_compound_rmap(page); 1299 1300 /* page still mapped by someone else? */ 1301 if (!atomic_add_negative(-1, &page->_mapcount)) 1302 return; 1303 1304 /* 1305 * We use the irq-unsafe __{inc|mod}_zone_page_stat because 1306 * these counters are not modified in interrupt context, and 1307 * pte lock(a spinlock) is held, which implies preemption disabled. 1308 */ 1309 __dec_node_page_state(page, NR_ANON_MAPPED); 1310 1311 if (unlikely(PageMlocked(page))) 1312 clear_page_mlock(page); 1313 1314 if (PageTransCompound(page)) 1315 deferred_split_huge_page(compound_head(page)); 1316 1317 /* 1318 * It would be tidy to reset the PageAnon mapping here, 1319 * but that might overwrite a racing page_add_anon_rmap 1320 * which increments mapcount after us but sets mapping 1321 * before us: so leave the reset to free_hot_cold_page, 1322 * and remember that it's only reliable while mapped. 1323 * Leaving it set also helps swapoff to reinstate ptes 1324 * faster for those pages still in swapcache. 1325 */ 1326 } 1327 1328 /* 1329 * @arg: enum ttu_flags will be passed to this argument 1330 */ 1331 static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, 1332 unsigned long address, void *arg) 1333 { 1334 struct mm_struct *mm = vma->vm_mm; 1335 struct page_vma_mapped_walk pvmw = { 1336 .page = page, 1337 .vma = vma, 1338 .address = address, 1339 }; 1340 pte_t pteval; 1341 struct page *subpage; 1342 bool ret = true; 1343 unsigned long start = address, end; 1344 enum ttu_flags flags = (enum ttu_flags)arg; 1345 1346 /* munlock has nothing to gain from examining un-locked vmas */ 1347 if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) 1348 return true; 1349 1350 if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) && 1351 is_zone_device_page(page) && !is_device_private_page(page)) 1352 return true; 1353 1354 if (flags & TTU_SPLIT_HUGE_PMD) { 1355 split_huge_pmd_address(vma, address, 1356 flags & TTU_SPLIT_FREEZE, page); 1357 } 1358 1359 /* 1360 * We have to assume the worse case ie pmd for invalidation. Note that 1361 * the page can not be free in this function as call of try_to_unmap() 1362 * must hold a reference on the page. 1363 */ 1364 end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page))); 1365 mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); 1366 1367 while (page_vma_mapped_walk(&pvmw)) { 1368 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 1369 /* PMD-mapped THP migration entry */ 1370 if (!pvmw.pte && (flags & TTU_MIGRATION)) { 1371 VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page); 1372 1373 if (!PageAnon(page)) 1374 continue; 1375 1376 set_pmd_migration_entry(&pvmw, page); 1377 continue; 1378 } 1379 #endif 1380 1381 /* 1382 * If the page is mlock()d, we cannot swap it out. 1383 * If it's recently referenced (perhaps page_referenced 1384 * skipped over this mm) then we should reactivate it. 1385 */ 1386 if (!(flags & TTU_IGNORE_MLOCK)) { 1387 if (vma->vm_flags & VM_LOCKED) { 1388 /* PTE-mapped THP are never mlocked */ 1389 if (!PageTransCompound(page)) { 1390 /* 1391 * Holding pte lock, we do *not* need 1392 * mmap_sem here 1393 */ 1394 mlock_vma_page(page); 1395 } 1396 ret = false; 1397 page_vma_mapped_walk_done(&pvmw); 1398 break; 1399 } 1400 if (flags & TTU_MUNLOCK) 1401 continue; 1402 } 1403 1404 /* Unexpected PMD-mapped THP? */ 1405 VM_BUG_ON_PAGE(!pvmw.pte, page); 1406 1407 subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte); 1408 address = pvmw.address; 1409 1410 1411 if (IS_ENABLED(CONFIG_MIGRATION) && 1412 (flags & TTU_MIGRATION) && 1413 is_zone_device_page(page)) { 1414 swp_entry_t entry; 1415 pte_t swp_pte; 1416 1417 pteval = ptep_get_and_clear(mm, pvmw.address, pvmw.pte); 1418 1419 /* 1420 * Store the pfn of the page in a special migration 1421 * pte. do_swap_page() will wait until the migration 1422 * pte is removed and then restart fault handling. 1423 */ 1424 entry = make_migration_entry(page, 0); 1425 swp_pte = swp_entry_to_pte(entry); 1426 if (pte_soft_dirty(pteval)) 1427 swp_pte = pte_swp_mksoft_dirty(swp_pte); 1428 set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte); 1429 goto discard; 1430 } 1431 1432 if (!(flags & TTU_IGNORE_ACCESS)) { 1433 if (ptep_clear_flush_young_notify(vma, address, 1434 pvmw.pte)) { 1435 ret = false; 1436 page_vma_mapped_walk_done(&pvmw); 1437 break; 1438 } 1439 } 1440 1441 /* Nuke the page table entry. */ 1442 flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); 1443 if (should_defer_flush(mm, flags)) { 1444 /* 1445 * We clear the PTE but do not flush so potentially 1446 * a remote CPU could still be writing to the page. 1447 * If the entry was previously clean then the 1448 * architecture must guarantee that a clear->dirty 1449 * transition on a cached TLB entry is written through 1450 * and traps if the PTE is unmapped. 1451 */ 1452 pteval = ptep_get_and_clear(mm, address, pvmw.pte); 1453 1454 set_tlb_ubc_flush_pending(mm, pte_dirty(pteval)); 1455 } else { 1456 pteval = ptep_clear_flush(vma, address, pvmw.pte); 1457 } 1458 1459 /* Move the dirty bit to the page. Now the pte is gone. */ 1460 if (pte_dirty(pteval)) 1461 set_page_dirty(page); 1462 1463 /* Update high watermark before we lower rss */ 1464 update_hiwater_rss(mm); 1465 1466 if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { 1467 pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); 1468 if (PageHuge(page)) { 1469 int nr = 1 << compound_order(page); 1470 hugetlb_count_sub(nr, mm); 1471 set_huge_swap_pte_at(mm, address, 1472 pvmw.pte, pteval, 1473 vma_mmu_pagesize(vma)); 1474 } else { 1475 dec_mm_counter(mm, mm_counter(page)); 1476 set_pte_at(mm, address, pvmw.pte, pteval); 1477 } 1478 1479 } else if (pte_unused(pteval)) { 1480 /* 1481 * The guest indicated that the page content is of no 1482 * interest anymore. Simply discard the pte, vmscan 1483 * will take care of the rest. 1484 */ 1485 dec_mm_counter(mm, mm_counter(page)); 1486 } else if (IS_ENABLED(CONFIG_MIGRATION) && 1487 (flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))) { 1488 swp_entry_t entry; 1489 pte_t swp_pte; 1490 /* 1491 * Store the pfn of the page in a special migration 1492 * pte. do_swap_page() will wait until the migration 1493 * pte is removed and then restart fault handling. 1494 */ 1495 entry = make_migration_entry(subpage, 1496 pte_write(pteval)); 1497 swp_pte = swp_entry_to_pte(entry); 1498 if (pte_soft_dirty(pteval)) 1499 swp_pte = pte_swp_mksoft_dirty(swp_pte); 1500 set_pte_at(mm, address, pvmw.pte, swp_pte); 1501 } else if (PageAnon(page)) { 1502 swp_entry_t entry = { .val = page_private(subpage) }; 1503 pte_t swp_pte; 1504 /* 1505 * Store the swap location in the pte. 1506 * See handle_pte_fault() ... 1507 */ 1508 if (unlikely(PageSwapBacked(page) != PageSwapCache(page))) { 1509 WARN_ON_ONCE(1); 1510 ret = false; 1511 /* We have to invalidate as we cleared the pte */ 1512 page_vma_mapped_walk_done(&pvmw); 1513 break; 1514 } 1515 1516 /* MADV_FREE page check */ 1517 if (!PageSwapBacked(page)) { 1518 if (!PageDirty(page)) { 1519 dec_mm_counter(mm, MM_ANONPAGES); 1520 goto discard; 1521 } 1522 1523 /* 1524 * If the page was redirtied, it cannot be 1525 * discarded. Remap the page to page table. 1526 */ 1527 set_pte_at(mm, address, pvmw.pte, pteval); 1528 SetPageSwapBacked(page); 1529 ret = false; 1530 page_vma_mapped_walk_done(&pvmw); 1531 break; 1532 } 1533 1534 if (swap_duplicate(entry) < 0) { 1535 set_pte_at(mm, address, pvmw.pte, pteval); 1536 ret = false; 1537 page_vma_mapped_walk_done(&pvmw); 1538 break; 1539 } 1540 if (list_empty(&mm->mmlist)) { 1541 spin_lock(&mmlist_lock); 1542 if (list_empty(&mm->mmlist)) 1543 list_add(&mm->mmlist, &init_mm.mmlist); 1544 spin_unlock(&mmlist_lock); 1545 } 1546 dec_mm_counter(mm, MM_ANONPAGES); 1547 inc_mm_counter(mm, MM_SWAPENTS); 1548 swp_pte = swp_entry_to_pte(entry); 1549 if (pte_soft_dirty(pteval)) 1550 swp_pte = pte_swp_mksoft_dirty(swp_pte); 1551 set_pte_at(mm, address, pvmw.pte, swp_pte); 1552 } else 1553 dec_mm_counter(mm, mm_counter_file(page)); 1554 discard: 1555 page_remove_rmap(subpage, PageHuge(page)); 1556 put_page(page); 1557 mmu_notifier_invalidate_range(mm, address, 1558 address + PAGE_SIZE); 1559 } 1560 1561 mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); 1562 1563 return ret; 1564 } 1565 1566 bool is_vma_temporary_stack(struct vm_area_struct *vma) 1567 { 1568 int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); 1569 1570 if (!maybe_stack) 1571 return false; 1572 1573 if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == 1574 VM_STACK_INCOMPLETE_SETUP) 1575 return true; 1576 1577 return false; 1578 } 1579 1580 static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) 1581 { 1582 return is_vma_temporary_stack(vma); 1583 } 1584 1585 static int page_mapcount_is_zero(struct page *page) 1586 { 1587 return !total_mapcount(page); 1588 } 1589 1590 /** 1591 * try_to_unmap - try to remove all page table mappings to a page 1592 * @page: the page to get unmapped 1593 * @flags: action and flags 1594 * 1595 * Tries to remove all the page table entries which are mapping this 1596 * page, used in the pageout path. Caller must hold the page lock. 1597 * 1598 * If unmap is successful, return true. Otherwise, false. 1599 */ 1600 bool try_to_unmap(struct page *page, enum ttu_flags flags) 1601 { 1602 struct rmap_walk_control rwc = { 1603 .rmap_one = try_to_unmap_one, 1604 .arg = (void *)flags, 1605 .done = page_mapcount_is_zero, 1606 .anon_lock = page_lock_anon_vma_read, 1607 }; 1608 1609 /* 1610 * During exec, a temporary VMA is setup and later moved. 1611 * The VMA is moved under the anon_vma lock but not the 1612 * page tables leading to a race where migration cannot 1613 * find the migration ptes. Rather than increasing the 1614 * locking requirements of exec(), migration skips 1615 * temporary VMAs until after exec() completes. 1616 */ 1617 if ((flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE)) 1618 && !PageKsm(page) && PageAnon(page)) 1619 rwc.invalid_vma = invalid_migration_vma; 1620 1621 if (flags & TTU_RMAP_LOCKED) 1622 rmap_walk_locked(page, &rwc); 1623 else 1624 rmap_walk(page, &rwc); 1625 1626 return !page_mapcount(page) ? true : false; 1627 } 1628 1629 static int page_not_mapped(struct page *page) 1630 { 1631 return !page_mapped(page); 1632 }; 1633 1634 /** 1635 * try_to_munlock - try to munlock a page 1636 * @page: the page to be munlocked 1637 * 1638 * Called from munlock code. Checks all of the VMAs mapping the page 1639 * to make sure nobody else has this page mlocked. The page will be 1640 * returned with PG_mlocked cleared if no other vmas have it mlocked. 1641 */ 1642 1643 void try_to_munlock(struct page *page) 1644 { 1645 struct rmap_walk_control rwc = { 1646 .rmap_one = try_to_unmap_one, 1647 .arg = (void *)TTU_MUNLOCK, 1648 .done = page_not_mapped, 1649 .anon_lock = page_lock_anon_vma_read, 1650 1651 }; 1652 1653 VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page); 1654 VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page); 1655 1656 rmap_walk(page, &rwc); 1657 } 1658 1659 void __put_anon_vma(struct anon_vma *anon_vma) 1660 { 1661 struct anon_vma *root = anon_vma->root; 1662 1663 anon_vma_free(anon_vma); 1664 if (root != anon_vma && atomic_dec_and_test(&root->refcount)) 1665 anon_vma_free(root); 1666 } 1667 1668 static struct anon_vma *rmap_walk_anon_lock(struct page *page, 1669 struct rmap_walk_control *rwc) 1670 { 1671 struct anon_vma *anon_vma; 1672 1673 if (rwc->anon_lock) 1674 return rwc->anon_lock(page); 1675 1676 /* 1677 * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() 1678 * because that depends on page_mapped(); but not all its usages 1679 * are holding mmap_sem. Users without mmap_sem are required to 1680 * take a reference count to prevent the anon_vma disappearing 1681 */ 1682 anon_vma = page_anon_vma(page); 1683 if (!anon_vma) 1684 return NULL; 1685 1686 anon_vma_lock_read(anon_vma); 1687 return anon_vma; 1688 } 1689 1690 /* 1691 * rmap_walk_anon - do something to anonymous page using the object-based 1692 * rmap method 1693 * @page: the page to be handled 1694 * @rwc: control variable according to each walk type 1695 * 1696 * Find all the mappings of a page using the mapping pointer and the vma chains 1697 * contained in the anon_vma struct it points to. 1698 * 1699 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma 1700 * where the page was found will be held for write. So, we won't recheck 1701 * vm_flags for that VMA. That should be OK, because that vma shouldn't be 1702 * LOCKED. 1703 */ 1704 static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, 1705 bool locked) 1706 { 1707 struct anon_vma *anon_vma; 1708 pgoff_t pgoff_start, pgoff_end; 1709 struct anon_vma_chain *avc; 1710 1711 if (locked) { 1712 anon_vma = page_anon_vma(page); 1713 /* anon_vma disappear under us? */ 1714 VM_BUG_ON_PAGE(!anon_vma, page); 1715 } else { 1716 anon_vma = rmap_walk_anon_lock(page, rwc); 1717 } 1718 if (!anon_vma) 1719 return; 1720 1721 pgoff_start = page_to_pgoff(page); 1722 pgoff_end = pgoff_start + hpage_nr_pages(page) - 1; 1723 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, 1724 pgoff_start, pgoff_end) { 1725 struct vm_area_struct *vma = avc->vma; 1726 unsigned long address = vma_address(page, vma); 1727 1728 cond_resched(); 1729 1730 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) 1731 continue; 1732 1733 if (!rwc->rmap_one(page, vma, address, rwc->arg)) 1734 break; 1735 if (rwc->done && rwc->done(page)) 1736 break; 1737 } 1738 1739 if (!locked) 1740 anon_vma_unlock_read(anon_vma); 1741 } 1742 1743 /* 1744 * rmap_walk_file - do something to file page using the object-based rmap method 1745 * @page: the page to be handled 1746 * @rwc: control variable according to each walk type 1747 * 1748 * Find all the mappings of a page using the mapping pointer and the vma chains 1749 * contained in the address_space struct it points to. 1750 * 1751 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma 1752 * where the page was found will be held for write. So, we won't recheck 1753 * vm_flags for that VMA. That should be OK, because that vma shouldn't be 1754 * LOCKED. 1755 */ 1756 static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, 1757 bool locked) 1758 { 1759 struct address_space *mapping = page_mapping(page); 1760 pgoff_t pgoff_start, pgoff_end; 1761 struct vm_area_struct *vma; 1762 1763 /* 1764 * The page lock not only makes sure that page->mapping cannot 1765 * suddenly be NULLified by truncation, it makes sure that the 1766 * structure at mapping cannot be freed and reused yet, 1767 * so we can safely take mapping->i_mmap_rwsem. 1768 */ 1769 VM_BUG_ON_PAGE(!PageLocked(page), page); 1770 1771 if (!mapping) 1772 return; 1773 1774 pgoff_start = page_to_pgoff(page); 1775 pgoff_end = pgoff_start + hpage_nr_pages(page) - 1; 1776 if (!locked) 1777 i_mmap_lock_read(mapping); 1778 vma_interval_tree_foreach(vma, &mapping->i_mmap, 1779 pgoff_start, pgoff_end) { 1780 unsigned long address = vma_address(page, vma); 1781 1782 cond_resched(); 1783 1784 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) 1785 continue; 1786 1787 if (!rwc->rmap_one(page, vma, address, rwc->arg)) 1788 goto done; 1789 if (rwc->done && rwc->done(page)) 1790 goto done; 1791 } 1792 1793 done: 1794 if (!locked) 1795 i_mmap_unlock_read(mapping); 1796 } 1797 1798 void rmap_walk(struct page *page, struct rmap_walk_control *rwc) 1799 { 1800 if (unlikely(PageKsm(page))) 1801 rmap_walk_ksm(page, rwc); 1802 else if (PageAnon(page)) 1803 rmap_walk_anon(page, rwc, false); 1804 else 1805 rmap_walk_file(page, rwc, false); 1806 } 1807 1808 /* Like rmap_walk, but caller holds relevant rmap lock */ 1809 void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc) 1810 { 1811 /* no ksm support for now */ 1812 VM_BUG_ON_PAGE(PageKsm(page), page); 1813 if (PageAnon(page)) 1814 rmap_walk_anon(page, rwc, true); 1815 else 1816 rmap_walk_file(page, rwc, true); 1817 } 1818 1819 #ifdef CONFIG_HUGETLB_PAGE 1820 /* 1821 * The following three functions are for anonymous (private mapped) hugepages. 1822 * Unlike common anonymous pages, anonymous hugepages have no accounting code 1823 * and no lru code, because we handle hugepages differently from common pages. 1824 */ 1825 static void __hugepage_set_anon_rmap(struct page *page, 1826 struct vm_area_struct *vma, unsigned long address, int exclusive) 1827 { 1828 struct anon_vma *anon_vma = vma->anon_vma; 1829 1830 BUG_ON(!anon_vma); 1831 1832 if (PageAnon(page)) 1833 return; 1834 if (!exclusive) 1835 anon_vma = anon_vma->root; 1836 1837 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 1838 page->mapping = (struct address_space *) anon_vma; 1839 page->index = linear_page_index(vma, address); 1840 } 1841 1842 void hugepage_add_anon_rmap(struct page *page, 1843 struct vm_area_struct *vma, unsigned long address) 1844 { 1845 struct anon_vma *anon_vma = vma->anon_vma; 1846 int first; 1847 1848 BUG_ON(!PageLocked(page)); 1849 BUG_ON(!anon_vma); 1850 /* address might be in next vma when migration races vma_adjust */ 1851 first = atomic_inc_and_test(compound_mapcount_ptr(page)); 1852 if (first) 1853 __hugepage_set_anon_rmap(page, vma, address, 0); 1854 } 1855 1856 void hugepage_add_new_anon_rmap(struct page *page, 1857 struct vm_area_struct *vma, unsigned long address) 1858 { 1859 BUG_ON(address < vma->vm_start || address >= vma->vm_end); 1860 atomic_set(compound_mapcount_ptr(page), 0); 1861 __hugepage_set_anon_rmap(page, vma, address, 1); 1862 } 1863 #endif /* CONFIG_HUGETLB_PAGE */ 1864