1 /* 2 * mm/rmap.c - physical to virtual reverse mappings 3 * 4 * Copyright 2001, Rik van Riel <riel@conectiva.com.br> 5 * Released under the General Public License (GPL). 6 * 7 * Simple, low overhead reverse mapping scheme. 8 * Please try to keep this thing as modular as possible. 9 * 10 * Provides methods for unmapping each kind of mapped page: 11 * the anon methods track anonymous pages, and 12 * the file methods track pages belonging to an inode. 13 * 14 * Original design by Rik van Riel <riel@conectiva.com.br> 2001 15 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004 16 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004 17 * Contributions by Hugh Dickins 2003, 2004 18 */ 19 20 /* 21 * Lock ordering in mm: 22 * 23 * inode->i_mutex (while writing or truncating, not reading or faulting) 24 * mm->mmap_sem 25 * page->flags PG_locked (lock_page) 26 * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share) 27 * mapping->i_mmap_rwsem 28 * anon_vma->rwsem 29 * mm->page_table_lock or pte_lock 30 * zone_lru_lock (in mark_page_accessed, isolate_lru_page) 31 * swap_lock (in swap_duplicate, swap_info_get) 32 * mmlist_lock (in mmput, drain_mmlist and others) 33 * mapping->private_lock (in __set_page_dirty_buffers) 34 * mem_cgroup_{begin,end}_page_stat (memcg->move_lock) 35 * i_pages lock (widely used) 36 * inode->i_lock (in set_page_dirty's __mark_inode_dirty) 37 * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) 38 * sb_lock (within inode_lock in fs/fs-writeback.c) 39 * i_pages lock (widely used, in set_page_dirty, 40 * in arch-dependent flush_dcache_mmap_lock, 41 * within bdi.wb->list_lock in __sync_single_inode) 42 * 43 * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon) 44 * ->tasklist_lock 45 * pte map lock 46 */ 47 48 #include <linux/mm.h> 49 #include <linux/sched/mm.h> 50 #include <linux/sched/task.h> 51 #include <linux/pagemap.h> 52 #include <linux/swap.h> 53 #include <linux/swapops.h> 54 #include <linux/slab.h> 55 #include <linux/init.h> 56 #include <linux/ksm.h> 57 #include <linux/rmap.h> 58 #include <linux/rcupdate.h> 59 #include <linux/export.h> 60 #include <linux/memcontrol.h> 61 #include <linux/mmu_notifier.h> 62 #include <linux/migrate.h> 63 #include <linux/hugetlb.h> 64 #include <linux/backing-dev.h> 65 #include <linux/page_idle.h> 66 #include <linux/memremap.h> 67 68 #include <asm/tlbflush.h> 69 70 #include <trace/events/tlb.h> 71 72 #include "internal.h" 73 74 static struct kmem_cache *anon_vma_cachep; 75 static struct kmem_cache *anon_vma_chain_cachep; 76 77 static inline struct anon_vma *anon_vma_alloc(void) 78 { 79 struct anon_vma *anon_vma; 80 81 anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); 82 if (anon_vma) { 83 atomic_set(&anon_vma->refcount, 1); 84 anon_vma->degree = 1; /* Reference for first vma */ 85 anon_vma->parent = anon_vma; 86 /* 87 * Initialise the anon_vma root to point to itself. If called 88 * from fork, the root will be reset to the parents anon_vma. 89 */ 90 anon_vma->root = anon_vma; 91 } 92 93 return anon_vma; 94 } 95 96 static inline void anon_vma_free(struct anon_vma *anon_vma) 97 { 98 VM_BUG_ON(atomic_read(&anon_vma->refcount)); 99 100 /* 101 * Synchronize against page_lock_anon_vma_read() such that 102 * we can safely hold the lock without the anon_vma getting 103 * freed. 104 * 105 * Relies on the full mb implied by the atomic_dec_and_test() from 106 * put_anon_vma() against the acquire barrier implied by 107 * down_read_trylock() from page_lock_anon_vma_read(). This orders: 108 * 109 * page_lock_anon_vma_read() VS put_anon_vma() 110 * down_read_trylock() atomic_dec_and_test() 111 * LOCK MB 112 * atomic_read() rwsem_is_locked() 113 * 114 * LOCK should suffice since the actual taking of the lock must 115 * happen _before_ what follows. 116 */ 117 might_sleep(); 118 if (rwsem_is_locked(&anon_vma->root->rwsem)) { 119 anon_vma_lock_write(anon_vma); 120 anon_vma_unlock_write(anon_vma); 121 } 122 123 kmem_cache_free(anon_vma_cachep, anon_vma); 124 } 125 126 static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp) 127 { 128 return kmem_cache_alloc(anon_vma_chain_cachep, gfp); 129 } 130 131 static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) 132 { 133 kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); 134 } 135 136 static void anon_vma_chain_link(struct vm_area_struct *vma, 137 struct anon_vma_chain *avc, 138 struct anon_vma *anon_vma) 139 { 140 avc->vma = vma; 141 avc->anon_vma = anon_vma; 142 list_add(&avc->same_vma, &vma->anon_vma_chain); 143 anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); 144 } 145 146 /** 147 * __anon_vma_prepare - attach an anon_vma to a memory region 148 * @vma: the memory region in question 149 * 150 * This makes sure the memory mapping described by 'vma' has 151 * an 'anon_vma' attached to it, so that we can associate the 152 * anonymous pages mapped into it with that anon_vma. 153 * 154 * The common case will be that we already have one, which 155 * is handled inline by anon_vma_prepare(). But if 156 * not we either need to find an adjacent mapping that we 157 * can re-use the anon_vma from (very common when the only 158 * reason for splitting a vma has been mprotect()), or we 159 * allocate a new one. 160 * 161 * Anon-vma allocations are very subtle, because we may have 162 * optimistically looked up an anon_vma in page_lock_anon_vma_read() 163 * and that may actually touch the spinlock even in the newly 164 * allocated vma (it depends on RCU to make sure that the 165 * anon_vma isn't actually destroyed). 166 * 167 * As a result, we need to do proper anon_vma locking even 168 * for the new allocation. At the same time, we do not want 169 * to do any locking for the common case of already having 170 * an anon_vma. 171 * 172 * This must be called with the mmap_sem held for reading. 173 */ 174 int __anon_vma_prepare(struct vm_area_struct *vma) 175 { 176 struct mm_struct *mm = vma->vm_mm; 177 struct anon_vma *anon_vma, *allocated; 178 struct anon_vma_chain *avc; 179 180 might_sleep(); 181 182 avc = anon_vma_chain_alloc(GFP_KERNEL); 183 if (!avc) 184 goto out_enomem; 185 186 anon_vma = find_mergeable_anon_vma(vma); 187 allocated = NULL; 188 if (!anon_vma) { 189 anon_vma = anon_vma_alloc(); 190 if (unlikely(!anon_vma)) 191 goto out_enomem_free_avc; 192 allocated = anon_vma; 193 } 194 195 anon_vma_lock_write(anon_vma); 196 /* page_table_lock to protect against threads */ 197 spin_lock(&mm->page_table_lock); 198 if (likely(!vma->anon_vma)) { 199 vma->anon_vma = anon_vma; 200 anon_vma_chain_link(vma, avc, anon_vma); 201 /* vma reference or self-parent link for new root */ 202 anon_vma->degree++; 203 allocated = NULL; 204 avc = NULL; 205 } 206 spin_unlock(&mm->page_table_lock); 207 anon_vma_unlock_write(anon_vma); 208 209 if (unlikely(allocated)) 210 put_anon_vma(allocated); 211 if (unlikely(avc)) 212 anon_vma_chain_free(avc); 213 214 return 0; 215 216 out_enomem_free_avc: 217 anon_vma_chain_free(avc); 218 out_enomem: 219 return -ENOMEM; 220 } 221 222 /* 223 * This is a useful helper function for locking the anon_vma root as 224 * we traverse the vma->anon_vma_chain, looping over anon_vma's that 225 * have the same vma. 226 * 227 * Such anon_vma's should have the same root, so you'd expect to see 228 * just a single mutex_lock for the whole traversal. 229 */ 230 static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma) 231 { 232 struct anon_vma *new_root = anon_vma->root; 233 if (new_root != root) { 234 if (WARN_ON_ONCE(root)) 235 up_write(&root->rwsem); 236 root = new_root; 237 down_write(&root->rwsem); 238 } 239 return root; 240 } 241 242 static inline void unlock_anon_vma_root(struct anon_vma *root) 243 { 244 if (root) 245 up_write(&root->rwsem); 246 } 247 248 /* 249 * Attach the anon_vmas from src to dst. 250 * Returns 0 on success, -ENOMEM on failure. 251 * 252 * If dst->anon_vma is NULL this function tries to find and reuse existing 253 * anon_vma which has no vmas and only one child anon_vma. This prevents 254 * degradation of anon_vma hierarchy to endless linear chain in case of 255 * constantly forking task. On the other hand, an anon_vma with more than one 256 * child isn't reused even if there was no alive vma, thus rmap walker has a 257 * good chance of avoiding scanning the whole hierarchy when it searches where 258 * page is mapped. 259 */ 260 int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) 261 { 262 struct anon_vma_chain *avc, *pavc; 263 struct anon_vma *root = NULL; 264 265 list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { 266 struct anon_vma *anon_vma; 267 268 avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN); 269 if (unlikely(!avc)) { 270 unlock_anon_vma_root(root); 271 root = NULL; 272 avc = anon_vma_chain_alloc(GFP_KERNEL); 273 if (!avc) 274 goto enomem_failure; 275 } 276 anon_vma = pavc->anon_vma; 277 root = lock_anon_vma_root(root, anon_vma); 278 anon_vma_chain_link(dst, avc, anon_vma); 279 280 /* 281 * Reuse existing anon_vma if its degree lower than two, 282 * that means it has no vma and only one anon_vma child. 283 * 284 * Do not chose parent anon_vma, otherwise first child 285 * will always reuse it. Root anon_vma is never reused: 286 * it has self-parent reference and at least one child. 287 */ 288 if (!dst->anon_vma && anon_vma != src->anon_vma && 289 anon_vma->degree < 2) 290 dst->anon_vma = anon_vma; 291 } 292 if (dst->anon_vma) 293 dst->anon_vma->degree++; 294 unlock_anon_vma_root(root); 295 return 0; 296 297 enomem_failure: 298 /* 299 * dst->anon_vma is dropped here otherwise its degree can be incorrectly 300 * decremented in unlink_anon_vmas(). 301 * We can safely do this because callers of anon_vma_clone() don't care 302 * about dst->anon_vma if anon_vma_clone() failed. 303 */ 304 dst->anon_vma = NULL; 305 unlink_anon_vmas(dst); 306 return -ENOMEM; 307 } 308 309 /* 310 * Attach vma to its own anon_vma, as well as to the anon_vmas that 311 * the corresponding VMA in the parent process is attached to. 312 * Returns 0 on success, non-zero on failure. 313 */ 314 int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) 315 { 316 struct anon_vma_chain *avc; 317 struct anon_vma *anon_vma; 318 int error; 319 320 /* Don't bother if the parent process has no anon_vma here. */ 321 if (!pvma->anon_vma) 322 return 0; 323 324 /* Drop inherited anon_vma, we'll reuse existing or allocate new. */ 325 vma->anon_vma = NULL; 326 327 /* 328 * First, attach the new VMA to the parent VMA's anon_vmas, 329 * so rmap can find non-COWed pages in child processes. 330 */ 331 error = anon_vma_clone(vma, pvma); 332 if (error) 333 return error; 334 335 /* An existing anon_vma has been reused, all done then. */ 336 if (vma->anon_vma) 337 return 0; 338 339 /* Then add our own anon_vma. */ 340 anon_vma = anon_vma_alloc(); 341 if (!anon_vma) 342 goto out_error; 343 avc = anon_vma_chain_alloc(GFP_KERNEL); 344 if (!avc) 345 goto out_error_free_anon_vma; 346 347 /* 348 * The root anon_vma's spinlock is the lock actually used when we 349 * lock any of the anon_vmas in this anon_vma tree. 350 */ 351 anon_vma->root = pvma->anon_vma->root; 352 anon_vma->parent = pvma->anon_vma; 353 /* 354 * With refcounts, an anon_vma can stay around longer than the 355 * process it belongs to. The root anon_vma needs to be pinned until 356 * this anon_vma is freed, because the lock lives in the root. 357 */ 358 get_anon_vma(anon_vma->root); 359 /* Mark this anon_vma as the one where our new (COWed) pages go. */ 360 vma->anon_vma = anon_vma; 361 anon_vma_lock_write(anon_vma); 362 anon_vma_chain_link(vma, avc, anon_vma); 363 anon_vma->parent->degree++; 364 anon_vma_unlock_write(anon_vma); 365 366 return 0; 367 368 out_error_free_anon_vma: 369 put_anon_vma(anon_vma); 370 out_error: 371 unlink_anon_vmas(vma); 372 return -ENOMEM; 373 } 374 375 void unlink_anon_vmas(struct vm_area_struct *vma) 376 { 377 struct anon_vma_chain *avc, *next; 378 struct anon_vma *root = NULL; 379 380 /* 381 * Unlink each anon_vma chained to the VMA. This list is ordered 382 * from newest to oldest, ensuring the root anon_vma gets freed last. 383 */ 384 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { 385 struct anon_vma *anon_vma = avc->anon_vma; 386 387 root = lock_anon_vma_root(root, anon_vma); 388 anon_vma_interval_tree_remove(avc, &anon_vma->rb_root); 389 390 /* 391 * Leave empty anon_vmas on the list - we'll need 392 * to free them outside the lock. 393 */ 394 if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) { 395 anon_vma->parent->degree--; 396 continue; 397 } 398 399 list_del(&avc->same_vma); 400 anon_vma_chain_free(avc); 401 } 402 if (vma->anon_vma) 403 vma->anon_vma->degree--; 404 unlock_anon_vma_root(root); 405 406 /* 407 * Iterate the list once more, it now only contains empty and unlinked 408 * anon_vmas, destroy them. Could not do before due to __put_anon_vma() 409 * needing to write-acquire the anon_vma->root->rwsem. 410 */ 411 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { 412 struct anon_vma *anon_vma = avc->anon_vma; 413 414 VM_WARN_ON(anon_vma->degree); 415 put_anon_vma(anon_vma); 416 417 list_del(&avc->same_vma); 418 anon_vma_chain_free(avc); 419 } 420 } 421 422 static void anon_vma_ctor(void *data) 423 { 424 struct anon_vma *anon_vma = data; 425 426 init_rwsem(&anon_vma->rwsem); 427 atomic_set(&anon_vma->refcount, 0); 428 anon_vma->rb_root = RB_ROOT_CACHED; 429 } 430 431 void __init anon_vma_init(void) 432 { 433 anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), 434 0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT, 435 anon_vma_ctor); 436 anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, 437 SLAB_PANIC|SLAB_ACCOUNT); 438 } 439 440 /* 441 * Getting a lock on a stable anon_vma from a page off the LRU is tricky! 442 * 443 * Since there is no serialization what so ever against page_remove_rmap() 444 * the best this function can do is return a locked anon_vma that might 445 * have been relevant to this page. 446 * 447 * The page might have been remapped to a different anon_vma or the anon_vma 448 * returned may already be freed (and even reused). 449 * 450 * In case it was remapped to a different anon_vma, the new anon_vma will be a 451 * child of the old anon_vma, and the anon_vma lifetime rules will therefore 452 * ensure that any anon_vma obtained from the page will still be valid for as 453 * long as we observe page_mapped() [ hence all those page_mapped() tests ]. 454 * 455 * All users of this function must be very careful when walking the anon_vma 456 * chain and verify that the page in question is indeed mapped in it 457 * [ something equivalent to page_mapped_in_vma() ]. 458 * 459 * Since anon_vma's slab is DESTROY_BY_RCU and we know from page_remove_rmap() 460 * that the anon_vma pointer from page->mapping is valid if there is a 461 * mapcount, we can dereference the anon_vma after observing those. 462 */ 463 struct anon_vma *page_get_anon_vma(struct page *page) 464 { 465 struct anon_vma *anon_vma = NULL; 466 unsigned long anon_mapping; 467 468 rcu_read_lock(); 469 anon_mapping = (unsigned long)READ_ONCE(page->mapping); 470 if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) 471 goto out; 472 if (!page_mapped(page)) 473 goto out; 474 475 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); 476 if (!atomic_inc_not_zero(&anon_vma->refcount)) { 477 anon_vma = NULL; 478 goto out; 479 } 480 481 /* 482 * If this page is still mapped, then its anon_vma cannot have been 483 * freed. But if it has been unmapped, we have no security against the 484 * anon_vma structure being freed and reused (for another anon_vma: 485 * SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero() 486 * above cannot corrupt). 487 */ 488 if (!page_mapped(page)) { 489 rcu_read_unlock(); 490 put_anon_vma(anon_vma); 491 return NULL; 492 } 493 out: 494 rcu_read_unlock(); 495 496 return anon_vma; 497 } 498 499 /* 500 * Similar to page_get_anon_vma() except it locks the anon_vma. 501 * 502 * Its a little more complex as it tries to keep the fast path to a single 503 * atomic op -- the trylock. If we fail the trylock, we fall back to getting a 504 * reference like with page_get_anon_vma() and then block on the mutex. 505 */ 506 struct anon_vma *page_lock_anon_vma_read(struct page *page) 507 { 508 struct anon_vma *anon_vma = NULL; 509 struct anon_vma *root_anon_vma; 510 unsigned long anon_mapping; 511 512 rcu_read_lock(); 513 anon_mapping = (unsigned long)READ_ONCE(page->mapping); 514 if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) 515 goto out; 516 if (!page_mapped(page)) 517 goto out; 518 519 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); 520 root_anon_vma = READ_ONCE(anon_vma->root); 521 if (down_read_trylock(&root_anon_vma->rwsem)) { 522 /* 523 * If the page is still mapped, then this anon_vma is still 524 * its anon_vma, and holding the mutex ensures that it will 525 * not go away, see anon_vma_free(). 526 */ 527 if (!page_mapped(page)) { 528 up_read(&root_anon_vma->rwsem); 529 anon_vma = NULL; 530 } 531 goto out; 532 } 533 534 /* trylock failed, we got to sleep */ 535 if (!atomic_inc_not_zero(&anon_vma->refcount)) { 536 anon_vma = NULL; 537 goto out; 538 } 539 540 if (!page_mapped(page)) { 541 rcu_read_unlock(); 542 put_anon_vma(anon_vma); 543 return NULL; 544 } 545 546 /* we pinned the anon_vma, its safe to sleep */ 547 rcu_read_unlock(); 548 anon_vma_lock_read(anon_vma); 549 550 if (atomic_dec_and_test(&anon_vma->refcount)) { 551 /* 552 * Oops, we held the last refcount, release the lock 553 * and bail -- can't simply use put_anon_vma() because 554 * we'll deadlock on the anon_vma_lock_write() recursion. 555 */ 556 anon_vma_unlock_read(anon_vma); 557 __put_anon_vma(anon_vma); 558 anon_vma = NULL; 559 } 560 561 return anon_vma; 562 563 out: 564 rcu_read_unlock(); 565 return anon_vma; 566 } 567 568 void page_unlock_anon_vma_read(struct anon_vma *anon_vma) 569 { 570 anon_vma_unlock_read(anon_vma); 571 } 572 573 #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH 574 /* 575 * Flush TLB entries for recently unmapped pages from remote CPUs. It is 576 * important if a PTE was dirty when it was unmapped that it's flushed 577 * before any IO is initiated on the page to prevent lost writes. Similarly, 578 * it must be flushed before freeing to prevent data leakage. 579 */ 580 void try_to_unmap_flush(void) 581 { 582 struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; 583 584 if (!tlb_ubc->flush_required) 585 return; 586 587 arch_tlbbatch_flush(&tlb_ubc->arch); 588 tlb_ubc->flush_required = false; 589 tlb_ubc->writable = false; 590 } 591 592 /* Flush iff there are potentially writable TLB entries that can race with IO */ 593 void try_to_unmap_flush_dirty(void) 594 { 595 struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; 596 597 if (tlb_ubc->writable) 598 try_to_unmap_flush(); 599 } 600 601 static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable) 602 { 603 struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; 604 605 arch_tlbbatch_add_mm(&tlb_ubc->arch, mm); 606 tlb_ubc->flush_required = true; 607 608 /* 609 * Ensure compiler does not re-order the setting of tlb_flush_batched 610 * before the PTE is cleared. 611 */ 612 barrier(); 613 mm->tlb_flush_batched = true; 614 615 /* 616 * If the PTE was dirty then it's best to assume it's writable. The 617 * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush() 618 * before the page is queued for IO. 619 */ 620 if (writable) 621 tlb_ubc->writable = true; 622 } 623 624 /* 625 * Returns true if the TLB flush should be deferred to the end of a batch of 626 * unmap operations to reduce IPIs. 627 */ 628 static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) 629 { 630 bool should_defer = false; 631 632 if (!(flags & TTU_BATCH_FLUSH)) 633 return false; 634 635 /* If remote CPUs need to be flushed then defer batch the flush */ 636 if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids) 637 should_defer = true; 638 put_cpu(); 639 640 return should_defer; 641 } 642 643 /* 644 * Reclaim unmaps pages under the PTL but do not flush the TLB prior to 645 * releasing the PTL if TLB flushes are batched. It's possible for a parallel 646 * operation such as mprotect or munmap to race between reclaim unmapping 647 * the page and flushing the page. If this race occurs, it potentially allows 648 * access to data via a stale TLB entry. Tracking all mm's that have TLB 649 * batching in flight would be expensive during reclaim so instead track 650 * whether TLB batching occurred in the past and if so then do a flush here 651 * if required. This will cost one additional flush per reclaim cycle paid 652 * by the first operation at risk such as mprotect and mumap. 653 * 654 * This must be called under the PTL so that an access to tlb_flush_batched 655 * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise 656 * via the PTL. 657 */ 658 void flush_tlb_batched_pending(struct mm_struct *mm) 659 { 660 if (mm->tlb_flush_batched) { 661 flush_tlb_mm(mm); 662 663 /* 664 * Do not allow the compiler to re-order the clearing of 665 * tlb_flush_batched before the tlb is flushed. 666 */ 667 barrier(); 668 mm->tlb_flush_batched = false; 669 } 670 } 671 #else 672 static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable) 673 { 674 } 675 676 static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) 677 { 678 return false; 679 } 680 #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ 681 682 /* 683 * At what user virtual address is page expected in vma? 684 * Caller should check the page is actually part of the vma. 685 */ 686 unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) 687 { 688 unsigned long address; 689 if (PageAnon(page)) { 690 struct anon_vma *page__anon_vma = page_anon_vma(page); 691 /* 692 * Note: swapoff's unuse_vma() is more efficient with this 693 * check, and needs it to match anon_vma when KSM is active. 694 */ 695 if (!vma->anon_vma || !page__anon_vma || 696 vma->anon_vma->root != page__anon_vma->root) 697 return -EFAULT; 698 } else if (page->mapping) { 699 if (!vma->vm_file || vma->vm_file->f_mapping != page->mapping) 700 return -EFAULT; 701 } else 702 return -EFAULT; 703 address = __vma_address(page, vma); 704 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) 705 return -EFAULT; 706 return address; 707 } 708 709 pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) 710 { 711 pgd_t *pgd; 712 p4d_t *p4d; 713 pud_t *pud; 714 pmd_t *pmd = NULL; 715 pmd_t pmde; 716 717 pgd = pgd_offset(mm, address); 718 if (!pgd_present(*pgd)) 719 goto out; 720 721 p4d = p4d_offset(pgd, address); 722 if (!p4d_present(*p4d)) 723 goto out; 724 725 pud = pud_offset(p4d, address); 726 if (!pud_present(*pud)) 727 goto out; 728 729 pmd = pmd_offset(pud, address); 730 /* 731 * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at() 732 * without holding anon_vma lock for write. So when looking for a 733 * genuine pmde (in which to find pte), test present and !THP together. 734 */ 735 pmde = *pmd; 736 barrier(); 737 if (!pmd_present(pmde) || pmd_trans_huge(pmde)) 738 pmd = NULL; 739 out: 740 return pmd; 741 } 742 743 struct page_referenced_arg { 744 int mapcount; 745 int referenced; 746 unsigned long vm_flags; 747 struct mem_cgroup *memcg; 748 }; 749 /* 750 * arg: page_referenced_arg will be passed 751 */ 752 static bool page_referenced_one(struct page *page, struct vm_area_struct *vma, 753 unsigned long address, void *arg) 754 { 755 struct page_referenced_arg *pra = arg; 756 struct page_vma_mapped_walk pvmw = { 757 .page = page, 758 .vma = vma, 759 .address = address, 760 }; 761 int referenced = 0; 762 763 while (page_vma_mapped_walk(&pvmw)) { 764 address = pvmw.address; 765 766 if (vma->vm_flags & VM_LOCKED) { 767 page_vma_mapped_walk_done(&pvmw); 768 pra->vm_flags |= VM_LOCKED; 769 return false; /* To break the loop */ 770 } 771 772 if (pvmw.pte) { 773 if (ptep_clear_flush_young_notify(vma, address, 774 pvmw.pte)) { 775 /* 776 * Don't treat a reference through 777 * a sequentially read mapping as such. 778 * If the page has been used in another mapping, 779 * we will catch it; if this other mapping is 780 * already gone, the unmap path will have set 781 * PG_referenced or activated the page. 782 */ 783 if (likely(!(vma->vm_flags & VM_SEQ_READ))) 784 referenced++; 785 } 786 } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { 787 if (pmdp_clear_flush_young_notify(vma, address, 788 pvmw.pmd)) 789 referenced++; 790 } else { 791 /* unexpected pmd-mapped page? */ 792 WARN_ON_ONCE(1); 793 } 794 795 pra->mapcount--; 796 } 797 798 if (referenced) 799 clear_page_idle(page); 800 if (test_and_clear_page_young(page)) 801 referenced++; 802 803 if (referenced) { 804 pra->referenced++; 805 pra->vm_flags |= vma->vm_flags; 806 } 807 808 if (!pra->mapcount) 809 return false; /* To break the loop */ 810 811 return true; 812 } 813 814 static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg) 815 { 816 struct page_referenced_arg *pra = arg; 817 struct mem_cgroup *memcg = pra->memcg; 818 819 if (!mm_match_cgroup(vma->vm_mm, memcg)) 820 return true; 821 822 return false; 823 } 824 825 /** 826 * page_referenced - test if the page was referenced 827 * @page: the page to test 828 * @is_locked: caller holds lock on the page 829 * @memcg: target memory cgroup 830 * @vm_flags: collect encountered vma->vm_flags who actually referenced the page 831 * 832 * Quick test_and_clear_referenced for all mappings to a page, 833 * returns the number of ptes which referenced the page. 834 */ 835 int page_referenced(struct page *page, 836 int is_locked, 837 struct mem_cgroup *memcg, 838 unsigned long *vm_flags) 839 { 840 int we_locked = 0; 841 struct page_referenced_arg pra = { 842 .mapcount = total_mapcount(page), 843 .memcg = memcg, 844 }; 845 struct rmap_walk_control rwc = { 846 .rmap_one = page_referenced_one, 847 .arg = (void *)&pra, 848 .anon_lock = page_lock_anon_vma_read, 849 }; 850 851 *vm_flags = 0; 852 if (!page_mapped(page)) 853 return 0; 854 855 if (!page_rmapping(page)) 856 return 0; 857 858 if (!is_locked && (!PageAnon(page) || PageKsm(page))) { 859 we_locked = trylock_page(page); 860 if (!we_locked) 861 return 1; 862 } 863 864 /* 865 * If we are reclaiming on behalf of a cgroup, skip 866 * counting on behalf of references from different 867 * cgroups 868 */ 869 if (memcg) { 870 rwc.invalid_vma = invalid_page_referenced_vma; 871 } 872 873 rmap_walk(page, &rwc); 874 *vm_flags = pra.vm_flags; 875 876 if (we_locked) 877 unlock_page(page); 878 879 return pra.referenced; 880 } 881 882 static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, 883 unsigned long address, void *arg) 884 { 885 struct page_vma_mapped_walk pvmw = { 886 .page = page, 887 .vma = vma, 888 .address = address, 889 .flags = PVMW_SYNC, 890 }; 891 unsigned long start = address, end; 892 int *cleaned = arg; 893 894 /* 895 * We have to assume the worse case ie pmd for invalidation. Note that 896 * the page can not be free from this function. 897 */ 898 end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page))); 899 mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); 900 901 while (page_vma_mapped_walk(&pvmw)) { 902 unsigned long cstart; 903 int ret = 0; 904 905 cstart = address = pvmw.address; 906 if (pvmw.pte) { 907 pte_t entry; 908 pte_t *pte = pvmw.pte; 909 910 if (!pte_dirty(*pte) && !pte_write(*pte)) 911 continue; 912 913 flush_cache_page(vma, address, pte_pfn(*pte)); 914 entry = ptep_clear_flush(vma, address, pte); 915 entry = pte_wrprotect(entry); 916 entry = pte_mkclean(entry); 917 set_pte_at(vma->vm_mm, address, pte, entry); 918 ret = 1; 919 } else { 920 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE 921 pmd_t *pmd = pvmw.pmd; 922 pmd_t entry; 923 924 if (!pmd_dirty(*pmd) && !pmd_write(*pmd)) 925 continue; 926 927 flush_cache_page(vma, address, page_to_pfn(page)); 928 entry = pmdp_huge_clear_flush(vma, address, pmd); 929 entry = pmd_wrprotect(entry); 930 entry = pmd_mkclean(entry); 931 set_pmd_at(vma->vm_mm, address, pmd, entry); 932 cstart &= PMD_MASK; 933 ret = 1; 934 #else 935 /* unexpected pmd-mapped page? */ 936 WARN_ON_ONCE(1); 937 #endif 938 } 939 940 /* 941 * No need to call mmu_notifier_invalidate_range() as we are 942 * downgrading page table protection not changing it to point 943 * to a new page. 944 * 945 * See Documentation/vm/mmu_notifier.txt 946 */ 947 if (ret) 948 (*cleaned)++; 949 } 950 951 mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); 952 953 return true; 954 } 955 956 static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg) 957 { 958 if (vma->vm_flags & VM_SHARED) 959 return false; 960 961 return true; 962 } 963 964 int page_mkclean(struct page *page) 965 { 966 int cleaned = 0; 967 struct address_space *mapping; 968 struct rmap_walk_control rwc = { 969 .arg = (void *)&cleaned, 970 .rmap_one = page_mkclean_one, 971 .invalid_vma = invalid_mkclean_vma, 972 }; 973 974 BUG_ON(!PageLocked(page)); 975 976 if (!page_mapped(page)) 977 return 0; 978 979 mapping = page_mapping(page); 980 if (!mapping) 981 return 0; 982 983 rmap_walk(page, &rwc); 984 985 return cleaned; 986 } 987 EXPORT_SYMBOL_GPL(page_mkclean); 988 989 /** 990 * page_move_anon_rmap - move a page to our anon_vma 991 * @page: the page to move to our anon_vma 992 * @vma: the vma the page belongs to 993 * 994 * When a page belongs exclusively to one process after a COW event, 995 * that page can be moved into the anon_vma that belongs to just that 996 * process, so the rmap code will not search the parent or sibling 997 * processes. 998 */ 999 void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma) 1000 { 1001 struct anon_vma *anon_vma = vma->anon_vma; 1002 1003 page = compound_head(page); 1004 1005 VM_BUG_ON_PAGE(!PageLocked(page), page); 1006 VM_BUG_ON_VMA(!anon_vma, vma); 1007 1008 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 1009 /* 1010 * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written 1011 * simultaneously, so a concurrent reader (eg page_referenced()'s 1012 * PageAnon()) will not see one without the other. 1013 */ 1014 WRITE_ONCE(page->mapping, (struct address_space *) anon_vma); 1015 } 1016 1017 /** 1018 * __page_set_anon_rmap - set up new anonymous rmap 1019 * @page: Page to add to rmap 1020 * @vma: VM area to add page to. 1021 * @address: User virtual address of the mapping 1022 * @exclusive: the page is exclusively owned by the current process 1023 */ 1024 static void __page_set_anon_rmap(struct page *page, 1025 struct vm_area_struct *vma, unsigned long address, int exclusive) 1026 { 1027 struct anon_vma *anon_vma = vma->anon_vma; 1028 1029 BUG_ON(!anon_vma); 1030 1031 if (PageAnon(page)) 1032 return; 1033 1034 /* 1035 * If the page isn't exclusively mapped into this vma, 1036 * we must use the _oldest_ possible anon_vma for the 1037 * page mapping! 1038 */ 1039 if (!exclusive) 1040 anon_vma = anon_vma->root; 1041 1042 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 1043 page->mapping = (struct address_space *) anon_vma; 1044 page->index = linear_page_index(vma, address); 1045 } 1046 1047 /** 1048 * __page_check_anon_rmap - sanity check anonymous rmap addition 1049 * @page: the page to add the mapping to 1050 * @vma: the vm area in which the mapping is added 1051 * @address: the user virtual address mapped 1052 */ 1053 static void __page_check_anon_rmap(struct page *page, 1054 struct vm_area_struct *vma, unsigned long address) 1055 { 1056 #ifdef CONFIG_DEBUG_VM 1057 /* 1058 * The page's anon-rmap details (mapping and index) are guaranteed to 1059 * be set up correctly at this point. 1060 * 1061 * We have exclusion against page_add_anon_rmap because the caller 1062 * always holds the page locked, except if called from page_dup_rmap, 1063 * in which case the page is already known to be setup. 1064 * 1065 * We have exclusion against page_add_new_anon_rmap because those pages 1066 * are initially only visible via the pagetables, and the pte is locked 1067 * over the call to page_add_new_anon_rmap. 1068 */ 1069 BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root); 1070 BUG_ON(page_to_pgoff(page) != linear_page_index(vma, address)); 1071 #endif 1072 } 1073 1074 /** 1075 * page_add_anon_rmap - add pte mapping to an anonymous page 1076 * @page: the page to add the mapping to 1077 * @vma: the vm area in which the mapping is added 1078 * @address: the user virtual address mapped 1079 * @compound: charge the page as compound or small page 1080 * 1081 * The caller needs to hold the pte lock, and the page must be locked in 1082 * the anon_vma case: to serialize mapping,index checking after setting, 1083 * and to ensure that PageAnon is not being upgraded racily to PageKsm 1084 * (but PageKsm is never downgraded to PageAnon). 1085 */ 1086 void page_add_anon_rmap(struct page *page, 1087 struct vm_area_struct *vma, unsigned long address, bool compound) 1088 { 1089 do_page_add_anon_rmap(page, vma, address, compound ? RMAP_COMPOUND : 0); 1090 } 1091 1092 /* 1093 * Special version of the above for do_swap_page, which often runs 1094 * into pages that are exclusively owned by the current process. 1095 * Everybody else should continue to use page_add_anon_rmap above. 1096 */ 1097 void do_page_add_anon_rmap(struct page *page, 1098 struct vm_area_struct *vma, unsigned long address, int flags) 1099 { 1100 bool compound = flags & RMAP_COMPOUND; 1101 bool first; 1102 1103 if (compound) { 1104 atomic_t *mapcount; 1105 VM_BUG_ON_PAGE(!PageLocked(page), page); 1106 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 1107 mapcount = compound_mapcount_ptr(page); 1108 first = atomic_inc_and_test(mapcount); 1109 } else { 1110 first = atomic_inc_and_test(&page->_mapcount); 1111 } 1112 1113 if (first) { 1114 int nr = compound ? hpage_nr_pages(page) : 1; 1115 /* 1116 * We use the irq-unsafe __{inc|mod}_zone_page_stat because 1117 * these counters are not modified in interrupt context, and 1118 * pte lock(a spinlock) is held, which implies preemption 1119 * disabled. 1120 */ 1121 if (compound) 1122 __inc_node_page_state(page, NR_ANON_THPS); 1123 __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr); 1124 } 1125 if (unlikely(PageKsm(page))) 1126 return; 1127 1128 VM_BUG_ON_PAGE(!PageLocked(page), page); 1129 1130 /* address might be in next vma when migration races vma_adjust */ 1131 if (first) 1132 __page_set_anon_rmap(page, vma, address, 1133 flags & RMAP_EXCLUSIVE); 1134 else 1135 __page_check_anon_rmap(page, vma, address); 1136 } 1137 1138 /** 1139 * page_add_new_anon_rmap - add pte mapping to a new anonymous page 1140 * @page: the page to add the mapping to 1141 * @vma: the vm area in which the mapping is added 1142 * @address: the user virtual address mapped 1143 * @compound: charge the page as compound or small page 1144 * 1145 * Same as page_add_anon_rmap but must only be called on *new* pages. 1146 * This means the inc-and-test can be bypassed. 1147 * Page does not have to be locked. 1148 */ 1149 void page_add_new_anon_rmap(struct page *page, 1150 struct vm_area_struct *vma, unsigned long address, bool compound) 1151 { 1152 int nr = compound ? hpage_nr_pages(page) : 1; 1153 1154 VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); 1155 __SetPageSwapBacked(page); 1156 if (compound) { 1157 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 1158 /* increment count (starts at -1) */ 1159 atomic_set(compound_mapcount_ptr(page), 0); 1160 __inc_node_page_state(page, NR_ANON_THPS); 1161 } else { 1162 /* Anon THP always mapped first with PMD */ 1163 VM_BUG_ON_PAGE(PageTransCompound(page), page); 1164 /* increment count (starts at -1) */ 1165 atomic_set(&page->_mapcount, 0); 1166 } 1167 __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr); 1168 __page_set_anon_rmap(page, vma, address, 1); 1169 } 1170 1171 /** 1172 * page_add_file_rmap - add pte mapping to a file page 1173 * @page: the page to add the mapping to 1174 * @compound: charge the page as compound or small page 1175 * 1176 * The caller needs to hold the pte lock. 1177 */ 1178 void page_add_file_rmap(struct page *page, bool compound) 1179 { 1180 int i, nr = 1; 1181 1182 VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page); 1183 lock_page_memcg(page); 1184 if (compound && PageTransHuge(page)) { 1185 for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) { 1186 if (atomic_inc_and_test(&page[i]._mapcount)) 1187 nr++; 1188 } 1189 if (!atomic_inc_and_test(compound_mapcount_ptr(page))) 1190 goto out; 1191 VM_BUG_ON_PAGE(!PageSwapBacked(page), page); 1192 __inc_node_page_state(page, NR_SHMEM_PMDMAPPED); 1193 } else { 1194 if (PageTransCompound(page) && page_mapping(page)) { 1195 VM_WARN_ON_ONCE(!PageLocked(page)); 1196 1197 SetPageDoubleMap(compound_head(page)); 1198 if (PageMlocked(page)) 1199 clear_page_mlock(compound_head(page)); 1200 } 1201 if (!atomic_inc_and_test(&page->_mapcount)) 1202 goto out; 1203 } 1204 __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr); 1205 out: 1206 unlock_page_memcg(page); 1207 } 1208 1209 static void page_remove_file_rmap(struct page *page, bool compound) 1210 { 1211 int i, nr = 1; 1212 1213 VM_BUG_ON_PAGE(compound && !PageHead(page), page); 1214 lock_page_memcg(page); 1215 1216 /* Hugepages are not counted in NR_FILE_MAPPED for now. */ 1217 if (unlikely(PageHuge(page))) { 1218 /* hugetlb pages are always mapped with pmds */ 1219 atomic_dec(compound_mapcount_ptr(page)); 1220 goto out; 1221 } 1222 1223 /* page still mapped by someone else? */ 1224 if (compound && PageTransHuge(page)) { 1225 for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) { 1226 if (atomic_add_negative(-1, &page[i]._mapcount)) 1227 nr++; 1228 } 1229 if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) 1230 goto out; 1231 VM_BUG_ON_PAGE(!PageSwapBacked(page), page); 1232 __dec_node_page_state(page, NR_SHMEM_PMDMAPPED); 1233 } else { 1234 if (!atomic_add_negative(-1, &page->_mapcount)) 1235 goto out; 1236 } 1237 1238 /* 1239 * We use the irq-unsafe __{inc|mod}_lruvec_page_state because 1240 * these counters are not modified in interrupt context, and 1241 * pte lock(a spinlock) is held, which implies preemption disabled. 1242 */ 1243 __mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr); 1244 1245 if (unlikely(PageMlocked(page))) 1246 clear_page_mlock(page); 1247 out: 1248 unlock_page_memcg(page); 1249 } 1250 1251 static void page_remove_anon_compound_rmap(struct page *page) 1252 { 1253 int i, nr; 1254 1255 if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) 1256 return; 1257 1258 /* Hugepages are not counted in NR_ANON_PAGES for now. */ 1259 if (unlikely(PageHuge(page))) 1260 return; 1261 1262 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) 1263 return; 1264 1265 __dec_node_page_state(page, NR_ANON_THPS); 1266 1267 if (TestClearPageDoubleMap(page)) { 1268 /* 1269 * Subpages can be mapped with PTEs too. Check how many of 1270 * themi are still mapped. 1271 */ 1272 for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) { 1273 if (atomic_add_negative(-1, &page[i]._mapcount)) 1274 nr++; 1275 } 1276 } else { 1277 nr = HPAGE_PMD_NR; 1278 } 1279 1280 if (unlikely(PageMlocked(page))) 1281 clear_page_mlock(page); 1282 1283 if (nr) { 1284 __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, -nr); 1285 deferred_split_huge_page(page); 1286 } 1287 } 1288 1289 /** 1290 * page_remove_rmap - take down pte mapping from a page 1291 * @page: page to remove mapping from 1292 * @compound: uncharge the page as compound or small page 1293 * 1294 * The caller needs to hold the pte lock. 1295 */ 1296 void page_remove_rmap(struct page *page, bool compound) 1297 { 1298 if (!PageAnon(page)) 1299 return page_remove_file_rmap(page, compound); 1300 1301 if (compound) 1302 return page_remove_anon_compound_rmap(page); 1303 1304 /* page still mapped by someone else? */ 1305 if (!atomic_add_negative(-1, &page->_mapcount)) 1306 return; 1307 1308 /* 1309 * We use the irq-unsafe __{inc|mod}_zone_page_stat because 1310 * these counters are not modified in interrupt context, and 1311 * pte lock(a spinlock) is held, which implies preemption disabled. 1312 */ 1313 __dec_node_page_state(page, NR_ANON_MAPPED); 1314 1315 if (unlikely(PageMlocked(page))) 1316 clear_page_mlock(page); 1317 1318 if (PageTransCompound(page)) 1319 deferred_split_huge_page(compound_head(page)); 1320 1321 /* 1322 * It would be tidy to reset the PageAnon mapping here, 1323 * but that might overwrite a racing page_add_anon_rmap 1324 * which increments mapcount after us but sets mapping 1325 * before us: so leave the reset to free_unref_page, 1326 * and remember that it's only reliable while mapped. 1327 * Leaving it set also helps swapoff to reinstate ptes 1328 * faster for those pages still in swapcache. 1329 */ 1330 } 1331 1332 /* 1333 * @arg: enum ttu_flags will be passed to this argument 1334 */ 1335 static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, 1336 unsigned long address, void *arg) 1337 { 1338 struct mm_struct *mm = vma->vm_mm; 1339 struct page_vma_mapped_walk pvmw = { 1340 .page = page, 1341 .vma = vma, 1342 .address = address, 1343 }; 1344 pte_t pteval; 1345 struct page *subpage; 1346 bool ret = true; 1347 unsigned long start = address, end; 1348 enum ttu_flags flags = (enum ttu_flags)arg; 1349 1350 /* munlock has nothing to gain from examining un-locked vmas */ 1351 if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) 1352 return true; 1353 1354 if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) && 1355 is_zone_device_page(page) && !is_device_private_page(page)) 1356 return true; 1357 1358 if (flags & TTU_SPLIT_HUGE_PMD) { 1359 split_huge_pmd_address(vma, address, 1360 flags & TTU_SPLIT_FREEZE, page); 1361 } 1362 1363 /* 1364 * We have to assume the worse case ie pmd for invalidation. Note that 1365 * the page can not be free in this function as call of try_to_unmap() 1366 * must hold a reference on the page. 1367 */ 1368 end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page))); 1369 mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); 1370 1371 while (page_vma_mapped_walk(&pvmw)) { 1372 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 1373 /* PMD-mapped THP migration entry */ 1374 if (!pvmw.pte && (flags & TTU_MIGRATION)) { 1375 VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page); 1376 1377 if (!PageAnon(page)) 1378 continue; 1379 1380 set_pmd_migration_entry(&pvmw, page); 1381 continue; 1382 } 1383 #endif 1384 1385 /* 1386 * If the page is mlock()d, we cannot swap it out. 1387 * If it's recently referenced (perhaps page_referenced 1388 * skipped over this mm) then we should reactivate it. 1389 */ 1390 if (!(flags & TTU_IGNORE_MLOCK)) { 1391 if (vma->vm_flags & VM_LOCKED) { 1392 /* PTE-mapped THP are never mlocked */ 1393 if (!PageTransCompound(page)) { 1394 /* 1395 * Holding pte lock, we do *not* need 1396 * mmap_sem here 1397 */ 1398 mlock_vma_page(page); 1399 } 1400 ret = false; 1401 page_vma_mapped_walk_done(&pvmw); 1402 break; 1403 } 1404 if (flags & TTU_MUNLOCK) 1405 continue; 1406 } 1407 1408 /* Unexpected PMD-mapped THP? */ 1409 VM_BUG_ON_PAGE(!pvmw.pte, page); 1410 1411 subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte); 1412 address = pvmw.address; 1413 1414 1415 if (IS_ENABLED(CONFIG_MIGRATION) && 1416 (flags & TTU_MIGRATION) && 1417 is_zone_device_page(page)) { 1418 swp_entry_t entry; 1419 pte_t swp_pte; 1420 1421 pteval = ptep_get_and_clear(mm, pvmw.address, pvmw.pte); 1422 1423 /* 1424 * Store the pfn of the page in a special migration 1425 * pte. do_swap_page() will wait until the migration 1426 * pte is removed and then restart fault handling. 1427 */ 1428 entry = make_migration_entry(page, 0); 1429 swp_pte = swp_entry_to_pte(entry); 1430 if (pte_soft_dirty(pteval)) 1431 swp_pte = pte_swp_mksoft_dirty(swp_pte); 1432 set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte); 1433 /* 1434 * No need to invalidate here it will synchronize on 1435 * against the special swap migration pte. 1436 */ 1437 goto discard; 1438 } 1439 1440 if (!(flags & TTU_IGNORE_ACCESS)) { 1441 if (ptep_clear_flush_young_notify(vma, address, 1442 pvmw.pte)) { 1443 ret = false; 1444 page_vma_mapped_walk_done(&pvmw); 1445 break; 1446 } 1447 } 1448 1449 /* Nuke the page table entry. */ 1450 flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); 1451 if (should_defer_flush(mm, flags)) { 1452 /* 1453 * We clear the PTE but do not flush so potentially 1454 * a remote CPU could still be writing to the page. 1455 * If the entry was previously clean then the 1456 * architecture must guarantee that a clear->dirty 1457 * transition on a cached TLB entry is written through 1458 * and traps if the PTE is unmapped. 1459 */ 1460 pteval = ptep_get_and_clear(mm, address, pvmw.pte); 1461 1462 set_tlb_ubc_flush_pending(mm, pte_dirty(pteval)); 1463 } else { 1464 pteval = ptep_clear_flush(vma, address, pvmw.pte); 1465 } 1466 1467 /* Move the dirty bit to the page. Now the pte is gone. */ 1468 if (pte_dirty(pteval)) 1469 set_page_dirty(page); 1470 1471 /* Update high watermark before we lower rss */ 1472 update_hiwater_rss(mm); 1473 1474 if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { 1475 pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); 1476 if (PageHuge(page)) { 1477 int nr = 1 << compound_order(page); 1478 hugetlb_count_sub(nr, mm); 1479 set_huge_swap_pte_at(mm, address, 1480 pvmw.pte, pteval, 1481 vma_mmu_pagesize(vma)); 1482 } else { 1483 dec_mm_counter(mm, mm_counter(page)); 1484 set_pte_at(mm, address, pvmw.pte, pteval); 1485 } 1486 1487 } else if (pte_unused(pteval)) { 1488 /* 1489 * The guest indicated that the page content is of no 1490 * interest anymore. Simply discard the pte, vmscan 1491 * will take care of the rest. 1492 */ 1493 dec_mm_counter(mm, mm_counter(page)); 1494 /* We have to invalidate as we cleared the pte */ 1495 mmu_notifier_invalidate_range(mm, address, 1496 address + PAGE_SIZE); 1497 } else if (IS_ENABLED(CONFIG_MIGRATION) && 1498 (flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))) { 1499 swp_entry_t entry; 1500 pte_t swp_pte; 1501 1502 if (arch_unmap_one(mm, vma, address, pteval) < 0) { 1503 set_pte_at(mm, address, pvmw.pte, pteval); 1504 ret = false; 1505 page_vma_mapped_walk_done(&pvmw); 1506 break; 1507 } 1508 1509 /* 1510 * Store the pfn of the page in a special migration 1511 * pte. do_swap_page() will wait until the migration 1512 * pte is removed and then restart fault handling. 1513 */ 1514 entry = make_migration_entry(subpage, 1515 pte_write(pteval)); 1516 swp_pte = swp_entry_to_pte(entry); 1517 if (pte_soft_dirty(pteval)) 1518 swp_pte = pte_swp_mksoft_dirty(swp_pte); 1519 set_pte_at(mm, address, pvmw.pte, swp_pte); 1520 /* 1521 * No need to invalidate here it will synchronize on 1522 * against the special swap migration pte. 1523 */ 1524 } else if (PageAnon(page)) { 1525 swp_entry_t entry = { .val = page_private(subpage) }; 1526 pte_t swp_pte; 1527 /* 1528 * Store the swap location in the pte. 1529 * See handle_pte_fault() ... 1530 */ 1531 if (unlikely(PageSwapBacked(page) != PageSwapCache(page))) { 1532 WARN_ON_ONCE(1); 1533 ret = false; 1534 /* We have to invalidate as we cleared the pte */ 1535 mmu_notifier_invalidate_range(mm, address, 1536 address + PAGE_SIZE); 1537 page_vma_mapped_walk_done(&pvmw); 1538 break; 1539 } 1540 1541 /* MADV_FREE page check */ 1542 if (!PageSwapBacked(page)) { 1543 if (!PageDirty(page)) { 1544 /* Invalidate as we cleared the pte */ 1545 mmu_notifier_invalidate_range(mm, 1546 address, address + PAGE_SIZE); 1547 dec_mm_counter(mm, MM_ANONPAGES); 1548 goto discard; 1549 } 1550 1551 /* 1552 * If the page was redirtied, it cannot be 1553 * discarded. Remap the page to page table. 1554 */ 1555 set_pte_at(mm, address, pvmw.pte, pteval); 1556 SetPageSwapBacked(page); 1557 ret = false; 1558 page_vma_mapped_walk_done(&pvmw); 1559 break; 1560 } 1561 1562 if (swap_duplicate(entry) < 0) { 1563 set_pte_at(mm, address, pvmw.pte, pteval); 1564 ret = false; 1565 page_vma_mapped_walk_done(&pvmw); 1566 break; 1567 } 1568 if (arch_unmap_one(mm, vma, address, pteval) < 0) { 1569 set_pte_at(mm, address, pvmw.pte, pteval); 1570 ret = false; 1571 page_vma_mapped_walk_done(&pvmw); 1572 break; 1573 } 1574 if (list_empty(&mm->mmlist)) { 1575 spin_lock(&mmlist_lock); 1576 if (list_empty(&mm->mmlist)) 1577 list_add(&mm->mmlist, &init_mm.mmlist); 1578 spin_unlock(&mmlist_lock); 1579 } 1580 dec_mm_counter(mm, MM_ANONPAGES); 1581 inc_mm_counter(mm, MM_SWAPENTS); 1582 swp_pte = swp_entry_to_pte(entry); 1583 if (pte_soft_dirty(pteval)) 1584 swp_pte = pte_swp_mksoft_dirty(swp_pte); 1585 set_pte_at(mm, address, pvmw.pte, swp_pte); 1586 /* Invalidate as we cleared the pte */ 1587 mmu_notifier_invalidate_range(mm, address, 1588 address + PAGE_SIZE); 1589 } else { 1590 /* 1591 * We should not need to notify here as we reach this 1592 * case only from freeze_page() itself only call from 1593 * split_huge_page_to_list() so everything below must 1594 * be true: 1595 * - page is not anonymous 1596 * - page is locked 1597 * 1598 * So as it is a locked file back page thus it can not 1599 * be remove from the page cache and replace by a new 1600 * page before mmu_notifier_invalidate_range_end so no 1601 * concurrent thread might update its page table to 1602 * point at new page while a device still is using this 1603 * page. 1604 * 1605 * See Documentation/vm/mmu_notifier.txt 1606 */ 1607 dec_mm_counter(mm, mm_counter_file(page)); 1608 } 1609 discard: 1610 /* 1611 * No need to call mmu_notifier_invalidate_range() it has be 1612 * done above for all cases requiring it to happen under page 1613 * table lock before mmu_notifier_invalidate_range_end() 1614 * 1615 * See Documentation/vm/mmu_notifier.txt 1616 */ 1617 page_remove_rmap(subpage, PageHuge(page)); 1618 put_page(page); 1619 } 1620 1621 mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); 1622 1623 return ret; 1624 } 1625 1626 bool is_vma_temporary_stack(struct vm_area_struct *vma) 1627 { 1628 int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); 1629 1630 if (!maybe_stack) 1631 return false; 1632 1633 if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == 1634 VM_STACK_INCOMPLETE_SETUP) 1635 return true; 1636 1637 return false; 1638 } 1639 1640 static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) 1641 { 1642 return is_vma_temporary_stack(vma); 1643 } 1644 1645 static int page_mapcount_is_zero(struct page *page) 1646 { 1647 return !total_mapcount(page); 1648 } 1649 1650 /** 1651 * try_to_unmap - try to remove all page table mappings to a page 1652 * @page: the page to get unmapped 1653 * @flags: action and flags 1654 * 1655 * Tries to remove all the page table entries which are mapping this 1656 * page, used in the pageout path. Caller must hold the page lock. 1657 * 1658 * If unmap is successful, return true. Otherwise, false. 1659 */ 1660 bool try_to_unmap(struct page *page, enum ttu_flags flags) 1661 { 1662 struct rmap_walk_control rwc = { 1663 .rmap_one = try_to_unmap_one, 1664 .arg = (void *)flags, 1665 .done = page_mapcount_is_zero, 1666 .anon_lock = page_lock_anon_vma_read, 1667 }; 1668 1669 /* 1670 * During exec, a temporary VMA is setup and later moved. 1671 * The VMA is moved under the anon_vma lock but not the 1672 * page tables leading to a race where migration cannot 1673 * find the migration ptes. Rather than increasing the 1674 * locking requirements of exec(), migration skips 1675 * temporary VMAs until after exec() completes. 1676 */ 1677 if ((flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE)) 1678 && !PageKsm(page) && PageAnon(page)) 1679 rwc.invalid_vma = invalid_migration_vma; 1680 1681 if (flags & TTU_RMAP_LOCKED) 1682 rmap_walk_locked(page, &rwc); 1683 else 1684 rmap_walk(page, &rwc); 1685 1686 return !page_mapcount(page) ? true : false; 1687 } 1688 1689 static int page_not_mapped(struct page *page) 1690 { 1691 return !page_mapped(page); 1692 }; 1693 1694 /** 1695 * try_to_munlock - try to munlock a page 1696 * @page: the page to be munlocked 1697 * 1698 * Called from munlock code. Checks all of the VMAs mapping the page 1699 * to make sure nobody else has this page mlocked. The page will be 1700 * returned with PG_mlocked cleared if no other vmas have it mlocked. 1701 */ 1702 1703 void try_to_munlock(struct page *page) 1704 { 1705 struct rmap_walk_control rwc = { 1706 .rmap_one = try_to_unmap_one, 1707 .arg = (void *)TTU_MUNLOCK, 1708 .done = page_not_mapped, 1709 .anon_lock = page_lock_anon_vma_read, 1710 1711 }; 1712 1713 VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page); 1714 VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page); 1715 1716 rmap_walk(page, &rwc); 1717 } 1718 1719 void __put_anon_vma(struct anon_vma *anon_vma) 1720 { 1721 struct anon_vma *root = anon_vma->root; 1722 1723 anon_vma_free(anon_vma); 1724 if (root != anon_vma && atomic_dec_and_test(&root->refcount)) 1725 anon_vma_free(root); 1726 } 1727 1728 static struct anon_vma *rmap_walk_anon_lock(struct page *page, 1729 struct rmap_walk_control *rwc) 1730 { 1731 struct anon_vma *anon_vma; 1732 1733 if (rwc->anon_lock) 1734 return rwc->anon_lock(page); 1735 1736 /* 1737 * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() 1738 * because that depends on page_mapped(); but not all its usages 1739 * are holding mmap_sem. Users without mmap_sem are required to 1740 * take a reference count to prevent the anon_vma disappearing 1741 */ 1742 anon_vma = page_anon_vma(page); 1743 if (!anon_vma) 1744 return NULL; 1745 1746 anon_vma_lock_read(anon_vma); 1747 return anon_vma; 1748 } 1749 1750 /* 1751 * rmap_walk_anon - do something to anonymous page using the object-based 1752 * rmap method 1753 * @page: the page to be handled 1754 * @rwc: control variable according to each walk type 1755 * 1756 * Find all the mappings of a page using the mapping pointer and the vma chains 1757 * contained in the anon_vma struct it points to. 1758 * 1759 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma 1760 * where the page was found will be held for write. So, we won't recheck 1761 * vm_flags for that VMA. That should be OK, because that vma shouldn't be 1762 * LOCKED. 1763 */ 1764 static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, 1765 bool locked) 1766 { 1767 struct anon_vma *anon_vma; 1768 pgoff_t pgoff_start, pgoff_end; 1769 struct anon_vma_chain *avc; 1770 1771 if (locked) { 1772 anon_vma = page_anon_vma(page); 1773 /* anon_vma disappear under us? */ 1774 VM_BUG_ON_PAGE(!anon_vma, page); 1775 } else { 1776 anon_vma = rmap_walk_anon_lock(page, rwc); 1777 } 1778 if (!anon_vma) 1779 return; 1780 1781 pgoff_start = page_to_pgoff(page); 1782 pgoff_end = pgoff_start + hpage_nr_pages(page) - 1; 1783 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, 1784 pgoff_start, pgoff_end) { 1785 struct vm_area_struct *vma = avc->vma; 1786 unsigned long address = vma_address(page, vma); 1787 1788 cond_resched(); 1789 1790 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) 1791 continue; 1792 1793 if (!rwc->rmap_one(page, vma, address, rwc->arg)) 1794 break; 1795 if (rwc->done && rwc->done(page)) 1796 break; 1797 } 1798 1799 if (!locked) 1800 anon_vma_unlock_read(anon_vma); 1801 } 1802 1803 /* 1804 * rmap_walk_file - do something to file page using the object-based rmap method 1805 * @page: the page to be handled 1806 * @rwc: control variable according to each walk type 1807 * 1808 * Find all the mappings of a page using the mapping pointer and the vma chains 1809 * contained in the address_space struct it points to. 1810 * 1811 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma 1812 * where the page was found will be held for write. So, we won't recheck 1813 * vm_flags for that VMA. That should be OK, because that vma shouldn't be 1814 * LOCKED. 1815 */ 1816 static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, 1817 bool locked) 1818 { 1819 struct address_space *mapping = page_mapping(page); 1820 pgoff_t pgoff_start, pgoff_end; 1821 struct vm_area_struct *vma; 1822 1823 /* 1824 * The page lock not only makes sure that page->mapping cannot 1825 * suddenly be NULLified by truncation, it makes sure that the 1826 * structure at mapping cannot be freed and reused yet, 1827 * so we can safely take mapping->i_mmap_rwsem. 1828 */ 1829 VM_BUG_ON_PAGE(!PageLocked(page), page); 1830 1831 if (!mapping) 1832 return; 1833 1834 pgoff_start = page_to_pgoff(page); 1835 pgoff_end = pgoff_start + hpage_nr_pages(page) - 1; 1836 if (!locked) 1837 i_mmap_lock_read(mapping); 1838 vma_interval_tree_foreach(vma, &mapping->i_mmap, 1839 pgoff_start, pgoff_end) { 1840 unsigned long address = vma_address(page, vma); 1841 1842 cond_resched(); 1843 1844 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) 1845 continue; 1846 1847 if (!rwc->rmap_one(page, vma, address, rwc->arg)) 1848 goto done; 1849 if (rwc->done && rwc->done(page)) 1850 goto done; 1851 } 1852 1853 done: 1854 if (!locked) 1855 i_mmap_unlock_read(mapping); 1856 } 1857 1858 void rmap_walk(struct page *page, struct rmap_walk_control *rwc) 1859 { 1860 if (unlikely(PageKsm(page))) 1861 rmap_walk_ksm(page, rwc); 1862 else if (PageAnon(page)) 1863 rmap_walk_anon(page, rwc, false); 1864 else 1865 rmap_walk_file(page, rwc, false); 1866 } 1867 1868 /* Like rmap_walk, but caller holds relevant rmap lock */ 1869 void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc) 1870 { 1871 /* no ksm support for now */ 1872 VM_BUG_ON_PAGE(PageKsm(page), page); 1873 if (PageAnon(page)) 1874 rmap_walk_anon(page, rwc, true); 1875 else 1876 rmap_walk_file(page, rwc, true); 1877 } 1878 1879 #ifdef CONFIG_HUGETLB_PAGE 1880 /* 1881 * The following three functions are for anonymous (private mapped) hugepages. 1882 * Unlike common anonymous pages, anonymous hugepages have no accounting code 1883 * and no lru code, because we handle hugepages differently from common pages. 1884 */ 1885 static void __hugepage_set_anon_rmap(struct page *page, 1886 struct vm_area_struct *vma, unsigned long address, int exclusive) 1887 { 1888 struct anon_vma *anon_vma = vma->anon_vma; 1889 1890 BUG_ON(!anon_vma); 1891 1892 if (PageAnon(page)) 1893 return; 1894 if (!exclusive) 1895 anon_vma = anon_vma->root; 1896 1897 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 1898 page->mapping = (struct address_space *) anon_vma; 1899 page->index = linear_page_index(vma, address); 1900 } 1901 1902 void hugepage_add_anon_rmap(struct page *page, 1903 struct vm_area_struct *vma, unsigned long address) 1904 { 1905 struct anon_vma *anon_vma = vma->anon_vma; 1906 int first; 1907 1908 BUG_ON(!PageLocked(page)); 1909 BUG_ON(!anon_vma); 1910 /* address might be in next vma when migration races vma_adjust */ 1911 first = atomic_inc_and_test(compound_mapcount_ptr(page)); 1912 if (first) 1913 __hugepage_set_anon_rmap(page, vma, address, 0); 1914 } 1915 1916 void hugepage_add_new_anon_rmap(struct page *page, 1917 struct vm_area_struct *vma, unsigned long address) 1918 { 1919 BUG_ON(address < vma->vm_start || address >= vma->vm_end); 1920 atomic_set(compound_mapcount_ptr(page), 0); 1921 __hugepage_set_anon_rmap(page, vma, address, 1); 1922 } 1923 #endif /* CONFIG_HUGETLB_PAGE */ 1924