1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2009 Red Hat, Inc. 4 */ 5 6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 7 8 #include <linux/mm.h> 9 #include <linux/sched.h> 10 #include <linux/sched/coredump.h> 11 #include <linux/sched/numa_balancing.h> 12 #include <linux/highmem.h> 13 #include <linux/hugetlb.h> 14 #include <linux/mmu_notifier.h> 15 #include <linux/rmap.h> 16 #include <linux/swap.h> 17 #include <linux/shrinker.h> 18 #include <linux/mm_inline.h> 19 #include <linux/swapops.h> 20 #include <linux/dax.h> 21 #include <linux/khugepaged.h> 22 #include <linux/freezer.h> 23 #include <linux/pfn_t.h> 24 #include <linux/mman.h> 25 #include <linux/memremap.h> 26 #include <linux/pagemap.h> 27 #include <linux/debugfs.h> 28 #include <linux/migrate.h> 29 #include <linux/hashtable.h> 30 #include <linux/userfaultfd_k.h> 31 #include <linux/page_idle.h> 32 #include <linux/shmem_fs.h> 33 #include <linux/oom.h> 34 #include <linux/numa.h> 35 36 #include <asm/tlb.h> 37 #include <asm/pgalloc.h> 38 #include "internal.h" 39 40 /* 41 * By default, transparent hugepage support is disabled in order to avoid 42 * risking an increased memory footprint for applications that are not 43 * guaranteed to benefit from it. When transparent hugepage support is 44 * enabled, it is for all mappings, and khugepaged scans all mappings. 45 * Defrag is invoked by khugepaged hugepage allocations and by page faults 46 * for all hugepage allocations. 47 */ 48 unsigned long transparent_hugepage_flags __read_mostly = 49 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS 50 (1<<TRANSPARENT_HUGEPAGE_FLAG)| 51 #endif 52 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE 53 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| 54 #endif 55 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)| 56 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)| 57 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 58 59 static struct shrinker deferred_split_shrinker; 60 61 static atomic_t huge_zero_refcount; 62 struct page *huge_zero_page __read_mostly; 63 64 bool transparent_hugepage_enabled(struct vm_area_struct *vma) 65 { 66 /* The addr is used to check if the vma size fits */ 67 unsigned long addr = (vma->vm_end & HPAGE_PMD_MASK) - HPAGE_PMD_SIZE; 68 69 if (!transhuge_vma_suitable(vma, addr)) 70 return false; 71 if (vma_is_anonymous(vma)) 72 return __transparent_hugepage_enabled(vma); 73 if (vma_is_shmem(vma)) 74 return shmem_huge_enabled(vma); 75 76 return false; 77 } 78 79 static struct page *get_huge_zero_page(void) 80 { 81 struct page *zero_page; 82 retry: 83 if (likely(atomic_inc_not_zero(&huge_zero_refcount))) 84 return READ_ONCE(huge_zero_page); 85 86 zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, 87 HPAGE_PMD_ORDER); 88 if (!zero_page) { 89 count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); 90 return NULL; 91 } 92 count_vm_event(THP_ZERO_PAGE_ALLOC); 93 preempt_disable(); 94 if (cmpxchg(&huge_zero_page, NULL, zero_page)) { 95 preempt_enable(); 96 __free_pages(zero_page, compound_order(zero_page)); 97 goto retry; 98 } 99 100 /* We take additional reference here. It will be put back by shrinker */ 101 atomic_set(&huge_zero_refcount, 2); 102 preempt_enable(); 103 return READ_ONCE(huge_zero_page); 104 } 105 106 static void put_huge_zero_page(void) 107 { 108 /* 109 * Counter should never go to zero here. Only shrinker can put 110 * last reference. 111 */ 112 BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); 113 } 114 115 struct page *mm_get_huge_zero_page(struct mm_struct *mm) 116 { 117 if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) 118 return READ_ONCE(huge_zero_page); 119 120 if (!get_huge_zero_page()) 121 return NULL; 122 123 if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) 124 put_huge_zero_page(); 125 126 return READ_ONCE(huge_zero_page); 127 } 128 129 void mm_put_huge_zero_page(struct mm_struct *mm) 130 { 131 if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) 132 put_huge_zero_page(); 133 } 134 135 static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink, 136 struct shrink_control *sc) 137 { 138 /* we can free zero page only if last reference remains */ 139 return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; 140 } 141 142 static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink, 143 struct shrink_control *sc) 144 { 145 if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { 146 struct page *zero_page = xchg(&huge_zero_page, NULL); 147 BUG_ON(zero_page == NULL); 148 __free_pages(zero_page, compound_order(zero_page)); 149 return HPAGE_PMD_NR; 150 } 151 152 return 0; 153 } 154 155 static struct shrinker huge_zero_page_shrinker = { 156 .count_objects = shrink_huge_zero_page_count, 157 .scan_objects = shrink_huge_zero_page_scan, 158 .seeks = DEFAULT_SEEKS, 159 }; 160 161 #ifdef CONFIG_SYSFS 162 static ssize_t enabled_show(struct kobject *kobj, 163 struct kobj_attribute *attr, char *buf) 164 { 165 if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags)) 166 return sprintf(buf, "[always] madvise never\n"); 167 else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags)) 168 return sprintf(buf, "always [madvise] never\n"); 169 else 170 return sprintf(buf, "always madvise [never]\n"); 171 } 172 173 static ssize_t enabled_store(struct kobject *kobj, 174 struct kobj_attribute *attr, 175 const char *buf, size_t count) 176 { 177 ssize_t ret = count; 178 179 if (!memcmp("always", buf, 180 min(sizeof("always")-1, count))) { 181 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); 182 set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); 183 } else if (!memcmp("madvise", buf, 184 min(sizeof("madvise")-1, count))) { 185 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); 186 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); 187 } else if (!memcmp("never", buf, 188 min(sizeof("never")-1, count))) { 189 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); 190 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); 191 } else 192 ret = -EINVAL; 193 194 if (ret > 0) { 195 int err = start_stop_khugepaged(); 196 if (err) 197 ret = err; 198 } 199 return ret; 200 } 201 static struct kobj_attribute enabled_attr = 202 __ATTR(enabled, 0644, enabled_show, enabled_store); 203 204 ssize_t single_hugepage_flag_show(struct kobject *kobj, 205 struct kobj_attribute *attr, char *buf, 206 enum transparent_hugepage_flag flag) 207 { 208 return sprintf(buf, "%d\n", 209 !!test_bit(flag, &transparent_hugepage_flags)); 210 } 211 212 ssize_t single_hugepage_flag_store(struct kobject *kobj, 213 struct kobj_attribute *attr, 214 const char *buf, size_t count, 215 enum transparent_hugepage_flag flag) 216 { 217 unsigned long value; 218 int ret; 219 220 ret = kstrtoul(buf, 10, &value); 221 if (ret < 0) 222 return ret; 223 if (value > 1) 224 return -EINVAL; 225 226 if (value) 227 set_bit(flag, &transparent_hugepage_flags); 228 else 229 clear_bit(flag, &transparent_hugepage_flags); 230 231 return count; 232 } 233 234 static ssize_t defrag_show(struct kobject *kobj, 235 struct kobj_attribute *attr, char *buf) 236 { 237 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) 238 return sprintf(buf, "[always] defer defer+madvise madvise never\n"); 239 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) 240 return sprintf(buf, "always [defer] defer+madvise madvise never\n"); 241 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) 242 return sprintf(buf, "always defer [defer+madvise] madvise never\n"); 243 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) 244 return sprintf(buf, "always defer defer+madvise [madvise] never\n"); 245 return sprintf(buf, "always defer defer+madvise madvise [never]\n"); 246 } 247 248 static ssize_t defrag_store(struct kobject *kobj, 249 struct kobj_attribute *attr, 250 const char *buf, size_t count) 251 { 252 if (!memcmp("always", buf, 253 min(sizeof("always")-1, count))) { 254 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); 255 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); 256 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); 257 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); 258 } else if (!memcmp("defer+madvise", buf, 259 min(sizeof("defer+madvise")-1, count))) { 260 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); 261 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); 262 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); 263 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); 264 } else if (!memcmp("defer", buf, 265 min(sizeof("defer")-1, count))) { 266 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); 267 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); 268 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); 269 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); 270 } else if (!memcmp("madvise", buf, 271 min(sizeof("madvise")-1, count))) { 272 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); 273 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); 274 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); 275 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); 276 } else if (!memcmp("never", buf, 277 min(sizeof("never")-1, count))) { 278 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); 279 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); 280 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); 281 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); 282 } else 283 return -EINVAL; 284 285 return count; 286 } 287 static struct kobj_attribute defrag_attr = 288 __ATTR(defrag, 0644, defrag_show, defrag_store); 289 290 static ssize_t use_zero_page_show(struct kobject *kobj, 291 struct kobj_attribute *attr, char *buf) 292 { 293 return single_hugepage_flag_show(kobj, attr, buf, 294 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 295 } 296 static ssize_t use_zero_page_store(struct kobject *kobj, 297 struct kobj_attribute *attr, const char *buf, size_t count) 298 { 299 return single_hugepage_flag_store(kobj, attr, buf, count, 300 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 301 } 302 static struct kobj_attribute use_zero_page_attr = 303 __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store); 304 305 static ssize_t hpage_pmd_size_show(struct kobject *kobj, 306 struct kobj_attribute *attr, char *buf) 307 { 308 return sprintf(buf, "%lu\n", HPAGE_PMD_SIZE); 309 } 310 static struct kobj_attribute hpage_pmd_size_attr = 311 __ATTR_RO(hpage_pmd_size); 312 313 #ifdef CONFIG_DEBUG_VM 314 static ssize_t debug_cow_show(struct kobject *kobj, 315 struct kobj_attribute *attr, char *buf) 316 { 317 return single_hugepage_flag_show(kobj, attr, buf, 318 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); 319 } 320 static ssize_t debug_cow_store(struct kobject *kobj, 321 struct kobj_attribute *attr, 322 const char *buf, size_t count) 323 { 324 return single_hugepage_flag_store(kobj, attr, buf, count, 325 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); 326 } 327 static struct kobj_attribute debug_cow_attr = 328 __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store); 329 #endif /* CONFIG_DEBUG_VM */ 330 331 static struct attribute *hugepage_attr[] = { 332 &enabled_attr.attr, 333 &defrag_attr.attr, 334 &use_zero_page_attr.attr, 335 &hpage_pmd_size_attr.attr, 336 #if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) 337 &shmem_enabled_attr.attr, 338 #endif 339 #ifdef CONFIG_DEBUG_VM 340 &debug_cow_attr.attr, 341 #endif 342 NULL, 343 }; 344 345 static const struct attribute_group hugepage_attr_group = { 346 .attrs = hugepage_attr, 347 }; 348 349 static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) 350 { 351 int err; 352 353 *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); 354 if (unlikely(!*hugepage_kobj)) { 355 pr_err("failed to create transparent hugepage kobject\n"); 356 return -ENOMEM; 357 } 358 359 err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); 360 if (err) { 361 pr_err("failed to register transparent hugepage group\n"); 362 goto delete_obj; 363 } 364 365 err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); 366 if (err) { 367 pr_err("failed to register transparent hugepage group\n"); 368 goto remove_hp_group; 369 } 370 371 return 0; 372 373 remove_hp_group: 374 sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group); 375 delete_obj: 376 kobject_put(*hugepage_kobj); 377 return err; 378 } 379 380 static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj) 381 { 382 sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group); 383 sysfs_remove_group(hugepage_kobj, &hugepage_attr_group); 384 kobject_put(hugepage_kobj); 385 } 386 #else 387 static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj) 388 { 389 return 0; 390 } 391 392 static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj) 393 { 394 } 395 #endif /* CONFIG_SYSFS */ 396 397 static int __init hugepage_init(void) 398 { 399 int err; 400 struct kobject *hugepage_kobj; 401 402 if (!has_transparent_hugepage()) { 403 transparent_hugepage_flags = 0; 404 return -EINVAL; 405 } 406 407 /* 408 * hugepages can't be allocated by the buddy allocator 409 */ 410 MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER >= MAX_ORDER); 411 /* 412 * we use page->mapping and page->index in second tail page 413 * as list_head: assuming THP order >= 2 414 */ 415 MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2); 416 417 err = hugepage_init_sysfs(&hugepage_kobj); 418 if (err) 419 goto err_sysfs; 420 421 err = khugepaged_init(); 422 if (err) 423 goto err_slab; 424 425 err = register_shrinker(&huge_zero_page_shrinker); 426 if (err) 427 goto err_hzp_shrinker; 428 err = register_shrinker(&deferred_split_shrinker); 429 if (err) 430 goto err_split_shrinker; 431 432 /* 433 * By default disable transparent hugepages on smaller systems, 434 * where the extra memory used could hurt more than TLB overhead 435 * is likely to save. The admin can still enable it through /sys. 436 */ 437 if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) { 438 transparent_hugepage_flags = 0; 439 return 0; 440 } 441 442 err = start_stop_khugepaged(); 443 if (err) 444 goto err_khugepaged; 445 446 return 0; 447 err_khugepaged: 448 unregister_shrinker(&deferred_split_shrinker); 449 err_split_shrinker: 450 unregister_shrinker(&huge_zero_page_shrinker); 451 err_hzp_shrinker: 452 khugepaged_destroy(); 453 err_slab: 454 hugepage_exit_sysfs(hugepage_kobj); 455 err_sysfs: 456 return err; 457 } 458 subsys_initcall(hugepage_init); 459 460 static int __init setup_transparent_hugepage(char *str) 461 { 462 int ret = 0; 463 if (!str) 464 goto out; 465 if (!strcmp(str, "always")) { 466 set_bit(TRANSPARENT_HUGEPAGE_FLAG, 467 &transparent_hugepage_flags); 468 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 469 &transparent_hugepage_flags); 470 ret = 1; 471 } else if (!strcmp(str, "madvise")) { 472 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, 473 &transparent_hugepage_flags); 474 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 475 &transparent_hugepage_flags); 476 ret = 1; 477 } else if (!strcmp(str, "never")) { 478 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, 479 &transparent_hugepage_flags); 480 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 481 &transparent_hugepage_flags); 482 ret = 1; 483 } 484 out: 485 if (!ret) 486 pr_warn("transparent_hugepage= cannot parse, ignored\n"); 487 return ret; 488 } 489 __setup("transparent_hugepage=", setup_transparent_hugepage); 490 491 pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) 492 { 493 if (likely(vma->vm_flags & VM_WRITE)) 494 pmd = pmd_mkwrite(pmd); 495 return pmd; 496 } 497 498 static inline struct list_head *page_deferred_list(struct page *page) 499 { 500 /* ->lru in the tail pages is occupied by compound_head. */ 501 return &page[2].deferred_list; 502 } 503 504 void prep_transhuge_page(struct page *page) 505 { 506 /* 507 * we use page->mapping and page->indexlru in second tail page 508 * as list_head: assuming THP order >= 2 509 */ 510 511 INIT_LIST_HEAD(page_deferred_list(page)); 512 set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); 513 } 514 515 static unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len, 516 loff_t off, unsigned long flags, unsigned long size) 517 { 518 unsigned long addr; 519 loff_t off_end = off + len; 520 loff_t off_align = round_up(off, size); 521 unsigned long len_pad; 522 523 if (off_end <= off_align || (off_end - off_align) < size) 524 return 0; 525 526 len_pad = len + size; 527 if (len_pad < len || (off + len_pad) < off) 528 return 0; 529 530 addr = current->mm->get_unmapped_area(filp, 0, len_pad, 531 off >> PAGE_SHIFT, flags); 532 if (IS_ERR_VALUE(addr)) 533 return 0; 534 535 addr += (off - addr) & (size - 1); 536 return addr; 537 } 538 539 unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, 540 unsigned long len, unsigned long pgoff, unsigned long flags) 541 { 542 loff_t off = (loff_t)pgoff << PAGE_SHIFT; 543 544 if (addr) 545 goto out; 546 if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD)) 547 goto out; 548 549 addr = __thp_get_unmapped_area(filp, len, off, flags, PMD_SIZE); 550 if (addr) 551 return addr; 552 553 out: 554 return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); 555 } 556 EXPORT_SYMBOL_GPL(thp_get_unmapped_area); 557 558 static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, 559 struct page *page, gfp_t gfp) 560 { 561 struct vm_area_struct *vma = vmf->vma; 562 struct mem_cgroup *memcg; 563 pgtable_t pgtable; 564 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 565 vm_fault_t ret = 0; 566 567 VM_BUG_ON_PAGE(!PageCompound(page), page); 568 569 if (mem_cgroup_try_charge_delay(page, vma->vm_mm, gfp, &memcg, true)) { 570 put_page(page); 571 count_vm_event(THP_FAULT_FALLBACK); 572 return VM_FAULT_FALLBACK; 573 } 574 575 pgtable = pte_alloc_one(vma->vm_mm); 576 if (unlikely(!pgtable)) { 577 ret = VM_FAULT_OOM; 578 goto release; 579 } 580 581 clear_huge_page(page, vmf->address, HPAGE_PMD_NR); 582 /* 583 * The memory barrier inside __SetPageUptodate makes sure that 584 * clear_huge_page writes become visible before the set_pmd_at() 585 * write. 586 */ 587 __SetPageUptodate(page); 588 589 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 590 if (unlikely(!pmd_none(*vmf->pmd))) { 591 goto unlock_release; 592 } else { 593 pmd_t entry; 594 595 ret = check_stable_address_space(vma->vm_mm); 596 if (ret) 597 goto unlock_release; 598 599 /* Deliver the page fault to userland */ 600 if (userfaultfd_missing(vma)) { 601 vm_fault_t ret2; 602 603 spin_unlock(vmf->ptl); 604 mem_cgroup_cancel_charge(page, memcg, true); 605 put_page(page); 606 pte_free(vma->vm_mm, pgtable); 607 ret2 = handle_userfault(vmf, VM_UFFD_MISSING); 608 VM_BUG_ON(ret2 & VM_FAULT_FALLBACK); 609 return ret2; 610 } 611 612 entry = mk_huge_pmd(page, vma->vm_page_prot); 613 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 614 page_add_new_anon_rmap(page, vma, haddr, true); 615 mem_cgroup_commit_charge(page, memcg, false, true); 616 lru_cache_add_active_or_unevictable(page, vma); 617 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); 618 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); 619 add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); 620 mm_inc_nr_ptes(vma->vm_mm); 621 spin_unlock(vmf->ptl); 622 count_vm_event(THP_FAULT_ALLOC); 623 count_memcg_events(memcg, THP_FAULT_ALLOC, 1); 624 } 625 626 return 0; 627 unlock_release: 628 spin_unlock(vmf->ptl); 629 release: 630 if (pgtable) 631 pte_free(vma->vm_mm, pgtable); 632 mem_cgroup_cancel_charge(page, memcg, true); 633 put_page(page); 634 return ret; 635 636 } 637 638 /* 639 * always: directly stall for all thp allocations 640 * defer: wake kswapd and fail if not immediately available 641 * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise 642 * fail if not immediately available 643 * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately 644 * available 645 * never: never stall for any thp allocation 646 */ 647 static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) 648 { 649 const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); 650 651 /* Always do synchronous compaction */ 652 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) 653 return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); 654 655 /* Kick kcompactd and fail quickly */ 656 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) 657 return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; 658 659 /* Synchronous compaction if madvised, otherwise kick kcompactd */ 660 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) 661 return GFP_TRANSHUGE_LIGHT | 662 (vma_madvised ? __GFP_DIRECT_RECLAIM : 663 __GFP_KSWAPD_RECLAIM); 664 665 /* Only do synchronous compaction if madvised */ 666 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) 667 return GFP_TRANSHUGE_LIGHT | 668 (vma_madvised ? __GFP_DIRECT_RECLAIM : 0); 669 670 return GFP_TRANSHUGE_LIGHT; 671 } 672 673 /* Caller must hold page table lock. */ 674 static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, 675 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, 676 struct page *zero_page) 677 { 678 pmd_t entry; 679 if (!pmd_none(*pmd)) 680 return false; 681 entry = mk_pmd(zero_page, vma->vm_page_prot); 682 entry = pmd_mkhuge(entry); 683 if (pgtable) 684 pgtable_trans_huge_deposit(mm, pmd, pgtable); 685 set_pmd_at(mm, haddr, pmd, entry); 686 mm_inc_nr_ptes(mm); 687 return true; 688 } 689 690 vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) 691 { 692 struct vm_area_struct *vma = vmf->vma; 693 gfp_t gfp; 694 struct page *page; 695 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 696 697 if (!transhuge_vma_suitable(vma, haddr)) 698 return VM_FAULT_FALLBACK; 699 if (unlikely(anon_vma_prepare(vma))) 700 return VM_FAULT_OOM; 701 if (unlikely(khugepaged_enter(vma, vma->vm_flags))) 702 return VM_FAULT_OOM; 703 if (!(vmf->flags & FAULT_FLAG_WRITE) && 704 !mm_forbids_zeropage(vma->vm_mm) && 705 transparent_hugepage_use_zero_page()) { 706 pgtable_t pgtable; 707 struct page *zero_page; 708 bool set; 709 vm_fault_t ret; 710 pgtable = pte_alloc_one(vma->vm_mm); 711 if (unlikely(!pgtable)) 712 return VM_FAULT_OOM; 713 zero_page = mm_get_huge_zero_page(vma->vm_mm); 714 if (unlikely(!zero_page)) { 715 pte_free(vma->vm_mm, pgtable); 716 count_vm_event(THP_FAULT_FALLBACK); 717 return VM_FAULT_FALLBACK; 718 } 719 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 720 ret = 0; 721 set = false; 722 if (pmd_none(*vmf->pmd)) { 723 ret = check_stable_address_space(vma->vm_mm); 724 if (ret) { 725 spin_unlock(vmf->ptl); 726 } else if (userfaultfd_missing(vma)) { 727 spin_unlock(vmf->ptl); 728 ret = handle_userfault(vmf, VM_UFFD_MISSING); 729 VM_BUG_ON(ret & VM_FAULT_FALLBACK); 730 } else { 731 set_huge_zero_page(pgtable, vma->vm_mm, vma, 732 haddr, vmf->pmd, zero_page); 733 spin_unlock(vmf->ptl); 734 set = true; 735 } 736 } else 737 spin_unlock(vmf->ptl); 738 if (!set) 739 pte_free(vma->vm_mm, pgtable); 740 return ret; 741 } 742 gfp = alloc_hugepage_direct_gfpmask(vma); 743 page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); 744 if (unlikely(!page)) { 745 count_vm_event(THP_FAULT_FALLBACK); 746 return VM_FAULT_FALLBACK; 747 } 748 prep_transhuge_page(page); 749 return __do_huge_pmd_anonymous_page(vmf, page, gfp); 750 } 751 752 static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, 753 pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write, 754 pgtable_t pgtable) 755 { 756 struct mm_struct *mm = vma->vm_mm; 757 pmd_t entry; 758 spinlock_t *ptl; 759 760 ptl = pmd_lock(mm, pmd); 761 if (!pmd_none(*pmd)) { 762 if (write) { 763 if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) { 764 WARN_ON_ONCE(!is_huge_zero_pmd(*pmd)); 765 goto out_unlock; 766 } 767 entry = pmd_mkyoung(*pmd); 768 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 769 if (pmdp_set_access_flags(vma, addr, pmd, entry, 1)) 770 update_mmu_cache_pmd(vma, addr, pmd); 771 } 772 773 goto out_unlock; 774 } 775 776 entry = pmd_mkhuge(pfn_t_pmd(pfn, prot)); 777 if (pfn_t_devmap(pfn)) 778 entry = pmd_mkdevmap(entry); 779 if (write) { 780 entry = pmd_mkyoung(pmd_mkdirty(entry)); 781 entry = maybe_pmd_mkwrite(entry, vma); 782 } 783 784 if (pgtable) { 785 pgtable_trans_huge_deposit(mm, pmd, pgtable); 786 mm_inc_nr_ptes(mm); 787 pgtable = NULL; 788 } 789 790 set_pmd_at(mm, addr, pmd, entry); 791 update_mmu_cache_pmd(vma, addr, pmd); 792 793 out_unlock: 794 spin_unlock(ptl); 795 if (pgtable) 796 pte_free(mm, pgtable); 797 } 798 799 vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write) 800 { 801 unsigned long addr = vmf->address & PMD_MASK; 802 struct vm_area_struct *vma = vmf->vma; 803 pgprot_t pgprot = vma->vm_page_prot; 804 pgtable_t pgtable = NULL; 805 806 /* 807 * If we had pmd_special, we could avoid all these restrictions, 808 * but we need to be consistent with PTEs and architectures that 809 * can't support a 'special' bit. 810 */ 811 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && 812 !pfn_t_devmap(pfn)); 813 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == 814 (VM_PFNMAP|VM_MIXEDMAP)); 815 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); 816 817 if (addr < vma->vm_start || addr >= vma->vm_end) 818 return VM_FAULT_SIGBUS; 819 820 if (arch_needs_pgtable_deposit()) { 821 pgtable = pte_alloc_one(vma->vm_mm); 822 if (!pgtable) 823 return VM_FAULT_OOM; 824 } 825 826 track_pfn_insert(vma, &pgprot, pfn); 827 828 insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable); 829 return VM_FAULT_NOPAGE; 830 } 831 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd); 832 833 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 834 static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma) 835 { 836 if (likely(vma->vm_flags & VM_WRITE)) 837 pud = pud_mkwrite(pud); 838 return pud; 839 } 840 841 static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, 842 pud_t *pud, pfn_t pfn, pgprot_t prot, bool write) 843 { 844 struct mm_struct *mm = vma->vm_mm; 845 pud_t entry; 846 spinlock_t *ptl; 847 848 ptl = pud_lock(mm, pud); 849 if (!pud_none(*pud)) { 850 if (write) { 851 if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) { 852 WARN_ON_ONCE(!is_huge_zero_pud(*pud)); 853 goto out_unlock; 854 } 855 entry = pud_mkyoung(*pud); 856 entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma); 857 if (pudp_set_access_flags(vma, addr, pud, entry, 1)) 858 update_mmu_cache_pud(vma, addr, pud); 859 } 860 goto out_unlock; 861 } 862 863 entry = pud_mkhuge(pfn_t_pud(pfn, prot)); 864 if (pfn_t_devmap(pfn)) 865 entry = pud_mkdevmap(entry); 866 if (write) { 867 entry = pud_mkyoung(pud_mkdirty(entry)); 868 entry = maybe_pud_mkwrite(entry, vma); 869 } 870 set_pud_at(mm, addr, pud, entry); 871 update_mmu_cache_pud(vma, addr, pud); 872 873 out_unlock: 874 spin_unlock(ptl); 875 } 876 877 vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write) 878 { 879 unsigned long addr = vmf->address & PUD_MASK; 880 struct vm_area_struct *vma = vmf->vma; 881 pgprot_t pgprot = vma->vm_page_prot; 882 883 /* 884 * If we had pud_special, we could avoid all these restrictions, 885 * but we need to be consistent with PTEs and architectures that 886 * can't support a 'special' bit. 887 */ 888 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && 889 !pfn_t_devmap(pfn)); 890 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == 891 (VM_PFNMAP|VM_MIXEDMAP)); 892 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); 893 894 if (addr < vma->vm_start || addr >= vma->vm_end) 895 return VM_FAULT_SIGBUS; 896 897 track_pfn_insert(vma, &pgprot, pfn); 898 899 insert_pfn_pud(vma, addr, vmf->pud, pfn, pgprot, write); 900 return VM_FAULT_NOPAGE; 901 } 902 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud); 903 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 904 905 static void touch_pmd(struct vm_area_struct *vma, unsigned long addr, 906 pmd_t *pmd, int flags) 907 { 908 pmd_t _pmd; 909 910 _pmd = pmd_mkyoung(*pmd); 911 if (flags & FOLL_WRITE) 912 _pmd = pmd_mkdirty(_pmd); 913 if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, 914 pmd, _pmd, flags & FOLL_WRITE)) 915 update_mmu_cache_pmd(vma, addr, pmd); 916 } 917 918 struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, 919 pmd_t *pmd, int flags, struct dev_pagemap **pgmap) 920 { 921 unsigned long pfn = pmd_pfn(*pmd); 922 struct mm_struct *mm = vma->vm_mm; 923 struct page *page; 924 925 assert_spin_locked(pmd_lockptr(mm, pmd)); 926 927 /* 928 * When we COW a devmap PMD entry, we split it into PTEs, so we should 929 * not be in this function with `flags & FOLL_COW` set. 930 */ 931 WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set"); 932 933 if (flags & FOLL_WRITE && !pmd_write(*pmd)) 934 return NULL; 935 936 if (pmd_present(*pmd) && pmd_devmap(*pmd)) 937 /* pass */; 938 else 939 return NULL; 940 941 if (flags & FOLL_TOUCH) 942 touch_pmd(vma, addr, pmd, flags); 943 944 /* 945 * device mapped pages can only be returned if the 946 * caller will manage the page reference count. 947 */ 948 if (!(flags & FOLL_GET)) 949 return ERR_PTR(-EEXIST); 950 951 pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT; 952 *pgmap = get_dev_pagemap(pfn, *pgmap); 953 if (!*pgmap) 954 return ERR_PTR(-EFAULT); 955 page = pfn_to_page(pfn); 956 get_page(page); 957 958 return page; 959 } 960 961 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, 962 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, 963 struct vm_area_struct *vma) 964 { 965 spinlock_t *dst_ptl, *src_ptl; 966 struct page *src_page; 967 pmd_t pmd; 968 pgtable_t pgtable = NULL; 969 int ret = -ENOMEM; 970 971 /* Skip if can be re-fill on fault */ 972 if (!vma_is_anonymous(vma)) 973 return 0; 974 975 pgtable = pte_alloc_one(dst_mm); 976 if (unlikely(!pgtable)) 977 goto out; 978 979 dst_ptl = pmd_lock(dst_mm, dst_pmd); 980 src_ptl = pmd_lockptr(src_mm, src_pmd); 981 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 982 983 ret = -EAGAIN; 984 pmd = *src_pmd; 985 986 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 987 if (unlikely(is_swap_pmd(pmd))) { 988 swp_entry_t entry = pmd_to_swp_entry(pmd); 989 990 VM_BUG_ON(!is_pmd_migration_entry(pmd)); 991 if (is_write_migration_entry(entry)) { 992 make_migration_entry_read(&entry); 993 pmd = swp_entry_to_pmd(entry); 994 if (pmd_swp_soft_dirty(*src_pmd)) 995 pmd = pmd_swp_mksoft_dirty(pmd); 996 set_pmd_at(src_mm, addr, src_pmd, pmd); 997 } 998 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); 999 mm_inc_nr_ptes(dst_mm); 1000 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); 1001 set_pmd_at(dst_mm, addr, dst_pmd, pmd); 1002 ret = 0; 1003 goto out_unlock; 1004 } 1005 #endif 1006 1007 if (unlikely(!pmd_trans_huge(pmd))) { 1008 pte_free(dst_mm, pgtable); 1009 goto out_unlock; 1010 } 1011 /* 1012 * When page table lock is held, the huge zero pmd should not be 1013 * under splitting since we don't split the page itself, only pmd to 1014 * a page table. 1015 */ 1016 if (is_huge_zero_pmd(pmd)) { 1017 struct page *zero_page; 1018 /* 1019 * get_huge_zero_page() will never allocate a new page here, 1020 * since we already have a zero page to copy. It just takes a 1021 * reference. 1022 */ 1023 zero_page = mm_get_huge_zero_page(dst_mm); 1024 set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, 1025 zero_page); 1026 ret = 0; 1027 goto out_unlock; 1028 } 1029 1030 src_page = pmd_page(pmd); 1031 VM_BUG_ON_PAGE(!PageHead(src_page), src_page); 1032 get_page(src_page); 1033 page_dup_rmap(src_page, true); 1034 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); 1035 mm_inc_nr_ptes(dst_mm); 1036 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); 1037 1038 pmdp_set_wrprotect(src_mm, addr, src_pmd); 1039 pmd = pmd_mkold(pmd_wrprotect(pmd)); 1040 set_pmd_at(dst_mm, addr, dst_pmd, pmd); 1041 1042 ret = 0; 1043 out_unlock: 1044 spin_unlock(src_ptl); 1045 spin_unlock(dst_ptl); 1046 out: 1047 return ret; 1048 } 1049 1050 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 1051 static void touch_pud(struct vm_area_struct *vma, unsigned long addr, 1052 pud_t *pud, int flags) 1053 { 1054 pud_t _pud; 1055 1056 _pud = pud_mkyoung(*pud); 1057 if (flags & FOLL_WRITE) 1058 _pud = pud_mkdirty(_pud); 1059 if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK, 1060 pud, _pud, flags & FOLL_WRITE)) 1061 update_mmu_cache_pud(vma, addr, pud); 1062 } 1063 1064 struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, 1065 pud_t *pud, int flags, struct dev_pagemap **pgmap) 1066 { 1067 unsigned long pfn = pud_pfn(*pud); 1068 struct mm_struct *mm = vma->vm_mm; 1069 struct page *page; 1070 1071 assert_spin_locked(pud_lockptr(mm, pud)); 1072 1073 if (flags & FOLL_WRITE && !pud_write(*pud)) 1074 return NULL; 1075 1076 if (pud_present(*pud) && pud_devmap(*pud)) 1077 /* pass */; 1078 else 1079 return NULL; 1080 1081 if (flags & FOLL_TOUCH) 1082 touch_pud(vma, addr, pud, flags); 1083 1084 /* 1085 * device mapped pages can only be returned if the 1086 * caller will manage the page reference count. 1087 */ 1088 if (!(flags & FOLL_GET)) 1089 return ERR_PTR(-EEXIST); 1090 1091 pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT; 1092 *pgmap = get_dev_pagemap(pfn, *pgmap); 1093 if (!*pgmap) 1094 return ERR_PTR(-EFAULT); 1095 page = pfn_to_page(pfn); 1096 get_page(page); 1097 1098 return page; 1099 } 1100 1101 int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, 1102 pud_t *dst_pud, pud_t *src_pud, unsigned long addr, 1103 struct vm_area_struct *vma) 1104 { 1105 spinlock_t *dst_ptl, *src_ptl; 1106 pud_t pud; 1107 int ret; 1108 1109 dst_ptl = pud_lock(dst_mm, dst_pud); 1110 src_ptl = pud_lockptr(src_mm, src_pud); 1111 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 1112 1113 ret = -EAGAIN; 1114 pud = *src_pud; 1115 if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud))) 1116 goto out_unlock; 1117 1118 /* 1119 * When page table lock is held, the huge zero pud should not be 1120 * under splitting since we don't split the page itself, only pud to 1121 * a page table. 1122 */ 1123 if (is_huge_zero_pud(pud)) { 1124 /* No huge zero pud yet */ 1125 } 1126 1127 pudp_set_wrprotect(src_mm, addr, src_pud); 1128 pud = pud_mkold(pud_wrprotect(pud)); 1129 set_pud_at(dst_mm, addr, dst_pud, pud); 1130 1131 ret = 0; 1132 out_unlock: 1133 spin_unlock(src_ptl); 1134 spin_unlock(dst_ptl); 1135 return ret; 1136 } 1137 1138 void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud) 1139 { 1140 pud_t entry; 1141 unsigned long haddr; 1142 bool write = vmf->flags & FAULT_FLAG_WRITE; 1143 1144 vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud); 1145 if (unlikely(!pud_same(*vmf->pud, orig_pud))) 1146 goto unlock; 1147 1148 entry = pud_mkyoung(orig_pud); 1149 if (write) 1150 entry = pud_mkdirty(entry); 1151 haddr = vmf->address & HPAGE_PUD_MASK; 1152 if (pudp_set_access_flags(vmf->vma, haddr, vmf->pud, entry, write)) 1153 update_mmu_cache_pud(vmf->vma, vmf->address, vmf->pud); 1154 1155 unlock: 1156 spin_unlock(vmf->ptl); 1157 } 1158 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 1159 1160 void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd) 1161 { 1162 pmd_t entry; 1163 unsigned long haddr; 1164 bool write = vmf->flags & FAULT_FLAG_WRITE; 1165 1166 vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); 1167 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) 1168 goto unlock; 1169 1170 entry = pmd_mkyoung(orig_pmd); 1171 if (write) 1172 entry = pmd_mkdirty(entry); 1173 haddr = vmf->address & HPAGE_PMD_MASK; 1174 if (pmdp_set_access_flags(vmf->vma, haddr, vmf->pmd, entry, write)) 1175 update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd); 1176 1177 unlock: 1178 spin_unlock(vmf->ptl); 1179 } 1180 1181 static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, 1182 pmd_t orig_pmd, struct page *page) 1183 { 1184 struct vm_area_struct *vma = vmf->vma; 1185 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 1186 struct mem_cgroup *memcg; 1187 pgtable_t pgtable; 1188 pmd_t _pmd; 1189 int i; 1190 vm_fault_t ret = 0; 1191 struct page **pages; 1192 struct mmu_notifier_range range; 1193 1194 pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *), 1195 GFP_KERNEL); 1196 if (unlikely(!pages)) { 1197 ret |= VM_FAULT_OOM; 1198 goto out; 1199 } 1200 1201 for (i = 0; i < HPAGE_PMD_NR; i++) { 1202 pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma, 1203 vmf->address, page_to_nid(page)); 1204 if (unlikely(!pages[i] || 1205 mem_cgroup_try_charge_delay(pages[i], vma->vm_mm, 1206 GFP_KERNEL, &memcg, false))) { 1207 if (pages[i]) 1208 put_page(pages[i]); 1209 while (--i >= 0) { 1210 memcg = (void *)page_private(pages[i]); 1211 set_page_private(pages[i], 0); 1212 mem_cgroup_cancel_charge(pages[i], memcg, 1213 false); 1214 put_page(pages[i]); 1215 } 1216 kfree(pages); 1217 ret |= VM_FAULT_OOM; 1218 goto out; 1219 } 1220 set_page_private(pages[i], (unsigned long)memcg); 1221 } 1222 1223 for (i = 0; i < HPAGE_PMD_NR; i++) { 1224 copy_user_highpage(pages[i], page + i, 1225 haddr + PAGE_SIZE * i, vma); 1226 __SetPageUptodate(pages[i]); 1227 cond_resched(); 1228 } 1229 1230 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, 1231 haddr, haddr + HPAGE_PMD_SIZE); 1232 mmu_notifier_invalidate_range_start(&range); 1233 1234 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 1235 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) 1236 goto out_free_pages; 1237 VM_BUG_ON_PAGE(!PageHead(page), page); 1238 1239 /* 1240 * Leave pmd empty until pte is filled note we must notify here as 1241 * concurrent CPU thread might write to new page before the call to 1242 * mmu_notifier_invalidate_range_end() happens which can lead to a 1243 * device seeing memory write in different order than CPU. 1244 * 1245 * See Documentation/vm/mmu_notifier.rst 1246 */ 1247 pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); 1248 1249 pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd); 1250 pmd_populate(vma->vm_mm, &_pmd, pgtable); 1251 1252 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { 1253 pte_t entry; 1254 entry = mk_pte(pages[i], vma->vm_page_prot); 1255 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 1256 memcg = (void *)page_private(pages[i]); 1257 set_page_private(pages[i], 0); 1258 page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false); 1259 mem_cgroup_commit_charge(pages[i], memcg, false, false); 1260 lru_cache_add_active_or_unevictable(pages[i], vma); 1261 vmf->pte = pte_offset_map(&_pmd, haddr); 1262 VM_BUG_ON(!pte_none(*vmf->pte)); 1263 set_pte_at(vma->vm_mm, haddr, vmf->pte, entry); 1264 pte_unmap(vmf->pte); 1265 } 1266 kfree(pages); 1267 1268 smp_wmb(); /* make pte visible before pmd */ 1269 pmd_populate(vma->vm_mm, vmf->pmd, pgtable); 1270 page_remove_rmap(page, true); 1271 spin_unlock(vmf->ptl); 1272 1273 /* 1274 * No need to double call mmu_notifier->invalidate_range() callback as 1275 * the above pmdp_huge_clear_flush_notify() did already call it. 1276 */ 1277 mmu_notifier_invalidate_range_only_end(&range); 1278 1279 ret |= VM_FAULT_WRITE; 1280 put_page(page); 1281 1282 out: 1283 return ret; 1284 1285 out_free_pages: 1286 spin_unlock(vmf->ptl); 1287 mmu_notifier_invalidate_range_end(&range); 1288 for (i = 0; i < HPAGE_PMD_NR; i++) { 1289 memcg = (void *)page_private(pages[i]); 1290 set_page_private(pages[i], 0); 1291 mem_cgroup_cancel_charge(pages[i], memcg, false); 1292 put_page(pages[i]); 1293 } 1294 kfree(pages); 1295 goto out; 1296 } 1297 1298 vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) 1299 { 1300 struct vm_area_struct *vma = vmf->vma; 1301 struct page *page = NULL, *new_page; 1302 struct mem_cgroup *memcg; 1303 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 1304 struct mmu_notifier_range range; 1305 gfp_t huge_gfp; /* for allocation and charge */ 1306 vm_fault_t ret = 0; 1307 1308 vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); 1309 VM_BUG_ON_VMA(!vma->anon_vma, vma); 1310 if (is_huge_zero_pmd(orig_pmd)) 1311 goto alloc; 1312 spin_lock(vmf->ptl); 1313 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) 1314 goto out_unlock; 1315 1316 page = pmd_page(orig_pmd); 1317 VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page); 1318 /* 1319 * We can only reuse the page if nobody else maps the huge page or it's 1320 * part. 1321 */ 1322 if (!trylock_page(page)) { 1323 get_page(page); 1324 spin_unlock(vmf->ptl); 1325 lock_page(page); 1326 spin_lock(vmf->ptl); 1327 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { 1328 unlock_page(page); 1329 put_page(page); 1330 goto out_unlock; 1331 } 1332 put_page(page); 1333 } 1334 if (reuse_swap_page(page, NULL)) { 1335 pmd_t entry; 1336 entry = pmd_mkyoung(orig_pmd); 1337 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1338 if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) 1339 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); 1340 ret |= VM_FAULT_WRITE; 1341 unlock_page(page); 1342 goto out_unlock; 1343 } 1344 unlock_page(page); 1345 get_page(page); 1346 spin_unlock(vmf->ptl); 1347 alloc: 1348 if (__transparent_hugepage_enabled(vma) && 1349 !transparent_hugepage_debug_cow()) { 1350 huge_gfp = alloc_hugepage_direct_gfpmask(vma); 1351 new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); 1352 } else 1353 new_page = NULL; 1354 1355 if (likely(new_page)) { 1356 prep_transhuge_page(new_page); 1357 } else { 1358 if (!page) { 1359 split_huge_pmd(vma, vmf->pmd, vmf->address); 1360 ret |= VM_FAULT_FALLBACK; 1361 } else { 1362 ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page); 1363 if (ret & VM_FAULT_OOM) { 1364 split_huge_pmd(vma, vmf->pmd, vmf->address); 1365 ret |= VM_FAULT_FALLBACK; 1366 } 1367 put_page(page); 1368 } 1369 count_vm_event(THP_FAULT_FALLBACK); 1370 goto out; 1371 } 1372 1373 if (unlikely(mem_cgroup_try_charge_delay(new_page, vma->vm_mm, 1374 huge_gfp, &memcg, true))) { 1375 put_page(new_page); 1376 split_huge_pmd(vma, vmf->pmd, vmf->address); 1377 if (page) 1378 put_page(page); 1379 ret |= VM_FAULT_FALLBACK; 1380 count_vm_event(THP_FAULT_FALLBACK); 1381 goto out; 1382 } 1383 1384 count_vm_event(THP_FAULT_ALLOC); 1385 count_memcg_events(memcg, THP_FAULT_ALLOC, 1); 1386 1387 if (!page) 1388 clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR); 1389 else 1390 copy_user_huge_page(new_page, page, vmf->address, 1391 vma, HPAGE_PMD_NR); 1392 __SetPageUptodate(new_page); 1393 1394 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, 1395 haddr, haddr + HPAGE_PMD_SIZE); 1396 mmu_notifier_invalidate_range_start(&range); 1397 1398 spin_lock(vmf->ptl); 1399 if (page) 1400 put_page(page); 1401 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { 1402 spin_unlock(vmf->ptl); 1403 mem_cgroup_cancel_charge(new_page, memcg, true); 1404 put_page(new_page); 1405 goto out_mn; 1406 } else { 1407 pmd_t entry; 1408 entry = mk_huge_pmd(new_page, vma->vm_page_prot); 1409 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1410 pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); 1411 page_add_new_anon_rmap(new_page, vma, haddr, true); 1412 mem_cgroup_commit_charge(new_page, memcg, false, true); 1413 lru_cache_add_active_or_unevictable(new_page, vma); 1414 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); 1415 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); 1416 if (!page) { 1417 add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); 1418 } else { 1419 VM_BUG_ON_PAGE(!PageHead(page), page); 1420 page_remove_rmap(page, true); 1421 put_page(page); 1422 } 1423 ret |= VM_FAULT_WRITE; 1424 } 1425 spin_unlock(vmf->ptl); 1426 out_mn: 1427 /* 1428 * No need to double call mmu_notifier->invalidate_range() callback as 1429 * the above pmdp_huge_clear_flush_notify() did already call it. 1430 */ 1431 mmu_notifier_invalidate_range_only_end(&range); 1432 out: 1433 return ret; 1434 out_unlock: 1435 spin_unlock(vmf->ptl); 1436 return ret; 1437 } 1438 1439 /* 1440 * FOLL_FORCE can write to even unwritable pmd's, but only 1441 * after we've gone through a COW cycle and they are dirty. 1442 */ 1443 static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags) 1444 { 1445 return pmd_write(pmd) || 1446 ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd)); 1447 } 1448 1449 struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, 1450 unsigned long addr, 1451 pmd_t *pmd, 1452 unsigned int flags) 1453 { 1454 struct mm_struct *mm = vma->vm_mm; 1455 struct page *page = NULL; 1456 1457 assert_spin_locked(pmd_lockptr(mm, pmd)); 1458 1459 if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags)) 1460 goto out; 1461 1462 /* Avoid dumping huge zero page */ 1463 if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd)) 1464 return ERR_PTR(-EFAULT); 1465 1466 /* Full NUMA hinting faults to serialise migration in fault paths */ 1467 if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) 1468 goto out; 1469 1470 page = pmd_page(*pmd); 1471 VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page); 1472 if (flags & FOLL_TOUCH) 1473 touch_pmd(vma, addr, pmd, flags); 1474 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { 1475 /* 1476 * We don't mlock() pte-mapped THPs. This way we can avoid 1477 * leaking mlocked pages into non-VM_LOCKED VMAs. 1478 * 1479 * For anon THP: 1480 * 1481 * In most cases the pmd is the only mapping of the page as we 1482 * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for 1483 * writable private mappings in populate_vma_page_range(). 1484 * 1485 * The only scenario when we have the page shared here is if we 1486 * mlocking read-only mapping shared over fork(). We skip 1487 * mlocking such pages. 1488 * 1489 * For file THP: 1490 * 1491 * We can expect PageDoubleMap() to be stable under page lock: 1492 * for file pages we set it in page_add_file_rmap(), which 1493 * requires page to be locked. 1494 */ 1495 1496 if (PageAnon(page) && compound_mapcount(page) != 1) 1497 goto skip_mlock; 1498 if (PageDoubleMap(page) || !page->mapping) 1499 goto skip_mlock; 1500 if (!trylock_page(page)) 1501 goto skip_mlock; 1502 lru_add_drain(); 1503 if (page->mapping && !PageDoubleMap(page)) 1504 mlock_vma_page(page); 1505 unlock_page(page); 1506 } 1507 skip_mlock: 1508 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; 1509 VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page); 1510 if (flags & FOLL_GET) 1511 get_page(page); 1512 1513 out: 1514 return page; 1515 } 1516 1517 /* NUMA hinting page fault entry point for trans huge pmds */ 1518 vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) 1519 { 1520 struct vm_area_struct *vma = vmf->vma; 1521 struct anon_vma *anon_vma = NULL; 1522 struct page *page; 1523 unsigned long haddr = vmf->address & HPAGE_PMD_MASK; 1524 int page_nid = NUMA_NO_NODE, this_nid = numa_node_id(); 1525 int target_nid, last_cpupid = -1; 1526 bool page_locked; 1527 bool migrated = false; 1528 bool was_writable; 1529 int flags = 0; 1530 1531 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); 1532 if (unlikely(!pmd_same(pmd, *vmf->pmd))) 1533 goto out_unlock; 1534 1535 /* 1536 * If there are potential migrations, wait for completion and retry 1537 * without disrupting NUMA hinting information. Do not relock and 1538 * check_same as the page may no longer be mapped. 1539 */ 1540 if (unlikely(pmd_trans_migrating(*vmf->pmd))) { 1541 page = pmd_page(*vmf->pmd); 1542 if (!get_page_unless_zero(page)) 1543 goto out_unlock; 1544 spin_unlock(vmf->ptl); 1545 put_and_wait_on_page_locked(page); 1546 goto out; 1547 } 1548 1549 page = pmd_page(pmd); 1550 BUG_ON(is_huge_zero_page(page)); 1551 page_nid = page_to_nid(page); 1552 last_cpupid = page_cpupid_last(page); 1553 count_vm_numa_event(NUMA_HINT_FAULTS); 1554 if (page_nid == this_nid) { 1555 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); 1556 flags |= TNF_FAULT_LOCAL; 1557 } 1558 1559 /* See similar comment in do_numa_page for explanation */ 1560 if (!pmd_savedwrite(pmd)) 1561 flags |= TNF_NO_GROUP; 1562 1563 /* 1564 * Acquire the page lock to serialise THP migrations but avoid dropping 1565 * page_table_lock if at all possible 1566 */ 1567 page_locked = trylock_page(page); 1568 target_nid = mpol_misplaced(page, vma, haddr); 1569 if (target_nid == NUMA_NO_NODE) { 1570 /* If the page was locked, there are no parallel migrations */ 1571 if (page_locked) 1572 goto clear_pmdnuma; 1573 } 1574 1575 /* Migration could have started since the pmd_trans_migrating check */ 1576 if (!page_locked) { 1577 page_nid = NUMA_NO_NODE; 1578 if (!get_page_unless_zero(page)) 1579 goto out_unlock; 1580 spin_unlock(vmf->ptl); 1581 put_and_wait_on_page_locked(page); 1582 goto out; 1583 } 1584 1585 /* 1586 * Page is misplaced. Page lock serialises migrations. Acquire anon_vma 1587 * to serialises splits 1588 */ 1589 get_page(page); 1590 spin_unlock(vmf->ptl); 1591 anon_vma = page_lock_anon_vma_read(page); 1592 1593 /* Confirm the PMD did not change while page_table_lock was released */ 1594 spin_lock(vmf->ptl); 1595 if (unlikely(!pmd_same(pmd, *vmf->pmd))) { 1596 unlock_page(page); 1597 put_page(page); 1598 page_nid = NUMA_NO_NODE; 1599 goto out_unlock; 1600 } 1601 1602 /* Bail if we fail to protect against THP splits for any reason */ 1603 if (unlikely(!anon_vma)) { 1604 put_page(page); 1605 page_nid = NUMA_NO_NODE; 1606 goto clear_pmdnuma; 1607 } 1608 1609 /* 1610 * Since we took the NUMA fault, we must have observed the !accessible 1611 * bit. Make sure all other CPUs agree with that, to avoid them 1612 * modifying the page we're about to migrate. 1613 * 1614 * Must be done under PTL such that we'll observe the relevant 1615 * inc_tlb_flush_pending(). 1616 * 1617 * We are not sure a pending tlb flush here is for a huge page 1618 * mapping or not. Hence use the tlb range variant 1619 */ 1620 if (mm_tlb_flush_pending(vma->vm_mm)) { 1621 flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE); 1622 /* 1623 * change_huge_pmd() released the pmd lock before 1624 * invalidating the secondary MMUs sharing the primary 1625 * MMU pagetables (with ->invalidate_range()). The 1626 * mmu_notifier_invalidate_range_end() (which 1627 * internally calls ->invalidate_range()) in 1628 * change_pmd_range() will run after us, so we can't 1629 * rely on it here and we need an explicit invalidate. 1630 */ 1631 mmu_notifier_invalidate_range(vma->vm_mm, haddr, 1632 haddr + HPAGE_PMD_SIZE); 1633 } 1634 1635 /* 1636 * Migrate the THP to the requested node, returns with page unlocked 1637 * and access rights restored. 1638 */ 1639 spin_unlock(vmf->ptl); 1640 1641 migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma, 1642 vmf->pmd, pmd, vmf->address, page, target_nid); 1643 if (migrated) { 1644 flags |= TNF_MIGRATED; 1645 page_nid = target_nid; 1646 } else 1647 flags |= TNF_MIGRATE_FAIL; 1648 1649 goto out; 1650 clear_pmdnuma: 1651 BUG_ON(!PageLocked(page)); 1652 was_writable = pmd_savedwrite(pmd); 1653 pmd = pmd_modify(pmd, vma->vm_page_prot); 1654 pmd = pmd_mkyoung(pmd); 1655 if (was_writable) 1656 pmd = pmd_mkwrite(pmd); 1657 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd); 1658 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); 1659 unlock_page(page); 1660 out_unlock: 1661 spin_unlock(vmf->ptl); 1662 1663 out: 1664 if (anon_vma) 1665 page_unlock_anon_vma_read(anon_vma); 1666 1667 if (page_nid != NUMA_NO_NODE) 1668 task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, 1669 flags); 1670 1671 return 0; 1672 } 1673 1674 /* 1675 * Return true if we do MADV_FREE successfully on entire pmd page. 1676 * Otherwise, return false. 1677 */ 1678 bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 1679 pmd_t *pmd, unsigned long addr, unsigned long next) 1680 { 1681 spinlock_t *ptl; 1682 pmd_t orig_pmd; 1683 struct page *page; 1684 struct mm_struct *mm = tlb->mm; 1685 bool ret = false; 1686 1687 tlb_change_page_size(tlb, HPAGE_PMD_SIZE); 1688 1689 ptl = pmd_trans_huge_lock(pmd, vma); 1690 if (!ptl) 1691 goto out_unlocked; 1692 1693 orig_pmd = *pmd; 1694 if (is_huge_zero_pmd(orig_pmd)) 1695 goto out; 1696 1697 if (unlikely(!pmd_present(orig_pmd))) { 1698 VM_BUG_ON(thp_migration_supported() && 1699 !is_pmd_migration_entry(orig_pmd)); 1700 goto out; 1701 } 1702 1703 page = pmd_page(orig_pmd); 1704 /* 1705 * If other processes are mapping this page, we couldn't discard 1706 * the page unless they all do MADV_FREE so let's skip the page. 1707 */ 1708 if (page_mapcount(page) != 1) 1709 goto out; 1710 1711 if (!trylock_page(page)) 1712 goto out; 1713 1714 /* 1715 * If user want to discard part-pages of THP, split it so MADV_FREE 1716 * will deactivate only them. 1717 */ 1718 if (next - addr != HPAGE_PMD_SIZE) { 1719 get_page(page); 1720 spin_unlock(ptl); 1721 split_huge_page(page); 1722 unlock_page(page); 1723 put_page(page); 1724 goto out_unlocked; 1725 } 1726 1727 if (PageDirty(page)) 1728 ClearPageDirty(page); 1729 unlock_page(page); 1730 1731 if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) { 1732 pmdp_invalidate(vma, addr, pmd); 1733 orig_pmd = pmd_mkold(orig_pmd); 1734 orig_pmd = pmd_mkclean(orig_pmd); 1735 1736 set_pmd_at(mm, addr, pmd, orig_pmd); 1737 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 1738 } 1739 1740 mark_page_lazyfree(page); 1741 ret = true; 1742 out: 1743 spin_unlock(ptl); 1744 out_unlocked: 1745 return ret; 1746 } 1747 1748 static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd) 1749 { 1750 pgtable_t pgtable; 1751 1752 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 1753 pte_free(mm, pgtable); 1754 mm_dec_nr_ptes(mm); 1755 } 1756 1757 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 1758 pmd_t *pmd, unsigned long addr) 1759 { 1760 pmd_t orig_pmd; 1761 spinlock_t *ptl; 1762 1763 tlb_change_page_size(tlb, HPAGE_PMD_SIZE); 1764 1765 ptl = __pmd_trans_huge_lock(pmd, vma); 1766 if (!ptl) 1767 return 0; 1768 /* 1769 * For architectures like ppc64 we look at deposited pgtable 1770 * when calling pmdp_huge_get_and_clear. So do the 1771 * pgtable_trans_huge_withdraw after finishing pmdp related 1772 * operations. 1773 */ 1774 orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd, 1775 tlb->fullmm); 1776 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 1777 if (vma_is_dax(vma)) { 1778 if (arch_needs_pgtable_deposit()) 1779 zap_deposited_table(tlb->mm, pmd); 1780 spin_unlock(ptl); 1781 if (is_huge_zero_pmd(orig_pmd)) 1782 tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE); 1783 } else if (is_huge_zero_pmd(orig_pmd)) { 1784 zap_deposited_table(tlb->mm, pmd); 1785 spin_unlock(ptl); 1786 tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE); 1787 } else { 1788 struct page *page = NULL; 1789 int flush_needed = 1; 1790 1791 if (pmd_present(orig_pmd)) { 1792 page = pmd_page(orig_pmd); 1793 page_remove_rmap(page, true); 1794 VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); 1795 VM_BUG_ON_PAGE(!PageHead(page), page); 1796 } else if (thp_migration_supported()) { 1797 swp_entry_t entry; 1798 1799 VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); 1800 entry = pmd_to_swp_entry(orig_pmd); 1801 page = pfn_to_page(swp_offset(entry)); 1802 flush_needed = 0; 1803 } else 1804 WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); 1805 1806 if (PageAnon(page)) { 1807 zap_deposited_table(tlb->mm, pmd); 1808 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); 1809 } else { 1810 if (arch_needs_pgtable_deposit()) 1811 zap_deposited_table(tlb->mm, pmd); 1812 add_mm_counter(tlb->mm, mm_counter_file(page), -HPAGE_PMD_NR); 1813 } 1814 1815 spin_unlock(ptl); 1816 if (flush_needed) 1817 tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE); 1818 } 1819 return 1; 1820 } 1821 1822 #ifndef pmd_move_must_withdraw 1823 static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, 1824 spinlock_t *old_pmd_ptl, 1825 struct vm_area_struct *vma) 1826 { 1827 /* 1828 * With split pmd lock we also need to move preallocated 1829 * PTE page table if new_pmd is on different PMD page table. 1830 * 1831 * We also don't deposit and withdraw tables for file pages. 1832 */ 1833 return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma); 1834 } 1835 #endif 1836 1837 static pmd_t move_soft_dirty_pmd(pmd_t pmd) 1838 { 1839 #ifdef CONFIG_MEM_SOFT_DIRTY 1840 if (unlikely(is_pmd_migration_entry(pmd))) 1841 pmd = pmd_swp_mksoft_dirty(pmd); 1842 else if (pmd_present(pmd)) 1843 pmd = pmd_mksoft_dirty(pmd); 1844 #endif 1845 return pmd; 1846 } 1847 1848 bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, 1849 unsigned long new_addr, unsigned long old_end, 1850 pmd_t *old_pmd, pmd_t *new_pmd) 1851 { 1852 spinlock_t *old_ptl, *new_ptl; 1853 pmd_t pmd; 1854 struct mm_struct *mm = vma->vm_mm; 1855 bool force_flush = false; 1856 1857 if ((old_addr & ~HPAGE_PMD_MASK) || 1858 (new_addr & ~HPAGE_PMD_MASK) || 1859 old_end - old_addr < HPAGE_PMD_SIZE) 1860 return false; 1861 1862 /* 1863 * The destination pmd shouldn't be established, free_pgtables() 1864 * should have release it. 1865 */ 1866 if (WARN_ON(!pmd_none(*new_pmd))) { 1867 VM_BUG_ON(pmd_trans_huge(*new_pmd)); 1868 return false; 1869 } 1870 1871 /* 1872 * We don't have to worry about the ordering of src and dst 1873 * ptlocks because exclusive mmap_sem prevents deadlock. 1874 */ 1875 old_ptl = __pmd_trans_huge_lock(old_pmd, vma); 1876 if (old_ptl) { 1877 new_ptl = pmd_lockptr(mm, new_pmd); 1878 if (new_ptl != old_ptl) 1879 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); 1880 pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd); 1881 if (pmd_present(pmd)) 1882 force_flush = true; 1883 VM_BUG_ON(!pmd_none(*new_pmd)); 1884 1885 if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) { 1886 pgtable_t pgtable; 1887 pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); 1888 pgtable_trans_huge_deposit(mm, new_pmd, pgtable); 1889 } 1890 pmd = move_soft_dirty_pmd(pmd); 1891 set_pmd_at(mm, new_addr, new_pmd, pmd); 1892 if (force_flush) 1893 flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE); 1894 if (new_ptl != old_ptl) 1895 spin_unlock(new_ptl); 1896 spin_unlock(old_ptl); 1897 return true; 1898 } 1899 return false; 1900 } 1901 1902 /* 1903 * Returns 1904 * - 0 if PMD could not be locked 1905 * - 1 if PMD was locked but protections unchange and TLB flush unnecessary 1906 * - HPAGE_PMD_NR is protections changed and TLB flush necessary 1907 */ 1908 int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 1909 unsigned long addr, pgprot_t newprot, int prot_numa) 1910 { 1911 struct mm_struct *mm = vma->vm_mm; 1912 spinlock_t *ptl; 1913 pmd_t entry; 1914 bool preserve_write; 1915 int ret; 1916 1917 ptl = __pmd_trans_huge_lock(pmd, vma); 1918 if (!ptl) 1919 return 0; 1920 1921 preserve_write = prot_numa && pmd_write(*pmd); 1922 ret = 1; 1923 1924 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 1925 if (is_swap_pmd(*pmd)) { 1926 swp_entry_t entry = pmd_to_swp_entry(*pmd); 1927 1928 VM_BUG_ON(!is_pmd_migration_entry(*pmd)); 1929 if (is_write_migration_entry(entry)) { 1930 pmd_t newpmd; 1931 /* 1932 * A protection check is difficult so 1933 * just be safe and disable write 1934 */ 1935 make_migration_entry_read(&entry); 1936 newpmd = swp_entry_to_pmd(entry); 1937 if (pmd_swp_soft_dirty(*pmd)) 1938 newpmd = pmd_swp_mksoft_dirty(newpmd); 1939 set_pmd_at(mm, addr, pmd, newpmd); 1940 } 1941 goto unlock; 1942 } 1943 #endif 1944 1945 /* 1946 * Avoid trapping faults against the zero page. The read-only 1947 * data is likely to be read-cached on the local CPU and 1948 * local/remote hits to the zero page are not interesting. 1949 */ 1950 if (prot_numa && is_huge_zero_pmd(*pmd)) 1951 goto unlock; 1952 1953 if (prot_numa && pmd_protnone(*pmd)) 1954 goto unlock; 1955 1956 /* 1957 * In case prot_numa, we are under down_read(mmap_sem). It's critical 1958 * to not clear pmd intermittently to avoid race with MADV_DONTNEED 1959 * which is also under down_read(mmap_sem): 1960 * 1961 * CPU0: CPU1: 1962 * change_huge_pmd(prot_numa=1) 1963 * pmdp_huge_get_and_clear_notify() 1964 * madvise_dontneed() 1965 * zap_pmd_range() 1966 * pmd_trans_huge(*pmd) == 0 (without ptl) 1967 * // skip the pmd 1968 * set_pmd_at(); 1969 * // pmd is re-established 1970 * 1971 * The race makes MADV_DONTNEED miss the huge pmd and don't clear it 1972 * which may break userspace. 1973 * 1974 * pmdp_invalidate() is required to make sure we don't miss 1975 * dirty/young flags set by hardware. 1976 */ 1977 entry = pmdp_invalidate(vma, addr, pmd); 1978 1979 entry = pmd_modify(entry, newprot); 1980 if (preserve_write) 1981 entry = pmd_mk_savedwrite(entry); 1982 ret = HPAGE_PMD_NR; 1983 set_pmd_at(mm, addr, pmd, entry); 1984 BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry)); 1985 unlock: 1986 spin_unlock(ptl); 1987 return ret; 1988 } 1989 1990 /* 1991 * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise. 1992 * 1993 * Note that if it returns page table lock pointer, this routine returns without 1994 * unlocking page table lock. So callers must unlock it. 1995 */ 1996 spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) 1997 { 1998 spinlock_t *ptl; 1999 ptl = pmd_lock(vma->vm_mm, pmd); 2000 if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || 2001 pmd_devmap(*pmd))) 2002 return ptl; 2003 spin_unlock(ptl); 2004 return NULL; 2005 } 2006 2007 /* 2008 * Returns true if a given pud maps a thp, false otherwise. 2009 * 2010 * Note that if it returns true, this routine returns without unlocking page 2011 * table lock. So callers must unlock it. 2012 */ 2013 spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) 2014 { 2015 spinlock_t *ptl; 2016 2017 ptl = pud_lock(vma->vm_mm, pud); 2018 if (likely(pud_trans_huge(*pud) || pud_devmap(*pud))) 2019 return ptl; 2020 spin_unlock(ptl); 2021 return NULL; 2022 } 2023 2024 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 2025 int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, 2026 pud_t *pud, unsigned long addr) 2027 { 2028 spinlock_t *ptl; 2029 2030 ptl = __pud_trans_huge_lock(pud, vma); 2031 if (!ptl) 2032 return 0; 2033 /* 2034 * For architectures like ppc64 we look at deposited pgtable 2035 * when calling pudp_huge_get_and_clear. So do the 2036 * pgtable_trans_huge_withdraw after finishing pudp related 2037 * operations. 2038 */ 2039 pudp_huge_get_and_clear_full(tlb->mm, addr, pud, tlb->fullmm); 2040 tlb_remove_pud_tlb_entry(tlb, pud, addr); 2041 if (vma_is_dax(vma)) { 2042 spin_unlock(ptl); 2043 /* No zero page support yet */ 2044 } else { 2045 /* No support for anonymous PUD pages yet */ 2046 BUG(); 2047 } 2048 return 1; 2049 } 2050 2051 static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud, 2052 unsigned long haddr) 2053 { 2054 VM_BUG_ON(haddr & ~HPAGE_PUD_MASK); 2055 VM_BUG_ON_VMA(vma->vm_start > haddr, vma); 2056 VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma); 2057 VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud)); 2058 2059 count_vm_event(THP_SPLIT_PUD); 2060 2061 pudp_huge_clear_flush_notify(vma, haddr, pud); 2062 } 2063 2064 void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, 2065 unsigned long address) 2066 { 2067 spinlock_t *ptl; 2068 struct mmu_notifier_range range; 2069 2070 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, 2071 address & HPAGE_PUD_MASK, 2072 (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE); 2073 mmu_notifier_invalidate_range_start(&range); 2074 ptl = pud_lock(vma->vm_mm, pud); 2075 if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud))) 2076 goto out; 2077 __split_huge_pud_locked(vma, pud, range.start); 2078 2079 out: 2080 spin_unlock(ptl); 2081 /* 2082 * No need to double call mmu_notifier->invalidate_range() callback as 2083 * the above pudp_huge_clear_flush_notify() did already call it. 2084 */ 2085 mmu_notifier_invalidate_range_only_end(&range); 2086 } 2087 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 2088 2089 static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, 2090 unsigned long haddr, pmd_t *pmd) 2091 { 2092 struct mm_struct *mm = vma->vm_mm; 2093 pgtable_t pgtable; 2094 pmd_t _pmd; 2095 int i; 2096 2097 /* 2098 * Leave pmd empty until pte is filled note that it is fine to delay 2099 * notification until mmu_notifier_invalidate_range_end() as we are 2100 * replacing a zero pmd write protected page with a zero pte write 2101 * protected page. 2102 * 2103 * See Documentation/vm/mmu_notifier.rst 2104 */ 2105 pmdp_huge_clear_flush(vma, haddr, pmd); 2106 2107 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 2108 pmd_populate(mm, &_pmd, pgtable); 2109 2110 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { 2111 pte_t *pte, entry; 2112 entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); 2113 entry = pte_mkspecial(entry); 2114 pte = pte_offset_map(&_pmd, haddr); 2115 VM_BUG_ON(!pte_none(*pte)); 2116 set_pte_at(mm, haddr, pte, entry); 2117 pte_unmap(pte); 2118 } 2119 smp_wmb(); /* make pte visible before pmd */ 2120 pmd_populate(mm, pmd, pgtable); 2121 } 2122 2123 static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, 2124 unsigned long haddr, bool freeze) 2125 { 2126 struct mm_struct *mm = vma->vm_mm; 2127 struct page *page; 2128 pgtable_t pgtable; 2129 pmd_t old_pmd, _pmd; 2130 bool young, write, soft_dirty, pmd_migration = false; 2131 unsigned long addr; 2132 int i; 2133 2134 VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); 2135 VM_BUG_ON_VMA(vma->vm_start > haddr, vma); 2136 VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma); 2137 VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd) 2138 && !pmd_devmap(*pmd)); 2139 2140 count_vm_event(THP_SPLIT_PMD); 2141 2142 if (!vma_is_anonymous(vma)) { 2143 _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); 2144 /* 2145 * We are going to unmap this huge page. So 2146 * just go ahead and zap it 2147 */ 2148 if (arch_needs_pgtable_deposit()) 2149 zap_deposited_table(mm, pmd); 2150 if (vma_is_dax(vma)) 2151 return; 2152 page = pmd_page(_pmd); 2153 if (!PageDirty(page) && pmd_dirty(_pmd)) 2154 set_page_dirty(page); 2155 if (!PageReferenced(page) && pmd_young(_pmd)) 2156 SetPageReferenced(page); 2157 page_remove_rmap(page, true); 2158 put_page(page); 2159 add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR); 2160 return; 2161 } else if (is_huge_zero_pmd(*pmd)) { 2162 /* 2163 * FIXME: Do we want to invalidate secondary mmu by calling 2164 * mmu_notifier_invalidate_range() see comments below inside 2165 * __split_huge_pmd() ? 2166 * 2167 * We are going from a zero huge page write protected to zero 2168 * small page also write protected so it does not seems useful 2169 * to invalidate secondary mmu at this time. 2170 */ 2171 return __split_huge_zero_page_pmd(vma, haddr, pmd); 2172 } 2173 2174 /* 2175 * Up to this point the pmd is present and huge and userland has the 2176 * whole access to the hugepage during the split (which happens in 2177 * place). If we overwrite the pmd with the not-huge version pointing 2178 * to the pte here (which of course we could if all CPUs were bug 2179 * free), userland could trigger a small page size TLB miss on the 2180 * small sized TLB while the hugepage TLB entry is still established in 2181 * the huge TLB. Some CPU doesn't like that. 2182 * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum 2183 * 383 on page 93. Intel should be safe but is also warns that it's 2184 * only safe if the permission and cache attributes of the two entries 2185 * loaded in the two TLB is identical (which should be the case here). 2186 * But it is generally safer to never allow small and huge TLB entries 2187 * for the same virtual address to be loaded simultaneously. So instead 2188 * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the 2189 * current pmd notpresent (atomically because here the pmd_trans_huge 2190 * must remain set at all times on the pmd until the split is complete 2191 * for this pmd), then we flush the SMP TLB and finally we write the 2192 * non-huge version of the pmd entry with pmd_populate. 2193 */ 2194 old_pmd = pmdp_invalidate(vma, haddr, pmd); 2195 2196 pmd_migration = is_pmd_migration_entry(old_pmd); 2197 if (unlikely(pmd_migration)) { 2198 swp_entry_t entry; 2199 2200 entry = pmd_to_swp_entry(old_pmd); 2201 page = pfn_to_page(swp_offset(entry)); 2202 write = is_write_migration_entry(entry); 2203 young = false; 2204 soft_dirty = pmd_swp_soft_dirty(old_pmd); 2205 } else { 2206 page = pmd_page(old_pmd); 2207 if (pmd_dirty(old_pmd)) 2208 SetPageDirty(page); 2209 write = pmd_write(old_pmd); 2210 young = pmd_young(old_pmd); 2211 soft_dirty = pmd_soft_dirty(old_pmd); 2212 } 2213 VM_BUG_ON_PAGE(!page_count(page), page); 2214 page_ref_add(page, HPAGE_PMD_NR - 1); 2215 2216 /* 2217 * Withdraw the table only after we mark the pmd entry invalid. 2218 * This's critical for some architectures (Power). 2219 */ 2220 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 2221 pmd_populate(mm, &_pmd, pgtable); 2222 2223 for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { 2224 pte_t entry, *pte; 2225 /* 2226 * Note that NUMA hinting access restrictions are not 2227 * transferred to avoid any possibility of altering 2228 * permissions across VMAs. 2229 */ 2230 if (freeze || pmd_migration) { 2231 swp_entry_t swp_entry; 2232 swp_entry = make_migration_entry(page + i, write); 2233 entry = swp_entry_to_pte(swp_entry); 2234 if (soft_dirty) 2235 entry = pte_swp_mksoft_dirty(entry); 2236 } else { 2237 entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot)); 2238 entry = maybe_mkwrite(entry, vma); 2239 if (!write) 2240 entry = pte_wrprotect(entry); 2241 if (!young) 2242 entry = pte_mkold(entry); 2243 if (soft_dirty) 2244 entry = pte_mksoft_dirty(entry); 2245 } 2246 pte = pte_offset_map(&_pmd, addr); 2247 BUG_ON(!pte_none(*pte)); 2248 set_pte_at(mm, addr, pte, entry); 2249 atomic_inc(&page[i]._mapcount); 2250 pte_unmap(pte); 2251 } 2252 2253 /* 2254 * Set PG_double_map before dropping compound_mapcount to avoid 2255 * false-negative page_mapped(). 2256 */ 2257 if (compound_mapcount(page) > 1 && !TestSetPageDoubleMap(page)) { 2258 for (i = 0; i < HPAGE_PMD_NR; i++) 2259 atomic_inc(&page[i]._mapcount); 2260 } 2261 2262 if (atomic_add_negative(-1, compound_mapcount_ptr(page))) { 2263 /* Last compound_mapcount is gone. */ 2264 __dec_node_page_state(page, NR_ANON_THPS); 2265 if (TestClearPageDoubleMap(page)) { 2266 /* No need in mapcount reference anymore */ 2267 for (i = 0; i < HPAGE_PMD_NR; i++) 2268 atomic_dec(&page[i]._mapcount); 2269 } 2270 } 2271 2272 smp_wmb(); /* make pte visible before pmd */ 2273 pmd_populate(mm, pmd, pgtable); 2274 2275 if (freeze) { 2276 for (i = 0; i < HPAGE_PMD_NR; i++) { 2277 page_remove_rmap(page + i, false); 2278 put_page(page + i); 2279 } 2280 } 2281 } 2282 2283 void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 2284 unsigned long address, bool freeze, struct page *page) 2285 { 2286 spinlock_t *ptl; 2287 struct mmu_notifier_range range; 2288 2289 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, 2290 address & HPAGE_PMD_MASK, 2291 (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE); 2292 mmu_notifier_invalidate_range_start(&range); 2293 ptl = pmd_lock(vma->vm_mm, pmd); 2294 2295 /* 2296 * If caller asks to setup a migration entries, we need a page to check 2297 * pmd against. Otherwise we can end up replacing wrong page. 2298 */ 2299 VM_BUG_ON(freeze && !page); 2300 if (page && page != pmd_page(*pmd)) 2301 goto out; 2302 2303 if (pmd_trans_huge(*pmd)) { 2304 page = pmd_page(*pmd); 2305 if (PageMlocked(page)) 2306 clear_page_mlock(page); 2307 } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd))) 2308 goto out; 2309 __split_huge_pmd_locked(vma, pmd, range.start, freeze); 2310 out: 2311 spin_unlock(ptl); 2312 /* 2313 * No need to double call mmu_notifier->invalidate_range() callback. 2314 * They are 3 cases to consider inside __split_huge_pmd_locked(): 2315 * 1) pmdp_huge_clear_flush_notify() call invalidate_range() obvious 2316 * 2) __split_huge_zero_page_pmd() read only zero page and any write 2317 * fault will trigger a flush_notify before pointing to a new page 2318 * (it is fine if the secondary mmu keeps pointing to the old zero 2319 * page in the meantime) 2320 * 3) Split a huge pmd into pte pointing to the same page. No need 2321 * to invalidate secondary tlb entry they are all still valid. 2322 * any further changes to individual pte will notify. So no need 2323 * to call mmu_notifier->invalidate_range() 2324 */ 2325 mmu_notifier_invalidate_range_only_end(&range); 2326 } 2327 2328 void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, 2329 bool freeze, struct page *page) 2330 { 2331 pgd_t *pgd; 2332 p4d_t *p4d; 2333 pud_t *pud; 2334 pmd_t *pmd; 2335 2336 pgd = pgd_offset(vma->vm_mm, address); 2337 if (!pgd_present(*pgd)) 2338 return; 2339 2340 p4d = p4d_offset(pgd, address); 2341 if (!p4d_present(*p4d)) 2342 return; 2343 2344 pud = pud_offset(p4d, address); 2345 if (!pud_present(*pud)) 2346 return; 2347 2348 pmd = pmd_offset(pud, address); 2349 2350 __split_huge_pmd(vma, pmd, address, freeze, page); 2351 } 2352 2353 void vma_adjust_trans_huge(struct vm_area_struct *vma, 2354 unsigned long start, 2355 unsigned long end, 2356 long adjust_next) 2357 { 2358 /* 2359 * If the new start address isn't hpage aligned and it could 2360 * previously contain an hugepage: check if we need to split 2361 * an huge pmd. 2362 */ 2363 if (start & ~HPAGE_PMD_MASK && 2364 (start & HPAGE_PMD_MASK) >= vma->vm_start && 2365 (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) 2366 split_huge_pmd_address(vma, start, false, NULL); 2367 2368 /* 2369 * If the new end address isn't hpage aligned and it could 2370 * previously contain an hugepage: check if we need to split 2371 * an huge pmd. 2372 */ 2373 if (end & ~HPAGE_PMD_MASK && 2374 (end & HPAGE_PMD_MASK) >= vma->vm_start && 2375 (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) 2376 split_huge_pmd_address(vma, end, false, NULL); 2377 2378 /* 2379 * If we're also updating the vma->vm_next->vm_start, if the new 2380 * vm_next->vm_start isn't page aligned and it could previously 2381 * contain an hugepage: check if we need to split an huge pmd. 2382 */ 2383 if (adjust_next > 0) { 2384 struct vm_area_struct *next = vma->vm_next; 2385 unsigned long nstart = next->vm_start; 2386 nstart += adjust_next << PAGE_SHIFT; 2387 if (nstart & ~HPAGE_PMD_MASK && 2388 (nstart & HPAGE_PMD_MASK) >= next->vm_start && 2389 (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end) 2390 split_huge_pmd_address(next, nstart, false, NULL); 2391 } 2392 } 2393 2394 static void unmap_page(struct page *page) 2395 { 2396 enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS | 2397 TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD; 2398 bool unmap_success; 2399 2400 VM_BUG_ON_PAGE(!PageHead(page), page); 2401 2402 if (PageAnon(page)) 2403 ttu_flags |= TTU_SPLIT_FREEZE; 2404 2405 unmap_success = try_to_unmap(page, ttu_flags); 2406 VM_BUG_ON_PAGE(!unmap_success, page); 2407 } 2408 2409 static void remap_page(struct page *page) 2410 { 2411 int i; 2412 if (PageTransHuge(page)) { 2413 remove_migration_ptes(page, page, true); 2414 } else { 2415 for (i = 0; i < HPAGE_PMD_NR; i++) 2416 remove_migration_ptes(page + i, page + i, true); 2417 } 2418 } 2419 2420 static void __split_huge_page_tail(struct page *head, int tail, 2421 struct lruvec *lruvec, struct list_head *list) 2422 { 2423 struct page *page_tail = head + tail; 2424 2425 VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail); 2426 2427 /* 2428 * Clone page flags before unfreezing refcount. 2429 * 2430 * After successful get_page_unless_zero() might follow flags change, 2431 * for exmaple lock_page() which set PG_waiters. 2432 */ 2433 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 2434 page_tail->flags |= (head->flags & 2435 ((1L << PG_referenced) | 2436 (1L << PG_swapbacked) | 2437 (1L << PG_swapcache) | 2438 (1L << PG_mlocked) | 2439 (1L << PG_uptodate) | 2440 (1L << PG_active) | 2441 (1L << PG_workingset) | 2442 (1L << PG_locked) | 2443 (1L << PG_unevictable) | 2444 (1L << PG_dirty))); 2445 2446 /* ->mapping in first tail page is compound_mapcount */ 2447 VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, 2448 page_tail); 2449 page_tail->mapping = head->mapping; 2450 page_tail->index = head->index + tail; 2451 2452 /* Page flags must be visible before we make the page non-compound. */ 2453 smp_wmb(); 2454 2455 /* 2456 * Clear PageTail before unfreezing page refcount. 2457 * 2458 * After successful get_page_unless_zero() might follow put_page() 2459 * which needs correct compound_head(). 2460 */ 2461 clear_compound_head(page_tail); 2462 2463 /* Finally unfreeze refcount. Additional reference from page cache. */ 2464 page_ref_unfreeze(page_tail, 1 + (!PageAnon(head) || 2465 PageSwapCache(head))); 2466 2467 if (page_is_young(head)) 2468 set_page_young(page_tail); 2469 if (page_is_idle(head)) 2470 set_page_idle(page_tail); 2471 2472 page_cpupid_xchg_last(page_tail, page_cpupid_last(head)); 2473 2474 /* 2475 * always add to the tail because some iterators expect new 2476 * pages to show after the currently processed elements - e.g. 2477 * migrate_pages 2478 */ 2479 lru_add_page_tail(head, page_tail, lruvec, list); 2480 } 2481 2482 static void __split_huge_page(struct page *page, struct list_head *list, 2483 pgoff_t end, unsigned long flags) 2484 { 2485 struct page *head = compound_head(page); 2486 pg_data_t *pgdat = page_pgdat(head); 2487 struct lruvec *lruvec; 2488 int i; 2489 2490 lruvec = mem_cgroup_page_lruvec(head, pgdat); 2491 2492 /* complete memcg works before add pages to LRU */ 2493 mem_cgroup_split_huge_fixup(head); 2494 2495 for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { 2496 __split_huge_page_tail(head, i, lruvec, list); 2497 /* Some pages can be beyond i_size: drop them from page cache */ 2498 if (head[i].index >= end) { 2499 ClearPageDirty(head + i); 2500 __delete_from_page_cache(head + i, NULL); 2501 if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head)) 2502 shmem_uncharge(head->mapping->host, 1); 2503 put_page(head + i); 2504 } 2505 } 2506 2507 ClearPageCompound(head); 2508 /* See comment in __split_huge_page_tail() */ 2509 if (PageAnon(head)) { 2510 /* Additional pin to swap cache */ 2511 if (PageSwapCache(head)) 2512 page_ref_add(head, 2); 2513 else 2514 page_ref_inc(head); 2515 } else { 2516 /* Additional pin to page cache */ 2517 page_ref_add(head, 2); 2518 xa_unlock(&head->mapping->i_pages); 2519 } 2520 2521 spin_unlock_irqrestore(&pgdat->lru_lock, flags); 2522 2523 remap_page(head); 2524 2525 for (i = 0; i < HPAGE_PMD_NR; i++) { 2526 struct page *subpage = head + i; 2527 if (subpage == page) 2528 continue; 2529 unlock_page(subpage); 2530 2531 /* 2532 * Subpages may be freed if there wasn't any mapping 2533 * like if add_to_swap() is running on a lru page that 2534 * had its mapping zapped. And freeing these pages 2535 * requires taking the lru_lock so we do the put_page 2536 * of the tail pages after the split is complete. 2537 */ 2538 put_page(subpage); 2539 } 2540 } 2541 2542 int total_mapcount(struct page *page) 2543 { 2544 int i, compound, ret; 2545 2546 VM_BUG_ON_PAGE(PageTail(page), page); 2547 2548 if (likely(!PageCompound(page))) 2549 return atomic_read(&page->_mapcount) + 1; 2550 2551 compound = compound_mapcount(page); 2552 if (PageHuge(page)) 2553 return compound; 2554 ret = compound; 2555 for (i = 0; i < HPAGE_PMD_NR; i++) 2556 ret += atomic_read(&page[i]._mapcount) + 1; 2557 /* File pages has compound_mapcount included in _mapcount */ 2558 if (!PageAnon(page)) 2559 return ret - compound * HPAGE_PMD_NR; 2560 if (PageDoubleMap(page)) 2561 ret -= HPAGE_PMD_NR; 2562 return ret; 2563 } 2564 2565 /* 2566 * This calculates accurately how many mappings a transparent hugepage 2567 * has (unlike page_mapcount() which isn't fully accurate). This full 2568 * accuracy is primarily needed to know if copy-on-write faults can 2569 * reuse the page and change the mapping to read-write instead of 2570 * copying them. At the same time this returns the total_mapcount too. 2571 * 2572 * The function returns the highest mapcount any one of the subpages 2573 * has. If the return value is one, even if different processes are 2574 * mapping different subpages of the transparent hugepage, they can 2575 * all reuse it, because each process is reusing a different subpage. 2576 * 2577 * The total_mapcount is instead counting all virtual mappings of the 2578 * subpages. If the total_mapcount is equal to "one", it tells the 2579 * caller all mappings belong to the same "mm" and in turn the 2580 * anon_vma of the transparent hugepage can become the vma->anon_vma 2581 * local one as no other process may be mapping any of the subpages. 2582 * 2583 * It would be more accurate to replace page_mapcount() with 2584 * page_trans_huge_mapcount(), however we only use 2585 * page_trans_huge_mapcount() in the copy-on-write faults where we 2586 * need full accuracy to avoid breaking page pinning, because 2587 * page_trans_huge_mapcount() is slower than page_mapcount(). 2588 */ 2589 int page_trans_huge_mapcount(struct page *page, int *total_mapcount) 2590 { 2591 int i, ret, _total_mapcount, mapcount; 2592 2593 /* hugetlbfs shouldn't call it */ 2594 VM_BUG_ON_PAGE(PageHuge(page), page); 2595 2596 if (likely(!PageTransCompound(page))) { 2597 mapcount = atomic_read(&page->_mapcount) + 1; 2598 if (total_mapcount) 2599 *total_mapcount = mapcount; 2600 return mapcount; 2601 } 2602 2603 page = compound_head(page); 2604 2605 _total_mapcount = ret = 0; 2606 for (i = 0; i < HPAGE_PMD_NR; i++) { 2607 mapcount = atomic_read(&page[i]._mapcount) + 1; 2608 ret = max(ret, mapcount); 2609 _total_mapcount += mapcount; 2610 } 2611 if (PageDoubleMap(page)) { 2612 ret -= 1; 2613 _total_mapcount -= HPAGE_PMD_NR; 2614 } 2615 mapcount = compound_mapcount(page); 2616 ret += mapcount; 2617 _total_mapcount += mapcount; 2618 if (total_mapcount) 2619 *total_mapcount = _total_mapcount; 2620 return ret; 2621 } 2622 2623 /* Racy check whether the huge page can be split */ 2624 bool can_split_huge_page(struct page *page, int *pextra_pins) 2625 { 2626 int extra_pins; 2627 2628 /* Additional pins from page cache */ 2629 if (PageAnon(page)) 2630 extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0; 2631 else 2632 extra_pins = HPAGE_PMD_NR; 2633 if (pextra_pins) 2634 *pextra_pins = extra_pins; 2635 return total_mapcount(page) == page_count(page) - extra_pins - 1; 2636 } 2637 2638 /* 2639 * This function splits huge page into normal pages. @page can point to any 2640 * subpage of huge page to split. Split doesn't change the position of @page. 2641 * 2642 * Only caller must hold pin on the @page, otherwise split fails with -EBUSY. 2643 * The huge page must be locked. 2644 * 2645 * If @list is null, tail pages will be added to LRU list, otherwise, to @list. 2646 * 2647 * Both head page and tail pages will inherit mapping, flags, and so on from 2648 * the hugepage. 2649 * 2650 * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if 2651 * they are not mapped. 2652 * 2653 * Returns 0 if the hugepage is split successfully. 2654 * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under 2655 * us. 2656 */ 2657 int split_huge_page_to_list(struct page *page, struct list_head *list) 2658 { 2659 struct page *head = compound_head(page); 2660 struct pglist_data *pgdata = NODE_DATA(page_to_nid(head)); 2661 struct anon_vma *anon_vma = NULL; 2662 struct address_space *mapping = NULL; 2663 int count, mapcount, extra_pins, ret; 2664 bool mlocked; 2665 unsigned long flags; 2666 pgoff_t end; 2667 2668 VM_BUG_ON_PAGE(is_huge_zero_page(page), page); 2669 VM_BUG_ON_PAGE(!PageLocked(page), page); 2670 VM_BUG_ON_PAGE(!PageCompound(page), page); 2671 2672 if (PageWriteback(page)) 2673 return -EBUSY; 2674 2675 if (PageAnon(head)) { 2676 /* 2677 * The caller does not necessarily hold an mmap_sem that would 2678 * prevent the anon_vma disappearing so we first we take a 2679 * reference to it and then lock the anon_vma for write. This 2680 * is similar to page_lock_anon_vma_read except the write lock 2681 * is taken to serialise against parallel split or collapse 2682 * operations. 2683 */ 2684 anon_vma = page_get_anon_vma(head); 2685 if (!anon_vma) { 2686 ret = -EBUSY; 2687 goto out; 2688 } 2689 end = -1; 2690 mapping = NULL; 2691 anon_vma_lock_write(anon_vma); 2692 } else { 2693 mapping = head->mapping; 2694 2695 /* Truncated ? */ 2696 if (!mapping) { 2697 ret = -EBUSY; 2698 goto out; 2699 } 2700 2701 anon_vma = NULL; 2702 i_mmap_lock_read(mapping); 2703 2704 /* 2705 *__split_huge_page() may need to trim off pages beyond EOF: 2706 * but on 32-bit, i_size_read() takes an irq-unsafe seqlock, 2707 * which cannot be nested inside the page tree lock. So note 2708 * end now: i_size itself may be changed at any moment, but 2709 * head page lock is good enough to serialize the trimming. 2710 */ 2711 end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); 2712 } 2713 2714 /* 2715 * Racy check if we can split the page, before unmap_page() will 2716 * split PMDs 2717 */ 2718 if (!can_split_huge_page(head, &extra_pins)) { 2719 ret = -EBUSY; 2720 goto out_unlock; 2721 } 2722 2723 mlocked = PageMlocked(page); 2724 unmap_page(head); 2725 VM_BUG_ON_PAGE(compound_mapcount(head), head); 2726 2727 /* Make sure the page is not on per-CPU pagevec as it takes pin */ 2728 if (mlocked) 2729 lru_add_drain(); 2730 2731 /* prevent PageLRU to go away from under us, and freeze lru stats */ 2732 spin_lock_irqsave(&pgdata->lru_lock, flags); 2733 2734 if (mapping) { 2735 XA_STATE(xas, &mapping->i_pages, page_index(head)); 2736 2737 /* 2738 * Check if the head page is present in page cache. 2739 * We assume all tail are present too, if head is there. 2740 */ 2741 xa_lock(&mapping->i_pages); 2742 if (xas_load(&xas) != head) 2743 goto fail; 2744 } 2745 2746 /* Prevent deferred_split_scan() touching ->_refcount */ 2747 spin_lock(&pgdata->split_queue_lock); 2748 count = page_count(head); 2749 mapcount = total_mapcount(head); 2750 if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) { 2751 if (!list_empty(page_deferred_list(head))) { 2752 pgdata->split_queue_len--; 2753 list_del(page_deferred_list(head)); 2754 } 2755 if (mapping) 2756 __dec_node_page_state(page, NR_SHMEM_THPS); 2757 spin_unlock(&pgdata->split_queue_lock); 2758 __split_huge_page(page, list, end, flags); 2759 if (PageSwapCache(head)) { 2760 swp_entry_t entry = { .val = page_private(head) }; 2761 2762 ret = split_swap_cluster(entry); 2763 } else 2764 ret = 0; 2765 } else { 2766 if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { 2767 pr_alert("total_mapcount: %u, page_count(): %u\n", 2768 mapcount, count); 2769 if (PageTail(page)) 2770 dump_page(head, NULL); 2771 dump_page(page, "total_mapcount(head) > 0"); 2772 BUG(); 2773 } 2774 spin_unlock(&pgdata->split_queue_lock); 2775 fail: if (mapping) 2776 xa_unlock(&mapping->i_pages); 2777 spin_unlock_irqrestore(&pgdata->lru_lock, flags); 2778 remap_page(head); 2779 ret = -EBUSY; 2780 } 2781 2782 out_unlock: 2783 if (anon_vma) { 2784 anon_vma_unlock_write(anon_vma); 2785 put_anon_vma(anon_vma); 2786 } 2787 if (mapping) 2788 i_mmap_unlock_read(mapping); 2789 out: 2790 count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); 2791 return ret; 2792 } 2793 2794 void free_transhuge_page(struct page *page) 2795 { 2796 struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); 2797 unsigned long flags; 2798 2799 spin_lock_irqsave(&pgdata->split_queue_lock, flags); 2800 if (!list_empty(page_deferred_list(page))) { 2801 pgdata->split_queue_len--; 2802 list_del(page_deferred_list(page)); 2803 } 2804 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 2805 free_compound_page(page); 2806 } 2807 2808 void deferred_split_huge_page(struct page *page) 2809 { 2810 struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); 2811 unsigned long flags; 2812 2813 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 2814 2815 spin_lock_irqsave(&pgdata->split_queue_lock, flags); 2816 if (list_empty(page_deferred_list(page))) { 2817 count_vm_event(THP_DEFERRED_SPLIT_PAGE); 2818 list_add_tail(page_deferred_list(page), &pgdata->split_queue); 2819 pgdata->split_queue_len++; 2820 } 2821 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 2822 } 2823 2824 static unsigned long deferred_split_count(struct shrinker *shrink, 2825 struct shrink_control *sc) 2826 { 2827 struct pglist_data *pgdata = NODE_DATA(sc->nid); 2828 return READ_ONCE(pgdata->split_queue_len); 2829 } 2830 2831 static unsigned long deferred_split_scan(struct shrinker *shrink, 2832 struct shrink_control *sc) 2833 { 2834 struct pglist_data *pgdata = NODE_DATA(sc->nid); 2835 unsigned long flags; 2836 LIST_HEAD(list), *pos, *next; 2837 struct page *page; 2838 int split = 0; 2839 2840 spin_lock_irqsave(&pgdata->split_queue_lock, flags); 2841 /* Take pin on all head pages to avoid freeing them under us */ 2842 list_for_each_safe(pos, next, &pgdata->split_queue) { 2843 page = list_entry((void *)pos, struct page, mapping); 2844 page = compound_head(page); 2845 if (get_page_unless_zero(page)) { 2846 list_move(page_deferred_list(page), &list); 2847 } else { 2848 /* We lost race with put_compound_page() */ 2849 list_del_init(page_deferred_list(page)); 2850 pgdata->split_queue_len--; 2851 } 2852 if (!--sc->nr_to_scan) 2853 break; 2854 } 2855 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 2856 2857 list_for_each_safe(pos, next, &list) { 2858 page = list_entry((void *)pos, struct page, mapping); 2859 if (!trylock_page(page)) 2860 goto next; 2861 /* split_huge_page() removes page from list on success */ 2862 if (!split_huge_page(page)) 2863 split++; 2864 unlock_page(page); 2865 next: 2866 put_page(page); 2867 } 2868 2869 spin_lock_irqsave(&pgdata->split_queue_lock, flags); 2870 list_splice_tail(&list, &pgdata->split_queue); 2871 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 2872 2873 /* 2874 * Stop shrinker if we didn't split any page, but the queue is empty. 2875 * This can happen if pages were freed under us. 2876 */ 2877 if (!split && list_empty(&pgdata->split_queue)) 2878 return SHRINK_STOP; 2879 return split; 2880 } 2881 2882 static struct shrinker deferred_split_shrinker = { 2883 .count_objects = deferred_split_count, 2884 .scan_objects = deferred_split_scan, 2885 .seeks = DEFAULT_SEEKS, 2886 .flags = SHRINKER_NUMA_AWARE, 2887 }; 2888 2889 #ifdef CONFIG_DEBUG_FS 2890 static int split_huge_pages_set(void *data, u64 val) 2891 { 2892 struct zone *zone; 2893 struct page *page; 2894 unsigned long pfn, max_zone_pfn; 2895 unsigned long total = 0, split = 0; 2896 2897 if (val != 1) 2898 return -EINVAL; 2899 2900 for_each_populated_zone(zone) { 2901 max_zone_pfn = zone_end_pfn(zone); 2902 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) { 2903 if (!pfn_valid(pfn)) 2904 continue; 2905 2906 page = pfn_to_page(pfn); 2907 if (!get_page_unless_zero(page)) 2908 continue; 2909 2910 if (zone != page_zone(page)) 2911 goto next; 2912 2913 if (!PageHead(page) || PageHuge(page) || !PageLRU(page)) 2914 goto next; 2915 2916 total++; 2917 lock_page(page); 2918 if (!split_huge_page(page)) 2919 split++; 2920 unlock_page(page); 2921 next: 2922 put_page(page); 2923 } 2924 } 2925 2926 pr_info("%lu of %lu THP split\n", split, total); 2927 2928 return 0; 2929 } 2930 DEFINE_SIMPLE_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set, 2931 "%llu\n"); 2932 2933 static int __init split_huge_pages_debugfs(void) 2934 { 2935 debugfs_create_file("split_huge_pages", 0200, NULL, NULL, 2936 &split_huge_pages_fops); 2937 return 0; 2938 } 2939 late_initcall(split_huge_pages_debugfs); 2940 #endif 2941 2942 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 2943 void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, 2944 struct page *page) 2945 { 2946 struct vm_area_struct *vma = pvmw->vma; 2947 struct mm_struct *mm = vma->vm_mm; 2948 unsigned long address = pvmw->address; 2949 pmd_t pmdval; 2950 swp_entry_t entry; 2951 pmd_t pmdswp; 2952 2953 if (!(pvmw->pmd && !pvmw->pte)) 2954 return; 2955 2956 flush_cache_range(vma, address, address + HPAGE_PMD_SIZE); 2957 pmdval = *pvmw->pmd; 2958 pmdp_invalidate(vma, address, pvmw->pmd); 2959 if (pmd_dirty(pmdval)) 2960 set_page_dirty(page); 2961 entry = make_migration_entry(page, pmd_write(pmdval)); 2962 pmdswp = swp_entry_to_pmd(entry); 2963 if (pmd_soft_dirty(pmdval)) 2964 pmdswp = pmd_swp_mksoft_dirty(pmdswp); 2965 set_pmd_at(mm, address, pvmw->pmd, pmdswp); 2966 page_remove_rmap(page, true); 2967 put_page(page); 2968 } 2969 2970 void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) 2971 { 2972 struct vm_area_struct *vma = pvmw->vma; 2973 struct mm_struct *mm = vma->vm_mm; 2974 unsigned long address = pvmw->address; 2975 unsigned long mmun_start = address & HPAGE_PMD_MASK; 2976 pmd_t pmde; 2977 swp_entry_t entry; 2978 2979 if (!(pvmw->pmd && !pvmw->pte)) 2980 return; 2981 2982 entry = pmd_to_swp_entry(*pvmw->pmd); 2983 get_page(new); 2984 pmde = pmd_mkold(mk_huge_pmd(new, vma->vm_page_prot)); 2985 if (pmd_swp_soft_dirty(*pvmw->pmd)) 2986 pmde = pmd_mksoft_dirty(pmde); 2987 if (is_write_migration_entry(entry)) 2988 pmde = maybe_pmd_mkwrite(pmde, vma); 2989 2990 flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE); 2991 if (PageAnon(new)) 2992 page_add_anon_rmap(new, vma, mmun_start, true); 2993 else 2994 page_add_file_rmap(new, true); 2995 set_pmd_at(mm, mmun_start, pvmw->pmd, pmde); 2996 if ((vma->vm_flags & VM_LOCKED) && !PageDoubleMap(new)) 2997 mlock_vma_page(new); 2998 update_mmu_cache_pmd(vma, address, pvmw->pmd); 2999 } 3000 #endif 3001