1 /* 2 * Copyright (C) 2009 Red Hat, Inc. 3 * 4 * This work is licensed under the terms of the GNU GPL, version 2. See 5 * the COPYING file in the top-level directory. 6 */ 7 8 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 9 10 #include <linux/mm.h> 11 #include <linux/sched.h> 12 #include <linux/highmem.h> 13 #include <linux/hugetlb.h> 14 #include <linux/mmu_notifier.h> 15 #include <linux/rmap.h> 16 #include <linux/swap.h> 17 #include <linux/shrinker.h> 18 #include <linux/mm_inline.h> 19 #include <linux/swapops.h> 20 #include <linux/dax.h> 21 #include <linux/kthread.h> 22 #include <linux/khugepaged.h> 23 #include <linux/freezer.h> 24 #include <linux/pfn_t.h> 25 #include <linux/mman.h> 26 #include <linux/memremap.h> 27 #include <linux/pagemap.h> 28 #include <linux/debugfs.h> 29 #include <linux/migrate.h> 30 #include <linux/hashtable.h> 31 #include <linux/userfaultfd_k.h> 32 #include <linux/page_idle.h> 33 34 #include <asm/tlb.h> 35 #include <asm/pgalloc.h> 36 #include "internal.h" 37 38 enum scan_result { 39 SCAN_FAIL, 40 SCAN_SUCCEED, 41 SCAN_PMD_NULL, 42 SCAN_EXCEED_NONE_PTE, 43 SCAN_PTE_NON_PRESENT, 44 SCAN_PAGE_RO, 45 SCAN_NO_REFERENCED_PAGE, 46 SCAN_PAGE_NULL, 47 SCAN_SCAN_ABORT, 48 SCAN_PAGE_COUNT, 49 SCAN_PAGE_LRU, 50 SCAN_PAGE_LOCK, 51 SCAN_PAGE_ANON, 52 SCAN_PAGE_COMPOUND, 53 SCAN_ANY_PROCESS, 54 SCAN_VMA_NULL, 55 SCAN_VMA_CHECK, 56 SCAN_ADDRESS_RANGE, 57 SCAN_SWAP_CACHE_PAGE, 58 SCAN_DEL_PAGE_LRU, 59 SCAN_ALLOC_HUGE_PAGE_FAIL, 60 SCAN_CGROUP_CHARGE_FAIL 61 }; 62 63 #define CREATE_TRACE_POINTS 64 #include <trace/events/huge_memory.h> 65 66 /* 67 * By default transparent hugepage support is disabled in order that avoid 68 * to risk increase the memory footprint of applications without a guaranteed 69 * benefit. When transparent hugepage support is enabled, is for all mappings, 70 * and khugepaged scans all mappings. 71 * Defrag is invoked by khugepaged hugepage allocations and by page faults 72 * for all hugepage allocations. 73 */ 74 unsigned long transparent_hugepage_flags __read_mostly = 75 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS 76 (1<<TRANSPARENT_HUGEPAGE_FLAG)| 77 #endif 78 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE 79 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| 80 #endif 81 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)| 82 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)| 83 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 84 85 /* default scan 8*512 pte (or vmas) every 30 second */ 86 static unsigned int khugepaged_pages_to_scan __read_mostly; 87 static unsigned int khugepaged_pages_collapsed; 88 static unsigned int khugepaged_full_scans; 89 static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000; 90 /* during fragmentation poll the hugepage allocator once every minute */ 91 static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; 92 static unsigned long khugepaged_sleep_expire; 93 static struct task_struct *khugepaged_thread __read_mostly; 94 static DEFINE_MUTEX(khugepaged_mutex); 95 static DEFINE_SPINLOCK(khugepaged_mm_lock); 96 static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); 97 /* 98 * default collapse hugepages if there is at least one pte mapped like 99 * it would have happened if the vma was large enough during page 100 * fault. 101 */ 102 static unsigned int khugepaged_max_ptes_none __read_mostly; 103 104 static int khugepaged(void *none); 105 static int khugepaged_slab_init(void); 106 static void khugepaged_slab_exit(void); 107 108 #define MM_SLOTS_HASH_BITS 10 109 static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); 110 111 static struct kmem_cache *mm_slot_cache __read_mostly; 112 113 /** 114 * struct mm_slot - hash lookup from mm to mm_slot 115 * @hash: hash collision list 116 * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head 117 * @mm: the mm that this information is valid for 118 */ 119 struct mm_slot { 120 struct hlist_node hash; 121 struct list_head mm_node; 122 struct mm_struct *mm; 123 }; 124 125 /** 126 * struct khugepaged_scan - cursor for scanning 127 * @mm_head: the head of the mm list to scan 128 * @mm_slot: the current mm_slot we are scanning 129 * @address: the next address inside that to be scanned 130 * 131 * There is only the one khugepaged_scan instance of this cursor structure. 132 */ 133 struct khugepaged_scan { 134 struct list_head mm_head; 135 struct mm_slot *mm_slot; 136 unsigned long address; 137 }; 138 static struct khugepaged_scan khugepaged_scan = { 139 .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), 140 }; 141 142 static struct shrinker deferred_split_shrinker; 143 144 static void set_recommended_min_free_kbytes(void) 145 { 146 struct zone *zone; 147 int nr_zones = 0; 148 unsigned long recommended_min; 149 150 for_each_populated_zone(zone) 151 nr_zones++; 152 153 /* Ensure 2 pageblocks are free to assist fragmentation avoidance */ 154 recommended_min = pageblock_nr_pages * nr_zones * 2; 155 156 /* 157 * Make sure that on average at least two pageblocks are almost free 158 * of another type, one for a migratetype to fall back to and a 159 * second to avoid subsequent fallbacks of other types There are 3 160 * MIGRATE_TYPES we care about. 161 */ 162 recommended_min += pageblock_nr_pages * nr_zones * 163 MIGRATE_PCPTYPES * MIGRATE_PCPTYPES; 164 165 /* don't ever allow to reserve more than 5% of the lowmem */ 166 recommended_min = min(recommended_min, 167 (unsigned long) nr_free_buffer_pages() / 20); 168 recommended_min <<= (PAGE_SHIFT-10); 169 170 if (recommended_min > min_free_kbytes) { 171 if (user_min_free_kbytes >= 0) 172 pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n", 173 min_free_kbytes, recommended_min); 174 175 min_free_kbytes = recommended_min; 176 } 177 setup_per_zone_wmarks(); 178 } 179 180 static int start_stop_khugepaged(void) 181 { 182 int err = 0; 183 if (khugepaged_enabled()) { 184 if (!khugepaged_thread) 185 khugepaged_thread = kthread_run(khugepaged, NULL, 186 "khugepaged"); 187 if (IS_ERR(khugepaged_thread)) { 188 pr_err("khugepaged: kthread_run(khugepaged) failed\n"); 189 err = PTR_ERR(khugepaged_thread); 190 khugepaged_thread = NULL; 191 goto fail; 192 } 193 194 if (!list_empty(&khugepaged_scan.mm_head)) 195 wake_up_interruptible(&khugepaged_wait); 196 197 set_recommended_min_free_kbytes(); 198 } else if (khugepaged_thread) { 199 kthread_stop(khugepaged_thread); 200 khugepaged_thread = NULL; 201 } 202 fail: 203 return err; 204 } 205 206 static atomic_t huge_zero_refcount; 207 struct page *huge_zero_page __read_mostly; 208 209 struct page *get_huge_zero_page(void) 210 { 211 struct page *zero_page; 212 retry: 213 if (likely(atomic_inc_not_zero(&huge_zero_refcount))) 214 return READ_ONCE(huge_zero_page); 215 216 zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, 217 HPAGE_PMD_ORDER); 218 if (!zero_page) { 219 count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); 220 return NULL; 221 } 222 count_vm_event(THP_ZERO_PAGE_ALLOC); 223 preempt_disable(); 224 if (cmpxchg(&huge_zero_page, NULL, zero_page)) { 225 preempt_enable(); 226 __free_pages(zero_page, compound_order(zero_page)); 227 goto retry; 228 } 229 230 /* We take additional reference here. It will be put back by shrinker */ 231 atomic_set(&huge_zero_refcount, 2); 232 preempt_enable(); 233 return READ_ONCE(huge_zero_page); 234 } 235 236 void put_huge_zero_page(void) 237 { 238 /* 239 * Counter should never go to zero here. Only shrinker can put 240 * last reference. 241 */ 242 BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); 243 } 244 245 static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink, 246 struct shrink_control *sc) 247 { 248 /* we can free zero page only if last reference remains */ 249 return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; 250 } 251 252 static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink, 253 struct shrink_control *sc) 254 { 255 if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { 256 struct page *zero_page = xchg(&huge_zero_page, NULL); 257 BUG_ON(zero_page == NULL); 258 __free_pages(zero_page, compound_order(zero_page)); 259 return HPAGE_PMD_NR; 260 } 261 262 return 0; 263 } 264 265 static struct shrinker huge_zero_page_shrinker = { 266 .count_objects = shrink_huge_zero_page_count, 267 .scan_objects = shrink_huge_zero_page_scan, 268 .seeks = DEFAULT_SEEKS, 269 }; 270 271 #ifdef CONFIG_SYSFS 272 273 static ssize_t triple_flag_store(struct kobject *kobj, 274 struct kobj_attribute *attr, 275 const char *buf, size_t count, 276 enum transparent_hugepage_flag enabled, 277 enum transparent_hugepage_flag deferred, 278 enum transparent_hugepage_flag req_madv) 279 { 280 if (!memcmp("defer", buf, 281 min(sizeof("defer")-1, count))) { 282 if (enabled == deferred) 283 return -EINVAL; 284 clear_bit(enabled, &transparent_hugepage_flags); 285 clear_bit(req_madv, &transparent_hugepage_flags); 286 set_bit(deferred, &transparent_hugepage_flags); 287 } else if (!memcmp("always", buf, 288 min(sizeof("always")-1, count))) { 289 clear_bit(deferred, &transparent_hugepage_flags); 290 clear_bit(req_madv, &transparent_hugepage_flags); 291 set_bit(enabled, &transparent_hugepage_flags); 292 } else if (!memcmp("madvise", buf, 293 min(sizeof("madvise")-1, count))) { 294 clear_bit(enabled, &transparent_hugepage_flags); 295 clear_bit(deferred, &transparent_hugepage_flags); 296 set_bit(req_madv, &transparent_hugepage_flags); 297 } else if (!memcmp("never", buf, 298 min(sizeof("never")-1, count))) { 299 clear_bit(enabled, &transparent_hugepage_flags); 300 clear_bit(req_madv, &transparent_hugepage_flags); 301 clear_bit(deferred, &transparent_hugepage_flags); 302 } else 303 return -EINVAL; 304 305 return count; 306 } 307 308 static ssize_t enabled_show(struct kobject *kobj, 309 struct kobj_attribute *attr, char *buf) 310 { 311 if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags)) 312 return sprintf(buf, "[always] madvise never\n"); 313 else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags)) 314 return sprintf(buf, "always [madvise] never\n"); 315 else 316 return sprintf(buf, "always madvise [never]\n"); 317 } 318 319 static ssize_t enabled_store(struct kobject *kobj, 320 struct kobj_attribute *attr, 321 const char *buf, size_t count) 322 { 323 ssize_t ret; 324 325 ret = triple_flag_store(kobj, attr, buf, count, 326 TRANSPARENT_HUGEPAGE_FLAG, 327 TRANSPARENT_HUGEPAGE_FLAG, 328 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); 329 330 if (ret > 0) { 331 int err; 332 333 mutex_lock(&khugepaged_mutex); 334 err = start_stop_khugepaged(); 335 mutex_unlock(&khugepaged_mutex); 336 337 if (err) 338 ret = err; 339 } 340 341 return ret; 342 } 343 static struct kobj_attribute enabled_attr = 344 __ATTR(enabled, 0644, enabled_show, enabled_store); 345 346 static ssize_t single_flag_show(struct kobject *kobj, 347 struct kobj_attribute *attr, char *buf, 348 enum transparent_hugepage_flag flag) 349 { 350 return sprintf(buf, "%d\n", 351 !!test_bit(flag, &transparent_hugepage_flags)); 352 } 353 354 static ssize_t single_flag_store(struct kobject *kobj, 355 struct kobj_attribute *attr, 356 const char *buf, size_t count, 357 enum transparent_hugepage_flag flag) 358 { 359 unsigned long value; 360 int ret; 361 362 ret = kstrtoul(buf, 10, &value); 363 if (ret < 0) 364 return ret; 365 if (value > 1) 366 return -EINVAL; 367 368 if (value) 369 set_bit(flag, &transparent_hugepage_flags); 370 else 371 clear_bit(flag, &transparent_hugepage_flags); 372 373 return count; 374 } 375 376 /* 377 * Currently defrag only disables __GFP_NOWAIT for allocation. A blind 378 * __GFP_REPEAT is too aggressive, it's never worth swapping tons of 379 * memory just to allocate one more hugepage. 380 */ 381 static ssize_t defrag_show(struct kobject *kobj, 382 struct kobj_attribute *attr, char *buf) 383 { 384 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) 385 return sprintf(buf, "[always] defer madvise never\n"); 386 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) 387 return sprintf(buf, "always [defer] madvise never\n"); 388 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) 389 return sprintf(buf, "always defer [madvise] never\n"); 390 else 391 return sprintf(buf, "always defer madvise [never]\n"); 392 393 } 394 static ssize_t defrag_store(struct kobject *kobj, 395 struct kobj_attribute *attr, 396 const char *buf, size_t count) 397 { 398 return triple_flag_store(kobj, attr, buf, count, 399 TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, 400 TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, 401 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG); 402 } 403 static struct kobj_attribute defrag_attr = 404 __ATTR(defrag, 0644, defrag_show, defrag_store); 405 406 static ssize_t use_zero_page_show(struct kobject *kobj, 407 struct kobj_attribute *attr, char *buf) 408 { 409 return single_flag_show(kobj, attr, buf, 410 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 411 } 412 static ssize_t use_zero_page_store(struct kobject *kobj, 413 struct kobj_attribute *attr, const char *buf, size_t count) 414 { 415 return single_flag_store(kobj, attr, buf, count, 416 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 417 } 418 static struct kobj_attribute use_zero_page_attr = 419 __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store); 420 #ifdef CONFIG_DEBUG_VM 421 static ssize_t debug_cow_show(struct kobject *kobj, 422 struct kobj_attribute *attr, char *buf) 423 { 424 return single_flag_show(kobj, attr, buf, 425 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); 426 } 427 static ssize_t debug_cow_store(struct kobject *kobj, 428 struct kobj_attribute *attr, 429 const char *buf, size_t count) 430 { 431 return single_flag_store(kobj, attr, buf, count, 432 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); 433 } 434 static struct kobj_attribute debug_cow_attr = 435 __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store); 436 #endif /* CONFIG_DEBUG_VM */ 437 438 static struct attribute *hugepage_attr[] = { 439 &enabled_attr.attr, 440 &defrag_attr.attr, 441 &use_zero_page_attr.attr, 442 #ifdef CONFIG_DEBUG_VM 443 &debug_cow_attr.attr, 444 #endif 445 NULL, 446 }; 447 448 static struct attribute_group hugepage_attr_group = { 449 .attrs = hugepage_attr, 450 }; 451 452 static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, 453 struct kobj_attribute *attr, 454 char *buf) 455 { 456 return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs); 457 } 458 459 static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, 460 struct kobj_attribute *attr, 461 const char *buf, size_t count) 462 { 463 unsigned long msecs; 464 int err; 465 466 err = kstrtoul(buf, 10, &msecs); 467 if (err || msecs > UINT_MAX) 468 return -EINVAL; 469 470 khugepaged_scan_sleep_millisecs = msecs; 471 khugepaged_sleep_expire = 0; 472 wake_up_interruptible(&khugepaged_wait); 473 474 return count; 475 } 476 static struct kobj_attribute scan_sleep_millisecs_attr = 477 __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show, 478 scan_sleep_millisecs_store); 479 480 static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, 481 struct kobj_attribute *attr, 482 char *buf) 483 { 484 return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs); 485 } 486 487 static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, 488 struct kobj_attribute *attr, 489 const char *buf, size_t count) 490 { 491 unsigned long msecs; 492 int err; 493 494 err = kstrtoul(buf, 10, &msecs); 495 if (err || msecs > UINT_MAX) 496 return -EINVAL; 497 498 khugepaged_alloc_sleep_millisecs = msecs; 499 khugepaged_sleep_expire = 0; 500 wake_up_interruptible(&khugepaged_wait); 501 502 return count; 503 } 504 static struct kobj_attribute alloc_sleep_millisecs_attr = 505 __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show, 506 alloc_sleep_millisecs_store); 507 508 static ssize_t pages_to_scan_show(struct kobject *kobj, 509 struct kobj_attribute *attr, 510 char *buf) 511 { 512 return sprintf(buf, "%u\n", khugepaged_pages_to_scan); 513 } 514 static ssize_t pages_to_scan_store(struct kobject *kobj, 515 struct kobj_attribute *attr, 516 const char *buf, size_t count) 517 { 518 int err; 519 unsigned long pages; 520 521 err = kstrtoul(buf, 10, &pages); 522 if (err || !pages || pages > UINT_MAX) 523 return -EINVAL; 524 525 khugepaged_pages_to_scan = pages; 526 527 return count; 528 } 529 static struct kobj_attribute pages_to_scan_attr = 530 __ATTR(pages_to_scan, 0644, pages_to_scan_show, 531 pages_to_scan_store); 532 533 static ssize_t pages_collapsed_show(struct kobject *kobj, 534 struct kobj_attribute *attr, 535 char *buf) 536 { 537 return sprintf(buf, "%u\n", khugepaged_pages_collapsed); 538 } 539 static struct kobj_attribute pages_collapsed_attr = 540 __ATTR_RO(pages_collapsed); 541 542 static ssize_t full_scans_show(struct kobject *kobj, 543 struct kobj_attribute *attr, 544 char *buf) 545 { 546 return sprintf(buf, "%u\n", khugepaged_full_scans); 547 } 548 static struct kobj_attribute full_scans_attr = 549 __ATTR_RO(full_scans); 550 551 static ssize_t khugepaged_defrag_show(struct kobject *kobj, 552 struct kobj_attribute *attr, char *buf) 553 { 554 return single_flag_show(kobj, attr, buf, 555 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); 556 } 557 static ssize_t khugepaged_defrag_store(struct kobject *kobj, 558 struct kobj_attribute *attr, 559 const char *buf, size_t count) 560 { 561 return single_flag_store(kobj, attr, buf, count, 562 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); 563 } 564 static struct kobj_attribute khugepaged_defrag_attr = 565 __ATTR(defrag, 0644, khugepaged_defrag_show, 566 khugepaged_defrag_store); 567 568 /* 569 * max_ptes_none controls if khugepaged should collapse hugepages over 570 * any unmapped ptes in turn potentially increasing the memory 571 * footprint of the vmas. When max_ptes_none is 0 khugepaged will not 572 * reduce the available free memory in the system as it 573 * runs. Increasing max_ptes_none will instead potentially reduce the 574 * free memory in the system during the khugepaged scan. 575 */ 576 static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj, 577 struct kobj_attribute *attr, 578 char *buf) 579 { 580 return sprintf(buf, "%u\n", khugepaged_max_ptes_none); 581 } 582 static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj, 583 struct kobj_attribute *attr, 584 const char *buf, size_t count) 585 { 586 int err; 587 unsigned long max_ptes_none; 588 589 err = kstrtoul(buf, 10, &max_ptes_none); 590 if (err || max_ptes_none > HPAGE_PMD_NR-1) 591 return -EINVAL; 592 593 khugepaged_max_ptes_none = max_ptes_none; 594 595 return count; 596 } 597 static struct kobj_attribute khugepaged_max_ptes_none_attr = 598 __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show, 599 khugepaged_max_ptes_none_store); 600 601 static struct attribute *khugepaged_attr[] = { 602 &khugepaged_defrag_attr.attr, 603 &khugepaged_max_ptes_none_attr.attr, 604 &pages_to_scan_attr.attr, 605 &pages_collapsed_attr.attr, 606 &full_scans_attr.attr, 607 &scan_sleep_millisecs_attr.attr, 608 &alloc_sleep_millisecs_attr.attr, 609 NULL, 610 }; 611 612 static struct attribute_group khugepaged_attr_group = { 613 .attrs = khugepaged_attr, 614 .name = "khugepaged", 615 }; 616 617 static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) 618 { 619 int err; 620 621 *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); 622 if (unlikely(!*hugepage_kobj)) { 623 pr_err("failed to create transparent hugepage kobject\n"); 624 return -ENOMEM; 625 } 626 627 err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); 628 if (err) { 629 pr_err("failed to register transparent hugepage group\n"); 630 goto delete_obj; 631 } 632 633 err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); 634 if (err) { 635 pr_err("failed to register transparent hugepage group\n"); 636 goto remove_hp_group; 637 } 638 639 return 0; 640 641 remove_hp_group: 642 sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group); 643 delete_obj: 644 kobject_put(*hugepage_kobj); 645 return err; 646 } 647 648 static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj) 649 { 650 sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group); 651 sysfs_remove_group(hugepage_kobj, &hugepage_attr_group); 652 kobject_put(hugepage_kobj); 653 } 654 #else 655 static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj) 656 { 657 return 0; 658 } 659 660 static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj) 661 { 662 } 663 #endif /* CONFIG_SYSFS */ 664 665 static int __init hugepage_init(void) 666 { 667 int err; 668 struct kobject *hugepage_kobj; 669 670 if (!has_transparent_hugepage()) { 671 transparent_hugepage_flags = 0; 672 return -EINVAL; 673 } 674 675 khugepaged_pages_to_scan = HPAGE_PMD_NR * 8; 676 khugepaged_max_ptes_none = HPAGE_PMD_NR - 1; 677 /* 678 * hugepages can't be allocated by the buddy allocator 679 */ 680 MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER >= MAX_ORDER); 681 /* 682 * we use page->mapping and page->index in second tail page 683 * as list_head: assuming THP order >= 2 684 */ 685 MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2); 686 687 err = hugepage_init_sysfs(&hugepage_kobj); 688 if (err) 689 goto err_sysfs; 690 691 err = khugepaged_slab_init(); 692 if (err) 693 goto err_slab; 694 695 err = register_shrinker(&huge_zero_page_shrinker); 696 if (err) 697 goto err_hzp_shrinker; 698 err = register_shrinker(&deferred_split_shrinker); 699 if (err) 700 goto err_split_shrinker; 701 702 /* 703 * By default disable transparent hugepages on smaller systems, 704 * where the extra memory used could hurt more than TLB overhead 705 * is likely to save. The admin can still enable it through /sys. 706 */ 707 if (totalram_pages < (512 << (20 - PAGE_SHIFT))) { 708 transparent_hugepage_flags = 0; 709 return 0; 710 } 711 712 err = start_stop_khugepaged(); 713 if (err) 714 goto err_khugepaged; 715 716 return 0; 717 err_khugepaged: 718 unregister_shrinker(&deferred_split_shrinker); 719 err_split_shrinker: 720 unregister_shrinker(&huge_zero_page_shrinker); 721 err_hzp_shrinker: 722 khugepaged_slab_exit(); 723 err_slab: 724 hugepage_exit_sysfs(hugepage_kobj); 725 err_sysfs: 726 return err; 727 } 728 subsys_initcall(hugepage_init); 729 730 static int __init setup_transparent_hugepage(char *str) 731 { 732 int ret = 0; 733 if (!str) 734 goto out; 735 if (!strcmp(str, "always")) { 736 set_bit(TRANSPARENT_HUGEPAGE_FLAG, 737 &transparent_hugepage_flags); 738 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 739 &transparent_hugepage_flags); 740 ret = 1; 741 } else if (!strcmp(str, "madvise")) { 742 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, 743 &transparent_hugepage_flags); 744 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 745 &transparent_hugepage_flags); 746 ret = 1; 747 } else if (!strcmp(str, "never")) { 748 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, 749 &transparent_hugepage_flags); 750 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, 751 &transparent_hugepage_flags); 752 ret = 1; 753 } 754 out: 755 if (!ret) 756 pr_warn("transparent_hugepage= cannot parse, ignored\n"); 757 return ret; 758 } 759 __setup("transparent_hugepage=", setup_transparent_hugepage); 760 761 pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) 762 { 763 if (likely(vma->vm_flags & VM_WRITE)) 764 pmd = pmd_mkwrite(pmd); 765 return pmd; 766 } 767 768 static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot) 769 { 770 return pmd_mkhuge(mk_pmd(page, prot)); 771 } 772 773 static inline struct list_head *page_deferred_list(struct page *page) 774 { 775 /* 776 * ->lru in the tail pages is occupied by compound_head. 777 * Let's use ->mapping + ->index in the second tail page as list_head. 778 */ 779 return (struct list_head *)&page[2].mapping; 780 } 781 782 void prep_transhuge_page(struct page *page) 783 { 784 /* 785 * we use page->mapping and page->indexlru in second tail page 786 * as list_head: assuming THP order >= 2 787 */ 788 789 INIT_LIST_HEAD(page_deferred_list(page)); 790 set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); 791 } 792 793 static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, 794 struct vm_area_struct *vma, 795 unsigned long address, pmd_t *pmd, 796 struct page *page, gfp_t gfp, 797 unsigned int flags) 798 { 799 struct mem_cgroup *memcg; 800 pgtable_t pgtable; 801 spinlock_t *ptl; 802 unsigned long haddr = address & HPAGE_PMD_MASK; 803 804 VM_BUG_ON_PAGE(!PageCompound(page), page); 805 806 if (mem_cgroup_try_charge(page, mm, gfp, &memcg, true)) { 807 put_page(page); 808 count_vm_event(THP_FAULT_FALLBACK); 809 return VM_FAULT_FALLBACK; 810 } 811 812 pgtable = pte_alloc_one(mm, haddr); 813 if (unlikely(!pgtable)) { 814 mem_cgroup_cancel_charge(page, memcg, true); 815 put_page(page); 816 return VM_FAULT_OOM; 817 } 818 819 clear_huge_page(page, haddr, HPAGE_PMD_NR); 820 /* 821 * The memory barrier inside __SetPageUptodate makes sure that 822 * clear_huge_page writes become visible before the set_pmd_at() 823 * write. 824 */ 825 __SetPageUptodate(page); 826 827 ptl = pmd_lock(mm, pmd); 828 if (unlikely(!pmd_none(*pmd))) { 829 spin_unlock(ptl); 830 mem_cgroup_cancel_charge(page, memcg, true); 831 put_page(page); 832 pte_free(mm, pgtable); 833 } else { 834 pmd_t entry; 835 836 /* Deliver the page fault to userland */ 837 if (userfaultfd_missing(vma)) { 838 int ret; 839 840 spin_unlock(ptl); 841 mem_cgroup_cancel_charge(page, memcg, true); 842 put_page(page); 843 pte_free(mm, pgtable); 844 ret = handle_userfault(vma, address, flags, 845 VM_UFFD_MISSING); 846 VM_BUG_ON(ret & VM_FAULT_FALLBACK); 847 return ret; 848 } 849 850 entry = mk_huge_pmd(page, vma->vm_page_prot); 851 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 852 page_add_new_anon_rmap(page, vma, haddr, true); 853 mem_cgroup_commit_charge(page, memcg, false, true); 854 lru_cache_add_active_or_unevictable(page, vma); 855 pgtable_trans_huge_deposit(mm, pmd, pgtable); 856 set_pmd_at(mm, haddr, pmd, entry); 857 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); 858 atomic_long_inc(&mm->nr_ptes); 859 spin_unlock(ptl); 860 count_vm_event(THP_FAULT_ALLOC); 861 } 862 863 return 0; 864 } 865 866 /* 867 * If THP is set to always then directly reclaim/compact as necessary 868 * If set to defer then do no reclaim and defer to khugepaged 869 * If set to madvise and the VMA is flagged then directly reclaim/compact 870 */ 871 static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) 872 { 873 gfp_t reclaim_flags = 0; 874 875 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags) && 876 (vma->vm_flags & VM_HUGEPAGE)) 877 reclaim_flags = __GFP_DIRECT_RECLAIM; 878 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) 879 reclaim_flags = __GFP_KSWAPD_RECLAIM; 880 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) 881 reclaim_flags = __GFP_DIRECT_RECLAIM; 882 883 return GFP_TRANSHUGE | reclaim_flags; 884 } 885 886 /* Defrag for khugepaged will enter direct reclaim/compaction if necessary */ 887 static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void) 888 { 889 return GFP_TRANSHUGE | (khugepaged_defrag() ? __GFP_DIRECT_RECLAIM : 0); 890 } 891 892 /* Caller must hold page table lock. */ 893 static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, 894 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, 895 struct page *zero_page) 896 { 897 pmd_t entry; 898 if (!pmd_none(*pmd)) 899 return false; 900 entry = mk_pmd(zero_page, vma->vm_page_prot); 901 entry = pmd_mkhuge(entry); 902 if (pgtable) 903 pgtable_trans_huge_deposit(mm, pmd, pgtable); 904 set_pmd_at(mm, haddr, pmd, entry); 905 atomic_long_inc(&mm->nr_ptes); 906 return true; 907 } 908 909 int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, 910 unsigned long address, pmd_t *pmd, 911 unsigned int flags) 912 { 913 gfp_t gfp; 914 struct page *page; 915 unsigned long haddr = address & HPAGE_PMD_MASK; 916 917 if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) 918 return VM_FAULT_FALLBACK; 919 if (unlikely(anon_vma_prepare(vma))) 920 return VM_FAULT_OOM; 921 if (unlikely(khugepaged_enter(vma, vma->vm_flags))) 922 return VM_FAULT_OOM; 923 if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm) && 924 transparent_hugepage_use_zero_page()) { 925 spinlock_t *ptl; 926 pgtable_t pgtable; 927 struct page *zero_page; 928 bool set; 929 int ret; 930 pgtable = pte_alloc_one(mm, haddr); 931 if (unlikely(!pgtable)) 932 return VM_FAULT_OOM; 933 zero_page = get_huge_zero_page(); 934 if (unlikely(!zero_page)) { 935 pte_free(mm, pgtable); 936 count_vm_event(THP_FAULT_FALLBACK); 937 return VM_FAULT_FALLBACK; 938 } 939 ptl = pmd_lock(mm, pmd); 940 ret = 0; 941 set = false; 942 if (pmd_none(*pmd)) { 943 if (userfaultfd_missing(vma)) { 944 spin_unlock(ptl); 945 ret = handle_userfault(vma, address, flags, 946 VM_UFFD_MISSING); 947 VM_BUG_ON(ret & VM_FAULT_FALLBACK); 948 } else { 949 set_huge_zero_page(pgtable, mm, vma, 950 haddr, pmd, 951 zero_page); 952 spin_unlock(ptl); 953 set = true; 954 } 955 } else 956 spin_unlock(ptl); 957 if (!set) { 958 pte_free(mm, pgtable); 959 put_huge_zero_page(); 960 } 961 return ret; 962 } 963 gfp = alloc_hugepage_direct_gfpmask(vma); 964 page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); 965 if (unlikely(!page)) { 966 count_vm_event(THP_FAULT_FALLBACK); 967 return VM_FAULT_FALLBACK; 968 } 969 prep_transhuge_page(page); 970 return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp, 971 flags); 972 } 973 974 static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, 975 pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write) 976 { 977 struct mm_struct *mm = vma->vm_mm; 978 pmd_t entry; 979 spinlock_t *ptl; 980 981 ptl = pmd_lock(mm, pmd); 982 entry = pmd_mkhuge(pfn_t_pmd(pfn, prot)); 983 if (pfn_t_devmap(pfn)) 984 entry = pmd_mkdevmap(entry); 985 if (write) { 986 entry = pmd_mkyoung(pmd_mkdirty(entry)); 987 entry = maybe_pmd_mkwrite(entry, vma); 988 } 989 set_pmd_at(mm, addr, pmd, entry); 990 update_mmu_cache_pmd(vma, addr, pmd); 991 spin_unlock(ptl); 992 } 993 994 int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, 995 pmd_t *pmd, pfn_t pfn, bool write) 996 { 997 pgprot_t pgprot = vma->vm_page_prot; 998 /* 999 * If we had pmd_special, we could avoid all these restrictions, 1000 * but we need to be consistent with PTEs and architectures that 1001 * can't support a 'special' bit. 1002 */ 1003 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); 1004 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == 1005 (VM_PFNMAP|VM_MIXEDMAP)); 1006 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); 1007 BUG_ON(!pfn_t_devmap(pfn)); 1008 1009 if (addr < vma->vm_start || addr >= vma->vm_end) 1010 return VM_FAULT_SIGBUS; 1011 if (track_pfn_insert(vma, &pgprot, pfn)) 1012 return VM_FAULT_SIGBUS; 1013 insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write); 1014 return VM_FAULT_NOPAGE; 1015 } 1016 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd); 1017 1018 static void touch_pmd(struct vm_area_struct *vma, unsigned long addr, 1019 pmd_t *pmd) 1020 { 1021 pmd_t _pmd; 1022 1023 /* 1024 * We should set the dirty bit only for FOLL_WRITE but for now 1025 * the dirty bit in the pmd is meaningless. And if the dirty 1026 * bit will become meaningful and we'll only set it with 1027 * FOLL_WRITE, an atomic set_bit will be required on the pmd to 1028 * set the young bit, instead of the current set_pmd_at. 1029 */ 1030 _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); 1031 if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, 1032 pmd, _pmd, 1)) 1033 update_mmu_cache_pmd(vma, addr, pmd); 1034 } 1035 1036 struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, 1037 pmd_t *pmd, int flags) 1038 { 1039 unsigned long pfn = pmd_pfn(*pmd); 1040 struct mm_struct *mm = vma->vm_mm; 1041 struct dev_pagemap *pgmap; 1042 struct page *page; 1043 1044 assert_spin_locked(pmd_lockptr(mm, pmd)); 1045 1046 if (flags & FOLL_WRITE && !pmd_write(*pmd)) 1047 return NULL; 1048 1049 if (pmd_present(*pmd) && pmd_devmap(*pmd)) 1050 /* pass */; 1051 else 1052 return NULL; 1053 1054 if (flags & FOLL_TOUCH) 1055 touch_pmd(vma, addr, pmd); 1056 1057 /* 1058 * device mapped pages can only be returned if the 1059 * caller will manage the page reference count. 1060 */ 1061 if (!(flags & FOLL_GET)) 1062 return ERR_PTR(-EEXIST); 1063 1064 pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT; 1065 pgmap = get_dev_pagemap(pfn, NULL); 1066 if (!pgmap) 1067 return ERR_PTR(-EFAULT); 1068 page = pfn_to_page(pfn); 1069 get_page(page); 1070 put_dev_pagemap(pgmap); 1071 1072 return page; 1073 } 1074 1075 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, 1076 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, 1077 struct vm_area_struct *vma) 1078 { 1079 spinlock_t *dst_ptl, *src_ptl; 1080 struct page *src_page; 1081 pmd_t pmd; 1082 pgtable_t pgtable = NULL; 1083 int ret; 1084 1085 if (!vma_is_dax(vma)) { 1086 ret = -ENOMEM; 1087 pgtable = pte_alloc_one(dst_mm, addr); 1088 if (unlikely(!pgtable)) 1089 goto out; 1090 } 1091 1092 dst_ptl = pmd_lock(dst_mm, dst_pmd); 1093 src_ptl = pmd_lockptr(src_mm, src_pmd); 1094 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 1095 1096 ret = -EAGAIN; 1097 pmd = *src_pmd; 1098 if (unlikely(!pmd_trans_huge(pmd) && !pmd_devmap(pmd))) { 1099 pte_free(dst_mm, pgtable); 1100 goto out_unlock; 1101 } 1102 /* 1103 * When page table lock is held, the huge zero pmd should not be 1104 * under splitting since we don't split the page itself, only pmd to 1105 * a page table. 1106 */ 1107 if (is_huge_zero_pmd(pmd)) { 1108 struct page *zero_page; 1109 /* 1110 * get_huge_zero_page() will never allocate a new page here, 1111 * since we already have a zero page to copy. It just takes a 1112 * reference. 1113 */ 1114 zero_page = get_huge_zero_page(); 1115 set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, 1116 zero_page); 1117 ret = 0; 1118 goto out_unlock; 1119 } 1120 1121 if (!vma_is_dax(vma)) { 1122 /* thp accounting separate from pmd_devmap accounting */ 1123 src_page = pmd_page(pmd); 1124 VM_BUG_ON_PAGE(!PageHead(src_page), src_page); 1125 get_page(src_page); 1126 page_dup_rmap(src_page, true); 1127 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); 1128 atomic_long_inc(&dst_mm->nr_ptes); 1129 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); 1130 } 1131 1132 pmdp_set_wrprotect(src_mm, addr, src_pmd); 1133 pmd = pmd_mkold(pmd_wrprotect(pmd)); 1134 set_pmd_at(dst_mm, addr, dst_pmd, pmd); 1135 1136 ret = 0; 1137 out_unlock: 1138 spin_unlock(src_ptl); 1139 spin_unlock(dst_ptl); 1140 out: 1141 return ret; 1142 } 1143 1144 void huge_pmd_set_accessed(struct mm_struct *mm, 1145 struct vm_area_struct *vma, 1146 unsigned long address, 1147 pmd_t *pmd, pmd_t orig_pmd, 1148 int dirty) 1149 { 1150 spinlock_t *ptl; 1151 pmd_t entry; 1152 unsigned long haddr; 1153 1154 ptl = pmd_lock(mm, pmd); 1155 if (unlikely(!pmd_same(*pmd, orig_pmd))) 1156 goto unlock; 1157 1158 entry = pmd_mkyoung(orig_pmd); 1159 haddr = address & HPAGE_PMD_MASK; 1160 if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty)) 1161 update_mmu_cache_pmd(vma, address, pmd); 1162 1163 unlock: 1164 spin_unlock(ptl); 1165 } 1166 1167 static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, 1168 struct vm_area_struct *vma, 1169 unsigned long address, 1170 pmd_t *pmd, pmd_t orig_pmd, 1171 struct page *page, 1172 unsigned long haddr) 1173 { 1174 struct mem_cgroup *memcg; 1175 spinlock_t *ptl; 1176 pgtable_t pgtable; 1177 pmd_t _pmd; 1178 int ret = 0, i; 1179 struct page **pages; 1180 unsigned long mmun_start; /* For mmu_notifiers */ 1181 unsigned long mmun_end; /* For mmu_notifiers */ 1182 1183 pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR, 1184 GFP_KERNEL); 1185 if (unlikely(!pages)) { 1186 ret |= VM_FAULT_OOM; 1187 goto out; 1188 } 1189 1190 for (i = 0; i < HPAGE_PMD_NR; i++) { 1191 pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE | 1192 __GFP_OTHER_NODE, 1193 vma, address, page_to_nid(page)); 1194 if (unlikely(!pages[i] || 1195 mem_cgroup_try_charge(pages[i], mm, GFP_KERNEL, 1196 &memcg, false))) { 1197 if (pages[i]) 1198 put_page(pages[i]); 1199 while (--i >= 0) { 1200 memcg = (void *)page_private(pages[i]); 1201 set_page_private(pages[i], 0); 1202 mem_cgroup_cancel_charge(pages[i], memcg, 1203 false); 1204 put_page(pages[i]); 1205 } 1206 kfree(pages); 1207 ret |= VM_FAULT_OOM; 1208 goto out; 1209 } 1210 set_page_private(pages[i], (unsigned long)memcg); 1211 } 1212 1213 for (i = 0; i < HPAGE_PMD_NR; i++) { 1214 copy_user_highpage(pages[i], page + i, 1215 haddr + PAGE_SIZE * i, vma); 1216 __SetPageUptodate(pages[i]); 1217 cond_resched(); 1218 } 1219 1220 mmun_start = haddr; 1221 mmun_end = haddr + HPAGE_PMD_SIZE; 1222 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 1223 1224 ptl = pmd_lock(mm, pmd); 1225 if (unlikely(!pmd_same(*pmd, orig_pmd))) 1226 goto out_free_pages; 1227 VM_BUG_ON_PAGE(!PageHead(page), page); 1228 1229 pmdp_huge_clear_flush_notify(vma, haddr, pmd); 1230 /* leave pmd empty until pte is filled */ 1231 1232 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 1233 pmd_populate(mm, &_pmd, pgtable); 1234 1235 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { 1236 pte_t *pte, entry; 1237 entry = mk_pte(pages[i], vma->vm_page_prot); 1238 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 1239 memcg = (void *)page_private(pages[i]); 1240 set_page_private(pages[i], 0); 1241 page_add_new_anon_rmap(pages[i], vma, haddr, false); 1242 mem_cgroup_commit_charge(pages[i], memcg, false, false); 1243 lru_cache_add_active_or_unevictable(pages[i], vma); 1244 pte = pte_offset_map(&_pmd, haddr); 1245 VM_BUG_ON(!pte_none(*pte)); 1246 set_pte_at(mm, haddr, pte, entry); 1247 pte_unmap(pte); 1248 } 1249 kfree(pages); 1250 1251 smp_wmb(); /* make pte visible before pmd */ 1252 pmd_populate(mm, pmd, pgtable); 1253 page_remove_rmap(page, true); 1254 spin_unlock(ptl); 1255 1256 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 1257 1258 ret |= VM_FAULT_WRITE; 1259 put_page(page); 1260 1261 out: 1262 return ret; 1263 1264 out_free_pages: 1265 spin_unlock(ptl); 1266 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 1267 for (i = 0; i < HPAGE_PMD_NR; i++) { 1268 memcg = (void *)page_private(pages[i]); 1269 set_page_private(pages[i], 0); 1270 mem_cgroup_cancel_charge(pages[i], memcg, false); 1271 put_page(pages[i]); 1272 } 1273 kfree(pages); 1274 goto out; 1275 } 1276 1277 int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, 1278 unsigned long address, pmd_t *pmd, pmd_t orig_pmd) 1279 { 1280 spinlock_t *ptl; 1281 int ret = 0; 1282 struct page *page = NULL, *new_page; 1283 struct mem_cgroup *memcg; 1284 unsigned long haddr; 1285 unsigned long mmun_start; /* For mmu_notifiers */ 1286 unsigned long mmun_end; /* For mmu_notifiers */ 1287 gfp_t huge_gfp; /* for allocation and charge */ 1288 1289 ptl = pmd_lockptr(mm, pmd); 1290 VM_BUG_ON_VMA(!vma->anon_vma, vma); 1291 haddr = address & HPAGE_PMD_MASK; 1292 if (is_huge_zero_pmd(orig_pmd)) 1293 goto alloc; 1294 spin_lock(ptl); 1295 if (unlikely(!pmd_same(*pmd, orig_pmd))) 1296 goto out_unlock; 1297 1298 page = pmd_page(orig_pmd); 1299 VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page); 1300 /* 1301 * We can only reuse the page if nobody else maps the huge page or it's 1302 * part. 1303 */ 1304 if (page_trans_huge_mapcount(page, NULL) == 1) { 1305 pmd_t entry; 1306 entry = pmd_mkyoung(orig_pmd); 1307 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1308 if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) 1309 update_mmu_cache_pmd(vma, address, pmd); 1310 ret |= VM_FAULT_WRITE; 1311 goto out_unlock; 1312 } 1313 get_page(page); 1314 spin_unlock(ptl); 1315 alloc: 1316 if (transparent_hugepage_enabled(vma) && 1317 !transparent_hugepage_debug_cow()) { 1318 huge_gfp = alloc_hugepage_direct_gfpmask(vma); 1319 new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); 1320 } else 1321 new_page = NULL; 1322 1323 if (likely(new_page)) { 1324 prep_transhuge_page(new_page); 1325 } else { 1326 if (!page) { 1327 split_huge_pmd(vma, pmd, address); 1328 ret |= VM_FAULT_FALLBACK; 1329 } else { 1330 ret = do_huge_pmd_wp_page_fallback(mm, vma, address, 1331 pmd, orig_pmd, page, haddr); 1332 if (ret & VM_FAULT_OOM) { 1333 split_huge_pmd(vma, pmd, address); 1334 ret |= VM_FAULT_FALLBACK; 1335 } 1336 put_page(page); 1337 } 1338 count_vm_event(THP_FAULT_FALLBACK); 1339 goto out; 1340 } 1341 1342 if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg, 1343 true))) { 1344 put_page(new_page); 1345 if (page) { 1346 split_huge_pmd(vma, pmd, address); 1347 put_page(page); 1348 } else 1349 split_huge_pmd(vma, pmd, address); 1350 ret |= VM_FAULT_FALLBACK; 1351 count_vm_event(THP_FAULT_FALLBACK); 1352 goto out; 1353 } 1354 1355 count_vm_event(THP_FAULT_ALLOC); 1356 1357 if (!page) 1358 clear_huge_page(new_page, haddr, HPAGE_PMD_NR); 1359 else 1360 copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); 1361 __SetPageUptodate(new_page); 1362 1363 mmun_start = haddr; 1364 mmun_end = haddr + HPAGE_PMD_SIZE; 1365 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 1366 1367 spin_lock(ptl); 1368 if (page) 1369 put_page(page); 1370 if (unlikely(!pmd_same(*pmd, orig_pmd))) { 1371 spin_unlock(ptl); 1372 mem_cgroup_cancel_charge(new_page, memcg, true); 1373 put_page(new_page); 1374 goto out_mn; 1375 } else { 1376 pmd_t entry; 1377 entry = mk_huge_pmd(new_page, vma->vm_page_prot); 1378 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1379 pmdp_huge_clear_flush_notify(vma, haddr, pmd); 1380 page_add_new_anon_rmap(new_page, vma, haddr, true); 1381 mem_cgroup_commit_charge(new_page, memcg, false, true); 1382 lru_cache_add_active_or_unevictable(new_page, vma); 1383 set_pmd_at(mm, haddr, pmd, entry); 1384 update_mmu_cache_pmd(vma, address, pmd); 1385 if (!page) { 1386 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); 1387 put_huge_zero_page(); 1388 } else { 1389 VM_BUG_ON_PAGE(!PageHead(page), page); 1390 page_remove_rmap(page, true); 1391 put_page(page); 1392 } 1393 ret |= VM_FAULT_WRITE; 1394 } 1395 spin_unlock(ptl); 1396 out_mn: 1397 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 1398 out: 1399 return ret; 1400 out_unlock: 1401 spin_unlock(ptl); 1402 return ret; 1403 } 1404 1405 struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, 1406 unsigned long addr, 1407 pmd_t *pmd, 1408 unsigned int flags) 1409 { 1410 struct mm_struct *mm = vma->vm_mm; 1411 struct page *page = NULL; 1412 1413 assert_spin_locked(pmd_lockptr(mm, pmd)); 1414 1415 if (flags & FOLL_WRITE && !pmd_write(*pmd)) 1416 goto out; 1417 1418 /* Avoid dumping huge zero page */ 1419 if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd)) 1420 return ERR_PTR(-EFAULT); 1421 1422 /* Full NUMA hinting faults to serialise migration in fault paths */ 1423 if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) 1424 goto out; 1425 1426 page = pmd_page(*pmd); 1427 VM_BUG_ON_PAGE(!PageHead(page), page); 1428 if (flags & FOLL_TOUCH) 1429 touch_pmd(vma, addr, pmd); 1430 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { 1431 /* 1432 * We don't mlock() pte-mapped THPs. This way we can avoid 1433 * leaking mlocked pages into non-VM_LOCKED VMAs. 1434 * 1435 * In most cases the pmd is the only mapping of the page as we 1436 * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for 1437 * writable private mappings in populate_vma_page_range(). 1438 * 1439 * The only scenario when we have the page shared here is if we 1440 * mlocking read-only mapping shared over fork(). We skip 1441 * mlocking such pages. 1442 */ 1443 if (compound_mapcount(page) == 1 && !PageDoubleMap(page) && 1444 page->mapping && trylock_page(page)) { 1445 lru_add_drain(); 1446 if (page->mapping) 1447 mlock_vma_page(page); 1448 unlock_page(page); 1449 } 1450 } 1451 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; 1452 VM_BUG_ON_PAGE(!PageCompound(page), page); 1453 if (flags & FOLL_GET) 1454 get_page(page); 1455 1456 out: 1457 return page; 1458 } 1459 1460 /* NUMA hinting page fault entry point for trans huge pmds */ 1461 int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, 1462 unsigned long addr, pmd_t pmd, pmd_t *pmdp) 1463 { 1464 spinlock_t *ptl; 1465 struct anon_vma *anon_vma = NULL; 1466 struct page *page; 1467 unsigned long haddr = addr & HPAGE_PMD_MASK; 1468 int page_nid = -1, this_nid = numa_node_id(); 1469 int target_nid, last_cpupid = -1; 1470 bool page_locked; 1471 bool migrated = false; 1472 bool was_writable; 1473 int flags = 0; 1474 1475 /* A PROT_NONE fault should not end up here */ 1476 BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))); 1477 1478 ptl = pmd_lock(mm, pmdp); 1479 if (unlikely(!pmd_same(pmd, *pmdp))) 1480 goto out_unlock; 1481 1482 /* 1483 * If there are potential migrations, wait for completion and retry 1484 * without disrupting NUMA hinting information. Do not relock and 1485 * check_same as the page may no longer be mapped. 1486 */ 1487 if (unlikely(pmd_trans_migrating(*pmdp))) { 1488 page = pmd_page(*pmdp); 1489 spin_unlock(ptl); 1490 wait_on_page_locked(page); 1491 goto out; 1492 } 1493 1494 page = pmd_page(pmd); 1495 BUG_ON(is_huge_zero_page(page)); 1496 page_nid = page_to_nid(page); 1497 last_cpupid = page_cpupid_last(page); 1498 count_vm_numa_event(NUMA_HINT_FAULTS); 1499 if (page_nid == this_nid) { 1500 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); 1501 flags |= TNF_FAULT_LOCAL; 1502 } 1503 1504 /* See similar comment in do_numa_page for explanation */ 1505 if (!(vma->vm_flags & VM_WRITE)) 1506 flags |= TNF_NO_GROUP; 1507 1508 /* 1509 * Acquire the page lock to serialise THP migrations but avoid dropping 1510 * page_table_lock if at all possible 1511 */ 1512 page_locked = trylock_page(page); 1513 target_nid = mpol_misplaced(page, vma, haddr); 1514 if (target_nid == -1) { 1515 /* If the page was locked, there are no parallel migrations */ 1516 if (page_locked) 1517 goto clear_pmdnuma; 1518 } 1519 1520 /* Migration could have started since the pmd_trans_migrating check */ 1521 if (!page_locked) { 1522 spin_unlock(ptl); 1523 wait_on_page_locked(page); 1524 page_nid = -1; 1525 goto out; 1526 } 1527 1528 /* 1529 * Page is misplaced. Page lock serialises migrations. Acquire anon_vma 1530 * to serialises splits 1531 */ 1532 get_page(page); 1533 spin_unlock(ptl); 1534 anon_vma = page_lock_anon_vma_read(page); 1535 1536 /* Confirm the PMD did not change while page_table_lock was released */ 1537 spin_lock(ptl); 1538 if (unlikely(!pmd_same(pmd, *pmdp))) { 1539 unlock_page(page); 1540 put_page(page); 1541 page_nid = -1; 1542 goto out_unlock; 1543 } 1544 1545 /* Bail if we fail to protect against THP splits for any reason */ 1546 if (unlikely(!anon_vma)) { 1547 put_page(page); 1548 page_nid = -1; 1549 goto clear_pmdnuma; 1550 } 1551 1552 /* 1553 * Migrate the THP to the requested node, returns with page unlocked 1554 * and access rights restored. 1555 */ 1556 spin_unlock(ptl); 1557 migrated = migrate_misplaced_transhuge_page(mm, vma, 1558 pmdp, pmd, addr, page, target_nid); 1559 if (migrated) { 1560 flags |= TNF_MIGRATED; 1561 page_nid = target_nid; 1562 } else 1563 flags |= TNF_MIGRATE_FAIL; 1564 1565 goto out; 1566 clear_pmdnuma: 1567 BUG_ON(!PageLocked(page)); 1568 was_writable = pmd_write(pmd); 1569 pmd = pmd_modify(pmd, vma->vm_page_prot); 1570 pmd = pmd_mkyoung(pmd); 1571 if (was_writable) 1572 pmd = pmd_mkwrite(pmd); 1573 set_pmd_at(mm, haddr, pmdp, pmd); 1574 update_mmu_cache_pmd(vma, addr, pmdp); 1575 unlock_page(page); 1576 out_unlock: 1577 spin_unlock(ptl); 1578 1579 out: 1580 if (anon_vma) 1581 page_unlock_anon_vma_read(anon_vma); 1582 1583 if (page_nid != -1) 1584 task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags); 1585 1586 return 0; 1587 } 1588 1589 int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 1590 pmd_t *pmd, unsigned long addr, unsigned long next) 1591 1592 { 1593 spinlock_t *ptl; 1594 pmd_t orig_pmd; 1595 struct page *page; 1596 struct mm_struct *mm = tlb->mm; 1597 int ret = 0; 1598 1599 ptl = pmd_trans_huge_lock(pmd, vma); 1600 if (!ptl) 1601 goto out_unlocked; 1602 1603 orig_pmd = *pmd; 1604 if (is_huge_zero_pmd(orig_pmd)) { 1605 ret = 1; 1606 goto out; 1607 } 1608 1609 page = pmd_page(orig_pmd); 1610 /* 1611 * If other processes are mapping this page, we couldn't discard 1612 * the page unless they all do MADV_FREE so let's skip the page. 1613 */ 1614 if (page_mapcount(page) != 1) 1615 goto out; 1616 1617 if (!trylock_page(page)) 1618 goto out; 1619 1620 /* 1621 * If user want to discard part-pages of THP, split it so MADV_FREE 1622 * will deactivate only them. 1623 */ 1624 if (next - addr != HPAGE_PMD_SIZE) { 1625 get_page(page); 1626 spin_unlock(ptl); 1627 split_huge_page(page); 1628 put_page(page); 1629 unlock_page(page); 1630 goto out_unlocked; 1631 } 1632 1633 if (PageDirty(page)) 1634 ClearPageDirty(page); 1635 unlock_page(page); 1636 1637 if (PageActive(page)) 1638 deactivate_page(page); 1639 1640 if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) { 1641 orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd, 1642 tlb->fullmm); 1643 orig_pmd = pmd_mkold(orig_pmd); 1644 orig_pmd = pmd_mkclean(orig_pmd); 1645 1646 set_pmd_at(mm, addr, pmd, orig_pmd); 1647 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 1648 } 1649 ret = 1; 1650 out: 1651 spin_unlock(ptl); 1652 out_unlocked: 1653 return ret; 1654 } 1655 1656 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 1657 pmd_t *pmd, unsigned long addr) 1658 { 1659 pmd_t orig_pmd; 1660 spinlock_t *ptl; 1661 1662 ptl = __pmd_trans_huge_lock(pmd, vma); 1663 if (!ptl) 1664 return 0; 1665 /* 1666 * For architectures like ppc64 we look at deposited pgtable 1667 * when calling pmdp_huge_get_and_clear. So do the 1668 * pgtable_trans_huge_withdraw after finishing pmdp related 1669 * operations. 1670 */ 1671 orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd, 1672 tlb->fullmm); 1673 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 1674 if (vma_is_dax(vma)) { 1675 spin_unlock(ptl); 1676 if (is_huge_zero_pmd(orig_pmd)) 1677 tlb_remove_page(tlb, pmd_page(orig_pmd)); 1678 } else if (is_huge_zero_pmd(orig_pmd)) { 1679 pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd)); 1680 atomic_long_dec(&tlb->mm->nr_ptes); 1681 spin_unlock(ptl); 1682 tlb_remove_page(tlb, pmd_page(orig_pmd)); 1683 } else { 1684 struct page *page = pmd_page(orig_pmd); 1685 page_remove_rmap(page, true); 1686 VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); 1687 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); 1688 VM_BUG_ON_PAGE(!PageHead(page), page); 1689 pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd)); 1690 atomic_long_dec(&tlb->mm->nr_ptes); 1691 spin_unlock(ptl); 1692 tlb_remove_page(tlb, page); 1693 } 1694 return 1; 1695 } 1696 1697 bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, 1698 unsigned long new_addr, unsigned long old_end, 1699 pmd_t *old_pmd, pmd_t *new_pmd) 1700 { 1701 spinlock_t *old_ptl, *new_ptl; 1702 pmd_t pmd; 1703 struct mm_struct *mm = vma->vm_mm; 1704 1705 if ((old_addr & ~HPAGE_PMD_MASK) || 1706 (new_addr & ~HPAGE_PMD_MASK) || 1707 old_end - old_addr < HPAGE_PMD_SIZE) 1708 return false; 1709 1710 /* 1711 * The destination pmd shouldn't be established, free_pgtables() 1712 * should have release it. 1713 */ 1714 if (WARN_ON(!pmd_none(*new_pmd))) { 1715 VM_BUG_ON(pmd_trans_huge(*new_pmd)); 1716 return false; 1717 } 1718 1719 /* 1720 * We don't have to worry about the ordering of src and dst 1721 * ptlocks because exclusive mmap_sem prevents deadlock. 1722 */ 1723 old_ptl = __pmd_trans_huge_lock(old_pmd, vma); 1724 if (old_ptl) { 1725 new_ptl = pmd_lockptr(mm, new_pmd); 1726 if (new_ptl != old_ptl) 1727 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); 1728 pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd); 1729 VM_BUG_ON(!pmd_none(*new_pmd)); 1730 1731 if (pmd_move_must_withdraw(new_ptl, old_ptl) && 1732 vma_is_anonymous(vma)) { 1733 pgtable_t pgtable; 1734 pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); 1735 pgtable_trans_huge_deposit(mm, new_pmd, pgtable); 1736 } 1737 set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); 1738 if (new_ptl != old_ptl) 1739 spin_unlock(new_ptl); 1740 spin_unlock(old_ptl); 1741 return true; 1742 } 1743 return false; 1744 } 1745 1746 /* 1747 * Returns 1748 * - 0 if PMD could not be locked 1749 * - 1 if PMD was locked but protections unchange and TLB flush unnecessary 1750 * - HPAGE_PMD_NR is protections changed and TLB flush necessary 1751 */ 1752 int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 1753 unsigned long addr, pgprot_t newprot, int prot_numa) 1754 { 1755 struct mm_struct *mm = vma->vm_mm; 1756 spinlock_t *ptl; 1757 int ret = 0; 1758 1759 ptl = __pmd_trans_huge_lock(pmd, vma); 1760 if (ptl) { 1761 pmd_t entry; 1762 bool preserve_write = prot_numa && pmd_write(*pmd); 1763 ret = 1; 1764 1765 /* 1766 * Avoid trapping faults against the zero page. The read-only 1767 * data is likely to be read-cached on the local CPU and 1768 * local/remote hits to the zero page are not interesting. 1769 */ 1770 if (prot_numa && is_huge_zero_pmd(*pmd)) { 1771 spin_unlock(ptl); 1772 return ret; 1773 } 1774 1775 if (!prot_numa || !pmd_protnone(*pmd)) { 1776 entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd); 1777 entry = pmd_modify(entry, newprot); 1778 if (preserve_write) 1779 entry = pmd_mkwrite(entry); 1780 ret = HPAGE_PMD_NR; 1781 set_pmd_at(mm, addr, pmd, entry); 1782 BUG_ON(!preserve_write && pmd_write(entry)); 1783 } 1784 spin_unlock(ptl); 1785 } 1786 1787 return ret; 1788 } 1789 1790 /* 1791 * Returns true if a given pmd maps a thp, false otherwise. 1792 * 1793 * Note that if it returns true, this routine returns without unlocking page 1794 * table lock. So callers must unlock it. 1795 */ 1796 spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) 1797 { 1798 spinlock_t *ptl; 1799 ptl = pmd_lock(vma->vm_mm, pmd); 1800 if (likely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd))) 1801 return ptl; 1802 spin_unlock(ptl); 1803 return NULL; 1804 } 1805 1806 #define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE) 1807 1808 int hugepage_madvise(struct vm_area_struct *vma, 1809 unsigned long *vm_flags, int advice) 1810 { 1811 switch (advice) { 1812 case MADV_HUGEPAGE: 1813 #ifdef CONFIG_S390 1814 /* 1815 * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390 1816 * can't handle this properly after s390_enable_sie, so we simply 1817 * ignore the madvise to prevent qemu from causing a SIGSEGV. 1818 */ 1819 if (mm_has_pgste(vma->vm_mm)) 1820 return 0; 1821 #endif 1822 /* 1823 * Be somewhat over-protective like KSM for now! 1824 */ 1825 if (*vm_flags & VM_NO_THP) 1826 return -EINVAL; 1827 *vm_flags &= ~VM_NOHUGEPAGE; 1828 *vm_flags |= VM_HUGEPAGE; 1829 /* 1830 * If the vma become good for khugepaged to scan, 1831 * register it here without waiting a page fault that 1832 * may not happen any time soon. 1833 */ 1834 if (unlikely(khugepaged_enter_vma_merge(vma, *vm_flags))) 1835 return -ENOMEM; 1836 break; 1837 case MADV_NOHUGEPAGE: 1838 /* 1839 * Be somewhat over-protective like KSM for now! 1840 */ 1841 if (*vm_flags & VM_NO_THP) 1842 return -EINVAL; 1843 *vm_flags &= ~VM_HUGEPAGE; 1844 *vm_flags |= VM_NOHUGEPAGE; 1845 /* 1846 * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning 1847 * this vma even if we leave the mm registered in khugepaged if 1848 * it got registered before VM_NOHUGEPAGE was set. 1849 */ 1850 break; 1851 } 1852 1853 return 0; 1854 } 1855 1856 static int __init khugepaged_slab_init(void) 1857 { 1858 mm_slot_cache = kmem_cache_create("khugepaged_mm_slot", 1859 sizeof(struct mm_slot), 1860 __alignof__(struct mm_slot), 0, NULL); 1861 if (!mm_slot_cache) 1862 return -ENOMEM; 1863 1864 return 0; 1865 } 1866 1867 static void __init khugepaged_slab_exit(void) 1868 { 1869 kmem_cache_destroy(mm_slot_cache); 1870 } 1871 1872 static inline struct mm_slot *alloc_mm_slot(void) 1873 { 1874 if (!mm_slot_cache) /* initialization failed */ 1875 return NULL; 1876 return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); 1877 } 1878 1879 static inline void free_mm_slot(struct mm_slot *mm_slot) 1880 { 1881 kmem_cache_free(mm_slot_cache, mm_slot); 1882 } 1883 1884 static struct mm_slot *get_mm_slot(struct mm_struct *mm) 1885 { 1886 struct mm_slot *mm_slot; 1887 1888 hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm) 1889 if (mm == mm_slot->mm) 1890 return mm_slot; 1891 1892 return NULL; 1893 } 1894 1895 static void insert_to_mm_slots_hash(struct mm_struct *mm, 1896 struct mm_slot *mm_slot) 1897 { 1898 mm_slot->mm = mm; 1899 hash_add(mm_slots_hash, &mm_slot->hash, (long)mm); 1900 } 1901 1902 static inline int khugepaged_test_exit(struct mm_struct *mm) 1903 { 1904 return atomic_read(&mm->mm_users) == 0; 1905 } 1906 1907 int __khugepaged_enter(struct mm_struct *mm) 1908 { 1909 struct mm_slot *mm_slot; 1910 int wakeup; 1911 1912 mm_slot = alloc_mm_slot(); 1913 if (!mm_slot) 1914 return -ENOMEM; 1915 1916 /* __khugepaged_exit() must not run from under us */ 1917 VM_BUG_ON_MM(khugepaged_test_exit(mm), mm); 1918 if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { 1919 free_mm_slot(mm_slot); 1920 return 0; 1921 } 1922 1923 spin_lock(&khugepaged_mm_lock); 1924 insert_to_mm_slots_hash(mm, mm_slot); 1925 /* 1926 * Insert just behind the scanning cursor, to let the area settle 1927 * down a little. 1928 */ 1929 wakeup = list_empty(&khugepaged_scan.mm_head); 1930 list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head); 1931 spin_unlock(&khugepaged_mm_lock); 1932 1933 atomic_inc(&mm->mm_count); 1934 if (wakeup) 1935 wake_up_interruptible(&khugepaged_wait); 1936 1937 return 0; 1938 } 1939 1940 int khugepaged_enter_vma_merge(struct vm_area_struct *vma, 1941 unsigned long vm_flags) 1942 { 1943 unsigned long hstart, hend; 1944 if (!vma->anon_vma) 1945 /* 1946 * Not yet faulted in so we will register later in the 1947 * page fault if needed. 1948 */ 1949 return 0; 1950 if (vma->vm_ops || (vm_flags & VM_NO_THP)) 1951 /* khugepaged not yet working on file or special mappings */ 1952 return 0; 1953 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 1954 hend = vma->vm_end & HPAGE_PMD_MASK; 1955 if (hstart < hend) 1956 return khugepaged_enter(vma, vm_flags); 1957 return 0; 1958 } 1959 1960 void __khugepaged_exit(struct mm_struct *mm) 1961 { 1962 struct mm_slot *mm_slot; 1963 int free = 0; 1964 1965 spin_lock(&khugepaged_mm_lock); 1966 mm_slot = get_mm_slot(mm); 1967 if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { 1968 hash_del(&mm_slot->hash); 1969 list_del(&mm_slot->mm_node); 1970 free = 1; 1971 } 1972 spin_unlock(&khugepaged_mm_lock); 1973 1974 if (free) { 1975 clear_bit(MMF_VM_HUGEPAGE, &mm->flags); 1976 free_mm_slot(mm_slot); 1977 mmdrop(mm); 1978 } else if (mm_slot) { 1979 /* 1980 * This is required to serialize against 1981 * khugepaged_test_exit() (which is guaranteed to run 1982 * under mmap sem read mode). Stop here (after we 1983 * return all pagetables will be destroyed) until 1984 * khugepaged has finished working on the pagetables 1985 * under the mmap_sem. 1986 */ 1987 down_write(&mm->mmap_sem); 1988 up_write(&mm->mmap_sem); 1989 } 1990 } 1991 1992 static void release_pte_page(struct page *page) 1993 { 1994 /* 0 stands for page_is_file_cache(page) == false */ 1995 dec_zone_page_state(page, NR_ISOLATED_ANON + 0); 1996 unlock_page(page); 1997 putback_lru_page(page); 1998 } 1999 2000 static void release_pte_pages(pte_t *pte, pte_t *_pte) 2001 { 2002 while (--_pte >= pte) { 2003 pte_t pteval = *_pte; 2004 if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval))) 2005 release_pte_page(pte_page(pteval)); 2006 } 2007 } 2008 2009 static int __collapse_huge_page_isolate(struct vm_area_struct *vma, 2010 unsigned long address, 2011 pte_t *pte) 2012 { 2013 struct page *page = NULL; 2014 pte_t *_pte; 2015 int none_or_zero = 0, result = 0; 2016 bool referenced = false, writable = false; 2017 2018 for (_pte = pte; _pte < pte+HPAGE_PMD_NR; 2019 _pte++, address += PAGE_SIZE) { 2020 pte_t pteval = *_pte; 2021 if (pte_none(pteval) || (pte_present(pteval) && 2022 is_zero_pfn(pte_pfn(pteval)))) { 2023 if (!userfaultfd_armed(vma) && 2024 ++none_or_zero <= khugepaged_max_ptes_none) { 2025 continue; 2026 } else { 2027 result = SCAN_EXCEED_NONE_PTE; 2028 goto out; 2029 } 2030 } 2031 if (!pte_present(pteval)) { 2032 result = SCAN_PTE_NON_PRESENT; 2033 goto out; 2034 } 2035 page = vm_normal_page(vma, address, pteval); 2036 if (unlikely(!page)) { 2037 result = SCAN_PAGE_NULL; 2038 goto out; 2039 } 2040 2041 VM_BUG_ON_PAGE(PageCompound(page), page); 2042 VM_BUG_ON_PAGE(!PageAnon(page), page); 2043 VM_BUG_ON_PAGE(!PageSwapBacked(page), page); 2044 2045 /* 2046 * We can do it before isolate_lru_page because the 2047 * page can't be freed from under us. NOTE: PG_lock 2048 * is needed to serialize against split_huge_page 2049 * when invoked from the VM. 2050 */ 2051 if (!trylock_page(page)) { 2052 result = SCAN_PAGE_LOCK; 2053 goto out; 2054 } 2055 2056 /* 2057 * cannot use mapcount: can't collapse if there's a gup pin. 2058 * The page must only be referenced by the scanned process 2059 * and page swap cache. 2060 */ 2061 if (page_count(page) != 1 + !!PageSwapCache(page)) { 2062 unlock_page(page); 2063 result = SCAN_PAGE_COUNT; 2064 goto out; 2065 } 2066 if (pte_write(pteval)) { 2067 writable = true; 2068 } else { 2069 if (PageSwapCache(page) && 2070 !reuse_swap_page(page, NULL)) { 2071 unlock_page(page); 2072 result = SCAN_SWAP_CACHE_PAGE; 2073 goto out; 2074 } 2075 /* 2076 * Page is not in the swap cache. It can be collapsed 2077 * into a THP. 2078 */ 2079 } 2080 2081 /* 2082 * Isolate the page to avoid collapsing an hugepage 2083 * currently in use by the VM. 2084 */ 2085 if (isolate_lru_page(page)) { 2086 unlock_page(page); 2087 result = SCAN_DEL_PAGE_LRU; 2088 goto out; 2089 } 2090 /* 0 stands for page_is_file_cache(page) == false */ 2091 inc_zone_page_state(page, NR_ISOLATED_ANON + 0); 2092 VM_BUG_ON_PAGE(!PageLocked(page), page); 2093 VM_BUG_ON_PAGE(PageLRU(page), page); 2094 2095 /* If there is no mapped pte young don't collapse the page */ 2096 if (pte_young(pteval) || 2097 page_is_young(page) || PageReferenced(page) || 2098 mmu_notifier_test_young(vma->vm_mm, address)) 2099 referenced = true; 2100 } 2101 if (likely(writable)) { 2102 if (likely(referenced)) { 2103 result = SCAN_SUCCEED; 2104 trace_mm_collapse_huge_page_isolate(page, none_or_zero, 2105 referenced, writable, result); 2106 return 1; 2107 } 2108 } else { 2109 result = SCAN_PAGE_RO; 2110 } 2111 2112 out: 2113 release_pte_pages(pte, _pte); 2114 trace_mm_collapse_huge_page_isolate(page, none_or_zero, 2115 referenced, writable, result); 2116 return 0; 2117 } 2118 2119 static void __collapse_huge_page_copy(pte_t *pte, struct page *page, 2120 struct vm_area_struct *vma, 2121 unsigned long address, 2122 spinlock_t *ptl) 2123 { 2124 pte_t *_pte; 2125 for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) { 2126 pte_t pteval = *_pte; 2127 struct page *src_page; 2128 2129 if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { 2130 clear_user_highpage(page, address); 2131 add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); 2132 if (is_zero_pfn(pte_pfn(pteval))) { 2133 /* 2134 * ptl mostly unnecessary. 2135 */ 2136 spin_lock(ptl); 2137 /* 2138 * paravirt calls inside pte_clear here are 2139 * superfluous. 2140 */ 2141 pte_clear(vma->vm_mm, address, _pte); 2142 spin_unlock(ptl); 2143 } 2144 } else { 2145 src_page = pte_page(pteval); 2146 copy_user_highpage(page, src_page, address, vma); 2147 VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page); 2148 release_pte_page(src_page); 2149 /* 2150 * ptl mostly unnecessary, but preempt has to 2151 * be disabled to update the per-cpu stats 2152 * inside page_remove_rmap(). 2153 */ 2154 spin_lock(ptl); 2155 /* 2156 * paravirt calls inside pte_clear here are 2157 * superfluous. 2158 */ 2159 pte_clear(vma->vm_mm, address, _pte); 2160 page_remove_rmap(src_page, false); 2161 spin_unlock(ptl); 2162 free_page_and_swap_cache(src_page); 2163 } 2164 2165 address += PAGE_SIZE; 2166 page++; 2167 } 2168 } 2169 2170 static void khugepaged_alloc_sleep(void) 2171 { 2172 DEFINE_WAIT(wait); 2173 2174 add_wait_queue(&khugepaged_wait, &wait); 2175 freezable_schedule_timeout_interruptible( 2176 msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); 2177 remove_wait_queue(&khugepaged_wait, &wait); 2178 } 2179 2180 static int khugepaged_node_load[MAX_NUMNODES]; 2181 2182 static bool khugepaged_scan_abort(int nid) 2183 { 2184 int i; 2185 2186 /* 2187 * If zone_reclaim_mode is disabled, then no extra effort is made to 2188 * allocate memory locally. 2189 */ 2190 if (!zone_reclaim_mode) 2191 return false; 2192 2193 /* If there is a count for this node already, it must be acceptable */ 2194 if (khugepaged_node_load[nid]) 2195 return false; 2196 2197 for (i = 0; i < MAX_NUMNODES; i++) { 2198 if (!khugepaged_node_load[i]) 2199 continue; 2200 if (node_distance(nid, i) > RECLAIM_DISTANCE) 2201 return true; 2202 } 2203 return false; 2204 } 2205 2206 #ifdef CONFIG_NUMA 2207 static int khugepaged_find_target_node(void) 2208 { 2209 static int last_khugepaged_target_node = NUMA_NO_NODE; 2210 int nid, target_node = 0, max_value = 0; 2211 2212 /* find first node with max normal pages hit */ 2213 for (nid = 0; nid < MAX_NUMNODES; nid++) 2214 if (khugepaged_node_load[nid] > max_value) { 2215 max_value = khugepaged_node_load[nid]; 2216 target_node = nid; 2217 } 2218 2219 /* do some balance if several nodes have the same hit record */ 2220 if (target_node <= last_khugepaged_target_node) 2221 for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES; 2222 nid++) 2223 if (max_value == khugepaged_node_load[nid]) { 2224 target_node = nid; 2225 break; 2226 } 2227 2228 last_khugepaged_target_node = target_node; 2229 return target_node; 2230 } 2231 2232 static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) 2233 { 2234 if (IS_ERR(*hpage)) { 2235 if (!*wait) 2236 return false; 2237 2238 *wait = false; 2239 *hpage = NULL; 2240 khugepaged_alloc_sleep(); 2241 } else if (*hpage) { 2242 put_page(*hpage); 2243 *hpage = NULL; 2244 } 2245 2246 return true; 2247 } 2248 2249 static struct page * 2250 khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, 2251 unsigned long address, int node) 2252 { 2253 VM_BUG_ON_PAGE(*hpage, *hpage); 2254 2255 /* 2256 * Before allocating the hugepage, release the mmap_sem read lock. 2257 * The allocation can take potentially a long time if it involves 2258 * sync compaction, and we do not need to hold the mmap_sem during 2259 * that. We will recheck the vma after taking it again in write mode. 2260 */ 2261 up_read(&mm->mmap_sem); 2262 2263 *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER); 2264 if (unlikely(!*hpage)) { 2265 count_vm_event(THP_COLLAPSE_ALLOC_FAILED); 2266 *hpage = ERR_PTR(-ENOMEM); 2267 return NULL; 2268 } 2269 2270 prep_transhuge_page(*hpage); 2271 count_vm_event(THP_COLLAPSE_ALLOC); 2272 return *hpage; 2273 } 2274 #else 2275 static int khugepaged_find_target_node(void) 2276 { 2277 return 0; 2278 } 2279 2280 static inline struct page *alloc_khugepaged_hugepage(void) 2281 { 2282 struct page *page; 2283 2284 page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(), 2285 HPAGE_PMD_ORDER); 2286 if (page) 2287 prep_transhuge_page(page); 2288 return page; 2289 } 2290 2291 static struct page *khugepaged_alloc_hugepage(bool *wait) 2292 { 2293 struct page *hpage; 2294 2295 do { 2296 hpage = alloc_khugepaged_hugepage(); 2297 if (!hpage) { 2298 count_vm_event(THP_COLLAPSE_ALLOC_FAILED); 2299 if (!*wait) 2300 return NULL; 2301 2302 *wait = false; 2303 khugepaged_alloc_sleep(); 2304 } else 2305 count_vm_event(THP_COLLAPSE_ALLOC); 2306 } while (unlikely(!hpage) && likely(khugepaged_enabled())); 2307 2308 return hpage; 2309 } 2310 2311 static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) 2312 { 2313 if (!*hpage) 2314 *hpage = khugepaged_alloc_hugepage(wait); 2315 2316 if (unlikely(!*hpage)) 2317 return false; 2318 2319 return true; 2320 } 2321 2322 static struct page * 2323 khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, 2324 unsigned long address, int node) 2325 { 2326 up_read(&mm->mmap_sem); 2327 VM_BUG_ON(!*hpage); 2328 2329 return *hpage; 2330 } 2331 #endif 2332 2333 static bool hugepage_vma_check(struct vm_area_struct *vma) 2334 { 2335 if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || 2336 (vma->vm_flags & VM_NOHUGEPAGE)) 2337 return false; 2338 if (!vma->anon_vma || vma->vm_ops) 2339 return false; 2340 if (is_vma_temporary_stack(vma)) 2341 return false; 2342 return !(vma->vm_flags & VM_NO_THP); 2343 } 2344 2345 static void collapse_huge_page(struct mm_struct *mm, 2346 unsigned long address, 2347 struct page **hpage, 2348 struct vm_area_struct *vma, 2349 int node) 2350 { 2351 pmd_t *pmd, _pmd; 2352 pte_t *pte; 2353 pgtable_t pgtable; 2354 struct page *new_page; 2355 spinlock_t *pmd_ptl, *pte_ptl; 2356 int isolated = 0, result = 0; 2357 unsigned long hstart, hend; 2358 struct mem_cgroup *memcg; 2359 unsigned long mmun_start; /* For mmu_notifiers */ 2360 unsigned long mmun_end; /* For mmu_notifiers */ 2361 gfp_t gfp; 2362 2363 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 2364 2365 /* Only allocate from the target node */ 2366 gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_OTHER_NODE | __GFP_THISNODE; 2367 2368 /* release the mmap_sem read lock. */ 2369 new_page = khugepaged_alloc_page(hpage, gfp, mm, address, node); 2370 if (!new_page) { 2371 result = SCAN_ALLOC_HUGE_PAGE_FAIL; 2372 goto out_nolock; 2373 } 2374 2375 if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { 2376 result = SCAN_CGROUP_CHARGE_FAIL; 2377 goto out_nolock; 2378 } 2379 2380 /* 2381 * Prevent all access to pagetables with the exception of 2382 * gup_fast later hanlded by the ptep_clear_flush and the VM 2383 * handled by the anon_vma lock + PG_lock. 2384 */ 2385 down_write(&mm->mmap_sem); 2386 if (unlikely(khugepaged_test_exit(mm))) { 2387 result = SCAN_ANY_PROCESS; 2388 goto out; 2389 } 2390 2391 vma = find_vma(mm, address); 2392 if (!vma) { 2393 result = SCAN_VMA_NULL; 2394 goto out; 2395 } 2396 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 2397 hend = vma->vm_end & HPAGE_PMD_MASK; 2398 if (address < hstart || address + HPAGE_PMD_SIZE > hend) { 2399 result = SCAN_ADDRESS_RANGE; 2400 goto out; 2401 } 2402 if (!hugepage_vma_check(vma)) { 2403 result = SCAN_VMA_CHECK; 2404 goto out; 2405 } 2406 pmd = mm_find_pmd(mm, address); 2407 if (!pmd) { 2408 result = SCAN_PMD_NULL; 2409 goto out; 2410 } 2411 2412 anon_vma_lock_write(vma->anon_vma); 2413 2414 pte = pte_offset_map(pmd, address); 2415 pte_ptl = pte_lockptr(mm, pmd); 2416 2417 mmun_start = address; 2418 mmun_end = address + HPAGE_PMD_SIZE; 2419 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 2420 pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ 2421 /* 2422 * After this gup_fast can't run anymore. This also removes 2423 * any huge TLB entry from the CPU so we won't allow 2424 * huge and small TLB entries for the same virtual address 2425 * to avoid the risk of CPU bugs in that area. 2426 */ 2427 _pmd = pmdp_collapse_flush(vma, address, pmd); 2428 spin_unlock(pmd_ptl); 2429 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2430 2431 spin_lock(pte_ptl); 2432 isolated = __collapse_huge_page_isolate(vma, address, pte); 2433 spin_unlock(pte_ptl); 2434 2435 if (unlikely(!isolated)) { 2436 pte_unmap(pte); 2437 spin_lock(pmd_ptl); 2438 BUG_ON(!pmd_none(*pmd)); 2439 /* 2440 * We can only use set_pmd_at when establishing 2441 * hugepmds and never for establishing regular pmds that 2442 * points to regular pagetables. Use pmd_populate for that 2443 */ 2444 pmd_populate(mm, pmd, pmd_pgtable(_pmd)); 2445 spin_unlock(pmd_ptl); 2446 anon_vma_unlock_write(vma->anon_vma); 2447 result = SCAN_FAIL; 2448 goto out; 2449 } 2450 2451 /* 2452 * All pages are isolated and locked so anon_vma rmap 2453 * can't run anymore. 2454 */ 2455 anon_vma_unlock_write(vma->anon_vma); 2456 2457 __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl); 2458 pte_unmap(pte); 2459 __SetPageUptodate(new_page); 2460 pgtable = pmd_pgtable(_pmd); 2461 2462 _pmd = mk_huge_pmd(new_page, vma->vm_page_prot); 2463 _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); 2464 2465 /* 2466 * spin_lock() below is not the equivalent of smp_wmb(), so 2467 * this is needed to avoid the copy_huge_page writes to become 2468 * visible after the set_pmd_at() write. 2469 */ 2470 smp_wmb(); 2471 2472 spin_lock(pmd_ptl); 2473 BUG_ON(!pmd_none(*pmd)); 2474 page_add_new_anon_rmap(new_page, vma, address, true); 2475 mem_cgroup_commit_charge(new_page, memcg, false, true); 2476 lru_cache_add_active_or_unevictable(new_page, vma); 2477 pgtable_trans_huge_deposit(mm, pmd, pgtable); 2478 set_pmd_at(mm, address, pmd, _pmd); 2479 update_mmu_cache_pmd(vma, address, pmd); 2480 spin_unlock(pmd_ptl); 2481 2482 *hpage = NULL; 2483 2484 khugepaged_pages_collapsed++; 2485 result = SCAN_SUCCEED; 2486 out_up_write: 2487 up_write(&mm->mmap_sem); 2488 trace_mm_collapse_huge_page(mm, isolated, result); 2489 return; 2490 2491 out_nolock: 2492 trace_mm_collapse_huge_page(mm, isolated, result); 2493 return; 2494 out: 2495 mem_cgroup_cancel_charge(new_page, memcg, true); 2496 goto out_up_write; 2497 } 2498 2499 static int khugepaged_scan_pmd(struct mm_struct *mm, 2500 struct vm_area_struct *vma, 2501 unsigned long address, 2502 struct page **hpage) 2503 { 2504 pmd_t *pmd; 2505 pte_t *pte, *_pte; 2506 int ret = 0, none_or_zero = 0, result = 0; 2507 struct page *page = NULL; 2508 unsigned long _address; 2509 spinlock_t *ptl; 2510 int node = NUMA_NO_NODE; 2511 bool writable = false, referenced = false; 2512 2513 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 2514 2515 pmd = mm_find_pmd(mm, address); 2516 if (!pmd) { 2517 result = SCAN_PMD_NULL; 2518 goto out; 2519 } 2520 2521 memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); 2522 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 2523 for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; 2524 _pte++, _address += PAGE_SIZE) { 2525 pte_t pteval = *_pte; 2526 if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { 2527 if (!userfaultfd_armed(vma) && 2528 ++none_or_zero <= khugepaged_max_ptes_none) { 2529 continue; 2530 } else { 2531 result = SCAN_EXCEED_NONE_PTE; 2532 goto out_unmap; 2533 } 2534 } 2535 if (!pte_present(pteval)) { 2536 result = SCAN_PTE_NON_PRESENT; 2537 goto out_unmap; 2538 } 2539 if (pte_write(pteval)) 2540 writable = true; 2541 2542 page = vm_normal_page(vma, _address, pteval); 2543 if (unlikely(!page)) { 2544 result = SCAN_PAGE_NULL; 2545 goto out_unmap; 2546 } 2547 2548 /* TODO: teach khugepaged to collapse THP mapped with pte */ 2549 if (PageCompound(page)) { 2550 result = SCAN_PAGE_COMPOUND; 2551 goto out_unmap; 2552 } 2553 2554 /* 2555 * Record which node the original page is from and save this 2556 * information to khugepaged_node_load[]. 2557 * Khupaged will allocate hugepage from the node has the max 2558 * hit record. 2559 */ 2560 node = page_to_nid(page); 2561 if (khugepaged_scan_abort(node)) { 2562 result = SCAN_SCAN_ABORT; 2563 goto out_unmap; 2564 } 2565 khugepaged_node_load[node]++; 2566 if (!PageLRU(page)) { 2567 result = SCAN_PAGE_LRU; 2568 goto out_unmap; 2569 } 2570 if (PageLocked(page)) { 2571 result = SCAN_PAGE_LOCK; 2572 goto out_unmap; 2573 } 2574 if (!PageAnon(page)) { 2575 result = SCAN_PAGE_ANON; 2576 goto out_unmap; 2577 } 2578 2579 /* 2580 * cannot use mapcount: can't collapse if there's a gup pin. 2581 * The page must only be referenced by the scanned process 2582 * and page swap cache. 2583 */ 2584 if (page_count(page) != 1 + !!PageSwapCache(page)) { 2585 result = SCAN_PAGE_COUNT; 2586 goto out_unmap; 2587 } 2588 if (pte_young(pteval) || 2589 page_is_young(page) || PageReferenced(page) || 2590 mmu_notifier_test_young(vma->vm_mm, address)) 2591 referenced = true; 2592 } 2593 if (writable) { 2594 if (referenced) { 2595 result = SCAN_SUCCEED; 2596 ret = 1; 2597 } else { 2598 result = SCAN_NO_REFERENCED_PAGE; 2599 } 2600 } else { 2601 result = SCAN_PAGE_RO; 2602 } 2603 out_unmap: 2604 pte_unmap_unlock(pte, ptl); 2605 if (ret) { 2606 node = khugepaged_find_target_node(); 2607 /* collapse_huge_page will return with the mmap_sem released */ 2608 collapse_huge_page(mm, address, hpage, vma, node); 2609 } 2610 out: 2611 trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, 2612 none_or_zero, result); 2613 return ret; 2614 } 2615 2616 static void collect_mm_slot(struct mm_slot *mm_slot) 2617 { 2618 struct mm_struct *mm = mm_slot->mm; 2619 2620 VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); 2621 2622 if (khugepaged_test_exit(mm)) { 2623 /* free mm_slot */ 2624 hash_del(&mm_slot->hash); 2625 list_del(&mm_slot->mm_node); 2626 2627 /* 2628 * Not strictly needed because the mm exited already. 2629 * 2630 * clear_bit(MMF_VM_HUGEPAGE, &mm->flags); 2631 */ 2632 2633 /* khugepaged_mm_lock actually not necessary for the below */ 2634 free_mm_slot(mm_slot); 2635 mmdrop(mm); 2636 } 2637 } 2638 2639 static unsigned int khugepaged_scan_mm_slot(unsigned int pages, 2640 struct page **hpage) 2641 __releases(&khugepaged_mm_lock) 2642 __acquires(&khugepaged_mm_lock) 2643 { 2644 struct mm_slot *mm_slot; 2645 struct mm_struct *mm; 2646 struct vm_area_struct *vma; 2647 int progress = 0; 2648 2649 VM_BUG_ON(!pages); 2650 VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); 2651 2652 if (khugepaged_scan.mm_slot) 2653 mm_slot = khugepaged_scan.mm_slot; 2654 else { 2655 mm_slot = list_entry(khugepaged_scan.mm_head.next, 2656 struct mm_slot, mm_node); 2657 khugepaged_scan.address = 0; 2658 khugepaged_scan.mm_slot = mm_slot; 2659 } 2660 spin_unlock(&khugepaged_mm_lock); 2661 2662 mm = mm_slot->mm; 2663 down_read(&mm->mmap_sem); 2664 if (unlikely(khugepaged_test_exit(mm))) 2665 vma = NULL; 2666 else 2667 vma = find_vma(mm, khugepaged_scan.address); 2668 2669 progress++; 2670 for (; vma; vma = vma->vm_next) { 2671 unsigned long hstart, hend; 2672 2673 cond_resched(); 2674 if (unlikely(khugepaged_test_exit(mm))) { 2675 progress++; 2676 break; 2677 } 2678 if (!hugepage_vma_check(vma)) { 2679 skip: 2680 progress++; 2681 continue; 2682 } 2683 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 2684 hend = vma->vm_end & HPAGE_PMD_MASK; 2685 if (hstart >= hend) 2686 goto skip; 2687 if (khugepaged_scan.address > hend) 2688 goto skip; 2689 if (khugepaged_scan.address < hstart) 2690 khugepaged_scan.address = hstart; 2691 VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); 2692 2693 while (khugepaged_scan.address < hend) { 2694 int ret; 2695 cond_resched(); 2696 if (unlikely(khugepaged_test_exit(mm))) 2697 goto breakouterloop; 2698 2699 VM_BUG_ON(khugepaged_scan.address < hstart || 2700 khugepaged_scan.address + HPAGE_PMD_SIZE > 2701 hend); 2702 ret = khugepaged_scan_pmd(mm, vma, 2703 khugepaged_scan.address, 2704 hpage); 2705 /* move to next address */ 2706 khugepaged_scan.address += HPAGE_PMD_SIZE; 2707 progress += HPAGE_PMD_NR; 2708 if (ret) 2709 /* we released mmap_sem so break loop */ 2710 goto breakouterloop_mmap_sem; 2711 if (progress >= pages) 2712 goto breakouterloop; 2713 } 2714 } 2715 breakouterloop: 2716 up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */ 2717 breakouterloop_mmap_sem: 2718 2719 spin_lock(&khugepaged_mm_lock); 2720 VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot); 2721 /* 2722 * Release the current mm_slot if this mm is about to die, or 2723 * if we scanned all vmas of this mm. 2724 */ 2725 if (khugepaged_test_exit(mm) || !vma) { 2726 /* 2727 * Make sure that if mm_users is reaching zero while 2728 * khugepaged runs here, khugepaged_exit will find 2729 * mm_slot not pointing to the exiting mm. 2730 */ 2731 if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) { 2732 khugepaged_scan.mm_slot = list_entry( 2733 mm_slot->mm_node.next, 2734 struct mm_slot, mm_node); 2735 khugepaged_scan.address = 0; 2736 } else { 2737 khugepaged_scan.mm_slot = NULL; 2738 khugepaged_full_scans++; 2739 } 2740 2741 collect_mm_slot(mm_slot); 2742 } 2743 2744 return progress; 2745 } 2746 2747 static int khugepaged_has_work(void) 2748 { 2749 return !list_empty(&khugepaged_scan.mm_head) && 2750 khugepaged_enabled(); 2751 } 2752 2753 static int khugepaged_wait_event(void) 2754 { 2755 return !list_empty(&khugepaged_scan.mm_head) || 2756 kthread_should_stop(); 2757 } 2758 2759 static void khugepaged_do_scan(void) 2760 { 2761 struct page *hpage = NULL; 2762 unsigned int progress = 0, pass_through_head = 0; 2763 unsigned int pages = khugepaged_pages_to_scan; 2764 bool wait = true; 2765 2766 barrier(); /* write khugepaged_pages_to_scan to local stack */ 2767 2768 while (progress < pages) { 2769 if (!khugepaged_prealloc_page(&hpage, &wait)) 2770 break; 2771 2772 cond_resched(); 2773 2774 if (unlikely(kthread_should_stop() || try_to_freeze())) 2775 break; 2776 2777 spin_lock(&khugepaged_mm_lock); 2778 if (!khugepaged_scan.mm_slot) 2779 pass_through_head++; 2780 if (khugepaged_has_work() && 2781 pass_through_head < 2) 2782 progress += khugepaged_scan_mm_slot(pages - progress, 2783 &hpage); 2784 else 2785 progress = pages; 2786 spin_unlock(&khugepaged_mm_lock); 2787 } 2788 2789 if (!IS_ERR_OR_NULL(hpage)) 2790 put_page(hpage); 2791 } 2792 2793 static bool khugepaged_should_wakeup(void) 2794 { 2795 return kthread_should_stop() || 2796 time_after_eq(jiffies, khugepaged_sleep_expire); 2797 } 2798 2799 static void khugepaged_wait_work(void) 2800 { 2801 if (khugepaged_has_work()) { 2802 const unsigned long scan_sleep_jiffies = 2803 msecs_to_jiffies(khugepaged_scan_sleep_millisecs); 2804 2805 if (!scan_sleep_jiffies) 2806 return; 2807 2808 khugepaged_sleep_expire = jiffies + scan_sleep_jiffies; 2809 wait_event_freezable_timeout(khugepaged_wait, 2810 khugepaged_should_wakeup(), 2811 scan_sleep_jiffies); 2812 return; 2813 } 2814 2815 if (khugepaged_enabled()) 2816 wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); 2817 } 2818 2819 static int khugepaged(void *none) 2820 { 2821 struct mm_slot *mm_slot; 2822 2823 set_freezable(); 2824 set_user_nice(current, MAX_NICE); 2825 2826 while (!kthread_should_stop()) { 2827 khugepaged_do_scan(); 2828 khugepaged_wait_work(); 2829 } 2830 2831 spin_lock(&khugepaged_mm_lock); 2832 mm_slot = khugepaged_scan.mm_slot; 2833 khugepaged_scan.mm_slot = NULL; 2834 if (mm_slot) 2835 collect_mm_slot(mm_slot); 2836 spin_unlock(&khugepaged_mm_lock); 2837 return 0; 2838 } 2839 2840 static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, 2841 unsigned long haddr, pmd_t *pmd) 2842 { 2843 struct mm_struct *mm = vma->vm_mm; 2844 pgtable_t pgtable; 2845 pmd_t _pmd; 2846 int i; 2847 2848 /* leave pmd empty until pte is filled */ 2849 pmdp_huge_clear_flush_notify(vma, haddr, pmd); 2850 2851 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 2852 pmd_populate(mm, &_pmd, pgtable); 2853 2854 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { 2855 pte_t *pte, entry; 2856 entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); 2857 entry = pte_mkspecial(entry); 2858 pte = pte_offset_map(&_pmd, haddr); 2859 VM_BUG_ON(!pte_none(*pte)); 2860 set_pte_at(mm, haddr, pte, entry); 2861 pte_unmap(pte); 2862 } 2863 smp_wmb(); /* make pte visible before pmd */ 2864 pmd_populate(mm, pmd, pgtable); 2865 put_huge_zero_page(); 2866 } 2867 2868 static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, 2869 unsigned long haddr, bool freeze) 2870 { 2871 struct mm_struct *mm = vma->vm_mm; 2872 struct page *page; 2873 pgtable_t pgtable; 2874 pmd_t _pmd; 2875 bool young, write, dirty; 2876 unsigned long addr; 2877 int i; 2878 2879 VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); 2880 VM_BUG_ON_VMA(vma->vm_start > haddr, vma); 2881 VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma); 2882 VM_BUG_ON(!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)); 2883 2884 count_vm_event(THP_SPLIT_PMD); 2885 2886 if (vma_is_dax(vma)) { 2887 pmd_t _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); 2888 if (is_huge_zero_pmd(_pmd)) 2889 put_huge_zero_page(); 2890 return; 2891 } else if (is_huge_zero_pmd(*pmd)) { 2892 return __split_huge_zero_page_pmd(vma, haddr, pmd); 2893 } 2894 2895 page = pmd_page(*pmd); 2896 VM_BUG_ON_PAGE(!page_count(page), page); 2897 page_ref_add(page, HPAGE_PMD_NR - 1); 2898 write = pmd_write(*pmd); 2899 young = pmd_young(*pmd); 2900 dirty = pmd_dirty(*pmd); 2901 2902 pmdp_huge_split_prepare(vma, haddr, pmd); 2903 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 2904 pmd_populate(mm, &_pmd, pgtable); 2905 2906 for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { 2907 pte_t entry, *pte; 2908 /* 2909 * Note that NUMA hinting access restrictions are not 2910 * transferred to avoid any possibility of altering 2911 * permissions across VMAs. 2912 */ 2913 if (freeze) { 2914 swp_entry_t swp_entry; 2915 swp_entry = make_migration_entry(page + i, write); 2916 entry = swp_entry_to_pte(swp_entry); 2917 } else { 2918 entry = mk_pte(page + i, vma->vm_page_prot); 2919 entry = maybe_mkwrite(entry, vma); 2920 if (!write) 2921 entry = pte_wrprotect(entry); 2922 if (!young) 2923 entry = pte_mkold(entry); 2924 } 2925 if (dirty) 2926 SetPageDirty(page + i); 2927 pte = pte_offset_map(&_pmd, addr); 2928 BUG_ON(!pte_none(*pte)); 2929 set_pte_at(mm, addr, pte, entry); 2930 atomic_inc(&page[i]._mapcount); 2931 pte_unmap(pte); 2932 } 2933 2934 /* 2935 * Set PG_double_map before dropping compound_mapcount to avoid 2936 * false-negative page_mapped(). 2937 */ 2938 if (compound_mapcount(page) > 1 && !TestSetPageDoubleMap(page)) { 2939 for (i = 0; i < HPAGE_PMD_NR; i++) 2940 atomic_inc(&page[i]._mapcount); 2941 } 2942 2943 if (atomic_add_negative(-1, compound_mapcount_ptr(page))) { 2944 /* Last compound_mapcount is gone. */ 2945 __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); 2946 if (TestClearPageDoubleMap(page)) { 2947 /* No need in mapcount reference anymore */ 2948 for (i = 0; i < HPAGE_PMD_NR; i++) 2949 atomic_dec(&page[i]._mapcount); 2950 } 2951 } 2952 2953 smp_wmb(); /* make pte visible before pmd */ 2954 /* 2955 * Up to this point the pmd is present and huge and userland has the 2956 * whole access to the hugepage during the split (which happens in 2957 * place). If we overwrite the pmd with the not-huge version pointing 2958 * to the pte here (which of course we could if all CPUs were bug 2959 * free), userland could trigger a small page size TLB miss on the 2960 * small sized TLB while the hugepage TLB entry is still established in 2961 * the huge TLB. Some CPU doesn't like that. 2962 * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum 2963 * 383 on page 93. Intel should be safe but is also warns that it's 2964 * only safe if the permission and cache attributes of the two entries 2965 * loaded in the two TLB is identical (which should be the case here). 2966 * But it is generally safer to never allow small and huge TLB entries 2967 * for the same virtual address to be loaded simultaneously. So instead 2968 * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the 2969 * current pmd notpresent (atomically because here the pmd_trans_huge 2970 * and pmd_trans_splitting must remain set at all times on the pmd 2971 * until the split is complete for this pmd), then we flush the SMP TLB 2972 * and finally we write the non-huge version of the pmd entry with 2973 * pmd_populate. 2974 */ 2975 pmdp_invalidate(vma, haddr, pmd); 2976 pmd_populate(mm, pmd, pgtable); 2977 2978 if (freeze) { 2979 for (i = 0; i < HPAGE_PMD_NR; i++) { 2980 page_remove_rmap(page + i, false); 2981 put_page(page + i); 2982 } 2983 } 2984 } 2985 2986 void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 2987 unsigned long address, bool freeze, struct page *page) 2988 { 2989 spinlock_t *ptl; 2990 struct mm_struct *mm = vma->vm_mm; 2991 unsigned long haddr = address & HPAGE_PMD_MASK; 2992 2993 mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE); 2994 ptl = pmd_lock(mm, pmd); 2995 2996 /* 2997 * If caller asks to setup a migration entries, we need a page to check 2998 * pmd against. Otherwise we can end up replacing wrong page. 2999 */ 3000 VM_BUG_ON(freeze && !page); 3001 if (page && page != pmd_page(*pmd)) 3002 goto out; 3003 3004 if (pmd_trans_huge(*pmd)) { 3005 page = pmd_page(*pmd); 3006 if (PageMlocked(page)) 3007 clear_page_mlock(page); 3008 } else if (!pmd_devmap(*pmd)) 3009 goto out; 3010 __split_huge_pmd_locked(vma, pmd, haddr, freeze); 3011 out: 3012 spin_unlock(ptl); 3013 mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE); 3014 } 3015 3016 void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, 3017 bool freeze, struct page *page) 3018 { 3019 pgd_t *pgd; 3020 pud_t *pud; 3021 pmd_t *pmd; 3022 3023 pgd = pgd_offset(vma->vm_mm, address); 3024 if (!pgd_present(*pgd)) 3025 return; 3026 3027 pud = pud_offset(pgd, address); 3028 if (!pud_present(*pud)) 3029 return; 3030 3031 pmd = pmd_offset(pud, address); 3032 3033 __split_huge_pmd(vma, pmd, address, freeze, page); 3034 } 3035 3036 void vma_adjust_trans_huge(struct vm_area_struct *vma, 3037 unsigned long start, 3038 unsigned long end, 3039 long adjust_next) 3040 { 3041 /* 3042 * If the new start address isn't hpage aligned and it could 3043 * previously contain an hugepage: check if we need to split 3044 * an huge pmd. 3045 */ 3046 if (start & ~HPAGE_PMD_MASK && 3047 (start & HPAGE_PMD_MASK) >= vma->vm_start && 3048 (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) 3049 split_huge_pmd_address(vma, start, false, NULL); 3050 3051 /* 3052 * If the new end address isn't hpage aligned and it could 3053 * previously contain an hugepage: check if we need to split 3054 * an huge pmd. 3055 */ 3056 if (end & ~HPAGE_PMD_MASK && 3057 (end & HPAGE_PMD_MASK) >= vma->vm_start && 3058 (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) 3059 split_huge_pmd_address(vma, end, false, NULL); 3060 3061 /* 3062 * If we're also updating the vma->vm_next->vm_start, if the new 3063 * vm_next->vm_start isn't page aligned and it could previously 3064 * contain an hugepage: check if we need to split an huge pmd. 3065 */ 3066 if (adjust_next > 0) { 3067 struct vm_area_struct *next = vma->vm_next; 3068 unsigned long nstart = next->vm_start; 3069 nstart += adjust_next << PAGE_SHIFT; 3070 if (nstart & ~HPAGE_PMD_MASK && 3071 (nstart & HPAGE_PMD_MASK) >= next->vm_start && 3072 (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end) 3073 split_huge_pmd_address(next, nstart, false, NULL); 3074 } 3075 } 3076 3077 static void freeze_page(struct page *page) 3078 { 3079 enum ttu_flags ttu_flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | 3080 TTU_IGNORE_ACCESS | TTU_RMAP_LOCKED; 3081 int i, ret; 3082 3083 VM_BUG_ON_PAGE(!PageHead(page), page); 3084 3085 /* We only need TTU_SPLIT_HUGE_PMD once */ 3086 ret = try_to_unmap(page, ttu_flags | TTU_SPLIT_HUGE_PMD); 3087 for (i = 1; !ret && i < HPAGE_PMD_NR; i++) { 3088 /* Cut short if the page is unmapped */ 3089 if (page_count(page) == 1) 3090 return; 3091 3092 ret = try_to_unmap(page + i, ttu_flags); 3093 } 3094 VM_BUG_ON(ret); 3095 } 3096 3097 static void unfreeze_page(struct page *page) 3098 { 3099 int i; 3100 3101 for (i = 0; i < HPAGE_PMD_NR; i++) 3102 remove_migration_ptes(page + i, page + i, true); 3103 } 3104 3105 static void __split_huge_page_tail(struct page *head, int tail, 3106 struct lruvec *lruvec, struct list_head *list) 3107 { 3108 struct page *page_tail = head + tail; 3109 3110 VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail); 3111 VM_BUG_ON_PAGE(page_ref_count(page_tail) != 0, page_tail); 3112 3113 /* 3114 * tail_page->_refcount is zero and not changing from under us. But 3115 * get_page_unless_zero() may be running from under us on the 3116 * tail_page. If we used atomic_set() below instead of atomic_inc(), we 3117 * would then run atomic_set() concurrently with 3118 * get_page_unless_zero(), and atomic_set() is implemented in C not 3119 * using locked ops. spin_unlock on x86 sometime uses locked ops 3120 * because of PPro errata 66, 92, so unless somebody can guarantee 3121 * atomic_set() here would be safe on all archs (and not only on x86), 3122 * it's safer to use atomic_inc(). 3123 */ 3124 page_ref_inc(page_tail); 3125 3126 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 3127 page_tail->flags |= (head->flags & 3128 ((1L << PG_referenced) | 3129 (1L << PG_swapbacked) | 3130 (1L << PG_mlocked) | 3131 (1L << PG_uptodate) | 3132 (1L << PG_active) | 3133 (1L << PG_locked) | 3134 (1L << PG_unevictable) | 3135 (1L << PG_dirty))); 3136 3137 /* 3138 * After clearing PageTail the gup refcount can be released. 3139 * Page flags also must be visible before we make the page non-compound. 3140 */ 3141 smp_wmb(); 3142 3143 clear_compound_head(page_tail); 3144 3145 if (page_is_young(head)) 3146 set_page_young(page_tail); 3147 if (page_is_idle(head)) 3148 set_page_idle(page_tail); 3149 3150 /* ->mapping in first tail page is compound_mapcount */ 3151 VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, 3152 page_tail); 3153 page_tail->mapping = head->mapping; 3154 3155 page_tail->index = head->index + tail; 3156 page_cpupid_xchg_last(page_tail, page_cpupid_last(head)); 3157 lru_add_page_tail(head, page_tail, lruvec, list); 3158 } 3159 3160 static void __split_huge_page(struct page *page, struct list_head *list) 3161 { 3162 struct page *head = compound_head(page); 3163 struct zone *zone = page_zone(head); 3164 struct lruvec *lruvec; 3165 int i; 3166 3167 /* prevent PageLRU to go away from under us, and freeze lru stats */ 3168 spin_lock_irq(&zone->lru_lock); 3169 lruvec = mem_cgroup_page_lruvec(head, zone); 3170 3171 /* complete memcg works before add pages to LRU */ 3172 mem_cgroup_split_huge_fixup(head); 3173 3174 for (i = HPAGE_PMD_NR - 1; i >= 1; i--) 3175 __split_huge_page_tail(head, i, lruvec, list); 3176 3177 ClearPageCompound(head); 3178 spin_unlock_irq(&zone->lru_lock); 3179 3180 unfreeze_page(head); 3181 3182 for (i = 0; i < HPAGE_PMD_NR; i++) { 3183 struct page *subpage = head + i; 3184 if (subpage == page) 3185 continue; 3186 unlock_page(subpage); 3187 3188 /* 3189 * Subpages may be freed if there wasn't any mapping 3190 * like if add_to_swap() is running on a lru page that 3191 * had its mapping zapped. And freeing these pages 3192 * requires taking the lru_lock so we do the put_page 3193 * of the tail pages after the split is complete. 3194 */ 3195 put_page(subpage); 3196 } 3197 } 3198 3199 int total_mapcount(struct page *page) 3200 { 3201 int i, ret; 3202 3203 VM_BUG_ON_PAGE(PageTail(page), page); 3204 3205 if (likely(!PageCompound(page))) 3206 return atomic_read(&page->_mapcount) + 1; 3207 3208 ret = compound_mapcount(page); 3209 if (PageHuge(page)) 3210 return ret; 3211 for (i = 0; i < HPAGE_PMD_NR; i++) 3212 ret += atomic_read(&page[i]._mapcount) + 1; 3213 if (PageDoubleMap(page)) 3214 ret -= HPAGE_PMD_NR; 3215 return ret; 3216 } 3217 3218 /* 3219 * This calculates accurately how many mappings a transparent hugepage 3220 * has (unlike page_mapcount() which isn't fully accurate). This full 3221 * accuracy is primarily needed to know if copy-on-write faults can 3222 * reuse the page and change the mapping to read-write instead of 3223 * copying them. At the same time this returns the total_mapcount too. 3224 * 3225 * The function returns the highest mapcount any one of the subpages 3226 * has. If the return value is one, even if different processes are 3227 * mapping different subpages of the transparent hugepage, they can 3228 * all reuse it, because each process is reusing a different subpage. 3229 * 3230 * The total_mapcount is instead counting all virtual mappings of the 3231 * subpages. If the total_mapcount is equal to "one", it tells the 3232 * caller all mappings belong to the same "mm" and in turn the 3233 * anon_vma of the transparent hugepage can become the vma->anon_vma 3234 * local one as no other process may be mapping any of the subpages. 3235 * 3236 * It would be more accurate to replace page_mapcount() with 3237 * page_trans_huge_mapcount(), however we only use 3238 * page_trans_huge_mapcount() in the copy-on-write faults where we 3239 * need full accuracy to avoid breaking page pinning, because 3240 * page_trans_huge_mapcount() is slower than page_mapcount(). 3241 */ 3242 int page_trans_huge_mapcount(struct page *page, int *total_mapcount) 3243 { 3244 int i, ret, _total_mapcount, mapcount; 3245 3246 /* hugetlbfs shouldn't call it */ 3247 VM_BUG_ON_PAGE(PageHuge(page), page); 3248 3249 if (likely(!PageTransCompound(page))) { 3250 mapcount = atomic_read(&page->_mapcount) + 1; 3251 if (total_mapcount) 3252 *total_mapcount = mapcount; 3253 return mapcount; 3254 } 3255 3256 page = compound_head(page); 3257 3258 _total_mapcount = ret = 0; 3259 for (i = 0; i < HPAGE_PMD_NR; i++) { 3260 mapcount = atomic_read(&page[i]._mapcount) + 1; 3261 ret = max(ret, mapcount); 3262 _total_mapcount += mapcount; 3263 } 3264 if (PageDoubleMap(page)) { 3265 ret -= 1; 3266 _total_mapcount -= HPAGE_PMD_NR; 3267 } 3268 mapcount = compound_mapcount(page); 3269 ret += mapcount; 3270 _total_mapcount += mapcount; 3271 if (total_mapcount) 3272 *total_mapcount = _total_mapcount; 3273 return ret; 3274 } 3275 3276 /* 3277 * This function splits huge page into normal pages. @page can point to any 3278 * subpage of huge page to split. Split doesn't change the position of @page. 3279 * 3280 * Only caller must hold pin on the @page, otherwise split fails with -EBUSY. 3281 * The huge page must be locked. 3282 * 3283 * If @list is null, tail pages will be added to LRU list, otherwise, to @list. 3284 * 3285 * Both head page and tail pages will inherit mapping, flags, and so on from 3286 * the hugepage. 3287 * 3288 * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if 3289 * they are not mapped. 3290 * 3291 * Returns 0 if the hugepage is split successfully. 3292 * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under 3293 * us. 3294 */ 3295 int split_huge_page_to_list(struct page *page, struct list_head *list) 3296 { 3297 struct page *head = compound_head(page); 3298 struct pglist_data *pgdata = NODE_DATA(page_to_nid(head)); 3299 struct anon_vma *anon_vma; 3300 int count, mapcount, ret; 3301 bool mlocked; 3302 unsigned long flags; 3303 3304 VM_BUG_ON_PAGE(is_huge_zero_page(page), page); 3305 VM_BUG_ON_PAGE(!PageAnon(page), page); 3306 VM_BUG_ON_PAGE(!PageLocked(page), page); 3307 VM_BUG_ON_PAGE(!PageSwapBacked(page), page); 3308 VM_BUG_ON_PAGE(!PageCompound(page), page); 3309 3310 /* 3311 * The caller does not necessarily hold an mmap_sem that would prevent 3312 * the anon_vma disappearing so we first we take a reference to it 3313 * and then lock the anon_vma for write. This is similar to 3314 * page_lock_anon_vma_read except the write lock is taken to serialise 3315 * against parallel split or collapse operations. 3316 */ 3317 anon_vma = page_get_anon_vma(head); 3318 if (!anon_vma) { 3319 ret = -EBUSY; 3320 goto out; 3321 } 3322 anon_vma_lock_write(anon_vma); 3323 3324 /* 3325 * Racy check if we can split the page, before freeze_page() will 3326 * split PMDs 3327 */ 3328 if (total_mapcount(head) != page_count(head) - 1) { 3329 ret = -EBUSY; 3330 goto out_unlock; 3331 } 3332 3333 mlocked = PageMlocked(page); 3334 freeze_page(head); 3335 VM_BUG_ON_PAGE(compound_mapcount(head), head); 3336 3337 /* Make sure the page is not on per-CPU pagevec as it takes pin */ 3338 if (mlocked) 3339 lru_add_drain(); 3340 3341 /* Prevent deferred_split_scan() touching ->_refcount */ 3342 spin_lock_irqsave(&pgdata->split_queue_lock, flags); 3343 count = page_count(head); 3344 mapcount = total_mapcount(head); 3345 if (!mapcount && count == 1) { 3346 if (!list_empty(page_deferred_list(head))) { 3347 pgdata->split_queue_len--; 3348 list_del(page_deferred_list(head)); 3349 } 3350 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 3351 __split_huge_page(page, list); 3352 ret = 0; 3353 } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { 3354 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 3355 pr_alert("total_mapcount: %u, page_count(): %u\n", 3356 mapcount, count); 3357 if (PageTail(page)) 3358 dump_page(head, NULL); 3359 dump_page(page, "total_mapcount(head) > 0"); 3360 BUG(); 3361 } else { 3362 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 3363 unfreeze_page(head); 3364 ret = -EBUSY; 3365 } 3366 3367 out_unlock: 3368 anon_vma_unlock_write(anon_vma); 3369 put_anon_vma(anon_vma); 3370 out: 3371 count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); 3372 return ret; 3373 } 3374 3375 void free_transhuge_page(struct page *page) 3376 { 3377 struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); 3378 unsigned long flags; 3379 3380 spin_lock_irqsave(&pgdata->split_queue_lock, flags); 3381 if (!list_empty(page_deferred_list(page))) { 3382 pgdata->split_queue_len--; 3383 list_del(page_deferred_list(page)); 3384 } 3385 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 3386 free_compound_page(page); 3387 } 3388 3389 void deferred_split_huge_page(struct page *page) 3390 { 3391 struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); 3392 unsigned long flags; 3393 3394 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 3395 3396 spin_lock_irqsave(&pgdata->split_queue_lock, flags); 3397 if (list_empty(page_deferred_list(page))) { 3398 count_vm_event(THP_DEFERRED_SPLIT_PAGE); 3399 list_add_tail(page_deferred_list(page), &pgdata->split_queue); 3400 pgdata->split_queue_len++; 3401 } 3402 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 3403 } 3404 3405 static unsigned long deferred_split_count(struct shrinker *shrink, 3406 struct shrink_control *sc) 3407 { 3408 struct pglist_data *pgdata = NODE_DATA(sc->nid); 3409 return ACCESS_ONCE(pgdata->split_queue_len); 3410 } 3411 3412 static unsigned long deferred_split_scan(struct shrinker *shrink, 3413 struct shrink_control *sc) 3414 { 3415 struct pglist_data *pgdata = NODE_DATA(sc->nid); 3416 unsigned long flags; 3417 LIST_HEAD(list), *pos, *next; 3418 struct page *page; 3419 int split = 0; 3420 3421 spin_lock_irqsave(&pgdata->split_queue_lock, flags); 3422 /* Take pin on all head pages to avoid freeing them under us */ 3423 list_for_each_safe(pos, next, &pgdata->split_queue) { 3424 page = list_entry((void *)pos, struct page, mapping); 3425 page = compound_head(page); 3426 if (get_page_unless_zero(page)) { 3427 list_move(page_deferred_list(page), &list); 3428 } else { 3429 /* We lost race with put_compound_page() */ 3430 list_del_init(page_deferred_list(page)); 3431 pgdata->split_queue_len--; 3432 } 3433 if (!--sc->nr_to_scan) 3434 break; 3435 } 3436 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 3437 3438 list_for_each_safe(pos, next, &list) { 3439 page = list_entry((void *)pos, struct page, mapping); 3440 lock_page(page); 3441 /* split_huge_page() removes page from list on success */ 3442 if (!split_huge_page(page)) 3443 split++; 3444 unlock_page(page); 3445 put_page(page); 3446 } 3447 3448 spin_lock_irqsave(&pgdata->split_queue_lock, flags); 3449 list_splice_tail(&list, &pgdata->split_queue); 3450 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 3451 3452 /* 3453 * Stop shrinker if we didn't split any page, but the queue is empty. 3454 * This can happen if pages were freed under us. 3455 */ 3456 if (!split && list_empty(&pgdata->split_queue)) 3457 return SHRINK_STOP; 3458 return split; 3459 } 3460 3461 static struct shrinker deferred_split_shrinker = { 3462 .count_objects = deferred_split_count, 3463 .scan_objects = deferred_split_scan, 3464 .seeks = DEFAULT_SEEKS, 3465 .flags = SHRINKER_NUMA_AWARE, 3466 }; 3467 3468 #ifdef CONFIG_DEBUG_FS 3469 static int split_huge_pages_set(void *data, u64 val) 3470 { 3471 struct zone *zone; 3472 struct page *page; 3473 unsigned long pfn, max_zone_pfn; 3474 unsigned long total = 0, split = 0; 3475 3476 if (val != 1) 3477 return -EINVAL; 3478 3479 for_each_populated_zone(zone) { 3480 max_zone_pfn = zone_end_pfn(zone); 3481 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) { 3482 if (!pfn_valid(pfn)) 3483 continue; 3484 3485 page = pfn_to_page(pfn); 3486 if (!get_page_unless_zero(page)) 3487 continue; 3488 3489 if (zone != page_zone(page)) 3490 goto next; 3491 3492 if (!PageHead(page) || !PageAnon(page) || 3493 PageHuge(page)) 3494 goto next; 3495 3496 total++; 3497 lock_page(page); 3498 if (!split_huge_page(page)) 3499 split++; 3500 unlock_page(page); 3501 next: 3502 put_page(page); 3503 } 3504 } 3505 3506 pr_info("%lu of %lu THP split\n", split, total); 3507 3508 return 0; 3509 } 3510 DEFINE_SIMPLE_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set, 3511 "%llu\n"); 3512 3513 static int __init split_huge_pages_debugfs(void) 3514 { 3515 void *ret; 3516 3517 ret = debugfs_create_file("split_huge_pages", 0200, NULL, NULL, 3518 &split_huge_pages_fops); 3519 if (!ret) 3520 pr_warn("Failed to create split_huge_pages in debugfs"); 3521 return 0; 3522 } 3523 late_initcall(split_huge_pages_debugfs); 3524 #endif 3525