xref: /openbmc/linux/mm/huge_memory.c (revision bc5aa3a0)
1 /*
2  *  Copyright (C) 2009  Red Hat, Inc.
3  *
4  *  This work is licensed under the terms of the GNU GPL, version 2. See
5  *  the COPYING file in the top-level directory.
6  */
7 
8 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
9 
10 #include <linux/mm.h>
11 #include <linux/sched.h>
12 #include <linux/highmem.h>
13 #include <linux/hugetlb.h>
14 #include <linux/mmu_notifier.h>
15 #include <linux/rmap.h>
16 #include <linux/swap.h>
17 #include <linux/shrinker.h>
18 #include <linux/mm_inline.h>
19 #include <linux/swapops.h>
20 #include <linux/dax.h>
21 #include <linux/khugepaged.h>
22 #include <linux/freezer.h>
23 #include <linux/pfn_t.h>
24 #include <linux/mman.h>
25 #include <linux/memremap.h>
26 #include <linux/pagemap.h>
27 #include <linux/debugfs.h>
28 #include <linux/migrate.h>
29 #include <linux/hashtable.h>
30 #include <linux/userfaultfd_k.h>
31 #include <linux/page_idle.h>
32 #include <linux/shmem_fs.h>
33 
34 #include <asm/tlb.h>
35 #include <asm/pgalloc.h>
36 #include "internal.h"
37 
38 /*
39  * By default transparent hugepage support is disabled in order that avoid
40  * to risk increase the memory footprint of applications without a guaranteed
41  * benefit. When transparent hugepage support is enabled, is for all mappings,
42  * and khugepaged scans all mappings.
43  * Defrag is invoked by khugepaged hugepage allocations and by page faults
44  * for all hugepage allocations.
45  */
46 unsigned long transparent_hugepage_flags __read_mostly =
47 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
48 	(1<<TRANSPARENT_HUGEPAGE_FLAG)|
49 #endif
50 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
51 	(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
52 #endif
53 	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
54 	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
55 	(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
56 
57 static struct shrinker deferred_split_shrinker;
58 
59 static atomic_t huge_zero_refcount;
60 struct page *huge_zero_page __read_mostly;
61 
62 struct page *get_huge_zero_page(void)
63 {
64 	struct page *zero_page;
65 retry:
66 	if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
67 		return READ_ONCE(huge_zero_page);
68 
69 	zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
70 			HPAGE_PMD_ORDER);
71 	if (!zero_page) {
72 		count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
73 		return NULL;
74 	}
75 	count_vm_event(THP_ZERO_PAGE_ALLOC);
76 	preempt_disable();
77 	if (cmpxchg(&huge_zero_page, NULL, zero_page)) {
78 		preempt_enable();
79 		__free_pages(zero_page, compound_order(zero_page));
80 		goto retry;
81 	}
82 
83 	/* We take additional reference here. It will be put back by shrinker */
84 	atomic_set(&huge_zero_refcount, 2);
85 	preempt_enable();
86 	return READ_ONCE(huge_zero_page);
87 }
88 
89 void put_huge_zero_page(void)
90 {
91 	/*
92 	 * Counter should never go to zero here. Only shrinker can put
93 	 * last reference.
94 	 */
95 	BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
96 }
97 
98 static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
99 					struct shrink_control *sc)
100 {
101 	/* we can free zero page only if last reference remains */
102 	return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
103 }
104 
105 static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
106 				       struct shrink_control *sc)
107 {
108 	if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
109 		struct page *zero_page = xchg(&huge_zero_page, NULL);
110 		BUG_ON(zero_page == NULL);
111 		__free_pages(zero_page, compound_order(zero_page));
112 		return HPAGE_PMD_NR;
113 	}
114 
115 	return 0;
116 }
117 
118 static struct shrinker huge_zero_page_shrinker = {
119 	.count_objects = shrink_huge_zero_page_count,
120 	.scan_objects = shrink_huge_zero_page_scan,
121 	.seeks = DEFAULT_SEEKS,
122 };
123 
124 #ifdef CONFIG_SYSFS
125 
126 static ssize_t triple_flag_store(struct kobject *kobj,
127 				 struct kobj_attribute *attr,
128 				 const char *buf, size_t count,
129 				 enum transparent_hugepage_flag enabled,
130 				 enum transparent_hugepage_flag deferred,
131 				 enum transparent_hugepage_flag req_madv)
132 {
133 	if (!memcmp("defer", buf,
134 		    min(sizeof("defer")-1, count))) {
135 		if (enabled == deferred)
136 			return -EINVAL;
137 		clear_bit(enabled, &transparent_hugepage_flags);
138 		clear_bit(req_madv, &transparent_hugepage_flags);
139 		set_bit(deferred, &transparent_hugepage_flags);
140 	} else if (!memcmp("always", buf,
141 		    min(sizeof("always")-1, count))) {
142 		clear_bit(deferred, &transparent_hugepage_flags);
143 		clear_bit(req_madv, &transparent_hugepage_flags);
144 		set_bit(enabled, &transparent_hugepage_flags);
145 	} else if (!memcmp("madvise", buf,
146 			   min(sizeof("madvise")-1, count))) {
147 		clear_bit(enabled, &transparent_hugepage_flags);
148 		clear_bit(deferred, &transparent_hugepage_flags);
149 		set_bit(req_madv, &transparent_hugepage_flags);
150 	} else if (!memcmp("never", buf,
151 			   min(sizeof("never")-1, count))) {
152 		clear_bit(enabled, &transparent_hugepage_flags);
153 		clear_bit(req_madv, &transparent_hugepage_flags);
154 		clear_bit(deferred, &transparent_hugepage_flags);
155 	} else
156 		return -EINVAL;
157 
158 	return count;
159 }
160 
161 static ssize_t enabled_show(struct kobject *kobj,
162 			    struct kobj_attribute *attr, char *buf)
163 {
164 	if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
165 		return sprintf(buf, "[always] madvise never\n");
166 	else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags))
167 		return sprintf(buf, "always [madvise] never\n");
168 	else
169 		return sprintf(buf, "always madvise [never]\n");
170 }
171 
172 static ssize_t enabled_store(struct kobject *kobj,
173 			     struct kobj_attribute *attr,
174 			     const char *buf, size_t count)
175 {
176 	ssize_t ret;
177 
178 	ret = triple_flag_store(kobj, attr, buf, count,
179 				TRANSPARENT_HUGEPAGE_FLAG,
180 				TRANSPARENT_HUGEPAGE_FLAG,
181 				TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
182 
183 	if (ret > 0) {
184 		int err = start_stop_khugepaged();
185 		if (err)
186 			ret = err;
187 	}
188 
189 	return ret;
190 }
191 static struct kobj_attribute enabled_attr =
192 	__ATTR(enabled, 0644, enabled_show, enabled_store);
193 
194 ssize_t single_hugepage_flag_show(struct kobject *kobj,
195 				struct kobj_attribute *attr, char *buf,
196 				enum transparent_hugepage_flag flag)
197 {
198 	return sprintf(buf, "%d\n",
199 		       !!test_bit(flag, &transparent_hugepage_flags));
200 }
201 
202 ssize_t single_hugepage_flag_store(struct kobject *kobj,
203 				 struct kobj_attribute *attr,
204 				 const char *buf, size_t count,
205 				 enum transparent_hugepage_flag flag)
206 {
207 	unsigned long value;
208 	int ret;
209 
210 	ret = kstrtoul(buf, 10, &value);
211 	if (ret < 0)
212 		return ret;
213 	if (value > 1)
214 		return -EINVAL;
215 
216 	if (value)
217 		set_bit(flag, &transparent_hugepage_flags);
218 	else
219 		clear_bit(flag, &transparent_hugepage_flags);
220 
221 	return count;
222 }
223 
224 /*
225  * Currently defrag only disables __GFP_NOWAIT for allocation. A blind
226  * __GFP_REPEAT is too aggressive, it's never worth swapping tons of
227  * memory just to allocate one more hugepage.
228  */
229 static ssize_t defrag_show(struct kobject *kobj,
230 			   struct kobj_attribute *attr, char *buf)
231 {
232 	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
233 		return sprintf(buf, "[always] defer madvise never\n");
234 	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
235 		return sprintf(buf, "always [defer] madvise never\n");
236 	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
237 		return sprintf(buf, "always defer [madvise] never\n");
238 	else
239 		return sprintf(buf, "always defer madvise [never]\n");
240 
241 }
242 static ssize_t defrag_store(struct kobject *kobj,
243 			    struct kobj_attribute *attr,
244 			    const char *buf, size_t count)
245 {
246 	return triple_flag_store(kobj, attr, buf, count,
247 				 TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
248 				 TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
249 				 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
250 }
251 static struct kobj_attribute defrag_attr =
252 	__ATTR(defrag, 0644, defrag_show, defrag_store);
253 
254 static ssize_t use_zero_page_show(struct kobject *kobj,
255 		struct kobj_attribute *attr, char *buf)
256 {
257 	return single_hugepage_flag_show(kobj, attr, buf,
258 				TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
259 }
260 static ssize_t use_zero_page_store(struct kobject *kobj,
261 		struct kobj_attribute *attr, const char *buf, size_t count)
262 {
263 	return single_hugepage_flag_store(kobj, attr, buf, count,
264 				 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
265 }
266 static struct kobj_attribute use_zero_page_attr =
267 	__ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
268 #ifdef CONFIG_DEBUG_VM
269 static ssize_t debug_cow_show(struct kobject *kobj,
270 				struct kobj_attribute *attr, char *buf)
271 {
272 	return single_hugepage_flag_show(kobj, attr, buf,
273 				TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
274 }
275 static ssize_t debug_cow_store(struct kobject *kobj,
276 			       struct kobj_attribute *attr,
277 			       const char *buf, size_t count)
278 {
279 	return single_hugepage_flag_store(kobj, attr, buf, count,
280 				 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
281 }
282 static struct kobj_attribute debug_cow_attr =
283 	__ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store);
284 #endif /* CONFIG_DEBUG_VM */
285 
286 static struct attribute *hugepage_attr[] = {
287 	&enabled_attr.attr,
288 	&defrag_attr.attr,
289 	&use_zero_page_attr.attr,
290 #if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
291 	&shmem_enabled_attr.attr,
292 #endif
293 #ifdef CONFIG_DEBUG_VM
294 	&debug_cow_attr.attr,
295 #endif
296 	NULL,
297 };
298 
299 static struct attribute_group hugepage_attr_group = {
300 	.attrs = hugepage_attr,
301 };
302 
303 static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
304 {
305 	int err;
306 
307 	*hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
308 	if (unlikely(!*hugepage_kobj)) {
309 		pr_err("failed to create transparent hugepage kobject\n");
310 		return -ENOMEM;
311 	}
312 
313 	err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
314 	if (err) {
315 		pr_err("failed to register transparent hugepage group\n");
316 		goto delete_obj;
317 	}
318 
319 	err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
320 	if (err) {
321 		pr_err("failed to register transparent hugepage group\n");
322 		goto remove_hp_group;
323 	}
324 
325 	return 0;
326 
327 remove_hp_group:
328 	sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
329 delete_obj:
330 	kobject_put(*hugepage_kobj);
331 	return err;
332 }
333 
334 static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
335 {
336 	sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
337 	sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
338 	kobject_put(hugepage_kobj);
339 }
340 #else
341 static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
342 {
343 	return 0;
344 }
345 
346 static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
347 {
348 }
349 #endif /* CONFIG_SYSFS */
350 
351 static int __init hugepage_init(void)
352 {
353 	int err;
354 	struct kobject *hugepage_kobj;
355 
356 	if (!has_transparent_hugepage()) {
357 		transparent_hugepage_flags = 0;
358 		return -EINVAL;
359 	}
360 
361 	/*
362 	 * hugepages can't be allocated by the buddy allocator
363 	 */
364 	MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER >= MAX_ORDER);
365 	/*
366 	 * we use page->mapping and page->index in second tail page
367 	 * as list_head: assuming THP order >= 2
368 	 */
369 	MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
370 
371 	err = hugepage_init_sysfs(&hugepage_kobj);
372 	if (err)
373 		goto err_sysfs;
374 
375 	err = khugepaged_init();
376 	if (err)
377 		goto err_slab;
378 
379 	err = register_shrinker(&huge_zero_page_shrinker);
380 	if (err)
381 		goto err_hzp_shrinker;
382 	err = register_shrinker(&deferred_split_shrinker);
383 	if (err)
384 		goto err_split_shrinker;
385 
386 	/*
387 	 * By default disable transparent hugepages on smaller systems,
388 	 * where the extra memory used could hurt more than TLB overhead
389 	 * is likely to save.  The admin can still enable it through /sys.
390 	 */
391 	if (totalram_pages < (512 << (20 - PAGE_SHIFT))) {
392 		transparent_hugepage_flags = 0;
393 		return 0;
394 	}
395 
396 	err = start_stop_khugepaged();
397 	if (err)
398 		goto err_khugepaged;
399 
400 	return 0;
401 err_khugepaged:
402 	unregister_shrinker(&deferred_split_shrinker);
403 err_split_shrinker:
404 	unregister_shrinker(&huge_zero_page_shrinker);
405 err_hzp_shrinker:
406 	khugepaged_destroy();
407 err_slab:
408 	hugepage_exit_sysfs(hugepage_kobj);
409 err_sysfs:
410 	return err;
411 }
412 subsys_initcall(hugepage_init);
413 
414 static int __init setup_transparent_hugepage(char *str)
415 {
416 	int ret = 0;
417 	if (!str)
418 		goto out;
419 	if (!strcmp(str, "always")) {
420 		set_bit(TRANSPARENT_HUGEPAGE_FLAG,
421 			&transparent_hugepage_flags);
422 		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
423 			  &transparent_hugepage_flags);
424 		ret = 1;
425 	} else if (!strcmp(str, "madvise")) {
426 		clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
427 			  &transparent_hugepage_flags);
428 		set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
429 			&transparent_hugepage_flags);
430 		ret = 1;
431 	} else if (!strcmp(str, "never")) {
432 		clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
433 			  &transparent_hugepage_flags);
434 		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
435 			  &transparent_hugepage_flags);
436 		ret = 1;
437 	}
438 out:
439 	if (!ret)
440 		pr_warn("transparent_hugepage= cannot parse, ignored\n");
441 	return ret;
442 }
443 __setup("transparent_hugepage=", setup_transparent_hugepage);
444 
445 pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
446 {
447 	if (likely(vma->vm_flags & VM_WRITE))
448 		pmd = pmd_mkwrite(pmd);
449 	return pmd;
450 }
451 
452 static inline struct list_head *page_deferred_list(struct page *page)
453 {
454 	/*
455 	 * ->lru in the tail pages is occupied by compound_head.
456 	 * Let's use ->mapping + ->index in the second tail page as list_head.
457 	 */
458 	return (struct list_head *)&page[2].mapping;
459 }
460 
461 void prep_transhuge_page(struct page *page)
462 {
463 	/*
464 	 * we use page->mapping and page->indexlru in second tail page
465 	 * as list_head: assuming THP order >= 2
466 	 */
467 
468 	INIT_LIST_HEAD(page_deferred_list(page));
469 	set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
470 }
471 
472 static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
473 		gfp_t gfp)
474 {
475 	struct vm_area_struct *vma = fe->vma;
476 	struct mem_cgroup *memcg;
477 	pgtable_t pgtable;
478 	unsigned long haddr = fe->address & HPAGE_PMD_MASK;
479 
480 	VM_BUG_ON_PAGE(!PageCompound(page), page);
481 
482 	if (mem_cgroup_try_charge(page, vma->vm_mm, gfp, &memcg, true)) {
483 		put_page(page);
484 		count_vm_event(THP_FAULT_FALLBACK);
485 		return VM_FAULT_FALLBACK;
486 	}
487 
488 	pgtable = pte_alloc_one(vma->vm_mm, haddr);
489 	if (unlikely(!pgtable)) {
490 		mem_cgroup_cancel_charge(page, memcg, true);
491 		put_page(page);
492 		return VM_FAULT_OOM;
493 	}
494 
495 	clear_huge_page(page, haddr, HPAGE_PMD_NR);
496 	/*
497 	 * The memory barrier inside __SetPageUptodate makes sure that
498 	 * clear_huge_page writes become visible before the set_pmd_at()
499 	 * write.
500 	 */
501 	__SetPageUptodate(page);
502 
503 	fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
504 	if (unlikely(!pmd_none(*fe->pmd))) {
505 		spin_unlock(fe->ptl);
506 		mem_cgroup_cancel_charge(page, memcg, true);
507 		put_page(page);
508 		pte_free(vma->vm_mm, pgtable);
509 	} else {
510 		pmd_t entry;
511 
512 		/* Deliver the page fault to userland */
513 		if (userfaultfd_missing(vma)) {
514 			int ret;
515 
516 			spin_unlock(fe->ptl);
517 			mem_cgroup_cancel_charge(page, memcg, true);
518 			put_page(page);
519 			pte_free(vma->vm_mm, pgtable);
520 			ret = handle_userfault(fe, VM_UFFD_MISSING);
521 			VM_BUG_ON(ret & VM_FAULT_FALLBACK);
522 			return ret;
523 		}
524 
525 		entry = mk_huge_pmd(page, vma->vm_page_prot);
526 		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
527 		page_add_new_anon_rmap(page, vma, haddr, true);
528 		mem_cgroup_commit_charge(page, memcg, false, true);
529 		lru_cache_add_active_or_unevictable(page, vma);
530 		pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, pgtable);
531 		set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
532 		add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
533 		atomic_long_inc(&vma->vm_mm->nr_ptes);
534 		spin_unlock(fe->ptl);
535 		count_vm_event(THP_FAULT_ALLOC);
536 	}
537 
538 	return 0;
539 }
540 
541 /*
542  * If THP defrag is set to always then directly reclaim/compact as necessary
543  * If set to defer then do only background reclaim/compact and defer to khugepaged
544  * If set to madvise and the VMA is flagged then directly reclaim/compact
545  * When direct reclaim/compact is allowed, don't retry except for flagged VMA's
546  */
547 static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
548 {
549 	bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
550 
551 	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
552 				&transparent_hugepage_flags) && vma_madvised)
553 		return GFP_TRANSHUGE;
554 	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
555 						&transparent_hugepage_flags))
556 		return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
557 	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
558 						&transparent_hugepage_flags))
559 		return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
560 
561 	return GFP_TRANSHUGE_LIGHT;
562 }
563 
564 /* Caller must hold page table lock. */
565 static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
566 		struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
567 		struct page *zero_page)
568 {
569 	pmd_t entry;
570 	if (!pmd_none(*pmd))
571 		return false;
572 	entry = mk_pmd(zero_page, vma->vm_page_prot);
573 	entry = pmd_mkhuge(entry);
574 	if (pgtable)
575 		pgtable_trans_huge_deposit(mm, pmd, pgtable);
576 	set_pmd_at(mm, haddr, pmd, entry);
577 	atomic_long_inc(&mm->nr_ptes);
578 	return true;
579 }
580 
581 int do_huge_pmd_anonymous_page(struct fault_env *fe)
582 {
583 	struct vm_area_struct *vma = fe->vma;
584 	gfp_t gfp;
585 	struct page *page;
586 	unsigned long haddr = fe->address & HPAGE_PMD_MASK;
587 
588 	if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
589 		return VM_FAULT_FALLBACK;
590 	if (unlikely(anon_vma_prepare(vma)))
591 		return VM_FAULT_OOM;
592 	if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
593 		return VM_FAULT_OOM;
594 	if (!(fe->flags & FAULT_FLAG_WRITE) &&
595 			!mm_forbids_zeropage(vma->vm_mm) &&
596 			transparent_hugepage_use_zero_page()) {
597 		pgtable_t pgtable;
598 		struct page *zero_page;
599 		bool set;
600 		int ret;
601 		pgtable = pte_alloc_one(vma->vm_mm, haddr);
602 		if (unlikely(!pgtable))
603 			return VM_FAULT_OOM;
604 		zero_page = get_huge_zero_page();
605 		if (unlikely(!zero_page)) {
606 			pte_free(vma->vm_mm, pgtable);
607 			count_vm_event(THP_FAULT_FALLBACK);
608 			return VM_FAULT_FALLBACK;
609 		}
610 		fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
611 		ret = 0;
612 		set = false;
613 		if (pmd_none(*fe->pmd)) {
614 			if (userfaultfd_missing(vma)) {
615 				spin_unlock(fe->ptl);
616 				ret = handle_userfault(fe, VM_UFFD_MISSING);
617 				VM_BUG_ON(ret & VM_FAULT_FALLBACK);
618 			} else {
619 				set_huge_zero_page(pgtable, vma->vm_mm, vma,
620 						   haddr, fe->pmd, zero_page);
621 				spin_unlock(fe->ptl);
622 				set = true;
623 			}
624 		} else
625 			spin_unlock(fe->ptl);
626 		if (!set) {
627 			pte_free(vma->vm_mm, pgtable);
628 			put_huge_zero_page();
629 		}
630 		return ret;
631 	}
632 	gfp = alloc_hugepage_direct_gfpmask(vma);
633 	page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
634 	if (unlikely(!page)) {
635 		count_vm_event(THP_FAULT_FALLBACK);
636 		return VM_FAULT_FALLBACK;
637 	}
638 	prep_transhuge_page(page);
639 	return __do_huge_pmd_anonymous_page(fe, page, gfp);
640 }
641 
642 static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
643 		pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write)
644 {
645 	struct mm_struct *mm = vma->vm_mm;
646 	pmd_t entry;
647 	spinlock_t *ptl;
648 
649 	ptl = pmd_lock(mm, pmd);
650 	entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
651 	if (pfn_t_devmap(pfn))
652 		entry = pmd_mkdevmap(entry);
653 	if (write) {
654 		entry = pmd_mkyoung(pmd_mkdirty(entry));
655 		entry = maybe_pmd_mkwrite(entry, vma);
656 	}
657 	set_pmd_at(mm, addr, pmd, entry);
658 	update_mmu_cache_pmd(vma, addr, pmd);
659 	spin_unlock(ptl);
660 }
661 
662 int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
663 			pmd_t *pmd, pfn_t pfn, bool write)
664 {
665 	pgprot_t pgprot = vma->vm_page_prot;
666 	/*
667 	 * If we had pmd_special, we could avoid all these restrictions,
668 	 * but we need to be consistent with PTEs and architectures that
669 	 * can't support a 'special' bit.
670 	 */
671 	BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
672 	BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
673 						(VM_PFNMAP|VM_MIXEDMAP));
674 	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
675 	BUG_ON(!pfn_t_devmap(pfn));
676 
677 	if (addr < vma->vm_start || addr >= vma->vm_end)
678 		return VM_FAULT_SIGBUS;
679 	if (track_pfn_insert(vma, &pgprot, pfn))
680 		return VM_FAULT_SIGBUS;
681 	insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write);
682 	return VM_FAULT_NOPAGE;
683 }
684 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
685 
686 static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
687 		pmd_t *pmd)
688 {
689 	pmd_t _pmd;
690 
691 	/*
692 	 * We should set the dirty bit only for FOLL_WRITE but for now
693 	 * the dirty bit in the pmd is meaningless.  And if the dirty
694 	 * bit will become meaningful and we'll only set it with
695 	 * FOLL_WRITE, an atomic set_bit will be required on the pmd to
696 	 * set the young bit, instead of the current set_pmd_at.
697 	 */
698 	_pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
699 	if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
700 				pmd, _pmd,  1))
701 		update_mmu_cache_pmd(vma, addr, pmd);
702 }
703 
704 struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
705 		pmd_t *pmd, int flags)
706 {
707 	unsigned long pfn = pmd_pfn(*pmd);
708 	struct mm_struct *mm = vma->vm_mm;
709 	struct dev_pagemap *pgmap;
710 	struct page *page;
711 
712 	assert_spin_locked(pmd_lockptr(mm, pmd));
713 
714 	if (flags & FOLL_WRITE && !pmd_write(*pmd))
715 		return NULL;
716 
717 	if (pmd_present(*pmd) && pmd_devmap(*pmd))
718 		/* pass */;
719 	else
720 		return NULL;
721 
722 	if (flags & FOLL_TOUCH)
723 		touch_pmd(vma, addr, pmd);
724 
725 	/*
726 	 * device mapped pages can only be returned if the
727 	 * caller will manage the page reference count.
728 	 */
729 	if (!(flags & FOLL_GET))
730 		return ERR_PTR(-EEXIST);
731 
732 	pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
733 	pgmap = get_dev_pagemap(pfn, NULL);
734 	if (!pgmap)
735 		return ERR_PTR(-EFAULT);
736 	page = pfn_to_page(pfn);
737 	get_page(page);
738 	put_dev_pagemap(pgmap);
739 
740 	return page;
741 }
742 
743 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
744 		  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
745 		  struct vm_area_struct *vma)
746 {
747 	spinlock_t *dst_ptl, *src_ptl;
748 	struct page *src_page;
749 	pmd_t pmd;
750 	pgtable_t pgtable = NULL;
751 	int ret = -ENOMEM;
752 
753 	/* Skip if can be re-fill on fault */
754 	if (!vma_is_anonymous(vma))
755 		return 0;
756 
757 	pgtable = pte_alloc_one(dst_mm, addr);
758 	if (unlikely(!pgtable))
759 		goto out;
760 
761 	dst_ptl = pmd_lock(dst_mm, dst_pmd);
762 	src_ptl = pmd_lockptr(src_mm, src_pmd);
763 	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
764 
765 	ret = -EAGAIN;
766 	pmd = *src_pmd;
767 	if (unlikely(!pmd_trans_huge(pmd))) {
768 		pte_free(dst_mm, pgtable);
769 		goto out_unlock;
770 	}
771 	/*
772 	 * When page table lock is held, the huge zero pmd should not be
773 	 * under splitting since we don't split the page itself, only pmd to
774 	 * a page table.
775 	 */
776 	if (is_huge_zero_pmd(pmd)) {
777 		struct page *zero_page;
778 		/*
779 		 * get_huge_zero_page() will never allocate a new page here,
780 		 * since we already have a zero page to copy. It just takes a
781 		 * reference.
782 		 */
783 		zero_page = get_huge_zero_page();
784 		set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
785 				zero_page);
786 		ret = 0;
787 		goto out_unlock;
788 	}
789 
790 	src_page = pmd_page(pmd);
791 	VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
792 	get_page(src_page);
793 	page_dup_rmap(src_page, true);
794 	add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
795 	atomic_long_inc(&dst_mm->nr_ptes);
796 	pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
797 
798 	pmdp_set_wrprotect(src_mm, addr, src_pmd);
799 	pmd = pmd_mkold(pmd_wrprotect(pmd));
800 	set_pmd_at(dst_mm, addr, dst_pmd, pmd);
801 
802 	ret = 0;
803 out_unlock:
804 	spin_unlock(src_ptl);
805 	spin_unlock(dst_ptl);
806 out:
807 	return ret;
808 }
809 
810 void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd)
811 {
812 	pmd_t entry;
813 	unsigned long haddr;
814 
815 	fe->ptl = pmd_lock(fe->vma->vm_mm, fe->pmd);
816 	if (unlikely(!pmd_same(*fe->pmd, orig_pmd)))
817 		goto unlock;
818 
819 	entry = pmd_mkyoung(orig_pmd);
820 	haddr = fe->address & HPAGE_PMD_MASK;
821 	if (pmdp_set_access_flags(fe->vma, haddr, fe->pmd, entry,
822 				fe->flags & FAULT_FLAG_WRITE))
823 		update_mmu_cache_pmd(fe->vma, fe->address, fe->pmd);
824 
825 unlock:
826 	spin_unlock(fe->ptl);
827 }
828 
829 static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd,
830 		struct page *page)
831 {
832 	struct vm_area_struct *vma = fe->vma;
833 	unsigned long haddr = fe->address & HPAGE_PMD_MASK;
834 	struct mem_cgroup *memcg;
835 	pgtable_t pgtable;
836 	pmd_t _pmd;
837 	int ret = 0, i;
838 	struct page **pages;
839 	unsigned long mmun_start;	/* For mmu_notifiers */
840 	unsigned long mmun_end;		/* For mmu_notifiers */
841 
842 	pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
843 			GFP_KERNEL);
844 	if (unlikely(!pages)) {
845 		ret |= VM_FAULT_OOM;
846 		goto out;
847 	}
848 
849 	for (i = 0; i < HPAGE_PMD_NR; i++) {
850 		pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE |
851 					       __GFP_OTHER_NODE, vma,
852 					       fe->address, page_to_nid(page));
853 		if (unlikely(!pages[i] ||
854 			     mem_cgroup_try_charge(pages[i], vma->vm_mm,
855 				     GFP_KERNEL, &memcg, false))) {
856 			if (pages[i])
857 				put_page(pages[i]);
858 			while (--i >= 0) {
859 				memcg = (void *)page_private(pages[i]);
860 				set_page_private(pages[i], 0);
861 				mem_cgroup_cancel_charge(pages[i], memcg,
862 						false);
863 				put_page(pages[i]);
864 			}
865 			kfree(pages);
866 			ret |= VM_FAULT_OOM;
867 			goto out;
868 		}
869 		set_page_private(pages[i], (unsigned long)memcg);
870 	}
871 
872 	for (i = 0; i < HPAGE_PMD_NR; i++) {
873 		copy_user_highpage(pages[i], page + i,
874 				   haddr + PAGE_SIZE * i, vma);
875 		__SetPageUptodate(pages[i]);
876 		cond_resched();
877 	}
878 
879 	mmun_start = haddr;
880 	mmun_end   = haddr + HPAGE_PMD_SIZE;
881 	mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
882 
883 	fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
884 	if (unlikely(!pmd_same(*fe->pmd, orig_pmd)))
885 		goto out_free_pages;
886 	VM_BUG_ON_PAGE(!PageHead(page), page);
887 
888 	pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd);
889 	/* leave pmd empty until pte is filled */
890 
891 	pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, fe->pmd);
892 	pmd_populate(vma->vm_mm, &_pmd, pgtable);
893 
894 	for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
895 		pte_t entry;
896 		entry = mk_pte(pages[i], vma->vm_page_prot);
897 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
898 		memcg = (void *)page_private(pages[i]);
899 		set_page_private(pages[i], 0);
900 		page_add_new_anon_rmap(pages[i], fe->vma, haddr, false);
901 		mem_cgroup_commit_charge(pages[i], memcg, false, false);
902 		lru_cache_add_active_or_unevictable(pages[i], vma);
903 		fe->pte = pte_offset_map(&_pmd, haddr);
904 		VM_BUG_ON(!pte_none(*fe->pte));
905 		set_pte_at(vma->vm_mm, haddr, fe->pte, entry);
906 		pte_unmap(fe->pte);
907 	}
908 	kfree(pages);
909 
910 	smp_wmb(); /* make pte visible before pmd */
911 	pmd_populate(vma->vm_mm, fe->pmd, pgtable);
912 	page_remove_rmap(page, true);
913 	spin_unlock(fe->ptl);
914 
915 	mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
916 
917 	ret |= VM_FAULT_WRITE;
918 	put_page(page);
919 
920 out:
921 	return ret;
922 
923 out_free_pages:
924 	spin_unlock(fe->ptl);
925 	mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
926 	for (i = 0; i < HPAGE_PMD_NR; i++) {
927 		memcg = (void *)page_private(pages[i]);
928 		set_page_private(pages[i], 0);
929 		mem_cgroup_cancel_charge(pages[i], memcg, false);
930 		put_page(pages[i]);
931 	}
932 	kfree(pages);
933 	goto out;
934 }
935 
936 int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd)
937 {
938 	struct vm_area_struct *vma = fe->vma;
939 	struct page *page = NULL, *new_page;
940 	struct mem_cgroup *memcg;
941 	unsigned long haddr = fe->address & HPAGE_PMD_MASK;
942 	unsigned long mmun_start;	/* For mmu_notifiers */
943 	unsigned long mmun_end;		/* For mmu_notifiers */
944 	gfp_t huge_gfp;			/* for allocation and charge */
945 	int ret = 0;
946 
947 	fe->ptl = pmd_lockptr(vma->vm_mm, fe->pmd);
948 	VM_BUG_ON_VMA(!vma->anon_vma, vma);
949 	if (is_huge_zero_pmd(orig_pmd))
950 		goto alloc;
951 	spin_lock(fe->ptl);
952 	if (unlikely(!pmd_same(*fe->pmd, orig_pmd)))
953 		goto out_unlock;
954 
955 	page = pmd_page(orig_pmd);
956 	VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
957 	/*
958 	 * We can only reuse the page if nobody else maps the huge page or it's
959 	 * part.
960 	 */
961 	if (page_trans_huge_mapcount(page, NULL) == 1) {
962 		pmd_t entry;
963 		entry = pmd_mkyoung(orig_pmd);
964 		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
965 		if (pmdp_set_access_flags(vma, haddr, fe->pmd, entry,  1))
966 			update_mmu_cache_pmd(vma, fe->address, fe->pmd);
967 		ret |= VM_FAULT_WRITE;
968 		goto out_unlock;
969 	}
970 	get_page(page);
971 	spin_unlock(fe->ptl);
972 alloc:
973 	if (transparent_hugepage_enabled(vma) &&
974 	    !transparent_hugepage_debug_cow()) {
975 		huge_gfp = alloc_hugepage_direct_gfpmask(vma);
976 		new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
977 	} else
978 		new_page = NULL;
979 
980 	if (likely(new_page)) {
981 		prep_transhuge_page(new_page);
982 	} else {
983 		if (!page) {
984 			split_huge_pmd(vma, fe->pmd, fe->address);
985 			ret |= VM_FAULT_FALLBACK;
986 		} else {
987 			ret = do_huge_pmd_wp_page_fallback(fe, orig_pmd, page);
988 			if (ret & VM_FAULT_OOM) {
989 				split_huge_pmd(vma, fe->pmd, fe->address);
990 				ret |= VM_FAULT_FALLBACK;
991 			}
992 			put_page(page);
993 		}
994 		count_vm_event(THP_FAULT_FALLBACK);
995 		goto out;
996 	}
997 
998 	if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm,
999 					huge_gfp, &memcg, true))) {
1000 		put_page(new_page);
1001 		split_huge_pmd(vma, fe->pmd, fe->address);
1002 		if (page)
1003 			put_page(page);
1004 		ret |= VM_FAULT_FALLBACK;
1005 		count_vm_event(THP_FAULT_FALLBACK);
1006 		goto out;
1007 	}
1008 
1009 	count_vm_event(THP_FAULT_ALLOC);
1010 
1011 	if (!page)
1012 		clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
1013 	else
1014 		copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
1015 	__SetPageUptodate(new_page);
1016 
1017 	mmun_start = haddr;
1018 	mmun_end   = haddr + HPAGE_PMD_SIZE;
1019 	mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
1020 
1021 	spin_lock(fe->ptl);
1022 	if (page)
1023 		put_page(page);
1024 	if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) {
1025 		spin_unlock(fe->ptl);
1026 		mem_cgroup_cancel_charge(new_page, memcg, true);
1027 		put_page(new_page);
1028 		goto out_mn;
1029 	} else {
1030 		pmd_t entry;
1031 		entry = mk_huge_pmd(new_page, vma->vm_page_prot);
1032 		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1033 		pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd);
1034 		page_add_new_anon_rmap(new_page, vma, haddr, true);
1035 		mem_cgroup_commit_charge(new_page, memcg, false, true);
1036 		lru_cache_add_active_or_unevictable(new_page, vma);
1037 		set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
1038 		update_mmu_cache_pmd(vma, fe->address, fe->pmd);
1039 		if (!page) {
1040 			add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
1041 			put_huge_zero_page();
1042 		} else {
1043 			VM_BUG_ON_PAGE(!PageHead(page), page);
1044 			page_remove_rmap(page, true);
1045 			put_page(page);
1046 		}
1047 		ret |= VM_FAULT_WRITE;
1048 	}
1049 	spin_unlock(fe->ptl);
1050 out_mn:
1051 	mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
1052 out:
1053 	return ret;
1054 out_unlock:
1055 	spin_unlock(fe->ptl);
1056 	return ret;
1057 }
1058 
1059 struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
1060 				   unsigned long addr,
1061 				   pmd_t *pmd,
1062 				   unsigned int flags)
1063 {
1064 	struct mm_struct *mm = vma->vm_mm;
1065 	struct page *page = NULL;
1066 
1067 	assert_spin_locked(pmd_lockptr(mm, pmd));
1068 
1069 	if (flags & FOLL_WRITE && !pmd_write(*pmd))
1070 		goto out;
1071 
1072 	/* Avoid dumping huge zero page */
1073 	if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
1074 		return ERR_PTR(-EFAULT);
1075 
1076 	/* Full NUMA hinting faults to serialise migration in fault paths */
1077 	if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
1078 		goto out;
1079 
1080 	page = pmd_page(*pmd);
1081 	VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
1082 	if (flags & FOLL_TOUCH)
1083 		touch_pmd(vma, addr, pmd);
1084 	if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
1085 		/*
1086 		 * We don't mlock() pte-mapped THPs. This way we can avoid
1087 		 * leaking mlocked pages into non-VM_LOCKED VMAs.
1088 		 *
1089 		 * For anon THP:
1090 		 *
1091 		 * In most cases the pmd is the only mapping of the page as we
1092 		 * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for
1093 		 * writable private mappings in populate_vma_page_range().
1094 		 *
1095 		 * The only scenario when we have the page shared here is if we
1096 		 * mlocking read-only mapping shared over fork(). We skip
1097 		 * mlocking such pages.
1098 		 *
1099 		 * For file THP:
1100 		 *
1101 		 * We can expect PageDoubleMap() to be stable under page lock:
1102 		 * for file pages we set it in page_add_file_rmap(), which
1103 		 * requires page to be locked.
1104 		 */
1105 
1106 		if (PageAnon(page) && compound_mapcount(page) != 1)
1107 			goto skip_mlock;
1108 		if (PageDoubleMap(page) || !page->mapping)
1109 			goto skip_mlock;
1110 		if (!trylock_page(page))
1111 			goto skip_mlock;
1112 		lru_add_drain();
1113 		if (page->mapping && !PageDoubleMap(page))
1114 			mlock_vma_page(page);
1115 		unlock_page(page);
1116 	}
1117 skip_mlock:
1118 	page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
1119 	VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
1120 	if (flags & FOLL_GET)
1121 		get_page(page);
1122 
1123 out:
1124 	return page;
1125 }
1126 
1127 /* NUMA hinting page fault entry point for trans huge pmds */
1128 int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
1129 {
1130 	struct vm_area_struct *vma = fe->vma;
1131 	struct anon_vma *anon_vma = NULL;
1132 	struct page *page;
1133 	unsigned long haddr = fe->address & HPAGE_PMD_MASK;
1134 	int page_nid = -1, this_nid = numa_node_id();
1135 	int target_nid, last_cpupid = -1;
1136 	bool page_locked;
1137 	bool migrated = false;
1138 	bool was_writable;
1139 	int flags = 0;
1140 
1141 	/* A PROT_NONE fault should not end up here */
1142 	BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)));
1143 
1144 	fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
1145 	if (unlikely(!pmd_same(pmd, *fe->pmd)))
1146 		goto out_unlock;
1147 
1148 	/*
1149 	 * If there are potential migrations, wait for completion and retry
1150 	 * without disrupting NUMA hinting information. Do not relock and
1151 	 * check_same as the page may no longer be mapped.
1152 	 */
1153 	if (unlikely(pmd_trans_migrating(*fe->pmd))) {
1154 		page = pmd_page(*fe->pmd);
1155 		spin_unlock(fe->ptl);
1156 		wait_on_page_locked(page);
1157 		goto out;
1158 	}
1159 
1160 	page = pmd_page(pmd);
1161 	BUG_ON(is_huge_zero_page(page));
1162 	page_nid = page_to_nid(page);
1163 	last_cpupid = page_cpupid_last(page);
1164 	count_vm_numa_event(NUMA_HINT_FAULTS);
1165 	if (page_nid == this_nid) {
1166 		count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
1167 		flags |= TNF_FAULT_LOCAL;
1168 	}
1169 
1170 	/* See similar comment in do_numa_page for explanation */
1171 	if (!(vma->vm_flags & VM_WRITE))
1172 		flags |= TNF_NO_GROUP;
1173 
1174 	/*
1175 	 * Acquire the page lock to serialise THP migrations but avoid dropping
1176 	 * page_table_lock if at all possible
1177 	 */
1178 	page_locked = trylock_page(page);
1179 	target_nid = mpol_misplaced(page, vma, haddr);
1180 	if (target_nid == -1) {
1181 		/* If the page was locked, there are no parallel migrations */
1182 		if (page_locked)
1183 			goto clear_pmdnuma;
1184 	}
1185 
1186 	/* Migration could have started since the pmd_trans_migrating check */
1187 	if (!page_locked) {
1188 		spin_unlock(fe->ptl);
1189 		wait_on_page_locked(page);
1190 		page_nid = -1;
1191 		goto out;
1192 	}
1193 
1194 	/*
1195 	 * Page is misplaced. Page lock serialises migrations. Acquire anon_vma
1196 	 * to serialises splits
1197 	 */
1198 	get_page(page);
1199 	spin_unlock(fe->ptl);
1200 	anon_vma = page_lock_anon_vma_read(page);
1201 
1202 	/* Confirm the PMD did not change while page_table_lock was released */
1203 	spin_lock(fe->ptl);
1204 	if (unlikely(!pmd_same(pmd, *fe->pmd))) {
1205 		unlock_page(page);
1206 		put_page(page);
1207 		page_nid = -1;
1208 		goto out_unlock;
1209 	}
1210 
1211 	/* Bail if we fail to protect against THP splits for any reason */
1212 	if (unlikely(!anon_vma)) {
1213 		put_page(page);
1214 		page_nid = -1;
1215 		goto clear_pmdnuma;
1216 	}
1217 
1218 	/*
1219 	 * Migrate the THP to the requested node, returns with page unlocked
1220 	 * and access rights restored.
1221 	 */
1222 	spin_unlock(fe->ptl);
1223 	migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma,
1224 				fe->pmd, pmd, fe->address, page, target_nid);
1225 	if (migrated) {
1226 		flags |= TNF_MIGRATED;
1227 		page_nid = target_nid;
1228 	} else
1229 		flags |= TNF_MIGRATE_FAIL;
1230 
1231 	goto out;
1232 clear_pmdnuma:
1233 	BUG_ON(!PageLocked(page));
1234 	was_writable = pmd_write(pmd);
1235 	pmd = pmd_modify(pmd, vma->vm_page_prot);
1236 	pmd = pmd_mkyoung(pmd);
1237 	if (was_writable)
1238 		pmd = pmd_mkwrite(pmd);
1239 	set_pmd_at(vma->vm_mm, haddr, fe->pmd, pmd);
1240 	update_mmu_cache_pmd(vma, fe->address, fe->pmd);
1241 	unlock_page(page);
1242 out_unlock:
1243 	spin_unlock(fe->ptl);
1244 
1245 out:
1246 	if (anon_vma)
1247 		page_unlock_anon_vma_read(anon_vma);
1248 
1249 	if (page_nid != -1)
1250 		task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, fe->flags);
1251 
1252 	return 0;
1253 }
1254 
1255 /*
1256  * Return true if we do MADV_FREE successfully on entire pmd page.
1257  * Otherwise, return false.
1258  */
1259 bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1260 		pmd_t *pmd, unsigned long addr, unsigned long next)
1261 {
1262 	spinlock_t *ptl;
1263 	pmd_t orig_pmd;
1264 	struct page *page;
1265 	struct mm_struct *mm = tlb->mm;
1266 	bool ret = false;
1267 
1268 	ptl = pmd_trans_huge_lock(pmd, vma);
1269 	if (!ptl)
1270 		goto out_unlocked;
1271 
1272 	orig_pmd = *pmd;
1273 	if (is_huge_zero_pmd(orig_pmd))
1274 		goto out;
1275 
1276 	page = pmd_page(orig_pmd);
1277 	/*
1278 	 * If other processes are mapping this page, we couldn't discard
1279 	 * the page unless they all do MADV_FREE so let's skip the page.
1280 	 */
1281 	if (page_mapcount(page) != 1)
1282 		goto out;
1283 
1284 	if (!trylock_page(page))
1285 		goto out;
1286 
1287 	/*
1288 	 * If user want to discard part-pages of THP, split it so MADV_FREE
1289 	 * will deactivate only them.
1290 	 */
1291 	if (next - addr != HPAGE_PMD_SIZE) {
1292 		get_page(page);
1293 		spin_unlock(ptl);
1294 		split_huge_page(page);
1295 		put_page(page);
1296 		unlock_page(page);
1297 		goto out_unlocked;
1298 	}
1299 
1300 	if (PageDirty(page))
1301 		ClearPageDirty(page);
1302 	unlock_page(page);
1303 
1304 	if (PageActive(page))
1305 		deactivate_page(page);
1306 
1307 	if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
1308 		orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
1309 			tlb->fullmm);
1310 		orig_pmd = pmd_mkold(orig_pmd);
1311 		orig_pmd = pmd_mkclean(orig_pmd);
1312 
1313 		set_pmd_at(mm, addr, pmd, orig_pmd);
1314 		tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1315 	}
1316 	ret = true;
1317 out:
1318 	spin_unlock(ptl);
1319 out_unlocked:
1320 	return ret;
1321 }
1322 
1323 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1324 		 pmd_t *pmd, unsigned long addr)
1325 {
1326 	pmd_t orig_pmd;
1327 	spinlock_t *ptl;
1328 
1329 	ptl = __pmd_trans_huge_lock(pmd, vma);
1330 	if (!ptl)
1331 		return 0;
1332 	/*
1333 	 * For architectures like ppc64 we look at deposited pgtable
1334 	 * when calling pmdp_huge_get_and_clear. So do the
1335 	 * pgtable_trans_huge_withdraw after finishing pmdp related
1336 	 * operations.
1337 	 */
1338 	orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
1339 			tlb->fullmm);
1340 	tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1341 	if (vma_is_dax(vma)) {
1342 		spin_unlock(ptl);
1343 		if (is_huge_zero_pmd(orig_pmd))
1344 			tlb_remove_page(tlb, pmd_page(orig_pmd));
1345 	} else if (is_huge_zero_pmd(orig_pmd)) {
1346 		pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd));
1347 		atomic_long_dec(&tlb->mm->nr_ptes);
1348 		spin_unlock(ptl);
1349 		tlb_remove_page(tlb, pmd_page(orig_pmd));
1350 	} else {
1351 		struct page *page = pmd_page(orig_pmd);
1352 		page_remove_rmap(page, true);
1353 		VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
1354 		VM_BUG_ON_PAGE(!PageHead(page), page);
1355 		if (PageAnon(page)) {
1356 			pgtable_t pgtable;
1357 			pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
1358 			pte_free(tlb->mm, pgtable);
1359 			atomic_long_dec(&tlb->mm->nr_ptes);
1360 			add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
1361 		} else {
1362 			add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR);
1363 		}
1364 		spin_unlock(ptl);
1365 		tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE);
1366 	}
1367 	return 1;
1368 }
1369 
1370 bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
1371 		  unsigned long new_addr, unsigned long old_end,
1372 		  pmd_t *old_pmd, pmd_t *new_pmd)
1373 {
1374 	spinlock_t *old_ptl, *new_ptl;
1375 	pmd_t pmd;
1376 	struct mm_struct *mm = vma->vm_mm;
1377 
1378 	if ((old_addr & ~HPAGE_PMD_MASK) ||
1379 	    (new_addr & ~HPAGE_PMD_MASK) ||
1380 	    old_end - old_addr < HPAGE_PMD_SIZE)
1381 		return false;
1382 
1383 	/*
1384 	 * The destination pmd shouldn't be established, free_pgtables()
1385 	 * should have release it.
1386 	 */
1387 	if (WARN_ON(!pmd_none(*new_pmd))) {
1388 		VM_BUG_ON(pmd_trans_huge(*new_pmd));
1389 		return false;
1390 	}
1391 
1392 	/*
1393 	 * We don't have to worry about the ordering of src and dst
1394 	 * ptlocks because exclusive mmap_sem prevents deadlock.
1395 	 */
1396 	old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
1397 	if (old_ptl) {
1398 		new_ptl = pmd_lockptr(mm, new_pmd);
1399 		if (new_ptl != old_ptl)
1400 			spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
1401 		pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
1402 		VM_BUG_ON(!pmd_none(*new_pmd));
1403 
1404 		if (pmd_move_must_withdraw(new_ptl, old_ptl) &&
1405 				vma_is_anonymous(vma)) {
1406 			pgtable_t pgtable;
1407 			pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
1408 			pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
1409 		}
1410 		set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
1411 		if (new_ptl != old_ptl)
1412 			spin_unlock(new_ptl);
1413 		spin_unlock(old_ptl);
1414 		return true;
1415 	}
1416 	return false;
1417 }
1418 
1419 /*
1420  * Returns
1421  *  - 0 if PMD could not be locked
1422  *  - 1 if PMD was locked but protections unchange and TLB flush unnecessary
1423  *  - HPAGE_PMD_NR is protections changed and TLB flush necessary
1424  */
1425 int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1426 		unsigned long addr, pgprot_t newprot, int prot_numa)
1427 {
1428 	struct mm_struct *mm = vma->vm_mm;
1429 	spinlock_t *ptl;
1430 	int ret = 0;
1431 
1432 	ptl = __pmd_trans_huge_lock(pmd, vma);
1433 	if (ptl) {
1434 		pmd_t entry;
1435 		bool preserve_write = prot_numa && pmd_write(*pmd);
1436 		ret = 1;
1437 
1438 		/*
1439 		 * Avoid trapping faults against the zero page. The read-only
1440 		 * data is likely to be read-cached on the local CPU and
1441 		 * local/remote hits to the zero page are not interesting.
1442 		 */
1443 		if (prot_numa && is_huge_zero_pmd(*pmd)) {
1444 			spin_unlock(ptl);
1445 			return ret;
1446 		}
1447 
1448 		if (!prot_numa || !pmd_protnone(*pmd)) {
1449 			entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd);
1450 			entry = pmd_modify(entry, newprot);
1451 			if (preserve_write)
1452 				entry = pmd_mkwrite(entry);
1453 			ret = HPAGE_PMD_NR;
1454 			set_pmd_at(mm, addr, pmd, entry);
1455 			BUG_ON(vma_is_anonymous(vma) && !preserve_write &&
1456 					pmd_write(entry));
1457 		}
1458 		spin_unlock(ptl);
1459 	}
1460 
1461 	return ret;
1462 }
1463 
1464 /*
1465  * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise.
1466  *
1467  * Note that if it returns page table lock pointer, this routine returns without
1468  * unlocking page table lock. So callers must unlock it.
1469  */
1470 spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
1471 {
1472 	spinlock_t *ptl;
1473 	ptl = pmd_lock(vma->vm_mm, pmd);
1474 	if (likely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd)))
1475 		return ptl;
1476 	spin_unlock(ptl);
1477 	return NULL;
1478 }
1479 
1480 static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
1481 		unsigned long haddr, pmd_t *pmd)
1482 {
1483 	struct mm_struct *mm = vma->vm_mm;
1484 	pgtable_t pgtable;
1485 	pmd_t _pmd;
1486 	int i;
1487 
1488 	/* leave pmd empty until pte is filled */
1489 	pmdp_huge_clear_flush_notify(vma, haddr, pmd);
1490 
1491 	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
1492 	pmd_populate(mm, &_pmd, pgtable);
1493 
1494 	for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
1495 		pte_t *pte, entry;
1496 		entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
1497 		entry = pte_mkspecial(entry);
1498 		pte = pte_offset_map(&_pmd, haddr);
1499 		VM_BUG_ON(!pte_none(*pte));
1500 		set_pte_at(mm, haddr, pte, entry);
1501 		pte_unmap(pte);
1502 	}
1503 	smp_wmb(); /* make pte visible before pmd */
1504 	pmd_populate(mm, pmd, pgtable);
1505 	put_huge_zero_page();
1506 }
1507 
1508 static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
1509 		unsigned long haddr, bool freeze)
1510 {
1511 	struct mm_struct *mm = vma->vm_mm;
1512 	struct page *page;
1513 	pgtable_t pgtable;
1514 	pmd_t _pmd;
1515 	bool young, write, dirty, soft_dirty;
1516 	unsigned long addr;
1517 	int i;
1518 
1519 	VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
1520 	VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
1521 	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
1522 	VM_BUG_ON(!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd));
1523 
1524 	count_vm_event(THP_SPLIT_PMD);
1525 
1526 	if (!vma_is_anonymous(vma)) {
1527 		_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
1528 		if (is_huge_zero_pmd(_pmd))
1529 			put_huge_zero_page();
1530 		if (vma_is_dax(vma))
1531 			return;
1532 		page = pmd_page(_pmd);
1533 		if (!PageReferenced(page) && pmd_young(_pmd))
1534 			SetPageReferenced(page);
1535 		page_remove_rmap(page, true);
1536 		put_page(page);
1537 		add_mm_counter(mm, MM_FILEPAGES, -HPAGE_PMD_NR);
1538 		return;
1539 	} else if (is_huge_zero_pmd(*pmd)) {
1540 		return __split_huge_zero_page_pmd(vma, haddr, pmd);
1541 	}
1542 
1543 	page = pmd_page(*pmd);
1544 	VM_BUG_ON_PAGE(!page_count(page), page);
1545 	page_ref_add(page, HPAGE_PMD_NR - 1);
1546 	write = pmd_write(*pmd);
1547 	young = pmd_young(*pmd);
1548 	dirty = pmd_dirty(*pmd);
1549 	soft_dirty = pmd_soft_dirty(*pmd);
1550 
1551 	pmdp_huge_split_prepare(vma, haddr, pmd);
1552 	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
1553 	pmd_populate(mm, &_pmd, pgtable);
1554 
1555 	for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
1556 		pte_t entry, *pte;
1557 		/*
1558 		 * Note that NUMA hinting access restrictions are not
1559 		 * transferred to avoid any possibility of altering
1560 		 * permissions across VMAs.
1561 		 */
1562 		if (freeze) {
1563 			swp_entry_t swp_entry;
1564 			swp_entry = make_migration_entry(page + i, write);
1565 			entry = swp_entry_to_pte(swp_entry);
1566 			if (soft_dirty)
1567 				entry = pte_swp_mksoft_dirty(entry);
1568 		} else {
1569 			entry = mk_pte(page + i, vma->vm_page_prot);
1570 			entry = maybe_mkwrite(entry, vma);
1571 			if (!write)
1572 				entry = pte_wrprotect(entry);
1573 			if (!young)
1574 				entry = pte_mkold(entry);
1575 			if (soft_dirty)
1576 				entry = pte_mksoft_dirty(entry);
1577 		}
1578 		if (dirty)
1579 			SetPageDirty(page + i);
1580 		pte = pte_offset_map(&_pmd, addr);
1581 		BUG_ON(!pte_none(*pte));
1582 		set_pte_at(mm, addr, pte, entry);
1583 		atomic_inc(&page[i]._mapcount);
1584 		pte_unmap(pte);
1585 	}
1586 
1587 	/*
1588 	 * Set PG_double_map before dropping compound_mapcount to avoid
1589 	 * false-negative page_mapped().
1590 	 */
1591 	if (compound_mapcount(page) > 1 && !TestSetPageDoubleMap(page)) {
1592 		for (i = 0; i < HPAGE_PMD_NR; i++)
1593 			atomic_inc(&page[i]._mapcount);
1594 	}
1595 
1596 	if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
1597 		/* Last compound_mapcount is gone. */
1598 		__dec_node_page_state(page, NR_ANON_THPS);
1599 		if (TestClearPageDoubleMap(page)) {
1600 			/* No need in mapcount reference anymore */
1601 			for (i = 0; i < HPAGE_PMD_NR; i++)
1602 				atomic_dec(&page[i]._mapcount);
1603 		}
1604 	}
1605 
1606 	smp_wmb(); /* make pte visible before pmd */
1607 	/*
1608 	 * Up to this point the pmd is present and huge and userland has the
1609 	 * whole access to the hugepage during the split (which happens in
1610 	 * place). If we overwrite the pmd with the not-huge version pointing
1611 	 * to the pte here (which of course we could if all CPUs were bug
1612 	 * free), userland could trigger a small page size TLB miss on the
1613 	 * small sized TLB while the hugepage TLB entry is still established in
1614 	 * the huge TLB. Some CPU doesn't like that.
1615 	 * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum
1616 	 * 383 on page 93. Intel should be safe but is also warns that it's
1617 	 * only safe if the permission and cache attributes of the two entries
1618 	 * loaded in the two TLB is identical (which should be the case here).
1619 	 * But it is generally safer to never allow small and huge TLB entries
1620 	 * for the same virtual address to be loaded simultaneously. So instead
1621 	 * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
1622 	 * current pmd notpresent (atomically because here the pmd_trans_huge
1623 	 * and pmd_trans_splitting must remain set at all times on the pmd
1624 	 * until the split is complete for this pmd), then we flush the SMP TLB
1625 	 * and finally we write the non-huge version of the pmd entry with
1626 	 * pmd_populate.
1627 	 */
1628 	pmdp_invalidate(vma, haddr, pmd);
1629 	pmd_populate(mm, pmd, pgtable);
1630 
1631 	if (freeze) {
1632 		for (i = 0; i < HPAGE_PMD_NR; i++) {
1633 			page_remove_rmap(page + i, false);
1634 			put_page(page + i);
1635 		}
1636 	}
1637 }
1638 
1639 void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1640 		unsigned long address, bool freeze, struct page *page)
1641 {
1642 	spinlock_t *ptl;
1643 	struct mm_struct *mm = vma->vm_mm;
1644 	unsigned long haddr = address & HPAGE_PMD_MASK;
1645 
1646 	mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE);
1647 	ptl = pmd_lock(mm, pmd);
1648 
1649 	/*
1650 	 * If caller asks to setup a migration entries, we need a page to check
1651 	 * pmd against. Otherwise we can end up replacing wrong page.
1652 	 */
1653 	VM_BUG_ON(freeze && !page);
1654 	if (page && page != pmd_page(*pmd))
1655 	        goto out;
1656 
1657 	if (pmd_trans_huge(*pmd)) {
1658 		page = pmd_page(*pmd);
1659 		if (PageMlocked(page))
1660 			clear_page_mlock(page);
1661 	} else if (!pmd_devmap(*pmd))
1662 		goto out;
1663 	__split_huge_pmd_locked(vma, pmd, haddr, freeze);
1664 out:
1665 	spin_unlock(ptl);
1666 	mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE);
1667 }
1668 
1669 void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
1670 		bool freeze, struct page *page)
1671 {
1672 	pgd_t *pgd;
1673 	pud_t *pud;
1674 	pmd_t *pmd;
1675 
1676 	pgd = pgd_offset(vma->vm_mm, address);
1677 	if (!pgd_present(*pgd))
1678 		return;
1679 
1680 	pud = pud_offset(pgd, address);
1681 	if (!pud_present(*pud))
1682 		return;
1683 
1684 	pmd = pmd_offset(pud, address);
1685 
1686 	__split_huge_pmd(vma, pmd, address, freeze, page);
1687 }
1688 
1689 void vma_adjust_trans_huge(struct vm_area_struct *vma,
1690 			     unsigned long start,
1691 			     unsigned long end,
1692 			     long adjust_next)
1693 {
1694 	/*
1695 	 * If the new start address isn't hpage aligned and it could
1696 	 * previously contain an hugepage: check if we need to split
1697 	 * an huge pmd.
1698 	 */
1699 	if (start & ~HPAGE_PMD_MASK &&
1700 	    (start & HPAGE_PMD_MASK) >= vma->vm_start &&
1701 	    (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
1702 		split_huge_pmd_address(vma, start, false, NULL);
1703 
1704 	/*
1705 	 * If the new end address isn't hpage aligned and it could
1706 	 * previously contain an hugepage: check if we need to split
1707 	 * an huge pmd.
1708 	 */
1709 	if (end & ~HPAGE_PMD_MASK &&
1710 	    (end & HPAGE_PMD_MASK) >= vma->vm_start &&
1711 	    (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
1712 		split_huge_pmd_address(vma, end, false, NULL);
1713 
1714 	/*
1715 	 * If we're also updating the vma->vm_next->vm_start, if the new
1716 	 * vm_next->vm_start isn't page aligned and it could previously
1717 	 * contain an hugepage: check if we need to split an huge pmd.
1718 	 */
1719 	if (adjust_next > 0) {
1720 		struct vm_area_struct *next = vma->vm_next;
1721 		unsigned long nstart = next->vm_start;
1722 		nstart += adjust_next << PAGE_SHIFT;
1723 		if (nstart & ~HPAGE_PMD_MASK &&
1724 		    (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
1725 		    (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
1726 			split_huge_pmd_address(next, nstart, false, NULL);
1727 	}
1728 }
1729 
1730 static void freeze_page(struct page *page)
1731 {
1732 	enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS |
1733 		TTU_RMAP_LOCKED;
1734 	int i, ret;
1735 
1736 	VM_BUG_ON_PAGE(!PageHead(page), page);
1737 
1738 	if (PageAnon(page))
1739 		ttu_flags |= TTU_MIGRATION;
1740 
1741 	/* We only need TTU_SPLIT_HUGE_PMD once */
1742 	ret = try_to_unmap(page, ttu_flags | TTU_SPLIT_HUGE_PMD);
1743 	for (i = 1; !ret && i < HPAGE_PMD_NR; i++) {
1744 		/* Cut short if the page is unmapped */
1745 		if (page_count(page) == 1)
1746 			return;
1747 
1748 		ret = try_to_unmap(page + i, ttu_flags);
1749 	}
1750 	VM_BUG_ON_PAGE(ret, page + i - 1);
1751 }
1752 
1753 static void unfreeze_page(struct page *page)
1754 {
1755 	int i;
1756 
1757 	for (i = 0; i < HPAGE_PMD_NR; i++)
1758 		remove_migration_ptes(page + i, page + i, true);
1759 }
1760 
1761 static void __split_huge_page_tail(struct page *head, int tail,
1762 		struct lruvec *lruvec, struct list_head *list)
1763 {
1764 	struct page *page_tail = head + tail;
1765 
1766 	VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
1767 	VM_BUG_ON_PAGE(page_ref_count(page_tail) != 0, page_tail);
1768 
1769 	/*
1770 	 * tail_page->_refcount is zero and not changing from under us. But
1771 	 * get_page_unless_zero() may be running from under us on the
1772 	 * tail_page. If we used atomic_set() below instead of atomic_inc() or
1773 	 * atomic_add(), we would then run atomic_set() concurrently with
1774 	 * get_page_unless_zero(), and atomic_set() is implemented in C not
1775 	 * using locked ops. spin_unlock on x86 sometime uses locked ops
1776 	 * because of PPro errata 66, 92, so unless somebody can guarantee
1777 	 * atomic_set() here would be safe on all archs (and not only on x86),
1778 	 * it's safer to use atomic_inc()/atomic_add().
1779 	 */
1780 	if (PageAnon(head)) {
1781 		page_ref_inc(page_tail);
1782 	} else {
1783 		/* Additional pin to radix tree */
1784 		page_ref_add(page_tail, 2);
1785 	}
1786 
1787 	page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
1788 	page_tail->flags |= (head->flags &
1789 			((1L << PG_referenced) |
1790 			 (1L << PG_swapbacked) |
1791 			 (1L << PG_mlocked) |
1792 			 (1L << PG_uptodate) |
1793 			 (1L << PG_active) |
1794 			 (1L << PG_locked) |
1795 			 (1L << PG_unevictable) |
1796 			 (1L << PG_dirty)));
1797 
1798 	/*
1799 	 * After clearing PageTail the gup refcount can be released.
1800 	 * Page flags also must be visible before we make the page non-compound.
1801 	 */
1802 	smp_wmb();
1803 
1804 	clear_compound_head(page_tail);
1805 
1806 	if (page_is_young(head))
1807 		set_page_young(page_tail);
1808 	if (page_is_idle(head))
1809 		set_page_idle(page_tail);
1810 
1811 	/* ->mapping in first tail page is compound_mapcount */
1812 	VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
1813 			page_tail);
1814 	page_tail->mapping = head->mapping;
1815 
1816 	page_tail->index = head->index + tail;
1817 	page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
1818 	lru_add_page_tail(head, page_tail, lruvec, list);
1819 }
1820 
1821 static void __split_huge_page(struct page *page, struct list_head *list,
1822 		unsigned long flags)
1823 {
1824 	struct page *head = compound_head(page);
1825 	struct zone *zone = page_zone(head);
1826 	struct lruvec *lruvec;
1827 	pgoff_t end = -1;
1828 	int i;
1829 
1830 	lruvec = mem_cgroup_page_lruvec(head, zone->zone_pgdat);
1831 
1832 	/* complete memcg works before add pages to LRU */
1833 	mem_cgroup_split_huge_fixup(head);
1834 
1835 	if (!PageAnon(page))
1836 		end = DIV_ROUND_UP(i_size_read(head->mapping->host), PAGE_SIZE);
1837 
1838 	for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
1839 		__split_huge_page_tail(head, i, lruvec, list);
1840 		/* Some pages can be beyond i_size: drop them from page cache */
1841 		if (head[i].index >= end) {
1842 			__ClearPageDirty(head + i);
1843 			__delete_from_page_cache(head + i, NULL);
1844 			if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head))
1845 				shmem_uncharge(head->mapping->host, 1);
1846 			put_page(head + i);
1847 		}
1848 	}
1849 
1850 	ClearPageCompound(head);
1851 	/* See comment in __split_huge_page_tail() */
1852 	if (PageAnon(head)) {
1853 		page_ref_inc(head);
1854 	} else {
1855 		/* Additional pin to radix tree */
1856 		page_ref_add(head, 2);
1857 		spin_unlock(&head->mapping->tree_lock);
1858 	}
1859 
1860 	spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
1861 
1862 	unfreeze_page(head);
1863 
1864 	for (i = 0; i < HPAGE_PMD_NR; i++) {
1865 		struct page *subpage = head + i;
1866 		if (subpage == page)
1867 			continue;
1868 		unlock_page(subpage);
1869 
1870 		/*
1871 		 * Subpages may be freed if there wasn't any mapping
1872 		 * like if add_to_swap() is running on a lru page that
1873 		 * had its mapping zapped. And freeing these pages
1874 		 * requires taking the lru_lock so we do the put_page
1875 		 * of the tail pages after the split is complete.
1876 		 */
1877 		put_page(subpage);
1878 	}
1879 }
1880 
1881 int total_mapcount(struct page *page)
1882 {
1883 	int i, compound, ret;
1884 
1885 	VM_BUG_ON_PAGE(PageTail(page), page);
1886 
1887 	if (likely(!PageCompound(page)))
1888 		return atomic_read(&page->_mapcount) + 1;
1889 
1890 	compound = compound_mapcount(page);
1891 	if (PageHuge(page))
1892 		return compound;
1893 	ret = compound;
1894 	for (i = 0; i < HPAGE_PMD_NR; i++)
1895 		ret += atomic_read(&page[i]._mapcount) + 1;
1896 	/* File pages has compound_mapcount included in _mapcount */
1897 	if (!PageAnon(page))
1898 		return ret - compound * HPAGE_PMD_NR;
1899 	if (PageDoubleMap(page))
1900 		ret -= HPAGE_PMD_NR;
1901 	return ret;
1902 }
1903 
1904 /*
1905  * This calculates accurately how many mappings a transparent hugepage
1906  * has (unlike page_mapcount() which isn't fully accurate). This full
1907  * accuracy is primarily needed to know if copy-on-write faults can
1908  * reuse the page and change the mapping to read-write instead of
1909  * copying them. At the same time this returns the total_mapcount too.
1910  *
1911  * The function returns the highest mapcount any one of the subpages
1912  * has. If the return value is one, even if different processes are
1913  * mapping different subpages of the transparent hugepage, they can
1914  * all reuse it, because each process is reusing a different subpage.
1915  *
1916  * The total_mapcount is instead counting all virtual mappings of the
1917  * subpages. If the total_mapcount is equal to "one", it tells the
1918  * caller all mappings belong to the same "mm" and in turn the
1919  * anon_vma of the transparent hugepage can become the vma->anon_vma
1920  * local one as no other process may be mapping any of the subpages.
1921  *
1922  * It would be more accurate to replace page_mapcount() with
1923  * page_trans_huge_mapcount(), however we only use
1924  * page_trans_huge_mapcount() in the copy-on-write faults where we
1925  * need full accuracy to avoid breaking page pinning, because
1926  * page_trans_huge_mapcount() is slower than page_mapcount().
1927  */
1928 int page_trans_huge_mapcount(struct page *page, int *total_mapcount)
1929 {
1930 	int i, ret, _total_mapcount, mapcount;
1931 
1932 	/* hugetlbfs shouldn't call it */
1933 	VM_BUG_ON_PAGE(PageHuge(page), page);
1934 
1935 	if (likely(!PageTransCompound(page))) {
1936 		mapcount = atomic_read(&page->_mapcount) + 1;
1937 		if (total_mapcount)
1938 			*total_mapcount = mapcount;
1939 		return mapcount;
1940 	}
1941 
1942 	page = compound_head(page);
1943 
1944 	_total_mapcount = ret = 0;
1945 	for (i = 0; i < HPAGE_PMD_NR; i++) {
1946 		mapcount = atomic_read(&page[i]._mapcount) + 1;
1947 		ret = max(ret, mapcount);
1948 		_total_mapcount += mapcount;
1949 	}
1950 	if (PageDoubleMap(page)) {
1951 		ret -= 1;
1952 		_total_mapcount -= HPAGE_PMD_NR;
1953 	}
1954 	mapcount = compound_mapcount(page);
1955 	ret += mapcount;
1956 	_total_mapcount += mapcount;
1957 	if (total_mapcount)
1958 		*total_mapcount = _total_mapcount;
1959 	return ret;
1960 }
1961 
1962 /*
1963  * This function splits huge page into normal pages. @page can point to any
1964  * subpage of huge page to split. Split doesn't change the position of @page.
1965  *
1966  * Only caller must hold pin on the @page, otherwise split fails with -EBUSY.
1967  * The huge page must be locked.
1968  *
1969  * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
1970  *
1971  * Both head page and tail pages will inherit mapping, flags, and so on from
1972  * the hugepage.
1973  *
1974  * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if
1975  * they are not mapped.
1976  *
1977  * Returns 0 if the hugepage is split successfully.
1978  * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
1979  * us.
1980  */
1981 int split_huge_page_to_list(struct page *page, struct list_head *list)
1982 {
1983 	struct page *head = compound_head(page);
1984 	struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
1985 	struct anon_vma *anon_vma = NULL;
1986 	struct address_space *mapping = NULL;
1987 	int count, mapcount, extra_pins, ret;
1988 	bool mlocked;
1989 	unsigned long flags;
1990 
1991 	VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
1992 	VM_BUG_ON_PAGE(!PageLocked(page), page);
1993 	VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
1994 	VM_BUG_ON_PAGE(!PageCompound(page), page);
1995 
1996 	if (PageAnon(head)) {
1997 		/*
1998 		 * The caller does not necessarily hold an mmap_sem that would
1999 		 * prevent the anon_vma disappearing so we first we take a
2000 		 * reference to it and then lock the anon_vma for write. This
2001 		 * is similar to page_lock_anon_vma_read except the write lock
2002 		 * is taken to serialise against parallel split or collapse
2003 		 * operations.
2004 		 */
2005 		anon_vma = page_get_anon_vma(head);
2006 		if (!anon_vma) {
2007 			ret = -EBUSY;
2008 			goto out;
2009 		}
2010 		extra_pins = 0;
2011 		mapping = NULL;
2012 		anon_vma_lock_write(anon_vma);
2013 	} else {
2014 		mapping = head->mapping;
2015 
2016 		/* Truncated ? */
2017 		if (!mapping) {
2018 			ret = -EBUSY;
2019 			goto out;
2020 		}
2021 
2022 		/* Addidional pins from radix tree */
2023 		extra_pins = HPAGE_PMD_NR;
2024 		anon_vma = NULL;
2025 		i_mmap_lock_read(mapping);
2026 	}
2027 
2028 	/*
2029 	 * Racy check if we can split the page, before freeze_page() will
2030 	 * split PMDs
2031 	 */
2032 	if (total_mapcount(head) != page_count(head) - extra_pins - 1) {
2033 		ret = -EBUSY;
2034 		goto out_unlock;
2035 	}
2036 
2037 	mlocked = PageMlocked(page);
2038 	freeze_page(head);
2039 	VM_BUG_ON_PAGE(compound_mapcount(head), head);
2040 
2041 	/* Make sure the page is not on per-CPU pagevec as it takes pin */
2042 	if (mlocked)
2043 		lru_add_drain();
2044 
2045 	/* prevent PageLRU to go away from under us, and freeze lru stats */
2046 	spin_lock_irqsave(zone_lru_lock(page_zone(head)), flags);
2047 
2048 	if (mapping) {
2049 		void **pslot;
2050 
2051 		spin_lock(&mapping->tree_lock);
2052 		pslot = radix_tree_lookup_slot(&mapping->page_tree,
2053 				page_index(head));
2054 		/*
2055 		 * Check if the head page is present in radix tree.
2056 		 * We assume all tail are present too, if head is there.
2057 		 */
2058 		if (radix_tree_deref_slot_protected(pslot,
2059 					&mapping->tree_lock) != head)
2060 			goto fail;
2061 	}
2062 
2063 	/* Prevent deferred_split_scan() touching ->_refcount */
2064 	spin_lock(&pgdata->split_queue_lock);
2065 	count = page_count(head);
2066 	mapcount = total_mapcount(head);
2067 	if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) {
2068 		if (!list_empty(page_deferred_list(head))) {
2069 			pgdata->split_queue_len--;
2070 			list_del(page_deferred_list(head));
2071 		}
2072 		if (mapping)
2073 			__dec_node_page_state(page, NR_SHMEM_THPS);
2074 		spin_unlock(&pgdata->split_queue_lock);
2075 		__split_huge_page(page, list, flags);
2076 		ret = 0;
2077 	} else {
2078 		if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
2079 			pr_alert("total_mapcount: %u, page_count(): %u\n",
2080 					mapcount, count);
2081 			if (PageTail(page))
2082 				dump_page(head, NULL);
2083 			dump_page(page, "total_mapcount(head) > 0");
2084 			BUG();
2085 		}
2086 		spin_unlock(&pgdata->split_queue_lock);
2087 fail:		if (mapping)
2088 			spin_unlock(&mapping->tree_lock);
2089 		spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
2090 		unfreeze_page(head);
2091 		ret = -EBUSY;
2092 	}
2093 
2094 out_unlock:
2095 	if (anon_vma) {
2096 		anon_vma_unlock_write(anon_vma);
2097 		put_anon_vma(anon_vma);
2098 	}
2099 	if (mapping)
2100 		i_mmap_unlock_read(mapping);
2101 out:
2102 	count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
2103 	return ret;
2104 }
2105 
2106 void free_transhuge_page(struct page *page)
2107 {
2108 	struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
2109 	unsigned long flags;
2110 
2111 	spin_lock_irqsave(&pgdata->split_queue_lock, flags);
2112 	if (!list_empty(page_deferred_list(page))) {
2113 		pgdata->split_queue_len--;
2114 		list_del(page_deferred_list(page));
2115 	}
2116 	spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
2117 	free_compound_page(page);
2118 }
2119 
2120 void deferred_split_huge_page(struct page *page)
2121 {
2122 	struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
2123 	unsigned long flags;
2124 
2125 	VM_BUG_ON_PAGE(!PageTransHuge(page), page);
2126 
2127 	spin_lock_irqsave(&pgdata->split_queue_lock, flags);
2128 	if (list_empty(page_deferred_list(page))) {
2129 		count_vm_event(THP_DEFERRED_SPLIT_PAGE);
2130 		list_add_tail(page_deferred_list(page), &pgdata->split_queue);
2131 		pgdata->split_queue_len++;
2132 	}
2133 	spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
2134 }
2135 
2136 static unsigned long deferred_split_count(struct shrinker *shrink,
2137 		struct shrink_control *sc)
2138 {
2139 	struct pglist_data *pgdata = NODE_DATA(sc->nid);
2140 	return ACCESS_ONCE(pgdata->split_queue_len);
2141 }
2142 
2143 static unsigned long deferred_split_scan(struct shrinker *shrink,
2144 		struct shrink_control *sc)
2145 {
2146 	struct pglist_data *pgdata = NODE_DATA(sc->nid);
2147 	unsigned long flags;
2148 	LIST_HEAD(list), *pos, *next;
2149 	struct page *page;
2150 	int split = 0;
2151 
2152 	spin_lock_irqsave(&pgdata->split_queue_lock, flags);
2153 	/* Take pin on all head pages to avoid freeing them under us */
2154 	list_for_each_safe(pos, next, &pgdata->split_queue) {
2155 		page = list_entry((void *)pos, struct page, mapping);
2156 		page = compound_head(page);
2157 		if (get_page_unless_zero(page)) {
2158 			list_move(page_deferred_list(page), &list);
2159 		} else {
2160 			/* We lost race with put_compound_page() */
2161 			list_del_init(page_deferred_list(page));
2162 			pgdata->split_queue_len--;
2163 		}
2164 		if (!--sc->nr_to_scan)
2165 			break;
2166 	}
2167 	spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
2168 
2169 	list_for_each_safe(pos, next, &list) {
2170 		page = list_entry((void *)pos, struct page, mapping);
2171 		lock_page(page);
2172 		/* split_huge_page() removes page from list on success */
2173 		if (!split_huge_page(page))
2174 			split++;
2175 		unlock_page(page);
2176 		put_page(page);
2177 	}
2178 
2179 	spin_lock_irqsave(&pgdata->split_queue_lock, flags);
2180 	list_splice_tail(&list, &pgdata->split_queue);
2181 	spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
2182 
2183 	/*
2184 	 * Stop shrinker if we didn't split any page, but the queue is empty.
2185 	 * This can happen if pages were freed under us.
2186 	 */
2187 	if (!split && list_empty(&pgdata->split_queue))
2188 		return SHRINK_STOP;
2189 	return split;
2190 }
2191 
2192 static struct shrinker deferred_split_shrinker = {
2193 	.count_objects = deferred_split_count,
2194 	.scan_objects = deferred_split_scan,
2195 	.seeks = DEFAULT_SEEKS,
2196 	.flags = SHRINKER_NUMA_AWARE,
2197 };
2198 
2199 #ifdef CONFIG_DEBUG_FS
2200 static int split_huge_pages_set(void *data, u64 val)
2201 {
2202 	struct zone *zone;
2203 	struct page *page;
2204 	unsigned long pfn, max_zone_pfn;
2205 	unsigned long total = 0, split = 0;
2206 
2207 	if (val != 1)
2208 		return -EINVAL;
2209 
2210 	for_each_populated_zone(zone) {
2211 		max_zone_pfn = zone_end_pfn(zone);
2212 		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
2213 			if (!pfn_valid(pfn))
2214 				continue;
2215 
2216 			page = pfn_to_page(pfn);
2217 			if (!get_page_unless_zero(page))
2218 				continue;
2219 
2220 			if (zone != page_zone(page))
2221 				goto next;
2222 
2223 			if (!PageHead(page) || PageHuge(page) || !PageLRU(page))
2224 				goto next;
2225 
2226 			total++;
2227 			lock_page(page);
2228 			if (!split_huge_page(page))
2229 				split++;
2230 			unlock_page(page);
2231 next:
2232 			put_page(page);
2233 		}
2234 	}
2235 
2236 	pr_info("%lu of %lu THP split\n", split, total);
2237 
2238 	return 0;
2239 }
2240 DEFINE_SIMPLE_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
2241 		"%llu\n");
2242 
2243 static int __init split_huge_pages_debugfs(void)
2244 {
2245 	void *ret;
2246 
2247 	ret = debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
2248 			&split_huge_pages_fops);
2249 	if (!ret)
2250 		pr_warn("Failed to create split_huge_pages in debugfs");
2251 	return 0;
2252 }
2253 late_initcall(split_huge_pages_debugfs);
2254 #endif
2255