xref: /openbmc/linux/mm/hugetlb.c (revision d75c6af9)
1457c8996SThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only
21da177e4SLinus Torvalds /*
31da177e4SLinus Torvalds  * Generic hugetlb support.
46d49e352SNadia Yvette Chambers  * (C) Nadia Yvette Chambers, April 2004
51da177e4SLinus Torvalds  */
61da177e4SLinus Torvalds #include <linux/list.h>
71da177e4SLinus Torvalds #include <linux/init.h>
81da177e4SLinus Torvalds #include <linux/mm.h>
9e1759c21SAlexey Dobriyan #include <linux/seq_file.h>
101da177e4SLinus Torvalds #include <linux/sysctl.h>
111da177e4SLinus Torvalds #include <linux/highmem.h>
12cddb8a5cSAndrea Arcangeli #include <linux/mmu_notifier.h>
131da177e4SLinus Torvalds #include <linux/nodemask.h>
1463551ae0SDavid Gibson #include <linux/pagemap.h>
155da7ca86SChristoph Lameter #include <linux/mempolicy.h>
163b32123dSGideon Israel Dsouza #include <linux/compiler.h>
17aea47ff3SChristoph Lameter #include <linux/cpuset.h>
183935baa9SDavid Gibson #include <linux/mutex.h>
1997ad1087SMike Rapoport #include <linux/memblock.h>
20a3437870SNishanth Aravamudan #include <linux/sysfs.h>
215a0e3ad6STejun Heo #include <linux/slab.h>
2263489f8eSMike Kravetz #include <linux/mmdebug.h>
23174cd4b1SIngo Molnar #include <linux/sched/signal.h>
240fe6e20bSNaoya Horiguchi #include <linux/rmap.h>
25c6247f72SMatthew Wilcox #include <linux/string_helpers.h>
26fd6a03edSNaoya Horiguchi #include <linux/swap.h>
27fd6a03edSNaoya Horiguchi #include <linux/swapops.h>
288382d914SDavidlohr Bueso #include <linux/jhash.h>
2998fa15f3SAnshuman Khandual #include <linux/numa.h>
30d6606683SLinus Torvalds 
3163551ae0SDavid Gibson #include <asm/page.h>
3263551ae0SDavid Gibson #include <asm/pgtable.h>
3324669e58SAneesh Kumar K.V #include <asm/tlb.h>
3463551ae0SDavid Gibson 
3524669e58SAneesh Kumar K.V #include <linux/io.h>
3663551ae0SDavid Gibson #include <linux/hugetlb.h>
379dd540e2SAneesh Kumar K.V #include <linux/hugetlb_cgroup.h>
389a305230SLee Schermerhorn #include <linux/node.h>
391a1aad8aSMike Kravetz #include <linux/userfaultfd_k.h>
40ab5ac90aSMichal Hocko #include <linux/page_owner.h>
417835e98bSNick Piggin #include "internal.h"
421da177e4SLinus Torvalds 
43c3f38a38SAneesh Kumar K.V int hugetlb_max_hstate __read_mostly;
44e5ff2159SAndi Kleen unsigned int default_hstate_idx;
45e5ff2159SAndi Kleen struct hstate hstates[HUGE_MAX_HSTATE];
46641844f5SNaoya Horiguchi /*
47641844f5SNaoya Horiguchi  * Minimum page order among possible hugepage sizes, set to a proper value
48641844f5SNaoya Horiguchi  * at boot time.
49641844f5SNaoya Horiguchi  */
50641844f5SNaoya Horiguchi static unsigned int minimum_order __read_mostly = UINT_MAX;
51e5ff2159SAndi Kleen 
5253ba51d2SJon Tollefson __initdata LIST_HEAD(huge_boot_pages);
5353ba51d2SJon Tollefson 
54e5ff2159SAndi Kleen /* for command line parsing */
55e5ff2159SAndi Kleen static struct hstate * __initdata parsed_hstate;
56e5ff2159SAndi Kleen static unsigned long __initdata default_hstate_max_huge_pages;
57e11bfbfcSNick Piggin static unsigned long __initdata default_hstate_size;
589fee021dSVaishali Thakkar static bool __initdata parsed_valid_hugepagesz = true;
59e5ff2159SAndi Kleen 
603935baa9SDavid Gibson /*
6131caf665SNaoya Horiguchi  * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
6231caf665SNaoya Horiguchi  * free_huge_pages, and surplus_huge_pages.
633935baa9SDavid Gibson  */
64c3f38a38SAneesh Kumar K.V DEFINE_SPINLOCK(hugetlb_lock);
650bd0f9fbSEric Paris 
668382d914SDavidlohr Bueso /*
678382d914SDavidlohr Bueso  * Serializes faults on the same logical page.  This is used to
688382d914SDavidlohr Bueso  * prevent spurious OOMs when the hugepage pool is fully utilized.
698382d914SDavidlohr Bueso  */
708382d914SDavidlohr Bueso static int num_fault_mutexes;
71c672c7f2SMike Kravetz struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
728382d914SDavidlohr Bueso 
737ca02d0aSMike Kravetz /* Forward declaration */
747ca02d0aSMike Kravetz static int hugetlb_acct_memory(struct hstate *h, long delta);
757ca02d0aSMike Kravetz 
7690481622SDavid Gibson static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
7790481622SDavid Gibson {
7890481622SDavid Gibson 	bool free = (spool->count == 0) && (spool->used_hpages == 0);
7990481622SDavid Gibson 
8090481622SDavid Gibson 	spin_unlock(&spool->lock);
8190481622SDavid Gibson 
8290481622SDavid Gibson 	/* If no pages are used, and no other handles to the subpool
837ca02d0aSMike Kravetz 	 * remain, give up any reservations mased on minimum size and
847ca02d0aSMike Kravetz 	 * free the subpool */
857ca02d0aSMike Kravetz 	if (free) {
867ca02d0aSMike Kravetz 		if (spool->min_hpages != -1)
877ca02d0aSMike Kravetz 			hugetlb_acct_memory(spool->hstate,
887ca02d0aSMike Kravetz 						-spool->min_hpages);
8990481622SDavid Gibson 		kfree(spool);
9090481622SDavid Gibson 	}
917ca02d0aSMike Kravetz }
9290481622SDavid Gibson 
937ca02d0aSMike Kravetz struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
947ca02d0aSMike Kravetz 						long min_hpages)
9590481622SDavid Gibson {
9690481622SDavid Gibson 	struct hugepage_subpool *spool;
9790481622SDavid Gibson 
98c6a91820SMike Kravetz 	spool = kzalloc(sizeof(*spool), GFP_KERNEL);
9990481622SDavid Gibson 	if (!spool)
10090481622SDavid Gibson 		return NULL;
10190481622SDavid Gibson 
10290481622SDavid Gibson 	spin_lock_init(&spool->lock);
10390481622SDavid Gibson 	spool->count = 1;
1047ca02d0aSMike Kravetz 	spool->max_hpages = max_hpages;
1057ca02d0aSMike Kravetz 	spool->hstate = h;
1067ca02d0aSMike Kravetz 	spool->min_hpages = min_hpages;
1077ca02d0aSMike Kravetz 
1087ca02d0aSMike Kravetz 	if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
1097ca02d0aSMike Kravetz 		kfree(spool);
1107ca02d0aSMike Kravetz 		return NULL;
1117ca02d0aSMike Kravetz 	}
1127ca02d0aSMike Kravetz 	spool->rsv_hpages = min_hpages;
11390481622SDavid Gibson 
11490481622SDavid Gibson 	return spool;
11590481622SDavid Gibson }
11690481622SDavid Gibson 
11790481622SDavid Gibson void hugepage_put_subpool(struct hugepage_subpool *spool)
11890481622SDavid Gibson {
11990481622SDavid Gibson 	spin_lock(&spool->lock);
12090481622SDavid Gibson 	BUG_ON(!spool->count);
12190481622SDavid Gibson 	spool->count--;
12290481622SDavid Gibson 	unlock_or_release_subpool(spool);
12390481622SDavid Gibson }
12490481622SDavid Gibson 
1251c5ecae3SMike Kravetz /*
1261c5ecae3SMike Kravetz  * Subpool accounting for allocating and reserving pages.
1271c5ecae3SMike Kravetz  * Return -ENOMEM if there are not enough resources to satisfy the
1281c5ecae3SMike Kravetz  * the request.  Otherwise, return the number of pages by which the
1291c5ecae3SMike Kravetz  * global pools must be adjusted (upward).  The returned value may
1301c5ecae3SMike Kravetz  * only be different than the passed value (delta) in the case where
1311c5ecae3SMike Kravetz  * a subpool minimum size must be manitained.
1321c5ecae3SMike Kravetz  */
1331c5ecae3SMike Kravetz static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
13490481622SDavid Gibson 				      long delta)
13590481622SDavid Gibson {
1361c5ecae3SMike Kravetz 	long ret = delta;
13790481622SDavid Gibson 
13890481622SDavid Gibson 	if (!spool)
1391c5ecae3SMike Kravetz 		return ret;
14090481622SDavid Gibson 
14190481622SDavid Gibson 	spin_lock(&spool->lock);
14290481622SDavid Gibson 
1431c5ecae3SMike Kravetz 	if (spool->max_hpages != -1) {		/* maximum size accounting */
1441c5ecae3SMike Kravetz 		if ((spool->used_hpages + delta) <= spool->max_hpages)
1451c5ecae3SMike Kravetz 			spool->used_hpages += delta;
1461c5ecae3SMike Kravetz 		else {
1471c5ecae3SMike Kravetz 			ret = -ENOMEM;
1481c5ecae3SMike Kravetz 			goto unlock_ret;
1491c5ecae3SMike Kravetz 		}
1501c5ecae3SMike Kravetz 	}
1511c5ecae3SMike Kravetz 
15209a95e29SMike Kravetz 	/* minimum size accounting */
15309a95e29SMike Kravetz 	if (spool->min_hpages != -1 && spool->rsv_hpages) {
1541c5ecae3SMike Kravetz 		if (delta > spool->rsv_hpages) {
1551c5ecae3SMike Kravetz 			/*
1561c5ecae3SMike Kravetz 			 * Asking for more reserves than those already taken on
1571c5ecae3SMike Kravetz 			 * behalf of subpool.  Return difference.
1581c5ecae3SMike Kravetz 			 */
1591c5ecae3SMike Kravetz 			ret = delta - spool->rsv_hpages;
1601c5ecae3SMike Kravetz 			spool->rsv_hpages = 0;
1611c5ecae3SMike Kravetz 		} else {
1621c5ecae3SMike Kravetz 			ret = 0;	/* reserves already accounted for */
1631c5ecae3SMike Kravetz 			spool->rsv_hpages -= delta;
1641c5ecae3SMike Kravetz 		}
1651c5ecae3SMike Kravetz 	}
1661c5ecae3SMike Kravetz 
1671c5ecae3SMike Kravetz unlock_ret:
1681c5ecae3SMike Kravetz 	spin_unlock(&spool->lock);
16990481622SDavid Gibson 	return ret;
17090481622SDavid Gibson }
17190481622SDavid Gibson 
1721c5ecae3SMike Kravetz /*
1731c5ecae3SMike Kravetz  * Subpool accounting for freeing and unreserving pages.
1741c5ecae3SMike Kravetz  * Return the number of global page reservations that must be dropped.
1751c5ecae3SMike Kravetz  * The return value may only be different than the passed value (delta)
1761c5ecae3SMike Kravetz  * in the case where a subpool minimum size must be maintained.
1771c5ecae3SMike Kravetz  */
1781c5ecae3SMike Kravetz static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
17990481622SDavid Gibson 				       long delta)
18090481622SDavid Gibson {
1811c5ecae3SMike Kravetz 	long ret = delta;
1821c5ecae3SMike Kravetz 
18390481622SDavid Gibson 	if (!spool)
1841c5ecae3SMike Kravetz 		return delta;
18590481622SDavid Gibson 
18690481622SDavid Gibson 	spin_lock(&spool->lock);
1871c5ecae3SMike Kravetz 
1881c5ecae3SMike Kravetz 	if (spool->max_hpages != -1)		/* maximum size accounting */
18990481622SDavid Gibson 		spool->used_hpages -= delta;
1901c5ecae3SMike Kravetz 
19109a95e29SMike Kravetz 	 /* minimum size accounting */
19209a95e29SMike Kravetz 	if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) {
1931c5ecae3SMike Kravetz 		if (spool->rsv_hpages + delta <= spool->min_hpages)
1941c5ecae3SMike Kravetz 			ret = 0;
1951c5ecae3SMike Kravetz 		else
1961c5ecae3SMike Kravetz 			ret = spool->rsv_hpages + delta - spool->min_hpages;
1971c5ecae3SMike Kravetz 
1981c5ecae3SMike Kravetz 		spool->rsv_hpages += delta;
1991c5ecae3SMike Kravetz 		if (spool->rsv_hpages > spool->min_hpages)
2001c5ecae3SMike Kravetz 			spool->rsv_hpages = spool->min_hpages;
2011c5ecae3SMike Kravetz 	}
2021c5ecae3SMike Kravetz 
2031c5ecae3SMike Kravetz 	/*
2041c5ecae3SMike Kravetz 	 * If hugetlbfs_put_super couldn't free spool due to an outstanding
2051c5ecae3SMike Kravetz 	 * quota reference, free it now.
2061c5ecae3SMike Kravetz 	 */
20790481622SDavid Gibson 	unlock_or_release_subpool(spool);
2081c5ecae3SMike Kravetz 
2091c5ecae3SMike Kravetz 	return ret;
21090481622SDavid Gibson }
21190481622SDavid Gibson 
21290481622SDavid Gibson static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
21390481622SDavid Gibson {
21490481622SDavid Gibson 	return HUGETLBFS_SB(inode->i_sb)->spool;
21590481622SDavid Gibson }
21690481622SDavid Gibson 
21790481622SDavid Gibson static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
21890481622SDavid Gibson {
219496ad9aaSAl Viro 	return subpool_inode(file_inode(vma->vm_file));
22090481622SDavid Gibson }
22190481622SDavid Gibson 
222e7c4b0bfSAndy Whitcroft /*
22396822904SAndy Whitcroft  * Region tracking -- allows tracking of reservations and instantiated pages
22496822904SAndy Whitcroft  *                    across the pages in a mapping.
22584afd99bSAndy Whitcroft  *
2261dd308a7SMike Kravetz  * The region data structures are embedded into a resv_map and protected
2271dd308a7SMike Kravetz  * by a resv_map's lock.  The set of regions within the resv_map represent
2281dd308a7SMike Kravetz  * reservations for huge pages, or huge pages that have already been
2291dd308a7SMike Kravetz  * instantiated within the map.  The from and to elements are huge page
2301dd308a7SMike Kravetz  * indicies into the associated mapping.  from indicates the starting index
2311dd308a7SMike Kravetz  * of the region.  to represents the first index past the end of  the region.
2321dd308a7SMike Kravetz  *
2331dd308a7SMike Kravetz  * For example, a file region structure with from == 0 and to == 4 represents
2341dd308a7SMike Kravetz  * four huge pages in a mapping.  It is important to note that the to element
2351dd308a7SMike Kravetz  * represents the first element past the end of the region. This is used in
2361dd308a7SMike Kravetz  * arithmetic as 4(to) - 0(from) = 4 huge pages in the region.
2371dd308a7SMike Kravetz  *
2381dd308a7SMike Kravetz  * Interval notation of the form [from, to) will be used to indicate that
2391dd308a7SMike Kravetz  * the endpoint from is inclusive and to is exclusive.
24096822904SAndy Whitcroft  */
24196822904SAndy Whitcroft struct file_region {
24296822904SAndy Whitcroft 	struct list_head link;
24396822904SAndy Whitcroft 	long from;
24496822904SAndy Whitcroft 	long to;
24596822904SAndy Whitcroft };
24696822904SAndy Whitcroft 
247d75c6af9SMina Almasry /* Must be called with resv->lock held. Calling this with count_only == true
248d75c6af9SMina Almasry  * will count the number of pages to be added but will not modify the linked
249d75c6af9SMina Almasry  * list.
250d75c6af9SMina Almasry  */
251d75c6af9SMina Almasry static long add_reservation_in_range(struct resv_map *resv, long f, long t,
252d75c6af9SMina Almasry 				     bool count_only)
253d75c6af9SMina Almasry {
254d75c6af9SMina Almasry 	long chg = 0;
255d75c6af9SMina Almasry 	struct list_head *head = &resv->regions;
256d75c6af9SMina Almasry 	struct file_region *rg = NULL, *trg = NULL, *nrg = NULL;
257d75c6af9SMina Almasry 
258d75c6af9SMina Almasry 	/* Locate the region we are before or in. */
259d75c6af9SMina Almasry 	list_for_each_entry(rg, head, link)
260d75c6af9SMina Almasry 		if (f <= rg->to)
261d75c6af9SMina Almasry 			break;
262d75c6af9SMina Almasry 
263d75c6af9SMina Almasry 	/* Round our left edge to the current segment if it encloses us. */
264d75c6af9SMina Almasry 	if (f > rg->from)
265d75c6af9SMina Almasry 		f = rg->from;
266d75c6af9SMina Almasry 
267d75c6af9SMina Almasry 	chg = t - f;
268d75c6af9SMina Almasry 
269d75c6af9SMina Almasry 	/* Check for and consume any regions we now overlap with. */
270d75c6af9SMina Almasry 	nrg = rg;
271d75c6af9SMina Almasry 	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
272d75c6af9SMina Almasry 		if (&rg->link == head)
273d75c6af9SMina Almasry 			break;
274d75c6af9SMina Almasry 		if (rg->from > t)
275d75c6af9SMina Almasry 			break;
276d75c6af9SMina Almasry 
277d75c6af9SMina Almasry 		/* We overlap with this area, if it extends further than
278d75c6af9SMina Almasry 		 * us then we must extend ourselves.  Account for its
279d75c6af9SMina Almasry 		 * existing reservation.
280d75c6af9SMina Almasry 		 */
281d75c6af9SMina Almasry 		if (rg->to > t) {
282d75c6af9SMina Almasry 			chg += rg->to - t;
283d75c6af9SMina Almasry 			t = rg->to;
284d75c6af9SMina Almasry 		}
285d75c6af9SMina Almasry 		chg -= rg->to - rg->from;
286d75c6af9SMina Almasry 
287d75c6af9SMina Almasry 		if (!count_only && rg != nrg) {
288d75c6af9SMina Almasry 			list_del(&rg->link);
289d75c6af9SMina Almasry 			kfree(rg);
290d75c6af9SMina Almasry 		}
291d75c6af9SMina Almasry 	}
292d75c6af9SMina Almasry 
293d75c6af9SMina Almasry 	if (!count_only) {
294d75c6af9SMina Almasry 		nrg->from = f;
295d75c6af9SMina Almasry 		nrg->to = t;
296d75c6af9SMina Almasry 	}
297d75c6af9SMina Almasry 
298d75c6af9SMina Almasry 	return chg;
299d75c6af9SMina Almasry }
300d75c6af9SMina Almasry 
3011dd308a7SMike Kravetz /*
3021dd308a7SMike Kravetz  * Add the huge page range represented by [f, t) to the reserve
3035c911954SMina Almasry  * map.  Existing regions will be expanded to accommodate the specified
3045c911954SMina Almasry  * range, or a region will be taken from the cache.  Sufficient regions
3055c911954SMina Almasry  * must exist in the cache due to the previous call to region_chg with
3065c911954SMina Almasry  * the same range.
307cf3ad20bSMike Kravetz  *
308cf3ad20bSMike Kravetz  * Return the number of new huge pages added to the map.  This
309cf3ad20bSMike Kravetz  * number is greater than or equal to zero.
3101dd308a7SMike Kravetz  */
3111406ec9bSJoonsoo Kim static long region_add(struct resv_map *resv, long f, long t)
31296822904SAndy Whitcroft {
3131406ec9bSJoonsoo Kim 	struct list_head *head = &resv->regions;
314d75c6af9SMina Almasry 	struct file_region *rg, *nrg;
315cf3ad20bSMike Kravetz 	long add = 0;
31696822904SAndy Whitcroft 
3177b24d861SDavidlohr Bueso 	spin_lock(&resv->lock);
31896822904SAndy Whitcroft 	/* Locate the region we are either in or before. */
31996822904SAndy Whitcroft 	list_for_each_entry(rg, head, link)
32096822904SAndy Whitcroft 		if (f <= rg->to)
32196822904SAndy Whitcroft 			break;
32296822904SAndy Whitcroft 
3235e911373SMike Kravetz 	/*
3245e911373SMike Kravetz 	 * If no region exists which can be expanded to include the
3255c911954SMina Almasry 	 * specified range, pull a region descriptor from the cache
3265c911954SMina Almasry 	 * and use it for this range.
3275e911373SMike Kravetz 	 */
3285e911373SMike Kravetz 	if (&rg->link == head || t < rg->from) {
3295e911373SMike Kravetz 		VM_BUG_ON(resv->region_cache_count <= 0);
3305e911373SMike Kravetz 
3315e911373SMike Kravetz 		resv->region_cache_count--;
3325e911373SMike Kravetz 		nrg = list_first_entry(&resv->region_cache, struct file_region,
3335e911373SMike Kravetz 					link);
3345e911373SMike Kravetz 		list_del(&nrg->link);
3355e911373SMike Kravetz 
3365e911373SMike Kravetz 		nrg->from = f;
3375e911373SMike Kravetz 		nrg->to = t;
3385e911373SMike Kravetz 		list_add(&nrg->link, rg->link.prev);
3395e911373SMike Kravetz 
3405e911373SMike Kravetz 		add += t - f;
3415e911373SMike Kravetz 		goto out_locked;
3425e911373SMike Kravetz 	}
3435e911373SMike Kravetz 
344d75c6af9SMina Almasry 	add = add_reservation_in_range(resv, f, t, false);
345cf3ad20bSMike Kravetz 
3465e911373SMike Kravetz out_locked:
3475e911373SMike Kravetz 	resv->adds_in_progress--;
3487b24d861SDavidlohr Bueso 	spin_unlock(&resv->lock);
349cf3ad20bSMike Kravetz 	VM_BUG_ON(add < 0);
350cf3ad20bSMike Kravetz 	return add;
35196822904SAndy Whitcroft }
35296822904SAndy Whitcroft 
3531dd308a7SMike Kravetz /*
3541dd308a7SMike Kravetz  * Examine the existing reserve map and determine how many
3551dd308a7SMike Kravetz  * huge pages in the specified range [f, t) are NOT currently
3561dd308a7SMike Kravetz  * represented.  This routine is called before a subsequent
3571dd308a7SMike Kravetz  * call to region_add that will actually modify the reserve
3581dd308a7SMike Kravetz  * map to add the specified range [f, t).  region_chg does
3591dd308a7SMike Kravetz  * not change the number of huge pages represented by the
3605c911954SMina Almasry  * map.  A new file_region structure is added to the cache
3615c911954SMina Almasry  * as a placeholder, so that the subsequent region_add
3625c911954SMina Almasry  * call will have all the regions it needs and will not fail.
3635e911373SMike Kravetz  *
3645e911373SMike Kravetz  * Returns the number of huge pages that need to be added to the existing
3655e911373SMike Kravetz  * reservation map for the range [f, t).  This number is greater or equal to
3665e911373SMike Kravetz  * zero.  -ENOMEM is returned if a new file_region structure or cache entry
3675e911373SMike Kravetz  * is needed and can not be allocated.
3681dd308a7SMike Kravetz  */
3691406ec9bSJoonsoo Kim static long region_chg(struct resv_map *resv, long f, long t)
37096822904SAndy Whitcroft {
37196822904SAndy Whitcroft 	long chg = 0;
37296822904SAndy Whitcroft 
3737b24d861SDavidlohr Bueso 	spin_lock(&resv->lock);
3745e911373SMike Kravetz retry_locked:
3755e911373SMike Kravetz 	resv->adds_in_progress++;
3765e911373SMike Kravetz 
3775e911373SMike Kravetz 	/*
3785e911373SMike Kravetz 	 * Check for sufficient descriptors in the cache to accommodate
3795e911373SMike Kravetz 	 * the number of in progress add operations.
3805e911373SMike Kravetz 	 */
3815e911373SMike Kravetz 	if (resv->adds_in_progress > resv->region_cache_count) {
3825e911373SMike Kravetz 		struct file_region *trg;
3835e911373SMike Kravetz 
3845e911373SMike Kravetz 		VM_BUG_ON(resv->adds_in_progress - resv->region_cache_count > 1);
3855e911373SMike Kravetz 		/* Must drop lock to allocate a new descriptor. */
3865e911373SMike Kravetz 		resv->adds_in_progress--;
3875e911373SMike Kravetz 		spin_unlock(&resv->lock);
3885e911373SMike Kravetz 
3895e911373SMike Kravetz 		trg = kmalloc(sizeof(*trg), GFP_KERNEL);
3905c911954SMina Almasry 		if (!trg)
3915e911373SMike Kravetz 			return -ENOMEM;
3925e911373SMike Kravetz 
3935e911373SMike Kravetz 		spin_lock(&resv->lock);
3945e911373SMike Kravetz 		list_add(&trg->link, &resv->region_cache);
3955e911373SMike Kravetz 		resv->region_cache_count++;
3965e911373SMike Kravetz 		goto retry_locked;
3975e911373SMike Kravetz 	}
3985e911373SMike Kravetz 
399d75c6af9SMina Almasry 	chg = add_reservation_in_range(resv, f, t, true);
40096822904SAndy Whitcroft 
4017b24d861SDavidlohr Bueso 	spin_unlock(&resv->lock);
40296822904SAndy Whitcroft 	return chg;
40396822904SAndy Whitcroft }
40496822904SAndy Whitcroft 
4051dd308a7SMike Kravetz /*
4065e911373SMike Kravetz  * Abort the in progress add operation.  The adds_in_progress field
4075e911373SMike Kravetz  * of the resv_map keeps track of the operations in progress between
4085e911373SMike Kravetz  * calls to region_chg and region_add.  Operations are sometimes
4095e911373SMike Kravetz  * aborted after the call to region_chg.  In such cases, region_abort
4105e911373SMike Kravetz  * is called to decrement the adds_in_progress counter.
4115e911373SMike Kravetz  *
4125e911373SMike Kravetz  * NOTE: The range arguments [f, t) are not needed or used in this
4135e911373SMike Kravetz  * routine.  They are kept to make reading the calling code easier as
4145e911373SMike Kravetz  * arguments will match the associated region_chg call.
4155e911373SMike Kravetz  */
4165e911373SMike Kravetz static void region_abort(struct resv_map *resv, long f, long t)
4175e911373SMike Kravetz {
4185e911373SMike Kravetz 	spin_lock(&resv->lock);
4195e911373SMike Kravetz 	VM_BUG_ON(!resv->region_cache_count);
4205e911373SMike Kravetz 	resv->adds_in_progress--;
4215e911373SMike Kravetz 	spin_unlock(&resv->lock);
4225e911373SMike Kravetz }
4235e911373SMike Kravetz 
4245e911373SMike Kravetz /*
425feba16e2SMike Kravetz  * Delete the specified range [f, t) from the reserve map.  If the
426feba16e2SMike Kravetz  * t parameter is LONG_MAX, this indicates that ALL regions after f
427feba16e2SMike Kravetz  * should be deleted.  Locate the regions which intersect [f, t)
428feba16e2SMike Kravetz  * and either trim, delete or split the existing regions.
429feba16e2SMike Kravetz  *
430feba16e2SMike Kravetz  * Returns the number of huge pages deleted from the reserve map.
431feba16e2SMike Kravetz  * In the normal case, the return value is zero or more.  In the
432feba16e2SMike Kravetz  * case where a region must be split, a new region descriptor must
433feba16e2SMike Kravetz  * be allocated.  If the allocation fails, -ENOMEM will be returned.
434feba16e2SMike Kravetz  * NOTE: If the parameter t == LONG_MAX, then we will never split
435feba16e2SMike Kravetz  * a region and possibly return -ENOMEM.  Callers specifying
436feba16e2SMike Kravetz  * t == LONG_MAX do not need to check for -ENOMEM error.
4371dd308a7SMike Kravetz  */
438feba16e2SMike Kravetz static long region_del(struct resv_map *resv, long f, long t)
43996822904SAndy Whitcroft {
4401406ec9bSJoonsoo Kim 	struct list_head *head = &resv->regions;
44196822904SAndy Whitcroft 	struct file_region *rg, *trg;
442feba16e2SMike Kravetz 	struct file_region *nrg = NULL;
443feba16e2SMike Kravetz 	long del = 0;
44496822904SAndy Whitcroft 
445feba16e2SMike Kravetz retry:
4467b24d861SDavidlohr Bueso 	spin_lock(&resv->lock);
447feba16e2SMike Kravetz 	list_for_each_entry_safe(rg, trg, head, link) {
448dbe409e4SMike Kravetz 		/*
449dbe409e4SMike Kravetz 		 * Skip regions before the range to be deleted.  file_region
450dbe409e4SMike Kravetz 		 * ranges are normally of the form [from, to).  However, there
451dbe409e4SMike Kravetz 		 * may be a "placeholder" entry in the map which is of the form
452dbe409e4SMike Kravetz 		 * (from, to) with from == to.  Check for placeholder entries
453dbe409e4SMike Kravetz 		 * at the beginning of the range to be deleted.
454dbe409e4SMike Kravetz 		 */
455dbe409e4SMike Kravetz 		if (rg->to <= f && (rg->to != rg->from || rg->to != f))
456feba16e2SMike Kravetz 			continue;
457dbe409e4SMike Kravetz 
458feba16e2SMike Kravetz 		if (rg->from >= t)
45996822904SAndy Whitcroft 			break;
46096822904SAndy Whitcroft 
461feba16e2SMike Kravetz 		if (f > rg->from && t < rg->to) { /* Must split region */
462feba16e2SMike Kravetz 			/*
463feba16e2SMike Kravetz 			 * Check for an entry in the cache before dropping
464feba16e2SMike Kravetz 			 * lock and attempting allocation.
465feba16e2SMike Kravetz 			 */
466feba16e2SMike Kravetz 			if (!nrg &&
467feba16e2SMike Kravetz 			    resv->region_cache_count > resv->adds_in_progress) {
468feba16e2SMike Kravetz 				nrg = list_first_entry(&resv->region_cache,
469feba16e2SMike Kravetz 							struct file_region,
470feba16e2SMike Kravetz 							link);
471feba16e2SMike Kravetz 				list_del(&nrg->link);
472feba16e2SMike Kravetz 				resv->region_cache_count--;
47396822904SAndy Whitcroft 			}
47496822904SAndy Whitcroft 
475feba16e2SMike Kravetz 			if (!nrg) {
476feba16e2SMike Kravetz 				spin_unlock(&resv->lock);
477feba16e2SMike Kravetz 				nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
478feba16e2SMike Kravetz 				if (!nrg)
479feba16e2SMike Kravetz 					return -ENOMEM;
480feba16e2SMike Kravetz 				goto retry;
481feba16e2SMike Kravetz 			}
482feba16e2SMike Kravetz 
483feba16e2SMike Kravetz 			del += t - f;
484feba16e2SMike Kravetz 
485feba16e2SMike Kravetz 			/* New entry for end of split region */
486feba16e2SMike Kravetz 			nrg->from = t;
487feba16e2SMike Kravetz 			nrg->to = rg->to;
488feba16e2SMike Kravetz 			INIT_LIST_HEAD(&nrg->link);
489feba16e2SMike Kravetz 
490feba16e2SMike Kravetz 			/* Original entry is trimmed */
491feba16e2SMike Kravetz 			rg->to = f;
492feba16e2SMike Kravetz 
493feba16e2SMike Kravetz 			list_add(&nrg->link, &rg->link);
494feba16e2SMike Kravetz 			nrg = NULL;
49596822904SAndy Whitcroft 			break;
496feba16e2SMike Kravetz 		}
497feba16e2SMike Kravetz 
498feba16e2SMike Kravetz 		if (f <= rg->from && t >= rg->to) { /* Remove entire region */
499feba16e2SMike Kravetz 			del += rg->to - rg->from;
50096822904SAndy Whitcroft 			list_del(&rg->link);
50196822904SAndy Whitcroft 			kfree(rg);
502feba16e2SMike Kravetz 			continue;
50396822904SAndy Whitcroft 		}
5047b24d861SDavidlohr Bueso 
505feba16e2SMike Kravetz 		if (f <= rg->from) {	/* Trim beginning of region */
506feba16e2SMike Kravetz 			del += t - rg->from;
507feba16e2SMike Kravetz 			rg->from = t;
508feba16e2SMike Kravetz 		} else {		/* Trim end of region */
509feba16e2SMike Kravetz 			del += rg->to - f;
510feba16e2SMike Kravetz 			rg->to = f;
511feba16e2SMike Kravetz 		}
512feba16e2SMike Kravetz 	}
513feba16e2SMike Kravetz 
5147b24d861SDavidlohr Bueso 	spin_unlock(&resv->lock);
515feba16e2SMike Kravetz 	kfree(nrg);
516feba16e2SMike Kravetz 	return del;
51796822904SAndy Whitcroft }
51896822904SAndy Whitcroft 
5191dd308a7SMike Kravetz /*
520b5cec28dSMike Kravetz  * A rare out of memory error was encountered which prevented removal of
521b5cec28dSMike Kravetz  * the reserve map region for a page.  The huge page itself was free'ed
522b5cec28dSMike Kravetz  * and removed from the page cache.  This routine will adjust the subpool
523b5cec28dSMike Kravetz  * usage count, and the global reserve count if needed.  By incrementing
524b5cec28dSMike Kravetz  * these counts, the reserve map entry which could not be deleted will
525b5cec28dSMike Kravetz  * appear as a "reserved" entry instead of simply dangling with incorrect
526b5cec28dSMike Kravetz  * counts.
527b5cec28dSMike Kravetz  */
52872e2936cSzhong jiang void hugetlb_fix_reserve_counts(struct inode *inode)
529b5cec28dSMike Kravetz {
530b5cec28dSMike Kravetz 	struct hugepage_subpool *spool = subpool_inode(inode);
531b5cec28dSMike Kravetz 	long rsv_adjust;
532b5cec28dSMike Kravetz 
533b5cec28dSMike Kravetz 	rsv_adjust = hugepage_subpool_get_pages(spool, 1);
53472e2936cSzhong jiang 	if (rsv_adjust) {
535b5cec28dSMike Kravetz 		struct hstate *h = hstate_inode(inode);
536b5cec28dSMike Kravetz 
537b5cec28dSMike Kravetz 		hugetlb_acct_memory(h, 1);
538b5cec28dSMike Kravetz 	}
539b5cec28dSMike Kravetz }
540b5cec28dSMike Kravetz 
541b5cec28dSMike Kravetz /*
5421dd308a7SMike Kravetz  * Count and return the number of huge pages in the reserve map
5431dd308a7SMike Kravetz  * that intersect with the range [f, t).
5441dd308a7SMike Kravetz  */
5451406ec9bSJoonsoo Kim static long region_count(struct resv_map *resv, long f, long t)
54684afd99bSAndy Whitcroft {
5471406ec9bSJoonsoo Kim 	struct list_head *head = &resv->regions;
54884afd99bSAndy Whitcroft 	struct file_region *rg;
54984afd99bSAndy Whitcroft 	long chg = 0;
55084afd99bSAndy Whitcroft 
5517b24d861SDavidlohr Bueso 	spin_lock(&resv->lock);
55284afd99bSAndy Whitcroft 	/* Locate each segment we overlap with, and count that overlap. */
55384afd99bSAndy Whitcroft 	list_for_each_entry(rg, head, link) {
554f2135a4aSWang Sheng-Hui 		long seg_from;
555f2135a4aSWang Sheng-Hui 		long seg_to;
55684afd99bSAndy Whitcroft 
55784afd99bSAndy Whitcroft 		if (rg->to <= f)
55884afd99bSAndy Whitcroft 			continue;
55984afd99bSAndy Whitcroft 		if (rg->from >= t)
56084afd99bSAndy Whitcroft 			break;
56184afd99bSAndy Whitcroft 
56284afd99bSAndy Whitcroft 		seg_from = max(rg->from, f);
56384afd99bSAndy Whitcroft 		seg_to = min(rg->to, t);
56484afd99bSAndy Whitcroft 
56584afd99bSAndy Whitcroft 		chg += seg_to - seg_from;
56684afd99bSAndy Whitcroft 	}
5677b24d861SDavidlohr Bueso 	spin_unlock(&resv->lock);
56884afd99bSAndy Whitcroft 
56984afd99bSAndy Whitcroft 	return chg;
57084afd99bSAndy Whitcroft }
57184afd99bSAndy Whitcroft 
57296822904SAndy Whitcroft /*
573e7c4b0bfSAndy Whitcroft  * Convert the address within this vma to the page offset within
574e7c4b0bfSAndy Whitcroft  * the mapping, in pagecache page units; huge pages here.
575e7c4b0bfSAndy Whitcroft  */
576a5516438SAndi Kleen static pgoff_t vma_hugecache_offset(struct hstate *h,
577a5516438SAndi Kleen 			struct vm_area_struct *vma, unsigned long address)
578e7c4b0bfSAndy Whitcroft {
579a5516438SAndi Kleen 	return ((address - vma->vm_start) >> huge_page_shift(h)) +
580a5516438SAndi Kleen 			(vma->vm_pgoff >> huge_page_order(h));
581e7c4b0bfSAndy Whitcroft }
582e7c4b0bfSAndy Whitcroft 
5830fe6e20bSNaoya Horiguchi pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
5840fe6e20bSNaoya Horiguchi 				     unsigned long address)
5850fe6e20bSNaoya Horiguchi {
5860fe6e20bSNaoya Horiguchi 	return vma_hugecache_offset(hstate_vma(vma), vma, address);
5870fe6e20bSNaoya Horiguchi }
588dee41079SDan Williams EXPORT_SYMBOL_GPL(linear_hugepage_index);
5890fe6e20bSNaoya Horiguchi 
59084afd99bSAndy Whitcroft /*
59108fba699SMel Gorman  * Return the size of the pages allocated when backing a VMA. In the majority
59208fba699SMel Gorman  * cases this will be same size as used by the page table entries.
59308fba699SMel Gorman  */
59408fba699SMel Gorman unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
59508fba699SMel Gorman {
59605ea8860SDan Williams 	if (vma->vm_ops && vma->vm_ops->pagesize)
59705ea8860SDan Williams 		return vma->vm_ops->pagesize(vma);
59808fba699SMel Gorman 	return PAGE_SIZE;
59908fba699SMel Gorman }
600f340ca0fSJoerg Roedel EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
60108fba699SMel Gorman 
60208fba699SMel Gorman /*
6033340289dSMel Gorman  * Return the page size being used by the MMU to back a VMA. In the majority
6043340289dSMel Gorman  * of cases, the page size used by the kernel matches the MMU size. On
60509135cc5SDan Williams  * architectures where it differs, an architecture-specific 'strong'
60609135cc5SDan Williams  * version of this symbol is required.
6073340289dSMel Gorman  */
60809135cc5SDan Williams __weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
6093340289dSMel Gorman {
6103340289dSMel Gorman 	return vma_kernel_pagesize(vma);
6113340289dSMel Gorman }
6123340289dSMel Gorman 
6133340289dSMel Gorman /*
61484afd99bSAndy Whitcroft  * Flags for MAP_PRIVATE reservations.  These are stored in the bottom
61584afd99bSAndy Whitcroft  * bits of the reservation map pointer, which are always clear due to
61684afd99bSAndy Whitcroft  * alignment.
61784afd99bSAndy Whitcroft  */
61884afd99bSAndy Whitcroft #define HPAGE_RESV_OWNER    (1UL << 0)
61984afd99bSAndy Whitcroft #define HPAGE_RESV_UNMAPPED (1UL << 1)
62004f2cbe3SMel Gorman #define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
62184afd99bSAndy Whitcroft 
622a1e78772SMel Gorman /*
623a1e78772SMel Gorman  * These helpers are used to track how many pages are reserved for
624a1e78772SMel Gorman  * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
625a1e78772SMel Gorman  * is guaranteed to have their future faults succeed.
626a1e78772SMel Gorman  *
627a1e78772SMel Gorman  * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
628a1e78772SMel Gorman  * the reserve counters are updated with the hugetlb_lock held. It is safe
629a1e78772SMel Gorman  * to reset the VMA at fork() time as it is not in use yet and there is no
630a1e78772SMel Gorman  * chance of the global counters getting corrupted as a result of the values.
63184afd99bSAndy Whitcroft  *
63284afd99bSAndy Whitcroft  * The private mapping reservation is represented in a subtly different
63384afd99bSAndy Whitcroft  * manner to a shared mapping.  A shared mapping has a region map associated
63484afd99bSAndy Whitcroft  * with the underlying file, this region map represents the backing file
63584afd99bSAndy Whitcroft  * pages which have ever had a reservation assigned which this persists even
63684afd99bSAndy Whitcroft  * after the page is instantiated.  A private mapping has a region map
63784afd99bSAndy Whitcroft  * associated with the original mmap which is attached to all VMAs which
63884afd99bSAndy Whitcroft  * reference it, this region map represents those offsets which have consumed
63984afd99bSAndy Whitcroft  * reservation ie. where pages have been instantiated.
640a1e78772SMel Gorman  */
641e7c4b0bfSAndy Whitcroft static unsigned long get_vma_private_data(struct vm_area_struct *vma)
642e7c4b0bfSAndy Whitcroft {
643e7c4b0bfSAndy Whitcroft 	return (unsigned long)vma->vm_private_data;
644e7c4b0bfSAndy Whitcroft }
645e7c4b0bfSAndy Whitcroft 
646e7c4b0bfSAndy Whitcroft static void set_vma_private_data(struct vm_area_struct *vma,
647e7c4b0bfSAndy Whitcroft 							unsigned long value)
648e7c4b0bfSAndy Whitcroft {
649e7c4b0bfSAndy Whitcroft 	vma->vm_private_data = (void *)value;
650e7c4b0bfSAndy Whitcroft }
651e7c4b0bfSAndy Whitcroft 
6529119a41eSJoonsoo Kim struct resv_map *resv_map_alloc(void)
65384afd99bSAndy Whitcroft {
65484afd99bSAndy Whitcroft 	struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
6555e911373SMike Kravetz 	struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL);
6565e911373SMike Kravetz 
6575e911373SMike Kravetz 	if (!resv_map || !rg) {
6585e911373SMike Kravetz 		kfree(resv_map);
6595e911373SMike Kravetz 		kfree(rg);
66084afd99bSAndy Whitcroft 		return NULL;
6615e911373SMike Kravetz 	}
66284afd99bSAndy Whitcroft 
66384afd99bSAndy Whitcroft 	kref_init(&resv_map->refs);
6647b24d861SDavidlohr Bueso 	spin_lock_init(&resv_map->lock);
66584afd99bSAndy Whitcroft 	INIT_LIST_HEAD(&resv_map->regions);
66684afd99bSAndy Whitcroft 
6675e911373SMike Kravetz 	resv_map->adds_in_progress = 0;
6685e911373SMike Kravetz 
6695e911373SMike Kravetz 	INIT_LIST_HEAD(&resv_map->region_cache);
6705e911373SMike Kravetz 	list_add(&rg->link, &resv_map->region_cache);
6715e911373SMike Kravetz 	resv_map->region_cache_count = 1;
6725e911373SMike Kravetz 
67384afd99bSAndy Whitcroft 	return resv_map;
67484afd99bSAndy Whitcroft }
67584afd99bSAndy Whitcroft 
6769119a41eSJoonsoo Kim void resv_map_release(struct kref *ref)
67784afd99bSAndy Whitcroft {
67884afd99bSAndy Whitcroft 	struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
6795e911373SMike Kravetz 	struct list_head *head = &resv_map->region_cache;
6805e911373SMike Kravetz 	struct file_region *rg, *trg;
68184afd99bSAndy Whitcroft 
68284afd99bSAndy Whitcroft 	/* Clear out any active regions before we release the map. */
683feba16e2SMike Kravetz 	region_del(resv_map, 0, LONG_MAX);
6845e911373SMike Kravetz 
6855e911373SMike Kravetz 	/* ... and any entries left in the cache */
6865e911373SMike Kravetz 	list_for_each_entry_safe(rg, trg, head, link) {
6875e911373SMike Kravetz 		list_del(&rg->link);
6885e911373SMike Kravetz 		kfree(rg);
6895e911373SMike Kravetz 	}
6905e911373SMike Kravetz 
6915e911373SMike Kravetz 	VM_BUG_ON(resv_map->adds_in_progress);
6925e911373SMike Kravetz 
69384afd99bSAndy Whitcroft 	kfree(resv_map);
69484afd99bSAndy Whitcroft }
69584afd99bSAndy Whitcroft 
6964e35f483SJoonsoo Kim static inline struct resv_map *inode_resv_map(struct inode *inode)
6974e35f483SJoonsoo Kim {
698f27a5136SMike Kravetz 	/*
699f27a5136SMike Kravetz 	 * At inode evict time, i_mapping may not point to the original
700f27a5136SMike Kravetz 	 * address space within the inode.  This original address space
701f27a5136SMike Kravetz 	 * contains the pointer to the resv_map.  So, always use the
702f27a5136SMike Kravetz 	 * address space embedded within the inode.
703f27a5136SMike Kravetz 	 * The VERY common case is inode->mapping == &inode->i_data but,
704f27a5136SMike Kravetz 	 * this may not be true for device special inodes.
705f27a5136SMike Kravetz 	 */
706f27a5136SMike Kravetz 	return (struct resv_map *)(&inode->i_data)->private_data;
7074e35f483SJoonsoo Kim }
7084e35f483SJoonsoo Kim 
70984afd99bSAndy Whitcroft static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
710a1e78772SMel Gorman {
71181d1b09cSSasha Levin 	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
7124e35f483SJoonsoo Kim 	if (vma->vm_flags & VM_MAYSHARE) {
7134e35f483SJoonsoo Kim 		struct address_space *mapping = vma->vm_file->f_mapping;
7144e35f483SJoonsoo Kim 		struct inode *inode = mapping->host;
7154e35f483SJoonsoo Kim 
7164e35f483SJoonsoo Kim 		return inode_resv_map(inode);
7174e35f483SJoonsoo Kim 
7184e35f483SJoonsoo Kim 	} else {
71984afd99bSAndy Whitcroft 		return (struct resv_map *)(get_vma_private_data(vma) &
72084afd99bSAndy Whitcroft 							~HPAGE_RESV_MASK);
7214e35f483SJoonsoo Kim 	}
722a1e78772SMel Gorman }
723a1e78772SMel Gorman 
72484afd99bSAndy Whitcroft static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
725a1e78772SMel Gorman {
72681d1b09cSSasha Levin 	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
72781d1b09cSSasha Levin 	VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
728a1e78772SMel Gorman 
72984afd99bSAndy Whitcroft 	set_vma_private_data(vma, (get_vma_private_data(vma) &
73084afd99bSAndy Whitcroft 				HPAGE_RESV_MASK) | (unsigned long)map);
73104f2cbe3SMel Gorman }
73204f2cbe3SMel Gorman 
73304f2cbe3SMel Gorman static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
73404f2cbe3SMel Gorman {
73581d1b09cSSasha Levin 	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
73681d1b09cSSasha Levin 	VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
737e7c4b0bfSAndy Whitcroft 
738e7c4b0bfSAndy Whitcroft 	set_vma_private_data(vma, get_vma_private_data(vma) | flags);
73904f2cbe3SMel Gorman }
74004f2cbe3SMel Gorman 
74104f2cbe3SMel Gorman static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
74204f2cbe3SMel Gorman {
74381d1b09cSSasha Levin 	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
744e7c4b0bfSAndy Whitcroft 
745e7c4b0bfSAndy Whitcroft 	return (get_vma_private_data(vma) & flag) != 0;
746a1e78772SMel Gorman }
747a1e78772SMel Gorman 
74804f2cbe3SMel Gorman /* Reset counters to 0 and clear all HPAGE_RESV_* flags */
749a1e78772SMel Gorman void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
750a1e78772SMel Gorman {
75181d1b09cSSasha Levin 	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
752f83a275dSMel Gorman 	if (!(vma->vm_flags & VM_MAYSHARE))
753a1e78772SMel Gorman 		vma->vm_private_data = (void *)0;
754a1e78772SMel Gorman }
755a1e78772SMel Gorman 
756a1e78772SMel Gorman /* Returns true if the VMA has associated reserve pages */
757559ec2f8SNicholas Krause static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
758a1e78772SMel Gorman {
759af0ed73eSJoonsoo Kim 	if (vma->vm_flags & VM_NORESERVE) {
760af0ed73eSJoonsoo Kim 		/*
761af0ed73eSJoonsoo Kim 		 * This address is already reserved by other process(chg == 0),
762af0ed73eSJoonsoo Kim 		 * so, we should decrement reserved count. Without decrementing,
763af0ed73eSJoonsoo Kim 		 * reserve count remains after releasing inode, because this
764af0ed73eSJoonsoo Kim 		 * allocated page will go into page cache and is regarded as
765af0ed73eSJoonsoo Kim 		 * coming from reserved pool in releasing step.  Currently, we
766af0ed73eSJoonsoo Kim 		 * don't have any other solution to deal with this situation
767af0ed73eSJoonsoo Kim 		 * properly, so add work-around here.
768af0ed73eSJoonsoo Kim 		 */
769af0ed73eSJoonsoo Kim 		if (vma->vm_flags & VM_MAYSHARE && chg == 0)
770559ec2f8SNicholas Krause 			return true;
771af0ed73eSJoonsoo Kim 		else
772559ec2f8SNicholas Krause 			return false;
773af0ed73eSJoonsoo Kim 	}
774a63884e9SJoonsoo Kim 
775a63884e9SJoonsoo Kim 	/* Shared mappings always use reserves */
7761fb1b0e9SMike Kravetz 	if (vma->vm_flags & VM_MAYSHARE) {
7771fb1b0e9SMike Kravetz 		/*
7781fb1b0e9SMike Kravetz 		 * We know VM_NORESERVE is not set.  Therefore, there SHOULD
7791fb1b0e9SMike Kravetz 		 * be a region map for all pages.  The only situation where
7801fb1b0e9SMike Kravetz 		 * there is no region map is if a hole was punched via
7811fb1b0e9SMike Kravetz 		 * fallocate.  In this case, there really are no reverves to
7821fb1b0e9SMike Kravetz 		 * use.  This situation is indicated if chg != 0.
7831fb1b0e9SMike Kravetz 		 */
7841fb1b0e9SMike Kravetz 		if (chg)
7851fb1b0e9SMike Kravetz 			return false;
7861fb1b0e9SMike Kravetz 		else
787559ec2f8SNicholas Krause 			return true;
7881fb1b0e9SMike Kravetz 	}
789a63884e9SJoonsoo Kim 
790a63884e9SJoonsoo Kim 	/*
791a63884e9SJoonsoo Kim 	 * Only the process that called mmap() has reserves for
792a63884e9SJoonsoo Kim 	 * private mappings.
793a63884e9SJoonsoo Kim 	 */
79467961f9dSMike Kravetz 	if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
79567961f9dSMike Kravetz 		/*
79667961f9dSMike Kravetz 		 * Like the shared case above, a hole punch or truncate
79767961f9dSMike Kravetz 		 * could have been performed on the private mapping.
79867961f9dSMike Kravetz 		 * Examine the value of chg to determine if reserves
79967961f9dSMike Kravetz 		 * actually exist or were previously consumed.
80067961f9dSMike Kravetz 		 * Very Subtle - The value of chg comes from a previous
80167961f9dSMike Kravetz 		 * call to vma_needs_reserves().  The reserve map for
80267961f9dSMike Kravetz 		 * private mappings has different (opposite) semantics
80367961f9dSMike Kravetz 		 * than that of shared mappings.  vma_needs_reserves()
80467961f9dSMike Kravetz 		 * has already taken this difference in semantics into
80567961f9dSMike Kravetz 		 * account.  Therefore, the meaning of chg is the same
80667961f9dSMike Kravetz 		 * as in the shared case above.  Code could easily be
80767961f9dSMike Kravetz 		 * combined, but keeping it separate draws attention to
80867961f9dSMike Kravetz 		 * subtle differences.
80967961f9dSMike Kravetz 		 */
81067961f9dSMike Kravetz 		if (chg)
81167961f9dSMike Kravetz 			return false;
81267961f9dSMike Kravetz 		else
813559ec2f8SNicholas Krause 			return true;
81467961f9dSMike Kravetz 	}
815a63884e9SJoonsoo Kim 
816559ec2f8SNicholas Krause 	return false;
817a1e78772SMel Gorman }
818a1e78772SMel Gorman 
819a5516438SAndi Kleen static void enqueue_huge_page(struct hstate *h, struct page *page)
8201da177e4SLinus Torvalds {
8211da177e4SLinus Torvalds 	int nid = page_to_nid(page);
8220edaecfaSAneesh Kumar K.V 	list_move(&page->lru, &h->hugepage_freelists[nid]);
823a5516438SAndi Kleen 	h->free_huge_pages++;
824a5516438SAndi Kleen 	h->free_huge_pages_node[nid]++;
8251da177e4SLinus Torvalds }
8261da177e4SLinus Torvalds 
82794310cbcSAnshuman Khandual static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
828bf50bab2SNaoya Horiguchi {
829bf50bab2SNaoya Horiguchi 	struct page *page;
830bf50bab2SNaoya Horiguchi 
831c8721bbbSNaoya Horiguchi 	list_for_each_entry(page, &h->hugepage_freelists[nid], lru)
832243abd5bSNaoya Horiguchi 		if (!PageHWPoison(page))
833c8721bbbSNaoya Horiguchi 			break;
834c8721bbbSNaoya Horiguchi 	/*
835c8721bbbSNaoya Horiguchi 	 * if 'non-isolated free hugepage' not found on the list,
836c8721bbbSNaoya Horiguchi 	 * the allocation fails.
837c8721bbbSNaoya Horiguchi 	 */
838c8721bbbSNaoya Horiguchi 	if (&h->hugepage_freelists[nid] == &page->lru)
839bf50bab2SNaoya Horiguchi 		return NULL;
8400edaecfaSAneesh Kumar K.V 	list_move(&page->lru, &h->hugepage_activelist);
841a9869b83SNaoya Horiguchi 	set_page_refcounted(page);
842bf50bab2SNaoya Horiguchi 	h->free_huge_pages--;
843bf50bab2SNaoya Horiguchi 	h->free_huge_pages_node[nid]--;
844bf50bab2SNaoya Horiguchi 	return page;
845bf50bab2SNaoya Horiguchi }
846bf50bab2SNaoya Horiguchi 
8473e59fcb0SMichal Hocko static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid,
8483e59fcb0SMichal Hocko 		nodemask_t *nmask)
84994310cbcSAnshuman Khandual {
8503e59fcb0SMichal Hocko 	unsigned int cpuset_mems_cookie;
8513e59fcb0SMichal Hocko 	struct zonelist *zonelist;
8523e59fcb0SMichal Hocko 	struct zone *zone;
8533e59fcb0SMichal Hocko 	struct zoneref *z;
85498fa15f3SAnshuman Khandual 	int node = NUMA_NO_NODE;
8553e59fcb0SMichal Hocko 
8563e59fcb0SMichal Hocko 	zonelist = node_zonelist(nid, gfp_mask);
8573e59fcb0SMichal Hocko 
8583e59fcb0SMichal Hocko retry_cpuset:
8593e59fcb0SMichal Hocko 	cpuset_mems_cookie = read_mems_allowed_begin();
8603e59fcb0SMichal Hocko 	for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) {
86194310cbcSAnshuman Khandual 		struct page *page;
86294310cbcSAnshuman Khandual 
8633e59fcb0SMichal Hocko 		if (!cpuset_zone_allowed(zone, gfp_mask))
8643e59fcb0SMichal Hocko 			continue;
8653e59fcb0SMichal Hocko 		/*
8663e59fcb0SMichal Hocko 		 * no need to ask again on the same node. Pool is node rather than
8673e59fcb0SMichal Hocko 		 * zone aware
8683e59fcb0SMichal Hocko 		 */
8693e59fcb0SMichal Hocko 		if (zone_to_nid(zone) == node)
8703e59fcb0SMichal Hocko 			continue;
8713e59fcb0SMichal Hocko 		node = zone_to_nid(zone);
87294310cbcSAnshuman Khandual 
87394310cbcSAnshuman Khandual 		page = dequeue_huge_page_node_exact(h, node);
87494310cbcSAnshuman Khandual 		if (page)
87594310cbcSAnshuman Khandual 			return page;
87694310cbcSAnshuman Khandual 	}
8773e59fcb0SMichal Hocko 	if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie)))
8783e59fcb0SMichal Hocko 		goto retry_cpuset;
8793e59fcb0SMichal Hocko 
88094310cbcSAnshuman Khandual 	return NULL;
88194310cbcSAnshuman Khandual }
88294310cbcSAnshuman Khandual 
88386cdb465SNaoya Horiguchi /* Movability of hugepages depends on migration support. */
88486cdb465SNaoya Horiguchi static inline gfp_t htlb_alloc_mask(struct hstate *h)
88586cdb465SNaoya Horiguchi {
8867ed2c31dSAnshuman Khandual 	if (hugepage_movable_supported(h))
88786cdb465SNaoya Horiguchi 		return GFP_HIGHUSER_MOVABLE;
88886cdb465SNaoya Horiguchi 	else
88986cdb465SNaoya Horiguchi 		return GFP_HIGHUSER;
89086cdb465SNaoya Horiguchi }
89186cdb465SNaoya Horiguchi 
892a5516438SAndi Kleen static struct page *dequeue_huge_page_vma(struct hstate *h,
893a5516438SAndi Kleen 				struct vm_area_struct *vma,
894af0ed73eSJoonsoo Kim 				unsigned long address, int avoid_reserve,
895af0ed73eSJoonsoo Kim 				long chg)
8961da177e4SLinus Torvalds {
8973e59fcb0SMichal Hocko 	struct page *page;
898480eccf9SLee Schermerhorn 	struct mempolicy *mpol;
89904ec6264SVlastimil Babka 	gfp_t gfp_mask;
9003e59fcb0SMichal Hocko 	nodemask_t *nodemask;
90104ec6264SVlastimil Babka 	int nid;
9021da177e4SLinus Torvalds 
903a1e78772SMel Gorman 	/*
904a1e78772SMel Gorman 	 * A child process with MAP_PRIVATE mappings created by their parent
905a1e78772SMel Gorman 	 * have no page reserves. This check ensures that reservations are
906a1e78772SMel Gorman 	 * not "stolen". The child may still get SIGKILLed
907a1e78772SMel Gorman 	 */
908af0ed73eSJoonsoo Kim 	if (!vma_has_reserves(vma, chg) &&
909a5516438SAndi Kleen 			h->free_huge_pages - h->resv_huge_pages == 0)
910c0ff7453SMiao Xie 		goto err;
911a1e78772SMel Gorman 
91204f2cbe3SMel Gorman 	/* If reserves cannot be used, ensure enough pages are in the pool */
913a5516438SAndi Kleen 	if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
9146eab04a8SJustin P. Mattock 		goto err;
91504f2cbe3SMel Gorman 
91604ec6264SVlastimil Babka 	gfp_mask = htlb_alloc_mask(h);
91704ec6264SVlastimil Babka 	nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
9183e59fcb0SMichal Hocko 	page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
9193e59fcb0SMichal Hocko 	if (page && !avoid_reserve && vma_has_reserves(vma, chg)) {
92007443a85SJoonsoo Kim 		SetPagePrivate(page);
921a63884e9SJoonsoo Kim 		h->resv_huge_pages--;
922bf50bab2SNaoya Horiguchi 	}
923cc9a6c87SMel Gorman 
924cc9a6c87SMel Gorman 	mpol_cond_put(mpol);
925cc9a6c87SMel Gorman 	return page;
926cc9a6c87SMel Gorman 
927c0ff7453SMiao Xie err:
928cc9a6c87SMel Gorman 	return NULL;
9291da177e4SLinus Torvalds }
9301da177e4SLinus Torvalds 
9311cac6f2cSLuiz Capitulino /*
9321cac6f2cSLuiz Capitulino  * common helper functions for hstate_next_node_to_{alloc|free}.
9331cac6f2cSLuiz Capitulino  * We may have allocated or freed a huge page based on a different
9341cac6f2cSLuiz Capitulino  * nodes_allowed previously, so h->next_node_to_{alloc|free} might
9351cac6f2cSLuiz Capitulino  * be outside of *nodes_allowed.  Ensure that we use an allowed
9361cac6f2cSLuiz Capitulino  * node for alloc or free.
9371cac6f2cSLuiz Capitulino  */
9381cac6f2cSLuiz Capitulino static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
9391cac6f2cSLuiz Capitulino {
9400edaf86cSAndrew Morton 	nid = next_node_in(nid, *nodes_allowed);
9411cac6f2cSLuiz Capitulino 	VM_BUG_ON(nid >= MAX_NUMNODES);
9421cac6f2cSLuiz Capitulino 
9431cac6f2cSLuiz Capitulino 	return nid;
9441cac6f2cSLuiz Capitulino }
9451cac6f2cSLuiz Capitulino 
9461cac6f2cSLuiz Capitulino static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
9471cac6f2cSLuiz Capitulino {
9481cac6f2cSLuiz Capitulino 	if (!node_isset(nid, *nodes_allowed))
9491cac6f2cSLuiz Capitulino 		nid = next_node_allowed(nid, nodes_allowed);
9501cac6f2cSLuiz Capitulino 	return nid;
9511cac6f2cSLuiz Capitulino }
9521cac6f2cSLuiz Capitulino 
9531cac6f2cSLuiz Capitulino /*
9541cac6f2cSLuiz Capitulino  * returns the previously saved node ["this node"] from which to
9551cac6f2cSLuiz Capitulino  * allocate a persistent huge page for the pool and advance the
9561cac6f2cSLuiz Capitulino  * next node from which to allocate, handling wrap at end of node
9571cac6f2cSLuiz Capitulino  * mask.
9581cac6f2cSLuiz Capitulino  */
9591cac6f2cSLuiz Capitulino static int hstate_next_node_to_alloc(struct hstate *h,
9601cac6f2cSLuiz Capitulino 					nodemask_t *nodes_allowed)
9611cac6f2cSLuiz Capitulino {
9621cac6f2cSLuiz Capitulino 	int nid;
9631cac6f2cSLuiz Capitulino 
9641cac6f2cSLuiz Capitulino 	VM_BUG_ON(!nodes_allowed);
9651cac6f2cSLuiz Capitulino 
9661cac6f2cSLuiz Capitulino 	nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
9671cac6f2cSLuiz Capitulino 	h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
9681cac6f2cSLuiz Capitulino 
9691cac6f2cSLuiz Capitulino 	return nid;
9701cac6f2cSLuiz Capitulino }
9711cac6f2cSLuiz Capitulino 
9721cac6f2cSLuiz Capitulino /*
9731cac6f2cSLuiz Capitulino  * helper for free_pool_huge_page() - return the previously saved
9741cac6f2cSLuiz Capitulino  * node ["this node"] from which to free a huge page.  Advance the
9751cac6f2cSLuiz Capitulino  * next node id whether or not we find a free huge page to free so
9761cac6f2cSLuiz Capitulino  * that the next attempt to free addresses the next node.
9771cac6f2cSLuiz Capitulino  */
9781cac6f2cSLuiz Capitulino static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
9791cac6f2cSLuiz Capitulino {
9801cac6f2cSLuiz Capitulino 	int nid;
9811cac6f2cSLuiz Capitulino 
9821cac6f2cSLuiz Capitulino 	VM_BUG_ON(!nodes_allowed);
9831cac6f2cSLuiz Capitulino 
9841cac6f2cSLuiz Capitulino 	nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
9851cac6f2cSLuiz Capitulino 	h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
9861cac6f2cSLuiz Capitulino 
9871cac6f2cSLuiz Capitulino 	return nid;
9881cac6f2cSLuiz Capitulino }
9891cac6f2cSLuiz Capitulino 
9901cac6f2cSLuiz Capitulino #define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask)		\
9911cac6f2cSLuiz Capitulino 	for (nr_nodes = nodes_weight(*mask);				\
9921cac6f2cSLuiz Capitulino 		nr_nodes > 0 &&						\
9931cac6f2cSLuiz Capitulino 		((node = hstate_next_node_to_alloc(hs, mask)) || 1);	\
9941cac6f2cSLuiz Capitulino 		nr_nodes--)
9951cac6f2cSLuiz Capitulino 
9961cac6f2cSLuiz Capitulino #define for_each_node_mask_to_free(hs, nr_nodes, node, mask)		\
9971cac6f2cSLuiz Capitulino 	for (nr_nodes = nodes_weight(*mask);				\
9981cac6f2cSLuiz Capitulino 		nr_nodes > 0 &&						\
9991cac6f2cSLuiz Capitulino 		((node = hstate_next_node_to_free(hs, mask)) || 1);	\
10001cac6f2cSLuiz Capitulino 		nr_nodes--)
10011cac6f2cSLuiz Capitulino 
1002e1073d1eSAneesh Kumar K.V #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
1003944d9fecSLuiz Capitulino static void destroy_compound_gigantic_page(struct page *page,
1004d00181b9SKirill A. Shutemov 					unsigned int order)
1005944d9fecSLuiz Capitulino {
1006944d9fecSLuiz Capitulino 	int i;
1007944d9fecSLuiz Capitulino 	int nr_pages = 1 << order;
1008944d9fecSLuiz Capitulino 	struct page *p = page + 1;
1009944d9fecSLuiz Capitulino 
1010c8cc708aSGerald Schaefer 	atomic_set(compound_mapcount_ptr(page), 0);
1011944d9fecSLuiz Capitulino 	for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
10121d798ca3SKirill A. Shutemov 		clear_compound_head(p);
1013944d9fecSLuiz Capitulino 		set_page_refcounted(p);
1014944d9fecSLuiz Capitulino 	}
1015944d9fecSLuiz Capitulino 
1016944d9fecSLuiz Capitulino 	set_compound_order(page, 0);
1017944d9fecSLuiz Capitulino 	__ClearPageHead(page);
1018944d9fecSLuiz Capitulino }
1019944d9fecSLuiz Capitulino 
1020d00181b9SKirill A. Shutemov static void free_gigantic_page(struct page *page, unsigned int order)
1021944d9fecSLuiz Capitulino {
1022944d9fecSLuiz Capitulino 	free_contig_range(page_to_pfn(page), 1 << order);
1023944d9fecSLuiz Capitulino }
1024944d9fecSLuiz Capitulino 
10254eb0716eSAlexandre Ghiti #ifdef CONFIG_CONTIG_ALLOC
1026d9cc948fSMichal Hocko static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
1027d9cc948fSMichal Hocko 		int nid, nodemask_t *nodemask)
1028944d9fecSLuiz Capitulino {
10295e27a2dfSAnshuman Khandual 	unsigned long nr_pages = 1UL << huge_page_order(h);
1030944d9fecSLuiz Capitulino 
10315e27a2dfSAnshuman Khandual 	return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
1032944d9fecSLuiz Capitulino }
1033944d9fecSLuiz Capitulino 
1034944d9fecSLuiz Capitulino static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
1035d00181b9SKirill A. Shutemov static void prep_compound_gigantic_page(struct page *page, unsigned int order);
10364eb0716eSAlexandre Ghiti #else /* !CONFIG_CONTIG_ALLOC */
10374eb0716eSAlexandre Ghiti static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
10384eb0716eSAlexandre Ghiti 					int nid, nodemask_t *nodemask)
10394eb0716eSAlexandre Ghiti {
10404eb0716eSAlexandre Ghiti 	return NULL;
10414eb0716eSAlexandre Ghiti }
10424eb0716eSAlexandre Ghiti #endif /* CONFIG_CONTIG_ALLOC */
1043944d9fecSLuiz Capitulino 
1044e1073d1eSAneesh Kumar K.V #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
1045d9cc948fSMichal Hocko static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
10464eb0716eSAlexandre Ghiti 					int nid, nodemask_t *nodemask)
10474eb0716eSAlexandre Ghiti {
10484eb0716eSAlexandre Ghiti 	return NULL;
10494eb0716eSAlexandre Ghiti }
1050d00181b9SKirill A. Shutemov static inline void free_gigantic_page(struct page *page, unsigned int order) { }
1051944d9fecSLuiz Capitulino static inline void destroy_compound_gigantic_page(struct page *page,
1052d00181b9SKirill A. Shutemov 						unsigned int order) { }
1053944d9fecSLuiz Capitulino #endif
1054944d9fecSLuiz Capitulino 
1055a5516438SAndi Kleen static void update_and_free_page(struct hstate *h, struct page *page)
10566af2acb6SAdam Litke {
10576af2acb6SAdam Litke 	int i;
1058a5516438SAndi Kleen 
10594eb0716eSAlexandre Ghiti 	if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
1060944d9fecSLuiz Capitulino 		return;
106118229df5SAndy Whitcroft 
1062a5516438SAndi Kleen 	h->nr_huge_pages--;
1063a5516438SAndi Kleen 	h->nr_huge_pages_node[page_to_nid(page)]--;
1064a5516438SAndi Kleen 	for (i = 0; i < pages_per_huge_page(h); i++) {
106532f84528SChris Forbes 		page[i].flags &= ~(1 << PG_locked | 1 << PG_error |
106632f84528SChris Forbes 				1 << PG_referenced | 1 << PG_dirty |
1067a7407a27SLuiz Capitulino 				1 << PG_active | 1 << PG_private |
1068a7407a27SLuiz Capitulino 				1 << PG_writeback);
10696af2acb6SAdam Litke 	}
1070309381feSSasha Levin 	VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
1071f1e61557SKirill A. Shutemov 	set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
10726af2acb6SAdam Litke 	set_page_refcounted(page);
1073944d9fecSLuiz Capitulino 	if (hstate_is_gigantic(h)) {
1074944d9fecSLuiz Capitulino 		destroy_compound_gigantic_page(page, huge_page_order(h));
1075944d9fecSLuiz Capitulino 		free_gigantic_page(page, huge_page_order(h));
1076944d9fecSLuiz Capitulino 	} else {
1077a5516438SAndi Kleen 		__free_pages(page, huge_page_order(h));
10786af2acb6SAdam Litke 	}
1079944d9fecSLuiz Capitulino }
10806af2acb6SAdam Litke 
1081e5ff2159SAndi Kleen struct hstate *size_to_hstate(unsigned long size)
1082e5ff2159SAndi Kleen {
1083e5ff2159SAndi Kleen 	struct hstate *h;
1084e5ff2159SAndi Kleen 
1085e5ff2159SAndi Kleen 	for_each_hstate(h) {
1086e5ff2159SAndi Kleen 		if (huge_page_size(h) == size)
1087e5ff2159SAndi Kleen 			return h;
1088e5ff2159SAndi Kleen 	}
1089e5ff2159SAndi Kleen 	return NULL;
1090e5ff2159SAndi Kleen }
1091e5ff2159SAndi Kleen 
1092bcc54222SNaoya Horiguchi /*
1093bcc54222SNaoya Horiguchi  * Test to determine whether the hugepage is "active/in-use" (i.e. being linked
1094bcc54222SNaoya Horiguchi  * to hstate->hugepage_activelist.)
1095bcc54222SNaoya Horiguchi  *
1096bcc54222SNaoya Horiguchi  * This function can be called for tail pages, but never returns true for them.
1097bcc54222SNaoya Horiguchi  */
1098bcc54222SNaoya Horiguchi bool page_huge_active(struct page *page)
1099bcc54222SNaoya Horiguchi {
1100bcc54222SNaoya Horiguchi 	VM_BUG_ON_PAGE(!PageHuge(page), page);
1101bcc54222SNaoya Horiguchi 	return PageHead(page) && PagePrivate(&page[1]);
1102bcc54222SNaoya Horiguchi }
1103bcc54222SNaoya Horiguchi 
1104bcc54222SNaoya Horiguchi /* never called for tail page */
1105bcc54222SNaoya Horiguchi static void set_page_huge_active(struct page *page)
1106bcc54222SNaoya Horiguchi {
1107bcc54222SNaoya Horiguchi 	VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
1108bcc54222SNaoya Horiguchi 	SetPagePrivate(&page[1]);
1109bcc54222SNaoya Horiguchi }
1110bcc54222SNaoya Horiguchi 
1111bcc54222SNaoya Horiguchi static void clear_page_huge_active(struct page *page)
1112bcc54222SNaoya Horiguchi {
1113bcc54222SNaoya Horiguchi 	VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
1114bcc54222SNaoya Horiguchi 	ClearPagePrivate(&page[1]);
1115bcc54222SNaoya Horiguchi }
1116bcc54222SNaoya Horiguchi 
1117ab5ac90aSMichal Hocko /*
1118ab5ac90aSMichal Hocko  * Internal hugetlb specific page flag. Do not use outside of the hugetlb
1119ab5ac90aSMichal Hocko  * code
1120ab5ac90aSMichal Hocko  */
1121ab5ac90aSMichal Hocko static inline bool PageHugeTemporary(struct page *page)
1122ab5ac90aSMichal Hocko {
1123ab5ac90aSMichal Hocko 	if (!PageHuge(page))
1124ab5ac90aSMichal Hocko 		return false;
1125ab5ac90aSMichal Hocko 
1126ab5ac90aSMichal Hocko 	return (unsigned long)page[2].mapping == -1U;
1127ab5ac90aSMichal Hocko }
1128ab5ac90aSMichal Hocko 
1129ab5ac90aSMichal Hocko static inline void SetPageHugeTemporary(struct page *page)
1130ab5ac90aSMichal Hocko {
1131ab5ac90aSMichal Hocko 	page[2].mapping = (void *)-1U;
1132ab5ac90aSMichal Hocko }
1133ab5ac90aSMichal Hocko 
1134ab5ac90aSMichal Hocko static inline void ClearPageHugeTemporary(struct page *page)
1135ab5ac90aSMichal Hocko {
1136ab5ac90aSMichal Hocko 	page[2].mapping = NULL;
1137ab5ac90aSMichal Hocko }
1138ab5ac90aSMichal Hocko 
11398f1d26d0SAtsushi Kumagai void free_huge_page(struct page *page)
114027a85ef1SDavid Gibson {
1141a5516438SAndi Kleen 	/*
1142a5516438SAndi Kleen 	 * Can't pass hstate in here because it is called from the
1143a5516438SAndi Kleen 	 * compound page destructor.
1144a5516438SAndi Kleen 	 */
1145e5ff2159SAndi Kleen 	struct hstate *h = page_hstate(page);
11467893d1d5SAdam Litke 	int nid = page_to_nid(page);
114790481622SDavid Gibson 	struct hugepage_subpool *spool =
114890481622SDavid Gibson 		(struct hugepage_subpool *)page_private(page);
114907443a85SJoonsoo Kim 	bool restore_reserve;
115027a85ef1SDavid Gibson 
1151b4330afbSMike Kravetz 	VM_BUG_ON_PAGE(page_count(page), page);
1152b4330afbSMike Kravetz 	VM_BUG_ON_PAGE(page_mapcount(page), page);
11538ace22bcSYongkai Wu 
11548ace22bcSYongkai Wu 	set_page_private(page, 0);
11558ace22bcSYongkai Wu 	page->mapping = NULL;
115607443a85SJoonsoo Kim 	restore_reserve = PagePrivate(page);
115716c794b4SJoonsoo Kim 	ClearPagePrivate(page);
115827a85ef1SDavid Gibson 
11591c5ecae3SMike Kravetz 	/*
11600919e1b6SMike Kravetz 	 * If PagePrivate() was set on page, page allocation consumed a
11610919e1b6SMike Kravetz 	 * reservation.  If the page was associated with a subpool, there
11620919e1b6SMike Kravetz 	 * would have been a page reserved in the subpool before allocation
11630919e1b6SMike Kravetz 	 * via hugepage_subpool_get_pages().  Since we are 'restoring' the
11640919e1b6SMike Kravetz 	 * reservtion, do not call hugepage_subpool_put_pages() as this will
11650919e1b6SMike Kravetz 	 * remove the reserved page from the subpool.
11660919e1b6SMike Kravetz 	 */
11670919e1b6SMike Kravetz 	if (!restore_reserve) {
11680919e1b6SMike Kravetz 		/*
11690919e1b6SMike Kravetz 		 * A return code of zero implies that the subpool will be
11700919e1b6SMike Kravetz 		 * under its minimum size if the reservation is not restored
11710919e1b6SMike Kravetz 		 * after page is free.  Therefore, force restore_reserve
11720919e1b6SMike Kravetz 		 * operation.
11731c5ecae3SMike Kravetz 		 */
11741c5ecae3SMike Kravetz 		if (hugepage_subpool_put_pages(spool, 1) == 0)
11751c5ecae3SMike Kravetz 			restore_reserve = true;
11760919e1b6SMike Kravetz 	}
11771c5ecae3SMike Kravetz 
117827a85ef1SDavid Gibson 	spin_lock(&hugetlb_lock);
1179bcc54222SNaoya Horiguchi 	clear_page_huge_active(page);
11806d76dcf4SAneesh Kumar K.V 	hugetlb_cgroup_uncharge_page(hstate_index(h),
11816d76dcf4SAneesh Kumar K.V 				     pages_per_huge_page(h), page);
118207443a85SJoonsoo Kim 	if (restore_reserve)
118307443a85SJoonsoo Kim 		h->resv_huge_pages++;
118407443a85SJoonsoo Kim 
1185ab5ac90aSMichal Hocko 	if (PageHugeTemporary(page)) {
1186ab5ac90aSMichal Hocko 		list_del(&page->lru);
1187ab5ac90aSMichal Hocko 		ClearPageHugeTemporary(page);
1188ab5ac90aSMichal Hocko 		update_and_free_page(h, page);
1189ab5ac90aSMichal Hocko 	} else if (h->surplus_huge_pages_node[nid]) {
11900edaecfaSAneesh Kumar K.V 		/* remove the page from active list */
11910edaecfaSAneesh Kumar K.V 		list_del(&page->lru);
1192a5516438SAndi Kleen 		update_and_free_page(h, page);
1193a5516438SAndi Kleen 		h->surplus_huge_pages--;
1194a5516438SAndi Kleen 		h->surplus_huge_pages_node[nid]--;
11957893d1d5SAdam Litke 	} else {
11965d3a551cSWill Deacon 		arch_clear_hugepage_flags(page);
1197a5516438SAndi Kleen 		enqueue_huge_page(h, page);
11987893d1d5SAdam Litke 	}
119927a85ef1SDavid Gibson 	spin_unlock(&hugetlb_lock);
120027a85ef1SDavid Gibson }
120127a85ef1SDavid Gibson 
1202a5516438SAndi Kleen static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
1203b7ba30c6SAndi Kleen {
12040edaecfaSAneesh Kumar K.V 	INIT_LIST_HEAD(&page->lru);
1205f1e61557SKirill A. Shutemov 	set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
1206b7ba30c6SAndi Kleen 	spin_lock(&hugetlb_lock);
12079dd540e2SAneesh Kumar K.V 	set_hugetlb_cgroup(page, NULL);
1208a5516438SAndi Kleen 	h->nr_huge_pages++;
1209a5516438SAndi Kleen 	h->nr_huge_pages_node[nid]++;
1210b7ba30c6SAndi Kleen 	spin_unlock(&hugetlb_lock);
1211b7ba30c6SAndi Kleen }
1212b7ba30c6SAndi Kleen 
1213d00181b9SKirill A. Shutemov static void prep_compound_gigantic_page(struct page *page, unsigned int order)
121420a0307cSWu Fengguang {
121520a0307cSWu Fengguang 	int i;
121620a0307cSWu Fengguang 	int nr_pages = 1 << order;
121720a0307cSWu Fengguang 	struct page *p = page + 1;
121820a0307cSWu Fengguang 
121920a0307cSWu Fengguang 	/* we rely on prep_new_huge_page to set the destructor */
122020a0307cSWu Fengguang 	set_compound_order(page, order);
1221ef5a22beSAndrea Arcangeli 	__ClearPageReserved(page);
1222de09d31dSKirill A. Shutemov 	__SetPageHead(page);
122320a0307cSWu Fengguang 	for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
1224ef5a22beSAndrea Arcangeli 		/*
1225ef5a22beSAndrea Arcangeli 		 * For gigantic hugepages allocated through bootmem at
1226ef5a22beSAndrea Arcangeli 		 * boot, it's safer to be consistent with the not-gigantic
1227ef5a22beSAndrea Arcangeli 		 * hugepages and clear the PG_reserved bit from all tail pages
1228ef5a22beSAndrea Arcangeli 		 * too.  Otherwse drivers using get_user_pages() to access tail
1229ef5a22beSAndrea Arcangeli 		 * pages may get the reference counting wrong if they see
1230ef5a22beSAndrea Arcangeli 		 * PG_reserved set on a tail page (despite the head page not
1231ef5a22beSAndrea Arcangeli 		 * having PG_reserved set).  Enforcing this consistency between
1232ef5a22beSAndrea Arcangeli 		 * head and tail pages allows drivers to optimize away a check
1233ef5a22beSAndrea Arcangeli 		 * on the head page when they need know if put_page() is needed
1234ef5a22beSAndrea Arcangeli 		 * after get_user_pages().
1235ef5a22beSAndrea Arcangeli 		 */
1236ef5a22beSAndrea Arcangeli 		__ClearPageReserved(p);
123758a84aa9SYouquan Song 		set_page_count(p, 0);
12381d798ca3SKirill A. Shutemov 		set_compound_head(p, page);
123920a0307cSWu Fengguang 	}
1240b4330afbSMike Kravetz 	atomic_set(compound_mapcount_ptr(page), -1);
124120a0307cSWu Fengguang }
124220a0307cSWu Fengguang 
12437795912cSAndrew Morton /*
12447795912cSAndrew Morton  * PageHuge() only returns true for hugetlbfs pages, but not for normal or
12457795912cSAndrew Morton  * transparent huge pages.  See the PageTransHuge() documentation for more
12467795912cSAndrew Morton  * details.
12477795912cSAndrew Morton  */
124820a0307cSWu Fengguang int PageHuge(struct page *page)
124920a0307cSWu Fengguang {
125020a0307cSWu Fengguang 	if (!PageCompound(page))
125120a0307cSWu Fengguang 		return 0;
125220a0307cSWu Fengguang 
125320a0307cSWu Fengguang 	page = compound_head(page);
1254f1e61557SKirill A. Shutemov 	return page[1].compound_dtor == HUGETLB_PAGE_DTOR;
125520a0307cSWu Fengguang }
125643131e14SNaoya Horiguchi EXPORT_SYMBOL_GPL(PageHuge);
125743131e14SNaoya Horiguchi 
125827c73ae7SAndrea Arcangeli /*
125927c73ae7SAndrea Arcangeli  * PageHeadHuge() only returns true for hugetlbfs head page, but not for
126027c73ae7SAndrea Arcangeli  * normal or transparent huge pages.
126127c73ae7SAndrea Arcangeli  */
126227c73ae7SAndrea Arcangeli int PageHeadHuge(struct page *page_head)
126327c73ae7SAndrea Arcangeli {
126427c73ae7SAndrea Arcangeli 	if (!PageHead(page_head))
126527c73ae7SAndrea Arcangeli 		return 0;
126627c73ae7SAndrea Arcangeli 
1267758f66a2SAndrew Morton 	return get_compound_page_dtor(page_head) == free_huge_page;
126827c73ae7SAndrea Arcangeli }
126927c73ae7SAndrea Arcangeli 
127013d60f4bSZhang Yi pgoff_t __basepage_index(struct page *page)
127113d60f4bSZhang Yi {
127213d60f4bSZhang Yi 	struct page *page_head = compound_head(page);
127313d60f4bSZhang Yi 	pgoff_t index = page_index(page_head);
127413d60f4bSZhang Yi 	unsigned long compound_idx;
127513d60f4bSZhang Yi 
127613d60f4bSZhang Yi 	if (!PageHuge(page_head))
127713d60f4bSZhang Yi 		return page_index(page);
127813d60f4bSZhang Yi 
127913d60f4bSZhang Yi 	if (compound_order(page_head) >= MAX_ORDER)
128013d60f4bSZhang Yi 		compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
128113d60f4bSZhang Yi 	else
128213d60f4bSZhang Yi 		compound_idx = page - page_head;
128313d60f4bSZhang Yi 
128413d60f4bSZhang Yi 	return (index << compound_order(page_head)) + compound_idx;
128513d60f4bSZhang Yi }
128613d60f4bSZhang Yi 
12870c397daeSMichal Hocko static struct page *alloc_buddy_huge_page(struct hstate *h,
1288f60858f9SMike Kravetz 		gfp_t gfp_mask, int nid, nodemask_t *nmask,
1289f60858f9SMike Kravetz 		nodemask_t *node_alloc_noretry)
12901da177e4SLinus Torvalds {
1291af0fb9dfSMichal Hocko 	int order = huge_page_order(h);
12921da177e4SLinus Torvalds 	struct page *page;
1293f60858f9SMike Kravetz 	bool alloc_try_hard = true;
1294f96efd58SJoe Jin 
1295f60858f9SMike Kravetz 	/*
1296f60858f9SMike Kravetz 	 * By default we always try hard to allocate the page with
1297f60858f9SMike Kravetz 	 * __GFP_RETRY_MAYFAIL flag.  However, if we are allocating pages in
1298f60858f9SMike Kravetz 	 * a loop (to adjust global huge page counts) and previous allocation
1299f60858f9SMike Kravetz 	 * failed, do not continue to try hard on the same node.  Use the
1300f60858f9SMike Kravetz 	 * node_alloc_noretry bitmap to manage this state information.
1301f60858f9SMike Kravetz 	 */
1302f60858f9SMike Kravetz 	if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry))
1303f60858f9SMike Kravetz 		alloc_try_hard = false;
1304f60858f9SMike Kravetz 	gfp_mask |= __GFP_COMP|__GFP_NOWARN;
1305f60858f9SMike Kravetz 	if (alloc_try_hard)
1306f60858f9SMike Kravetz 		gfp_mask |= __GFP_RETRY_MAYFAIL;
1307af0fb9dfSMichal Hocko 	if (nid == NUMA_NO_NODE)
1308af0fb9dfSMichal Hocko 		nid = numa_mem_id();
1309af0fb9dfSMichal Hocko 	page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
1310af0fb9dfSMichal Hocko 	if (page)
1311af0fb9dfSMichal Hocko 		__count_vm_event(HTLB_BUDDY_PGALLOC);
1312af0fb9dfSMichal Hocko 	else
1313af0fb9dfSMichal Hocko 		__count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
131463b4613cSNishanth Aravamudan 
1315f60858f9SMike Kravetz 	/*
1316f60858f9SMike Kravetz 	 * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this
1317f60858f9SMike Kravetz 	 * indicates an overall state change.  Clear bit so that we resume
1318f60858f9SMike Kravetz 	 * normal 'try hard' allocations.
1319f60858f9SMike Kravetz 	 */
1320f60858f9SMike Kravetz 	if (node_alloc_noretry && page && !alloc_try_hard)
1321f60858f9SMike Kravetz 		node_clear(nid, *node_alloc_noretry);
1322f60858f9SMike Kravetz 
1323f60858f9SMike Kravetz 	/*
1324f60858f9SMike Kravetz 	 * If we tried hard to get a page but failed, set bit so that
1325f60858f9SMike Kravetz 	 * subsequent attempts will not try as hard until there is an
1326f60858f9SMike Kravetz 	 * overall state change.
1327f60858f9SMike Kravetz 	 */
1328f60858f9SMike Kravetz 	if (node_alloc_noretry && !page && alloc_try_hard)
1329f60858f9SMike Kravetz 		node_set(nid, *node_alloc_noretry);
1330f60858f9SMike Kravetz 
133163b4613cSNishanth Aravamudan 	return page;
133263b4613cSNishanth Aravamudan }
133363b4613cSNishanth Aravamudan 
1334af0fb9dfSMichal Hocko /*
13350c397daeSMichal Hocko  * Common helper to allocate a fresh hugetlb page. All specific allocators
13360c397daeSMichal Hocko  * should use this function to get new hugetlb pages
13370c397daeSMichal Hocko  */
13380c397daeSMichal Hocko static struct page *alloc_fresh_huge_page(struct hstate *h,
1339f60858f9SMike Kravetz 		gfp_t gfp_mask, int nid, nodemask_t *nmask,
1340f60858f9SMike Kravetz 		nodemask_t *node_alloc_noretry)
13410c397daeSMichal Hocko {
13420c397daeSMichal Hocko 	struct page *page;
13430c397daeSMichal Hocko 
13440c397daeSMichal Hocko 	if (hstate_is_gigantic(h))
13450c397daeSMichal Hocko 		page = alloc_gigantic_page(h, gfp_mask, nid, nmask);
13460c397daeSMichal Hocko 	else
13470c397daeSMichal Hocko 		page = alloc_buddy_huge_page(h, gfp_mask,
1348f60858f9SMike Kravetz 				nid, nmask, node_alloc_noretry);
13490c397daeSMichal Hocko 	if (!page)
13500c397daeSMichal Hocko 		return NULL;
13510c397daeSMichal Hocko 
13520c397daeSMichal Hocko 	if (hstate_is_gigantic(h))
13530c397daeSMichal Hocko 		prep_compound_gigantic_page(page, huge_page_order(h));
13540c397daeSMichal Hocko 	prep_new_huge_page(h, page, page_to_nid(page));
13550c397daeSMichal Hocko 
13560c397daeSMichal Hocko 	return page;
13570c397daeSMichal Hocko }
13580c397daeSMichal Hocko 
13590c397daeSMichal Hocko /*
1360af0fb9dfSMichal Hocko  * Allocates a fresh page to the hugetlb allocator pool in the node interleaved
1361af0fb9dfSMichal Hocko  * manner.
1362af0fb9dfSMichal Hocko  */
1363f60858f9SMike Kravetz static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
1364f60858f9SMike Kravetz 				nodemask_t *node_alloc_noretry)
1365b2261026SJoonsoo Kim {
1366b2261026SJoonsoo Kim 	struct page *page;
1367b2261026SJoonsoo Kim 	int nr_nodes, node;
1368af0fb9dfSMichal Hocko 	gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
1369b2261026SJoonsoo Kim 
1370b2261026SJoonsoo Kim 	for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
1371f60858f9SMike Kravetz 		page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed,
1372f60858f9SMike Kravetz 						node_alloc_noretry);
1373af0fb9dfSMichal Hocko 		if (page)
1374b2261026SJoonsoo Kim 			break;
1375b2261026SJoonsoo Kim 	}
1376b2261026SJoonsoo Kim 
1377af0fb9dfSMichal Hocko 	if (!page)
1378af0fb9dfSMichal Hocko 		return 0;
1379b2261026SJoonsoo Kim 
1380af0fb9dfSMichal Hocko 	put_page(page); /* free it into the hugepage allocator */
1381af0fb9dfSMichal Hocko 
1382af0fb9dfSMichal Hocko 	return 1;
1383b2261026SJoonsoo Kim }
1384b2261026SJoonsoo Kim 
1385e8c5c824SLee Schermerhorn /*
1386e8c5c824SLee Schermerhorn  * Free huge page from pool from next node to free.
1387e8c5c824SLee Schermerhorn  * Attempt to keep persistent huge pages more or less
1388e8c5c824SLee Schermerhorn  * balanced over allowed nodes.
1389e8c5c824SLee Schermerhorn  * Called with hugetlb_lock locked.
1390e8c5c824SLee Schermerhorn  */
13916ae11b27SLee Schermerhorn static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
13926ae11b27SLee Schermerhorn 							 bool acct_surplus)
1393e8c5c824SLee Schermerhorn {
1394b2261026SJoonsoo Kim 	int nr_nodes, node;
1395e8c5c824SLee Schermerhorn 	int ret = 0;
1396e8c5c824SLee Schermerhorn 
1397b2261026SJoonsoo Kim 	for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
1398685f3457SLee Schermerhorn 		/*
1399685f3457SLee Schermerhorn 		 * If we're returning unused surplus pages, only examine
1400685f3457SLee Schermerhorn 		 * nodes with surplus pages.
1401685f3457SLee Schermerhorn 		 */
1402b2261026SJoonsoo Kim 		if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
1403b2261026SJoonsoo Kim 		    !list_empty(&h->hugepage_freelists[node])) {
1404e8c5c824SLee Schermerhorn 			struct page *page =
1405b2261026SJoonsoo Kim 				list_entry(h->hugepage_freelists[node].next,
1406e8c5c824SLee Schermerhorn 					  struct page, lru);
1407e8c5c824SLee Schermerhorn 			list_del(&page->lru);
1408e8c5c824SLee Schermerhorn 			h->free_huge_pages--;
1409b2261026SJoonsoo Kim 			h->free_huge_pages_node[node]--;
1410685f3457SLee Schermerhorn 			if (acct_surplus) {
1411685f3457SLee Schermerhorn 				h->surplus_huge_pages--;
1412b2261026SJoonsoo Kim 				h->surplus_huge_pages_node[node]--;
1413685f3457SLee Schermerhorn 			}
1414e8c5c824SLee Schermerhorn 			update_and_free_page(h, page);
1415e8c5c824SLee Schermerhorn 			ret = 1;
14169a76db09SLee Schermerhorn 			break;
1417e8c5c824SLee Schermerhorn 		}
1418b2261026SJoonsoo Kim 	}
1419e8c5c824SLee Schermerhorn 
1420e8c5c824SLee Schermerhorn 	return ret;
1421e8c5c824SLee Schermerhorn }
1422e8c5c824SLee Schermerhorn 
1423c8721bbbSNaoya Horiguchi /*
1424c8721bbbSNaoya Horiguchi  * Dissolve a given free hugepage into free buddy pages. This function does
1425faf53defSNaoya Horiguchi  * nothing for in-use hugepages and non-hugepages.
1426faf53defSNaoya Horiguchi  * This function returns values like below:
1427faf53defSNaoya Horiguchi  *
1428faf53defSNaoya Horiguchi  *  -EBUSY: failed to dissolved free hugepages or the hugepage is in-use
1429faf53defSNaoya Horiguchi  *          (allocated or reserved.)
1430faf53defSNaoya Horiguchi  *       0: successfully dissolved free hugepages or the page is not a
1431faf53defSNaoya Horiguchi  *          hugepage (considered as already dissolved)
1432c8721bbbSNaoya Horiguchi  */
1433c3114a84SAnshuman Khandual int dissolve_free_huge_page(struct page *page)
1434c8721bbbSNaoya Horiguchi {
14356bc9b564SNaoya Horiguchi 	int rc = -EBUSY;
1436082d5b6bSGerald Schaefer 
1437faf53defSNaoya Horiguchi 	/* Not to disrupt normal path by vainly holding hugetlb_lock */
1438faf53defSNaoya Horiguchi 	if (!PageHuge(page))
1439faf53defSNaoya Horiguchi 		return 0;
1440faf53defSNaoya Horiguchi 
1441c8721bbbSNaoya Horiguchi 	spin_lock(&hugetlb_lock);
1442faf53defSNaoya Horiguchi 	if (!PageHuge(page)) {
1443faf53defSNaoya Horiguchi 		rc = 0;
1444faf53defSNaoya Horiguchi 		goto out;
1445faf53defSNaoya Horiguchi 	}
1446faf53defSNaoya Horiguchi 
1447faf53defSNaoya Horiguchi 	if (!page_count(page)) {
14482247bb33SGerald Schaefer 		struct page *head = compound_head(page);
14492247bb33SGerald Schaefer 		struct hstate *h = page_hstate(head);
14502247bb33SGerald Schaefer 		int nid = page_to_nid(head);
14516bc9b564SNaoya Horiguchi 		if (h->free_huge_pages - h->resv_huge_pages == 0)
1452082d5b6bSGerald Schaefer 			goto out;
1453c3114a84SAnshuman Khandual 		/*
1454c3114a84SAnshuman Khandual 		 * Move PageHWPoison flag from head page to the raw error page,
1455c3114a84SAnshuman Khandual 		 * which makes any subpages rather than the error page reusable.
1456c3114a84SAnshuman Khandual 		 */
1457c3114a84SAnshuman Khandual 		if (PageHWPoison(head) && page != head) {
1458c3114a84SAnshuman Khandual 			SetPageHWPoison(page);
1459c3114a84SAnshuman Khandual 			ClearPageHWPoison(head);
1460c3114a84SAnshuman Khandual 		}
14612247bb33SGerald Schaefer 		list_del(&head->lru);
1462c8721bbbSNaoya Horiguchi 		h->free_huge_pages--;
1463c8721bbbSNaoya Horiguchi 		h->free_huge_pages_node[nid]--;
1464c1470b33Szhong jiang 		h->max_huge_pages--;
14652247bb33SGerald Schaefer 		update_and_free_page(h, head);
14666bc9b564SNaoya Horiguchi 		rc = 0;
1467c8721bbbSNaoya Horiguchi 	}
1468082d5b6bSGerald Schaefer out:
1469c8721bbbSNaoya Horiguchi 	spin_unlock(&hugetlb_lock);
1470082d5b6bSGerald Schaefer 	return rc;
1471c8721bbbSNaoya Horiguchi }
1472c8721bbbSNaoya Horiguchi 
1473c8721bbbSNaoya Horiguchi /*
1474c8721bbbSNaoya Horiguchi  * Dissolve free hugepages in a given pfn range. Used by memory hotplug to
1475c8721bbbSNaoya Horiguchi  * make specified memory blocks removable from the system.
14762247bb33SGerald Schaefer  * Note that this will dissolve a free gigantic hugepage completely, if any
14772247bb33SGerald Schaefer  * part of it lies within the given range.
1478082d5b6bSGerald Schaefer  * Also note that if dissolve_free_huge_page() returns with an error, all
1479082d5b6bSGerald Schaefer  * free hugepages that were dissolved before that error are lost.
1480c8721bbbSNaoya Horiguchi  */
1481082d5b6bSGerald Schaefer int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
1482c8721bbbSNaoya Horiguchi {
1483c8721bbbSNaoya Horiguchi 	unsigned long pfn;
1484eb03aa00SGerald Schaefer 	struct page *page;
1485082d5b6bSGerald Schaefer 	int rc = 0;
1486c8721bbbSNaoya Horiguchi 
1487d0177639SLi Zhong 	if (!hugepages_supported())
1488082d5b6bSGerald Schaefer 		return rc;
1489d0177639SLi Zhong 
1490eb03aa00SGerald Schaefer 	for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) {
1491eb03aa00SGerald Schaefer 		page = pfn_to_page(pfn);
1492eb03aa00SGerald Schaefer 		rc = dissolve_free_huge_page(page);
1493eb03aa00SGerald Schaefer 		if (rc)
1494082d5b6bSGerald Schaefer 			break;
1495eb03aa00SGerald Schaefer 	}
1496082d5b6bSGerald Schaefer 
1497082d5b6bSGerald Schaefer 	return rc;
1498c8721bbbSNaoya Horiguchi }
1499c8721bbbSNaoya Horiguchi 
1500ab5ac90aSMichal Hocko /*
1501ab5ac90aSMichal Hocko  * Allocates a fresh surplus page from the page allocator.
1502ab5ac90aSMichal Hocko  */
15030c397daeSMichal Hocko static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
1504aaf14e40SMichal Hocko 		int nid, nodemask_t *nmask)
15057893d1d5SAdam Litke {
15069980d744SMichal Hocko 	struct page *page = NULL;
15077893d1d5SAdam Litke 
1508bae7f4aeSLuiz Capitulino 	if (hstate_is_gigantic(h))
1509aa888a74SAndi Kleen 		return NULL;
1510aa888a74SAndi Kleen 
1511d1c3fb1fSNishanth Aravamudan 	spin_lock(&hugetlb_lock);
15129980d744SMichal Hocko 	if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages)
15139980d744SMichal Hocko 		goto out_unlock;
1514d1c3fb1fSNishanth Aravamudan 	spin_unlock(&hugetlb_lock);
1515d1c3fb1fSNishanth Aravamudan 
1516f60858f9SMike Kravetz 	page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
15179980d744SMichal Hocko 	if (!page)
15180c397daeSMichal Hocko 		return NULL;
1519d1c3fb1fSNishanth Aravamudan 
15207893d1d5SAdam Litke 	spin_lock(&hugetlb_lock);
15219980d744SMichal Hocko 	/*
15229980d744SMichal Hocko 	 * We could have raced with the pool size change.
15239980d744SMichal Hocko 	 * Double check that and simply deallocate the new page
15249980d744SMichal Hocko 	 * if we would end up overcommiting the surpluses. Abuse
15259980d744SMichal Hocko 	 * temporary page to workaround the nasty free_huge_page
15269980d744SMichal Hocko 	 * codeflow
15279980d744SMichal Hocko 	 */
15289980d744SMichal Hocko 	if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
15299980d744SMichal Hocko 		SetPageHugeTemporary(page);
15302bf753e6SKai Shen 		spin_unlock(&hugetlb_lock);
15319980d744SMichal Hocko 		put_page(page);
15322bf753e6SKai Shen 		return NULL;
15339980d744SMichal Hocko 	} else {
15349980d744SMichal Hocko 		h->surplus_huge_pages++;
15354704dea3SMichal Hocko 		h->surplus_huge_pages_node[page_to_nid(page)]++;
15367893d1d5SAdam Litke 	}
15379980d744SMichal Hocko 
15389980d744SMichal Hocko out_unlock:
1539d1c3fb1fSNishanth Aravamudan 	spin_unlock(&hugetlb_lock);
15407893d1d5SAdam Litke 
15417893d1d5SAdam Litke 	return page;
15427893d1d5SAdam Litke }
15437893d1d5SAdam Litke 
15449a4e9f3bSAneesh Kumar K.V struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
1545ab5ac90aSMichal Hocko 				     int nid, nodemask_t *nmask)
1546ab5ac90aSMichal Hocko {
1547ab5ac90aSMichal Hocko 	struct page *page;
1548ab5ac90aSMichal Hocko 
1549ab5ac90aSMichal Hocko 	if (hstate_is_gigantic(h))
1550ab5ac90aSMichal Hocko 		return NULL;
1551ab5ac90aSMichal Hocko 
1552f60858f9SMike Kravetz 	page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
1553ab5ac90aSMichal Hocko 	if (!page)
1554ab5ac90aSMichal Hocko 		return NULL;
1555ab5ac90aSMichal Hocko 
1556ab5ac90aSMichal Hocko 	/*
1557ab5ac90aSMichal Hocko 	 * We do not account these pages as surplus because they are only
1558ab5ac90aSMichal Hocko 	 * temporary and will be released properly on the last reference
1559ab5ac90aSMichal Hocko 	 */
1560ab5ac90aSMichal Hocko 	SetPageHugeTemporary(page);
1561ab5ac90aSMichal Hocko 
1562ab5ac90aSMichal Hocko 	return page;
1563ab5ac90aSMichal Hocko }
1564ab5ac90aSMichal Hocko 
1565e4e574b7SAdam Litke /*
1566099730d6SDave Hansen  * Use the VMA's mpolicy to allocate a huge page from the buddy.
1567099730d6SDave Hansen  */
1568e0ec90eeSDave Hansen static
15690c397daeSMichal Hocko struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
1570099730d6SDave Hansen 		struct vm_area_struct *vma, unsigned long addr)
1571099730d6SDave Hansen {
1572aaf14e40SMichal Hocko 	struct page *page;
1573aaf14e40SMichal Hocko 	struct mempolicy *mpol;
1574aaf14e40SMichal Hocko 	gfp_t gfp_mask = htlb_alloc_mask(h);
1575aaf14e40SMichal Hocko 	int nid;
1576aaf14e40SMichal Hocko 	nodemask_t *nodemask;
1577aaf14e40SMichal Hocko 
1578aaf14e40SMichal Hocko 	nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
15790c397daeSMichal Hocko 	page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
1580aaf14e40SMichal Hocko 	mpol_cond_put(mpol);
1581aaf14e40SMichal Hocko 
1582aaf14e40SMichal Hocko 	return page;
1583099730d6SDave Hansen }
1584099730d6SDave Hansen 
1585ab5ac90aSMichal Hocko /* page migration callback function */
1586bf50bab2SNaoya Horiguchi struct page *alloc_huge_page_node(struct hstate *h, int nid)
1587bf50bab2SNaoya Horiguchi {
1588aaf14e40SMichal Hocko 	gfp_t gfp_mask = htlb_alloc_mask(h);
15894ef91848SJoonsoo Kim 	struct page *page = NULL;
1590bf50bab2SNaoya Horiguchi 
1591aaf14e40SMichal Hocko 	if (nid != NUMA_NO_NODE)
1592aaf14e40SMichal Hocko 		gfp_mask |= __GFP_THISNODE;
1593aaf14e40SMichal Hocko 
1594bf50bab2SNaoya Horiguchi 	spin_lock(&hugetlb_lock);
15954ef91848SJoonsoo Kim 	if (h->free_huge_pages - h->resv_huge_pages > 0)
15963e59fcb0SMichal Hocko 		page = dequeue_huge_page_nodemask(h, gfp_mask, nid, NULL);
1597bf50bab2SNaoya Horiguchi 	spin_unlock(&hugetlb_lock);
1598bf50bab2SNaoya Horiguchi 
159994ae8ba7SAneesh Kumar K.V 	if (!page)
16000c397daeSMichal Hocko 		page = alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
1601bf50bab2SNaoya Horiguchi 
1602bf50bab2SNaoya Horiguchi 	return page;
1603bf50bab2SNaoya Horiguchi }
1604bf50bab2SNaoya Horiguchi 
1605ab5ac90aSMichal Hocko /* page migration callback function */
16063e59fcb0SMichal Hocko struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
16073e59fcb0SMichal Hocko 		nodemask_t *nmask)
16084db9b2efSMichal Hocko {
1609aaf14e40SMichal Hocko 	gfp_t gfp_mask = htlb_alloc_mask(h);
16104db9b2efSMichal Hocko 
16114db9b2efSMichal Hocko 	spin_lock(&hugetlb_lock);
16124db9b2efSMichal Hocko 	if (h->free_huge_pages - h->resv_huge_pages > 0) {
16133e59fcb0SMichal Hocko 		struct page *page;
16143e59fcb0SMichal Hocko 
16153e59fcb0SMichal Hocko 		page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask);
16163e59fcb0SMichal Hocko 		if (page) {
16173e59fcb0SMichal Hocko 			spin_unlock(&hugetlb_lock);
16183e59fcb0SMichal Hocko 			return page;
16194db9b2efSMichal Hocko 		}
16204db9b2efSMichal Hocko 	}
16214db9b2efSMichal Hocko 	spin_unlock(&hugetlb_lock);
16224db9b2efSMichal Hocko 
16230c397daeSMichal Hocko 	return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
16244db9b2efSMichal Hocko }
16254db9b2efSMichal Hocko 
1626ebd63723SMichal Hocko /* mempolicy aware migration callback */
1627389c8178SMichal Hocko struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
1628389c8178SMichal Hocko 		unsigned long address)
1629ebd63723SMichal Hocko {
1630ebd63723SMichal Hocko 	struct mempolicy *mpol;
1631ebd63723SMichal Hocko 	nodemask_t *nodemask;
1632ebd63723SMichal Hocko 	struct page *page;
1633ebd63723SMichal Hocko 	gfp_t gfp_mask;
1634ebd63723SMichal Hocko 	int node;
1635ebd63723SMichal Hocko 
1636ebd63723SMichal Hocko 	gfp_mask = htlb_alloc_mask(h);
1637ebd63723SMichal Hocko 	node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
1638ebd63723SMichal Hocko 	page = alloc_huge_page_nodemask(h, node, nodemask);
1639ebd63723SMichal Hocko 	mpol_cond_put(mpol);
1640ebd63723SMichal Hocko 
1641ebd63723SMichal Hocko 	return page;
1642ebd63723SMichal Hocko }
1643ebd63723SMichal Hocko 
1644bf50bab2SNaoya Horiguchi /*
164525985edcSLucas De Marchi  * Increase the hugetlb pool such that it can accommodate a reservation
1646e4e574b7SAdam Litke  * of size 'delta'.
1647e4e574b7SAdam Litke  */
1648a5516438SAndi Kleen static int gather_surplus_pages(struct hstate *h, int delta)
1649e4e574b7SAdam Litke {
1650e4e574b7SAdam Litke 	struct list_head surplus_list;
1651e4e574b7SAdam Litke 	struct page *page, *tmp;
1652e4e574b7SAdam Litke 	int ret, i;
1653e4e574b7SAdam Litke 	int needed, allocated;
165428073b02SHillf Danton 	bool alloc_ok = true;
1655e4e574b7SAdam Litke 
1656a5516438SAndi Kleen 	needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
1657ac09b3a1SAdam Litke 	if (needed <= 0) {
1658a5516438SAndi Kleen 		h->resv_huge_pages += delta;
1659e4e574b7SAdam Litke 		return 0;
1660ac09b3a1SAdam Litke 	}
1661e4e574b7SAdam Litke 
1662e4e574b7SAdam Litke 	allocated = 0;
1663e4e574b7SAdam Litke 	INIT_LIST_HEAD(&surplus_list);
1664e4e574b7SAdam Litke 
1665e4e574b7SAdam Litke 	ret = -ENOMEM;
1666e4e574b7SAdam Litke retry:
1667e4e574b7SAdam Litke 	spin_unlock(&hugetlb_lock);
1668e4e574b7SAdam Litke 	for (i = 0; i < needed; i++) {
16690c397daeSMichal Hocko 		page = alloc_surplus_huge_page(h, htlb_alloc_mask(h),
1670aaf14e40SMichal Hocko 				NUMA_NO_NODE, NULL);
167128073b02SHillf Danton 		if (!page) {
167228073b02SHillf Danton 			alloc_ok = false;
167328073b02SHillf Danton 			break;
167428073b02SHillf Danton 		}
1675e4e574b7SAdam Litke 		list_add(&page->lru, &surplus_list);
167669ed779aSDavid Rientjes 		cond_resched();
1677e4e574b7SAdam Litke 	}
167828073b02SHillf Danton 	allocated += i;
1679e4e574b7SAdam Litke 
1680e4e574b7SAdam Litke 	/*
1681e4e574b7SAdam Litke 	 * After retaking hugetlb_lock, we need to recalculate 'needed'
1682e4e574b7SAdam Litke 	 * because either resv_huge_pages or free_huge_pages may have changed.
1683e4e574b7SAdam Litke 	 */
1684e4e574b7SAdam Litke 	spin_lock(&hugetlb_lock);
1685a5516438SAndi Kleen 	needed = (h->resv_huge_pages + delta) -
1686a5516438SAndi Kleen 			(h->free_huge_pages + allocated);
168728073b02SHillf Danton 	if (needed > 0) {
168828073b02SHillf Danton 		if (alloc_ok)
1689e4e574b7SAdam Litke 			goto retry;
169028073b02SHillf Danton 		/*
169128073b02SHillf Danton 		 * We were not able to allocate enough pages to
169228073b02SHillf Danton 		 * satisfy the entire reservation so we free what
169328073b02SHillf Danton 		 * we've allocated so far.
169428073b02SHillf Danton 		 */
169528073b02SHillf Danton 		goto free;
169628073b02SHillf Danton 	}
1697e4e574b7SAdam Litke 	/*
1698e4e574b7SAdam Litke 	 * The surplus_list now contains _at_least_ the number of extra pages
169925985edcSLucas De Marchi 	 * needed to accommodate the reservation.  Add the appropriate number
1700e4e574b7SAdam Litke 	 * of pages to the hugetlb pool and free the extras back to the buddy
1701ac09b3a1SAdam Litke 	 * allocator.  Commit the entire reservation here to prevent another
1702ac09b3a1SAdam Litke 	 * process from stealing the pages as they are added to the pool but
1703ac09b3a1SAdam Litke 	 * before they are reserved.
1704e4e574b7SAdam Litke 	 */
1705e4e574b7SAdam Litke 	needed += allocated;
1706a5516438SAndi Kleen 	h->resv_huge_pages += delta;
1707e4e574b7SAdam Litke 	ret = 0;
1708a9869b83SNaoya Horiguchi 
170919fc3f0aSAdam Litke 	/* Free the needed pages to the hugetlb pool */
171019fc3f0aSAdam Litke 	list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
171119fc3f0aSAdam Litke 		if ((--needed) < 0)
171219fc3f0aSAdam Litke 			break;
1713a9869b83SNaoya Horiguchi 		/*
1714a9869b83SNaoya Horiguchi 		 * This page is now managed by the hugetlb allocator and has
1715a9869b83SNaoya Horiguchi 		 * no users -- drop the buddy allocator's reference.
1716a9869b83SNaoya Horiguchi 		 */
1717a9869b83SNaoya Horiguchi 		put_page_testzero(page);
1718309381feSSasha Levin 		VM_BUG_ON_PAGE(page_count(page), page);
1719a5516438SAndi Kleen 		enqueue_huge_page(h, page);
172019fc3f0aSAdam Litke 	}
172128073b02SHillf Danton free:
1722b0365c8dSHillf Danton 	spin_unlock(&hugetlb_lock);
172319fc3f0aSAdam Litke 
172419fc3f0aSAdam Litke 	/* Free unnecessary surplus pages to the buddy allocator */
1725c0d934baSJoonsoo Kim 	list_for_each_entry_safe(page, tmp, &surplus_list, lru)
1726a9869b83SNaoya Horiguchi 		put_page(page);
172719fc3f0aSAdam Litke 	spin_lock(&hugetlb_lock);
1728e4e574b7SAdam Litke 
1729e4e574b7SAdam Litke 	return ret;
1730e4e574b7SAdam Litke }
1731e4e574b7SAdam Litke 
1732e4e574b7SAdam Litke /*
1733e5bbc8a6SMike Kravetz  * This routine has two main purposes:
1734e5bbc8a6SMike Kravetz  * 1) Decrement the reservation count (resv_huge_pages) by the value passed
1735e5bbc8a6SMike Kravetz  *    in unused_resv_pages.  This corresponds to the prior adjustments made
1736e5bbc8a6SMike Kravetz  *    to the associated reservation map.
1737e5bbc8a6SMike Kravetz  * 2) Free any unused surplus pages that may have been allocated to satisfy
1738e5bbc8a6SMike Kravetz  *    the reservation.  As many as unused_resv_pages may be freed.
1739e5bbc8a6SMike Kravetz  *
1740e5bbc8a6SMike Kravetz  * Called with hugetlb_lock held.  However, the lock could be dropped (and
1741e5bbc8a6SMike Kravetz  * reacquired) during calls to cond_resched_lock.  Whenever dropping the lock,
1742e5bbc8a6SMike Kravetz  * we must make sure nobody else can claim pages we are in the process of
1743e5bbc8a6SMike Kravetz  * freeing.  Do this by ensuring resv_huge_page always is greater than the
1744e5bbc8a6SMike Kravetz  * number of huge pages we plan to free when dropping the lock.
1745e4e574b7SAdam Litke  */
1746a5516438SAndi Kleen static void return_unused_surplus_pages(struct hstate *h,
1747a5516438SAndi Kleen 					unsigned long unused_resv_pages)
1748e4e574b7SAdam Litke {
1749e4e574b7SAdam Litke 	unsigned long nr_pages;
1750e4e574b7SAdam Litke 
1751aa888a74SAndi Kleen 	/* Cannot return gigantic pages currently */
1752bae7f4aeSLuiz Capitulino 	if (hstate_is_gigantic(h))
1753e5bbc8a6SMike Kravetz 		goto out;
1754aa888a74SAndi Kleen 
1755e5bbc8a6SMike Kravetz 	/*
1756e5bbc8a6SMike Kravetz 	 * Part (or even all) of the reservation could have been backed
1757e5bbc8a6SMike Kravetz 	 * by pre-allocated pages. Only free surplus pages.
1758e5bbc8a6SMike Kravetz 	 */
1759a5516438SAndi Kleen 	nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
1760e4e574b7SAdam Litke 
1761685f3457SLee Schermerhorn 	/*
1762685f3457SLee Schermerhorn 	 * We want to release as many surplus pages as possible, spread
17639b5e5d0fSLee Schermerhorn 	 * evenly across all nodes with memory. Iterate across these nodes
17649b5e5d0fSLee Schermerhorn 	 * until we can no longer free unreserved surplus pages. This occurs
17659b5e5d0fSLee Schermerhorn 	 * when the nodes with surplus pages have no free pages.
17669b5e5d0fSLee Schermerhorn 	 * free_pool_huge_page() will balance the the freed pages across the
17679b5e5d0fSLee Schermerhorn 	 * on-line nodes with memory and will handle the hstate accounting.
1768e5bbc8a6SMike Kravetz 	 *
1769e5bbc8a6SMike Kravetz 	 * Note that we decrement resv_huge_pages as we free the pages.  If
1770e5bbc8a6SMike Kravetz 	 * we drop the lock, resv_huge_pages will still be sufficiently large
1771e5bbc8a6SMike Kravetz 	 * to cover subsequent pages we may free.
1772685f3457SLee Schermerhorn 	 */
1773685f3457SLee Schermerhorn 	while (nr_pages--) {
1774e5bbc8a6SMike Kravetz 		h->resv_huge_pages--;
1775e5bbc8a6SMike Kravetz 		unused_resv_pages--;
17768cebfcd0SLai Jiangshan 		if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
1777e5bbc8a6SMike Kravetz 			goto out;
17787848a4bfSMizuma, Masayoshi 		cond_resched_lock(&hugetlb_lock);
1779e4e574b7SAdam Litke 	}
1780e5bbc8a6SMike Kravetz 
1781e5bbc8a6SMike Kravetz out:
1782e5bbc8a6SMike Kravetz 	/* Fully uncommit the reservation */
1783e5bbc8a6SMike Kravetz 	h->resv_huge_pages -= unused_resv_pages;
1784e4e574b7SAdam Litke }
1785e4e574b7SAdam Litke 
17865e911373SMike Kravetz 
1787c37f9fb1SAndy Whitcroft /*
1788feba16e2SMike Kravetz  * vma_needs_reservation, vma_commit_reservation and vma_end_reservation
17895e911373SMike Kravetz  * are used by the huge page allocation routines to manage reservations.
1790cf3ad20bSMike Kravetz  *
1791cf3ad20bSMike Kravetz  * vma_needs_reservation is called to determine if the huge page at addr
1792cf3ad20bSMike Kravetz  * within the vma has an associated reservation.  If a reservation is
1793cf3ad20bSMike Kravetz  * needed, the value 1 is returned.  The caller is then responsible for
1794cf3ad20bSMike Kravetz  * managing the global reservation and subpool usage counts.  After
1795cf3ad20bSMike Kravetz  * the huge page has been allocated, vma_commit_reservation is called
1796feba16e2SMike Kravetz  * to add the page to the reservation map.  If the page allocation fails,
1797feba16e2SMike Kravetz  * the reservation must be ended instead of committed.  vma_end_reservation
1798feba16e2SMike Kravetz  * is called in such cases.
1799cf3ad20bSMike Kravetz  *
1800cf3ad20bSMike Kravetz  * In the normal case, vma_commit_reservation returns the same value
1801cf3ad20bSMike Kravetz  * as the preceding vma_needs_reservation call.  The only time this
1802cf3ad20bSMike Kravetz  * is not the case is if a reserve map was changed between calls.  It
1803cf3ad20bSMike Kravetz  * is the responsibility of the caller to notice the difference and
1804cf3ad20bSMike Kravetz  * take appropriate action.
180596b96a96SMike Kravetz  *
180696b96a96SMike Kravetz  * vma_add_reservation is used in error paths where a reservation must
180796b96a96SMike Kravetz  * be restored when a newly allocated huge page must be freed.  It is
180896b96a96SMike Kravetz  * to be called after calling vma_needs_reservation to determine if a
180996b96a96SMike Kravetz  * reservation exists.
1810c37f9fb1SAndy Whitcroft  */
18115e911373SMike Kravetz enum vma_resv_mode {
18125e911373SMike Kravetz 	VMA_NEEDS_RESV,
18135e911373SMike Kravetz 	VMA_COMMIT_RESV,
1814feba16e2SMike Kravetz 	VMA_END_RESV,
181596b96a96SMike Kravetz 	VMA_ADD_RESV,
18165e911373SMike Kravetz };
1817cf3ad20bSMike Kravetz static long __vma_reservation_common(struct hstate *h,
1818cf3ad20bSMike Kravetz 				struct vm_area_struct *vma, unsigned long addr,
18195e911373SMike Kravetz 				enum vma_resv_mode mode)
1820c37f9fb1SAndy Whitcroft {
18214e35f483SJoonsoo Kim 	struct resv_map *resv;
18224e35f483SJoonsoo Kim 	pgoff_t idx;
1823cf3ad20bSMike Kravetz 	long ret;
1824c37f9fb1SAndy Whitcroft 
18254e35f483SJoonsoo Kim 	resv = vma_resv_map(vma);
18264e35f483SJoonsoo Kim 	if (!resv)
1827c37f9fb1SAndy Whitcroft 		return 1;
1828c37f9fb1SAndy Whitcroft 
18294e35f483SJoonsoo Kim 	idx = vma_hugecache_offset(h, vma, addr);
18305e911373SMike Kravetz 	switch (mode) {
18315e911373SMike Kravetz 	case VMA_NEEDS_RESV:
1832cf3ad20bSMike Kravetz 		ret = region_chg(resv, idx, idx + 1);
18335e911373SMike Kravetz 		break;
18345e911373SMike Kravetz 	case VMA_COMMIT_RESV:
18355e911373SMike Kravetz 		ret = region_add(resv, idx, idx + 1);
18365e911373SMike Kravetz 		break;
1837feba16e2SMike Kravetz 	case VMA_END_RESV:
18385e911373SMike Kravetz 		region_abort(resv, idx, idx + 1);
18395e911373SMike Kravetz 		ret = 0;
18405e911373SMike Kravetz 		break;
184196b96a96SMike Kravetz 	case VMA_ADD_RESV:
184296b96a96SMike Kravetz 		if (vma->vm_flags & VM_MAYSHARE)
184396b96a96SMike Kravetz 			ret = region_add(resv, idx, idx + 1);
184496b96a96SMike Kravetz 		else {
184596b96a96SMike Kravetz 			region_abort(resv, idx, idx + 1);
184696b96a96SMike Kravetz 			ret = region_del(resv, idx, idx + 1);
184796b96a96SMike Kravetz 		}
184896b96a96SMike Kravetz 		break;
18495e911373SMike Kravetz 	default:
18505e911373SMike Kravetz 		BUG();
18515e911373SMike Kravetz 	}
185284afd99bSAndy Whitcroft 
18534e35f483SJoonsoo Kim 	if (vma->vm_flags & VM_MAYSHARE)
1854cf3ad20bSMike Kravetz 		return ret;
185567961f9dSMike Kravetz 	else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && ret >= 0) {
185667961f9dSMike Kravetz 		/*
185767961f9dSMike Kravetz 		 * In most cases, reserves always exist for private mappings.
185867961f9dSMike Kravetz 		 * However, a file associated with mapping could have been
185967961f9dSMike Kravetz 		 * hole punched or truncated after reserves were consumed.
186067961f9dSMike Kravetz 		 * As subsequent fault on such a range will not use reserves.
186167961f9dSMike Kravetz 		 * Subtle - The reserve map for private mappings has the
186267961f9dSMike Kravetz 		 * opposite meaning than that of shared mappings.  If NO
186367961f9dSMike Kravetz 		 * entry is in the reserve map, it means a reservation exists.
186467961f9dSMike Kravetz 		 * If an entry exists in the reserve map, it means the
186567961f9dSMike Kravetz 		 * reservation has already been consumed.  As a result, the
186667961f9dSMike Kravetz 		 * return value of this routine is the opposite of the
186767961f9dSMike Kravetz 		 * value returned from reserve map manipulation routines above.
186867961f9dSMike Kravetz 		 */
186967961f9dSMike Kravetz 		if (ret)
187067961f9dSMike Kravetz 			return 0;
187167961f9dSMike Kravetz 		else
187267961f9dSMike Kravetz 			return 1;
187367961f9dSMike Kravetz 	}
18744e35f483SJoonsoo Kim 	else
1875cf3ad20bSMike Kravetz 		return ret < 0 ? ret : 0;
187684afd99bSAndy Whitcroft }
1877cf3ad20bSMike Kravetz 
1878cf3ad20bSMike Kravetz static long vma_needs_reservation(struct hstate *h,
1879a5516438SAndi Kleen 			struct vm_area_struct *vma, unsigned long addr)
1880c37f9fb1SAndy Whitcroft {
18815e911373SMike Kravetz 	return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV);
1882cf3ad20bSMike Kravetz }
1883c37f9fb1SAndy Whitcroft 
1884cf3ad20bSMike Kravetz static long vma_commit_reservation(struct hstate *h,
1885cf3ad20bSMike Kravetz 			struct vm_area_struct *vma, unsigned long addr)
1886cf3ad20bSMike Kravetz {
18875e911373SMike Kravetz 	return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV);
18885e911373SMike Kravetz }
18895e911373SMike Kravetz 
1890feba16e2SMike Kravetz static void vma_end_reservation(struct hstate *h,
18915e911373SMike Kravetz 			struct vm_area_struct *vma, unsigned long addr)
18925e911373SMike Kravetz {
1893feba16e2SMike Kravetz 	(void)__vma_reservation_common(h, vma, addr, VMA_END_RESV);
1894c37f9fb1SAndy Whitcroft }
1895c37f9fb1SAndy Whitcroft 
189696b96a96SMike Kravetz static long vma_add_reservation(struct hstate *h,
189796b96a96SMike Kravetz 			struct vm_area_struct *vma, unsigned long addr)
189896b96a96SMike Kravetz {
189996b96a96SMike Kravetz 	return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV);
190096b96a96SMike Kravetz }
190196b96a96SMike Kravetz 
190296b96a96SMike Kravetz /*
190396b96a96SMike Kravetz  * This routine is called to restore a reservation on error paths.  In the
190496b96a96SMike Kravetz  * specific error paths, a huge page was allocated (via alloc_huge_page)
190596b96a96SMike Kravetz  * and is about to be freed.  If a reservation for the page existed,
190696b96a96SMike Kravetz  * alloc_huge_page would have consumed the reservation and set PagePrivate
190796b96a96SMike Kravetz  * in the newly allocated page.  When the page is freed via free_huge_page,
190896b96a96SMike Kravetz  * the global reservation count will be incremented if PagePrivate is set.
190996b96a96SMike Kravetz  * However, free_huge_page can not adjust the reserve map.  Adjust the
191096b96a96SMike Kravetz  * reserve map here to be consistent with global reserve count adjustments
191196b96a96SMike Kravetz  * to be made by free_huge_page.
191296b96a96SMike Kravetz  */
191396b96a96SMike Kravetz static void restore_reserve_on_error(struct hstate *h,
191496b96a96SMike Kravetz 			struct vm_area_struct *vma, unsigned long address,
191596b96a96SMike Kravetz 			struct page *page)
191696b96a96SMike Kravetz {
191796b96a96SMike Kravetz 	if (unlikely(PagePrivate(page))) {
191896b96a96SMike Kravetz 		long rc = vma_needs_reservation(h, vma, address);
191996b96a96SMike Kravetz 
192096b96a96SMike Kravetz 		if (unlikely(rc < 0)) {
192196b96a96SMike Kravetz 			/*
192296b96a96SMike Kravetz 			 * Rare out of memory condition in reserve map
192396b96a96SMike Kravetz 			 * manipulation.  Clear PagePrivate so that
192496b96a96SMike Kravetz 			 * global reserve count will not be incremented
192596b96a96SMike Kravetz 			 * by free_huge_page.  This will make it appear
192696b96a96SMike Kravetz 			 * as though the reservation for this page was
192796b96a96SMike Kravetz 			 * consumed.  This may prevent the task from
192896b96a96SMike Kravetz 			 * faulting in the page at a later time.  This
192996b96a96SMike Kravetz 			 * is better than inconsistent global huge page
193096b96a96SMike Kravetz 			 * accounting of reserve counts.
193196b96a96SMike Kravetz 			 */
193296b96a96SMike Kravetz 			ClearPagePrivate(page);
193396b96a96SMike Kravetz 		} else if (rc) {
193496b96a96SMike Kravetz 			rc = vma_add_reservation(h, vma, address);
193596b96a96SMike Kravetz 			if (unlikely(rc < 0))
193696b96a96SMike Kravetz 				/*
193796b96a96SMike Kravetz 				 * See above comment about rare out of
193896b96a96SMike Kravetz 				 * memory condition.
193996b96a96SMike Kravetz 				 */
194096b96a96SMike Kravetz 				ClearPagePrivate(page);
194196b96a96SMike Kravetz 		} else
194296b96a96SMike Kravetz 			vma_end_reservation(h, vma, address);
194396b96a96SMike Kravetz 	}
194496b96a96SMike Kravetz }
194596b96a96SMike Kravetz 
194670c3547eSMike Kravetz struct page *alloc_huge_page(struct vm_area_struct *vma,
194704f2cbe3SMel Gorman 				    unsigned long addr, int avoid_reserve)
1948348ea204SAdam Litke {
194990481622SDavid Gibson 	struct hugepage_subpool *spool = subpool_vma(vma);
1950a5516438SAndi Kleen 	struct hstate *h = hstate_vma(vma);
1951348ea204SAdam Litke 	struct page *page;
1952d85f69b0SMike Kravetz 	long map_chg, map_commit;
1953d85f69b0SMike Kravetz 	long gbl_chg;
19546d76dcf4SAneesh Kumar K.V 	int ret, idx;
19556d76dcf4SAneesh Kumar K.V 	struct hugetlb_cgroup *h_cg;
19562fc39cecSAdam Litke 
19576d76dcf4SAneesh Kumar K.V 	idx = hstate_index(h);
1958a1e78772SMel Gorman 	/*
1959d85f69b0SMike Kravetz 	 * Examine the region/reserve map to determine if the process
1960d85f69b0SMike Kravetz 	 * has a reservation for the page to be allocated.  A return
1961d85f69b0SMike Kravetz 	 * code of zero indicates a reservation exists (no change).
1962a1e78772SMel Gorman 	 */
1963d85f69b0SMike Kravetz 	map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
1964d85f69b0SMike Kravetz 	if (map_chg < 0)
196576dcee75SAneesh Kumar K.V 		return ERR_PTR(-ENOMEM);
1966d85f69b0SMike Kravetz 
1967d85f69b0SMike Kravetz 	/*
1968d85f69b0SMike Kravetz 	 * Processes that did not create the mapping will have no
1969d85f69b0SMike Kravetz 	 * reserves as indicated by the region/reserve map. Check
1970d85f69b0SMike Kravetz 	 * that the allocation will not exceed the subpool limit.
1971d85f69b0SMike Kravetz 	 * Allocations for MAP_NORESERVE mappings also need to be
1972d85f69b0SMike Kravetz 	 * checked against any subpool limit.
1973d85f69b0SMike Kravetz 	 */
1974d85f69b0SMike Kravetz 	if (map_chg || avoid_reserve) {
1975d85f69b0SMike Kravetz 		gbl_chg = hugepage_subpool_get_pages(spool, 1);
1976d85f69b0SMike Kravetz 		if (gbl_chg < 0) {
1977feba16e2SMike Kravetz 			vma_end_reservation(h, vma, addr);
197876dcee75SAneesh Kumar K.V 			return ERR_PTR(-ENOSPC);
19795e911373SMike Kravetz 		}
198090d8b7e6SAdam Litke 
1981d85f69b0SMike Kravetz 		/*
1982d85f69b0SMike Kravetz 		 * Even though there was no reservation in the region/reserve
1983d85f69b0SMike Kravetz 		 * map, there could be reservations associated with the
1984d85f69b0SMike Kravetz 		 * subpool that can be used.  This would be indicated if the
1985d85f69b0SMike Kravetz 		 * return value of hugepage_subpool_get_pages() is zero.
1986d85f69b0SMike Kravetz 		 * However, if avoid_reserve is specified we still avoid even
1987d85f69b0SMike Kravetz 		 * the subpool reservations.
1988d85f69b0SMike Kravetz 		 */
1989d85f69b0SMike Kravetz 		if (avoid_reserve)
1990d85f69b0SMike Kravetz 			gbl_chg = 1;
1991d85f69b0SMike Kravetz 	}
1992d85f69b0SMike Kravetz 
19936d76dcf4SAneesh Kumar K.V 	ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
19948f34af6fSJianyu Zhan 	if (ret)
19958f34af6fSJianyu Zhan 		goto out_subpool_put;
19968f34af6fSJianyu Zhan 
1997a1e78772SMel Gorman 	spin_lock(&hugetlb_lock);
1998d85f69b0SMike Kravetz 	/*
1999d85f69b0SMike Kravetz 	 * glb_chg is passed to indicate whether or not a page must be taken
2000d85f69b0SMike Kravetz 	 * from the global free pool (global change).  gbl_chg == 0 indicates
2001d85f69b0SMike Kravetz 	 * a reservation exists for the allocation.
2002d85f69b0SMike Kravetz 	 */
2003d85f69b0SMike Kravetz 	page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
200481a6fcaeSJoonsoo Kim 	if (!page) {
200594ae8ba7SAneesh Kumar K.V 		spin_unlock(&hugetlb_lock);
20060c397daeSMichal Hocko 		page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
20078f34af6fSJianyu Zhan 		if (!page)
20088f34af6fSJianyu Zhan 			goto out_uncharge_cgroup;
2009a88c7695SNaoya Horiguchi 		if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
2010a88c7695SNaoya Horiguchi 			SetPagePrivate(page);
2011a88c7695SNaoya Horiguchi 			h->resv_huge_pages--;
2012a88c7695SNaoya Horiguchi 		}
201379dbb236SAneesh Kumar K.V 		spin_lock(&hugetlb_lock);
201479dbb236SAneesh Kumar K.V 		list_move(&page->lru, &h->hugepage_activelist);
201581a6fcaeSJoonsoo Kim 		/* Fall through */
2016a1e78772SMel Gorman 	}
201781a6fcaeSJoonsoo Kim 	hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
201881a6fcaeSJoonsoo Kim 	spin_unlock(&hugetlb_lock);
2019a1e78772SMel Gorman 
202090481622SDavid Gibson 	set_page_private(page, (unsigned long)spool);
2021a1e78772SMel Gorman 
2022d85f69b0SMike Kravetz 	map_commit = vma_commit_reservation(h, vma, addr);
2023d85f69b0SMike Kravetz 	if (unlikely(map_chg > map_commit)) {
202433039678SMike Kravetz 		/*
202533039678SMike Kravetz 		 * The page was added to the reservation map between
202633039678SMike Kravetz 		 * vma_needs_reservation and vma_commit_reservation.
202733039678SMike Kravetz 		 * This indicates a race with hugetlb_reserve_pages.
202833039678SMike Kravetz 		 * Adjust for the subpool count incremented above AND
202933039678SMike Kravetz 		 * in hugetlb_reserve_pages for the same page.  Also,
203033039678SMike Kravetz 		 * the reservation count added in hugetlb_reserve_pages
203133039678SMike Kravetz 		 * no longer applies.
203233039678SMike Kravetz 		 */
203333039678SMike Kravetz 		long rsv_adjust;
203433039678SMike Kravetz 
203533039678SMike Kravetz 		rsv_adjust = hugepage_subpool_put_pages(spool, 1);
203633039678SMike Kravetz 		hugetlb_acct_memory(h, -rsv_adjust);
203733039678SMike Kravetz 	}
20387893d1d5SAdam Litke 	return page;
20398f34af6fSJianyu Zhan 
20408f34af6fSJianyu Zhan out_uncharge_cgroup:
20418f34af6fSJianyu Zhan 	hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
20428f34af6fSJianyu Zhan out_subpool_put:
2043d85f69b0SMike Kravetz 	if (map_chg || avoid_reserve)
20448f34af6fSJianyu Zhan 		hugepage_subpool_put_pages(spool, 1);
2045feba16e2SMike Kravetz 	vma_end_reservation(h, vma, addr);
20468f34af6fSJianyu Zhan 	return ERR_PTR(-ENOSPC);
2047b45b5bd6SDavid Gibson }
2048b45b5bd6SDavid Gibson 
2049e24a1307SAneesh Kumar K.V int alloc_bootmem_huge_page(struct hstate *h)
2050e24a1307SAneesh Kumar K.V 	__attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
2051e24a1307SAneesh Kumar K.V int __alloc_bootmem_huge_page(struct hstate *h)
2052aa888a74SAndi Kleen {
2053aa888a74SAndi Kleen 	struct huge_bootmem_page *m;
2054b2261026SJoonsoo Kim 	int nr_nodes, node;
2055aa888a74SAndi Kleen 
2056b2261026SJoonsoo Kim 	for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
2057aa888a74SAndi Kleen 		void *addr;
2058aa888a74SAndi Kleen 
2059eb31d559SMike Rapoport 		addr = memblock_alloc_try_nid_raw(
20608b89a116SGrygorii Strashko 				huge_page_size(h), huge_page_size(h),
206197ad1087SMike Rapoport 				0, MEMBLOCK_ALLOC_ACCESSIBLE, node);
2062aa888a74SAndi Kleen 		if (addr) {
2063aa888a74SAndi Kleen 			/*
2064aa888a74SAndi Kleen 			 * Use the beginning of the huge page to store the
2065aa888a74SAndi Kleen 			 * huge_bootmem_page struct (until gather_bootmem
2066aa888a74SAndi Kleen 			 * puts them into the mem_map).
2067aa888a74SAndi Kleen 			 */
2068aa888a74SAndi Kleen 			m = addr;
2069aa888a74SAndi Kleen 			goto found;
2070aa888a74SAndi Kleen 		}
2071aa888a74SAndi Kleen 	}
2072aa888a74SAndi Kleen 	return 0;
2073aa888a74SAndi Kleen 
2074aa888a74SAndi Kleen found:
2075df994eadSLuiz Capitulino 	BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h)));
2076aa888a74SAndi Kleen 	/* Put them into a private list first because mem_map is not up yet */
2077330d6e48SCannon Matthews 	INIT_LIST_HEAD(&m->list);
2078aa888a74SAndi Kleen 	list_add(&m->list, &huge_boot_pages);
2079aa888a74SAndi Kleen 	m->hstate = h;
2080aa888a74SAndi Kleen 	return 1;
2081aa888a74SAndi Kleen }
2082aa888a74SAndi Kleen 
2083d00181b9SKirill A. Shutemov static void __init prep_compound_huge_page(struct page *page,
2084d00181b9SKirill A. Shutemov 		unsigned int order)
208518229df5SAndy Whitcroft {
208618229df5SAndy Whitcroft 	if (unlikely(order > (MAX_ORDER - 1)))
208718229df5SAndy Whitcroft 		prep_compound_gigantic_page(page, order);
208818229df5SAndy Whitcroft 	else
208918229df5SAndy Whitcroft 		prep_compound_page(page, order);
209018229df5SAndy Whitcroft }
209118229df5SAndy Whitcroft 
2092aa888a74SAndi Kleen /* Put bootmem huge pages into the standard lists after mem_map is up */
2093aa888a74SAndi Kleen static void __init gather_bootmem_prealloc(void)
2094aa888a74SAndi Kleen {
2095aa888a74SAndi Kleen 	struct huge_bootmem_page *m;
2096aa888a74SAndi Kleen 
2097aa888a74SAndi Kleen 	list_for_each_entry(m, &huge_boot_pages, list) {
209840d18ebfSMike Kravetz 		struct page *page = virt_to_page(m);
2099aa888a74SAndi Kleen 		struct hstate *h = m->hstate;
2100ee8f248dSBecky Bruce 
2101aa888a74SAndi Kleen 		WARN_ON(page_count(page) != 1);
210218229df5SAndy Whitcroft 		prep_compound_huge_page(page, h->order);
2103ef5a22beSAndrea Arcangeli 		WARN_ON(PageReserved(page));
2104aa888a74SAndi Kleen 		prep_new_huge_page(h, page, page_to_nid(page));
2105af0fb9dfSMichal Hocko 		put_page(page); /* free it into the hugepage allocator */
2106af0fb9dfSMichal Hocko 
2107b0320c7bSRafael Aquini 		/*
2108b0320c7bSRafael Aquini 		 * If we had gigantic hugepages allocated at boot time, we need
2109b0320c7bSRafael Aquini 		 * to restore the 'stolen' pages to totalram_pages in order to
2110b0320c7bSRafael Aquini 		 * fix confusing memory reports from free(1) and another
2111b0320c7bSRafael Aquini 		 * side-effects, like CommitLimit going negative.
2112b0320c7bSRafael Aquini 		 */
2113bae7f4aeSLuiz Capitulino 		if (hstate_is_gigantic(h))
21143dcc0571SJiang Liu 			adjust_managed_page_count(page, 1 << h->order);
2115520495feSCannon Matthews 		cond_resched();
2116aa888a74SAndi Kleen 	}
2117aa888a74SAndi Kleen }
2118aa888a74SAndi Kleen 
21198faa8b07SAndi Kleen static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
21201da177e4SLinus Torvalds {
21211da177e4SLinus Torvalds 	unsigned long i;
2122f60858f9SMike Kravetz 	nodemask_t *node_alloc_noretry;
2123f60858f9SMike Kravetz 
2124f60858f9SMike Kravetz 	if (!hstate_is_gigantic(h)) {
2125f60858f9SMike Kravetz 		/*
2126f60858f9SMike Kravetz 		 * Bit mask controlling how hard we retry per-node allocations.
2127f60858f9SMike Kravetz 		 * Ignore errors as lower level routines can deal with
2128f60858f9SMike Kravetz 		 * node_alloc_noretry == NULL.  If this kmalloc fails at boot
2129f60858f9SMike Kravetz 		 * time, we are likely in bigger trouble.
2130f60858f9SMike Kravetz 		 */
2131f60858f9SMike Kravetz 		node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry),
2132f60858f9SMike Kravetz 						GFP_KERNEL);
2133f60858f9SMike Kravetz 	} else {
2134f60858f9SMike Kravetz 		/* allocations done at boot time */
2135f60858f9SMike Kravetz 		node_alloc_noretry = NULL;
2136f60858f9SMike Kravetz 	}
2137f60858f9SMike Kravetz 
2138f60858f9SMike Kravetz 	/* bit mask controlling how hard we retry per-node allocations */
2139f60858f9SMike Kravetz 	if (node_alloc_noretry)
2140f60858f9SMike Kravetz 		nodes_clear(*node_alloc_noretry);
21411da177e4SLinus Torvalds 
2142e5ff2159SAndi Kleen 	for (i = 0; i < h->max_huge_pages; ++i) {
2143bae7f4aeSLuiz Capitulino 		if (hstate_is_gigantic(h)) {
2144aa888a74SAndi Kleen 			if (!alloc_bootmem_huge_page(h))
2145aa888a74SAndi Kleen 				break;
21460c397daeSMichal Hocko 		} else if (!alloc_pool_huge_page(h,
2147f60858f9SMike Kravetz 					 &node_states[N_MEMORY],
2148f60858f9SMike Kravetz 					 node_alloc_noretry))
21491da177e4SLinus Torvalds 			break;
215069ed779aSDavid Rientjes 		cond_resched();
21511da177e4SLinus Torvalds 	}
2152d715cf80SLiam R. Howlett 	if (i < h->max_huge_pages) {
2153d715cf80SLiam R. Howlett 		char buf[32];
2154d715cf80SLiam R. Howlett 
2155c6247f72SMatthew Wilcox 		string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
2156d715cf80SLiam R. Howlett 		pr_warn("HugeTLB: allocating %lu of page size %s failed.  Only allocated %lu hugepages.\n",
2157d715cf80SLiam R. Howlett 			h->max_huge_pages, buf, i);
21588faa8b07SAndi Kleen 		h->max_huge_pages = i;
2159e5ff2159SAndi Kleen 	}
2160f60858f9SMike Kravetz 
2161f60858f9SMike Kravetz 	kfree(node_alloc_noretry);
2162d715cf80SLiam R. Howlett }
2163e5ff2159SAndi Kleen 
2164e5ff2159SAndi Kleen static void __init hugetlb_init_hstates(void)
2165e5ff2159SAndi Kleen {
2166e5ff2159SAndi Kleen 	struct hstate *h;
2167e5ff2159SAndi Kleen 
2168e5ff2159SAndi Kleen 	for_each_hstate(h) {
2169641844f5SNaoya Horiguchi 		if (minimum_order > huge_page_order(h))
2170641844f5SNaoya Horiguchi 			minimum_order = huge_page_order(h);
2171641844f5SNaoya Horiguchi 
21728faa8b07SAndi Kleen 		/* oversize hugepages were init'ed in early boot */
2173bae7f4aeSLuiz Capitulino 		if (!hstate_is_gigantic(h))
21748faa8b07SAndi Kleen 			hugetlb_hstate_alloc_pages(h);
2175e5ff2159SAndi Kleen 	}
2176641844f5SNaoya Horiguchi 	VM_BUG_ON(minimum_order == UINT_MAX);
2177e5ff2159SAndi Kleen }
2178e5ff2159SAndi Kleen 
2179e5ff2159SAndi Kleen static void __init report_hugepages(void)
2180e5ff2159SAndi Kleen {
2181e5ff2159SAndi Kleen 	struct hstate *h;
2182e5ff2159SAndi Kleen 
2183e5ff2159SAndi Kleen 	for_each_hstate(h) {
21844abd32dbSAndi Kleen 		char buf[32];
2185c6247f72SMatthew Wilcox 
2186c6247f72SMatthew Wilcox 		string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
2187ffb22af5SAndrew Morton 		pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n",
2188c6247f72SMatthew Wilcox 			buf, h->free_huge_pages);
2189e5ff2159SAndi Kleen 	}
2190e5ff2159SAndi Kleen }
2191e5ff2159SAndi Kleen 
21921da177e4SLinus Torvalds #ifdef CONFIG_HIGHMEM
21936ae11b27SLee Schermerhorn static void try_to_free_low(struct hstate *h, unsigned long count,
21946ae11b27SLee Schermerhorn 						nodemask_t *nodes_allowed)
21951da177e4SLinus Torvalds {
21964415cc8dSChristoph Lameter 	int i;
21974415cc8dSChristoph Lameter 
2198bae7f4aeSLuiz Capitulino 	if (hstate_is_gigantic(h))
2199aa888a74SAndi Kleen 		return;
2200aa888a74SAndi Kleen 
22016ae11b27SLee Schermerhorn 	for_each_node_mask(i, *nodes_allowed) {
22021da177e4SLinus Torvalds 		struct page *page, *next;
2203a5516438SAndi Kleen 		struct list_head *freel = &h->hugepage_freelists[i];
2204a5516438SAndi Kleen 		list_for_each_entry_safe(page, next, freel, lru) {
2205a5516438SAndi Kleen 			if (count >= h->nr_huge_pages)
22066b0c880dSAdam Litke 				return;
22071da177e4SLinus Torvalds 			if (PageHighMem(page))
22081da177e4SLinus Torvalds 				continue;
22091da177e4SLinus Torvalds 			list_del(&page->lru);
2210e5ff2159SAndi Kleen 			update_and_free_page(h, page);
2211a5516438SAndi Kleen 			h->free_huge_pages--;
2212a5516438SAndi Kleen 			h->free_huge_pages_node[page_to_nid(page)]--;
22131da177e4SLinus Torvalds 		}
22141da177e4SLinus Torvalds 	}
22151da177e4SLinus Torvalds }
22161da177e4SLinus Torvalds #else
22176ae11b27SLee Schermerhorn static inline void try_to_free_low(struct hstate *h, unsigned long count,
22186ae11b27SLee Schermerhorn 						nodemask_t *nodes_allowed)
22191da177e4SLinus Torvalds {
22201da177e4SLinus Torvalds }
22211da177e4SLinus Torvalds #endif
22221da177e4SLinus Torvalds 
222320a0307cSWu Fengguang /*
222420a0307cSWu Fengguang  * Increment or decrement surplus_huge_pages.  Keep node-specific counters
222520a0307cSWu Fengguang  * balanced by operating on them in a round-robin fashion.
222620a0307cSWu Fengguang  * Returns 1 if an adjustment was made.
222720a0307cSWu Fengguang  */
22286ae11b27SLee Schermerhorn static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
22296ae11b27SLee Schermerhorn 				int delta)
223020a0307cSWu Fengguang {
2231b2261026SJoonsoo Kim 	int nr_nodes, node;
223220a0307cSWu Fengguang 
223320a0307cSWu Fengguang 	VM_BUG_ON(delta != -1 && delta != 1);
223420a0307cSWu Fengguang 
2235e8c5c824SLee Schermerhorn 	if (delta < 0) {
2236b2261026SJoonsoo Kim 		for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
2237b2261026SJoonsoo Kim 			if (h->surplus_huge_pages_node[node])
2238b2261026SJoonsoo Kim 				goto found;
2239b2261026SJoonsoo Kim 		}
2240b2261026SJoonsoo Kim 	} else {
2241b2261026SJoonsoo Kim 		for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
2242b2261026SJoonsoo Kim 			if (h->surplus_huge_pages_node[node] <
2243b2261026SJoonsoo Kim 					h->nr_huge_pages_node[node])
2244b2261026SJoonsoo Kim 				goto found;
2245e8c5c824SLee Schermerhorn 		}
22469a76db09SLee Schermerhorn 	}
2247b2261026SJoonsoo Kim 	return 0;
224820a0307cSWu Fengguang 
2249b2261026SJoonsoo Kim found:
225020a0307cSWu Fengguang 	h->surplus_huge_pages += delta;
2251b2261026SJoonsoo Kim 	h->surplus_huge_pages_node[node] += delta;
2252b2261026SJoonsoo Kim 	return 1;
225320a0307cSWu Fengguang }
225420a0307cSWu Fengguang 
2255a5516438SAndi Kleen #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
2256fd875dcaSMike Kravetz static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
22576ae11b27SLee Schermerhorn 			      nodemask_t *nodes_allowed)
22581da177e4SLinus Torvalds {
22597893d1d5SAdam Litke 	unsigned long min_count, ret;
2260f60858f9SMike Kravetz 	NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL);
2261f60858f9SMike Kravetz 
2262f60858f9SMike Kravetz 	/*
2263f60858f9SMike Kravetz 	 * Bit mask controlling how hard we retry per-node allocations.
2264f60858f9SMike Kravetz 	 * If we can not allocate the bit mask, do not attempt to allocate
2265f60858f9SMike Kravetz 	 * the requested huge pages.
2266f60858f9SMike Kravetz 	 */
2267f60858f9SMike Kravetz 	if (node_alloc_noretry)
2268f60858f9SMike Kravetz 		nodes_clear(*node_alloc_noretry);
2269f60858f9SMike Kravetz 	else
2270f60858f9SMike Kravetz 		return -ENOMEM;
22711da177e4SLinus Torvalds 
22724eb0716eSAlexandre Ghiti 	spin_lock(&hugetlb_lock);
22734eb0716eSAlexandre Ghiti 
22744eb0716eSAlexandre Ghiti 	/*
2275fd875dcaSMike Kravetz 	 * Check for a node specific request.
2276fd875dcaSMike Kravetz 	 * Changing node specific huge page count may require a corresponding
2277fd875dcaSMike Kravetz 	 * change to the global count.  In any case, the passed node mask
2278fd875dcaSMike Kravetz 	 * (nodes_allowed) will restrict alloc/free to the specified node.
2279fd875dcaSMike Kravetz 	 */
2280fd875dcaSMike Kravetz 	if (nid != NUMA_NO_NODE) {
2281fd875dcaSMike Kravetz 		unsigned long old_count = count;
2282fd875dcaSMike Kravetz 
2283fd875dcaSMike Kravetz 		count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
2284fd875dcaSMike Kravetz 		/*
2285fd875dcaSMike Kravetz 		 * User may have specified a large count value which caused the
2286fd875dcaSMike Kravetz 		 * above calculation to overflow.  In this case, they wanted
2287fd875dcaSMike Kravetz 		 * to allocate as many huge pages as possible.  Set count to
2288fd875dcaSMike Kravetz 		 * largest possible value to align with their intention.
2289fd875dcaSMike Kravetz 		 */
2290fd875dcaSMike Kravetz 		if (count < old_count)
2291fd875dcaSMike Kravetz 			count = ULONG_MAX;
2292fd875dcaSMike Kravetz 	}
2293fd875dcaSMike Kravetz 
2294fd875dcaSMike Kravetz 	/*
22954eb0716eSAlexandre Ghiti 	 * Gigantic pages runtime allocation depend on the capability for large
22964eb0716eSAlexandre Ghiti 	 * page range allocation.
22974eb0716eSAlexandre Ghiti 	 * If the system does not provide this feature, return an error when
22984eb0716eSAlexandre Ghiti 	 * the user tries to allocate gigantic pages but let the user free the
22994eb0716eSAlexandre Ghiti 	 * boottime allocated gigantic pages.
23004eb0716eSAlexandre Ghiti 	 */
23014eb0716eSAlexandre Ghiti 	if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
23024eb0716eSAlexandre Ghiti 		if (count > persistent_huge_pages(h)) {
23034eb0716eSAlexandre Ghiti 			spin_unlock(&hugetlb_lock);
2304f60858f9SMike Kravetz 			NODEMASK_FREE(node_alloc_noretry);
23054eb0716eSAlexandre Ghiti 			return -EINVAL;
23064eb0716eSAlexandre Ghiti 		}
23074eb0716eSAlexandre Ghiti 		/* Fall through to decrease pool */
23084eb0716eSAlexandre Ghiti 	}
2309aa888a74SAndi Kleen 
23107893d1d5SAdam Litke 	/*
23117893d1d5SAdam Litke 	 * Increase the pool size
23127893d1d5SAdam Litke 	 * First take pages out of surplus state.  Then make up the
23137893d1d5SAdam Litke 	 * remaining difference by allocating fresh huge pages.
2314d1c3fb1fSNishanth Aravamudan 	 *
23150c397daeSMichal Hocko 	 * We might race with alloc_surplus_huge_page() here and be unable
2316d1c3fb1fSNishanth Aravamudan 	 * to convert a surplus huge page to a normal huge page. That is
2317d1c3fb1fSNishanth Aravamudan 	 * not critical, though, it just means the overall size of the
2318d1c3fb1fSNishanth Aravamudan 	 * pool might be one hugepage larger than it needs to be, but
2319d1c3fb1fSNishanth Aravamudan 	 * within all the constraints specified by the sysctls.
23207893d1d5SAdam Litke 	 */
2321a5516438SAndi Kleen 	while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
23226ae11b27SLee Schermerhorn 		if (!adjust_pool_surplus(h, nodes_allowed, -1))
23237893d1d5SAdam Litke 			break;
23247893d1d5SAdam Litke 	}
23257893d1d5SAdam Litke 
2326a5516438SAndi Kleen 	while (count > persistent_huge_pages(h)) {
23277893d1d5SAdam Litke 		/*
23287893d1d5SAdam Litke 		 * If this allocation races such that we no longer need the
23297893d1d5SAdam Litke 		 * page, free_huge_page will handle it by freeing the page
23307893d1d5SAdam Litke 		 * and reducing the surplus.
23317893d1d5SAdam Litke 		 */
23327893d1d5SAdam Litke 		spin_unlock(&hugetlb_lock);
2333649920c6SJia He 
2334649920c6SJia He 		/* yield cpu to avoid soft lockup */
2335649920c6SJia He 		cond_resched();
2336649920c6SJia He 
2337f60858f9SMike Kravetz 		ret = alloc_pool_huge_page(h, nodes_allowed,
2338f60858f9SMike Kravetz 						node_alloc_noretry);
23397893d1d5SAdam Litke 		spin_lock(&hugetlb_lock);
23407893d1d5SAdam Litke 		if (!ret)
23417893d1d5SAdam Litke 			goto out;
23427893d1d5SAdam Litke 
2343536240f2SMel Gorman 		/* Bail for signals. Probably ctrl-c from user */
2344536240f2SMel Gorman 		if (signal_pending(current))
2345536240f2SMel Gorman 			goto out;
23467893d1d5SAdam Litke 	}
23477893d1d5SAdam Litke 
23487893d1d5SAdam Litke 	/*
23497893d1d5SAdam Litke 	 * Decrease the pool size
23507893d1d5SAdam Litke 	 * First return free pages to the buddy allocator (being careful
23517893d1d5SAdam Litke 	 * to keep enough around to satisfy reservations).  Then place
23527893d1d5SAdam Litke 	 * pages into surplus state as needed so the pool will shrink
23537893d1d5SAdam Litke 	 * to the desired size as pages become free.
2354d1c3fb1fSNishanth Aravamudan 	 *
2355d1c3fb1fSNishanth Aravamudan 	 * By placing pages into the surplus state independent of the
2356d1c3fb1fSNishanth Aravamudan 	 * overcommit value, we are allowing the surplus pool size to
2357d1c3fb1fSNishanth Aravamudan 	 * exceed overcommit. There are few sane options here. Since
23580c397daeSMichal Hocko 	 * alloc_surplus_huge_page() is checking the global counter,
2359d1c3fb1fSNishanth Aravamudan 	 * though, we'll note that we're not allowed to exceed surplus
2360d1c3fb1fSNishanth Aravamudan 	 * and won't grow the pool anywhere else. Not until one of the
2361d1c3fb1fSNishanth Aravamudan 	 * sysctls are changed, or the surplus pages go out of use.
23627893d1d5SAdam Litke 	 */
2363a5516438SAndi Kleen 	min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
23646b0c880dSAdam Litke 	min_count = max(count, min_count);
23656ae11b27SLee Schermerhorn 	try_to_free_low(h, min_count, nodes_allowed);
2366a5516438SAndi Kleen 	while (min_count < persistent_huge_pages(h)) {
23676ae11b27SLee Schermerhorn 		if (!free_pool_huge_page(h, nodes_allowed, 0))
23681da177e4SLinus Torvalds 			break;
236955f67141SMizuma, Masayoshi 		cond_resched_lock(&hugetlb_lock);
23701da177e4SLinus Torvalds 	}
2371a5516438SAndi Kleen 	while (count < persistent_huge_pages(h)) {
23726ae11b27SLee Schermerhorn 		if (!adjust_pool_surplus(h, nodes_allowed, 1))
23737893d1d5SAdam Litke 			break;
23747893d1d5SAdam Litke 	}
23757893d1d5SAdam Litke out:
23764eb0716eSAlexandre Ghiti 	h->max_huge_pages = persistent_huge_pages(h);
23771da177e4SLinus Torvalds 	spin_unlock(&hugetlb_lock);
23784eb0716eSAlexandre Ghiti 
2379f60858f9SMike Kravetz 	NODEMASK_FREE(node_alloc_noretry);
2380f60858f9SMike Kravetz 
23814eb0716eSAlexandre Ghiti 	return 0;
23821da177e4SLinus Torvalds }
23831da177e4SLinus Torvalds 
2384a3437870SNishanth Aravamudan #define HSTATE_ATTR_RO(_name) \
2385a3437870SNishanth Aravamudan 	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
2386a3437870SNishanth Aravamudan 
2387a3437870SNishanth Aravamudan #define HSTATE_ATTR(_name) \
2388a3437870SNishanth Aravamudan 	static struct kobj_attribute _name##_attr = \
2389a3437870SNishanth Aravamudan 		__ATTR(_name, 0644, _name##_show, _name##_store)
2390a3437870SNishanth Aravamudan 
2391a3437870SNishanth Aravamudan static struct kobject *hugepages_kobj;
2392a3437870SNishanth Aravamudan static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
2393a3437870SNishanth Aravamudan 
23949a305230SLee Schermerhorn static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp);
23959a305230SLee Schermerhorn 
23969a305230SLee Schermerhorn static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp)
2397a3437870SNishanth Aravamudan {
2398a3437870SNishanth Aravamudan 	int i;
23999a305230SLee Schermerhorn 
2400a3437870SNishanth Aravamudan 	for (i = 0; i < HUGE_MAX_HSTATE; i++)
24019a305230SLee Schermerhorn 		if (hstate_kobjs[i] == kobj) {
24029a305230SLee Schermerhorn 			if (nidp)
24039a305230SLee Schermerhorn 				*nidp = NUMA_NO_NODE;
2404a3437870SNishanth Aravamudan 			return &hstates[i];
24059a305230SLee Schermerhorn 		}
24069a305230SLee Schermerhorn 
24079a305230SLee Schermerhorn 	return kobj_to_node_hstate(kobj, nidp);
2408a3437870SNishanth Aravamudan }
2409a3437870SNishanth Aravamudan 
241006808b08SLee Schermerhorn static ssize_t nr_hugepages_show_common(struct kobject *kobj,
2411a3437870SNishanth Aravamudan 					struct kobj_attribute *attr, char *buf)
2412a3437870SNishanth Aravamudan {
24139a305230SLee Schermerhorn 	struct hstate *h;
24149a305230SLee Schermerhorn 	unsigned long nr_huge_pages;
24159a305230SLee Schermerhorn 	int nid;
24169a305230SLee Schermerhorn 
24179a305230SLee Schermerhorn 	h = kobj_to_hstate(kobj, &nid);
24189a305230SLee Schermerhorn 	if (nid == NUMA_NO_NODE)
24199a305230SLee Schermerhorn 		nr_huge_pages = h->nr_huge_pages;
24209a305230SLee Schermerhorn 	else
24219a305230SLee Schermerhorn 		nr_huge_pages = h->nr_huge_pages_node[nid];
24229a305230SLee Schermerhorn 
24239a305230SLee Schermerhorn 	return sprintf(buf, "%lu\n", nr_huge_pages);
2424a3437870SNishanth Aravamudan }
2425adbe8726SEric B Munson 
2426238d3c13SDavid Rientjes static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
2427238d3c13SDavid Rientjes 					   struct hstate *h, int nid,
2428238d3c13SDavid Rientjes 					   unsigned long count, size_t len)
2429a3437870SNishanth Aravamudan {
2430a3437870SNishanth Aravamudan 	int err;
24312d0adf7eSOscar Salvador 	nodemask_t nodes_allowed, *n_mask;
2432a3437870SNishanth Aravamudan 
24332d0adf7eSOscar Salvador 	if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
24342d0adf7eSOscar Salvador 		return -EINVAL;
2435adbe8726SEric B Munson 
24369a305230SLee Schermerhorn 	if (nid == NUMA_NO_NODE) {
24379a305230SLee Schermerhorn 		/*
24389a305230SLee Schermerhorn 		 * global hstate attribute
24399a305230SLee Schermerhorn 		 */
24409a305230SLee Schermerhorn 		if (!(obey_mempolicy &&
24412d0adf7eSOscar Salvador 				init_nodemask_of_mempolicy(&nodes_allowed)))
24422d0adf7eSOscar Salvador 			n_mask = &node_states[N_MEMORY];
24432d0adf7eSOscar Salvador 		else
24442d0adf7eSOscar Salvador 			n_mask = &nodes_allowed;
24452d0adf7eSOscar Salvador 	} else {
24469a305230SLee Schermerhorn 		/*
2447fd875dcaSMike Kravetz 		 * Node specific request.  count adjustment happens in
2448fd875dcaSMike Kravetz 		 * set_max_huge_pages() after acquiring hugetlb_lock.
24499a305230SLee Schermerhorn 		 */
24502d0adf7eSOscar Salvador 		init_nodemask_of_node(&nodes_allowed, nid);
24512d0adf7eSOscar Salvador 		n_mask = &nodes_allowed;
2452fd875dcaSMike Kravetz 	}
24539a305230SLee Schermerhorn 
24542d0adf7eSOscar Salvador 	err = set_max_huge_pages(h, count, nid, n_mask);
245506808b08SLee Schermerhorn 
24564eb0716eSAlexandre Ghiti 	return err ? err : len;
245706808b08SLee Schermerhorn }
245806808b08SLee Schermerhorn 
2459238d3c13SDavid Rientjes static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
2460238d3c13SDavid Rientjes 					 struct kobject *kobj, const char *buf,
2461238d3c13SDavid Rientjes 					 size_t len)
2462238d3c13SDavid Rientjes {
2463238d3c13SDavid Rientjes 	struct hstate *h;
2464238d3c13SDavid Rientjes 	unsigned long count;
2465238d3c13SDavid Rientjes 	int nid;
2466238d3c13SDavid Rientjes 	int err;
2467238d3c13SDavid Rientjes 
2468238d3c13SDavid Rientjes 	err = kstrtoul(buf, 10, &count);
2469238d3c13SDavid Rientjes 	if (err)
2470238d3c13SDavid Rientjes 		return err;
2471238d3c13SDavid Rientjes 
2472238d3c13SDavid Rientjes 	h = kobj_to_hstate(kobj, &nid);
2473238d3c13SDavid Rientjes 	return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len);
2474238d3c13SDavid Rientjes }
2475238d3c13SDavid Rientjes 
247606808b08SLee Schermerhorn static ssize_t nr_hugepages_show(struct kobject *kobj,
247706808b08SLee Schermerhorn 				       struct kobj_attribute *attr, char *buf)
247806808b08SLee Schermerhorn {
247906808b08SLee Schermerhorn 	return nr_hugepages_show_common(kobj, attr, buf);
248006808b08SLee Schermerhorn }
248106808b08SLee Schermerhorn 
248206808b08SLee Schermerhorn static ssize_t nr_hugepages_store(struct kobject *kobj,
248306808b08SLee Schermerhorn 	       struct kobj_attribute *attr, const char *buf, size_t len)
248406808b08SLee Schermerhorn {
2485238d3c13SDavid Rientjes 	return nr_hugepages_store_common(false, kobj, buf, len);
2486a3437870SNishanth Aravamudan }
2487a3437870SNishanth Aravamudan HSTATE_ATTR(nr_hugepages);
2488a3437870SNishanth Aravamudan 
248906808b08SLee Schermerhorn #ifdef CONFIG_NUMA
249006808b08SLee Schermerhorn 
249106808b08SLee Schermerhorn /*
249206808b08SLee Schermerhorn  * hstate attribute for optionally mempolicy-based constraint on persistent
249306808b08SLee Schermerhorn  * huge page alloc/free.
249406808b08SLee Schermerhorn  */
249506808b08SLee Schermerhorn static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
249606808b08SLee Schermerhorn 				       struct kobj_attribute *attr, char *buf)
249706808b08SLee Schermerhorn {
249806808b08SLee Schermerhorn 	return nr_hugepages_show_common(kobj, attr, buf);
249906808b08SLee Schermerhorn }
250006808b08SLee Schermerhorn 
250106808b08SLee Schermerhorn static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
250206808b08SLee Schermerhorn 	       struct kobj_attribute *attr, const char *buf, size_t len)
250306808b08SLee Schermerhorn {
2504238d3c13SDavid Rientjes 	return nr_hugepages_store_common(true, kobj, buf, len);
250506808b08SLee Schermerhorn }
250606808b08SLee Schermerhorn HSTATE_ATTR(nr_hugepages_mempolicy);
250706808b08SLee Schermerhorn #endif
250806808b08SLee Schermerhorn 
250906808b08SLee Schermerhorn 
2510a3437870SNishanth Aravamudan static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
2511a3437870SNishanth Aravamudan 					struct kobj_attribute *attr, char *buf)
2512a3437870SNishanth Aravamudan {
25139a305230SLee Schermerhorn 	struct hstate *h = kobj_to_hstate(kobj, NULL);
2514a3437870SNishanth Aravamudan 	return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
2515a3437870SNishanth Aravamudan }
2516adbe8726SEric B Munson 
2517a3437870SNishanth Aravamudan static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
2518a3437870SNishanth Aravamudan 		struct kobj_attribute *attr, const char *buf, size_t count)
2519a3437870SNishanth Aravamudan {
2520a3437870SNishanth Aravamudan 	int err;
2521a3437870SNishanth Aravamudan 	unsigned long input;
25229a305230SLee Schermerhorn 	struct hstate *h = kobj_to_hstate(kobj, NULL);
2523a3437870SNishanth Aravamudan 
2524bae7f4aeSLuiz Capitulino 	if (hstate_is_gigantic(h))
2525adbe8726SEric B Munson 		return -EINVAL;
2526adbe8726SEric B Munson 
25273dbb95f7SJingoo Han 	err = kstrtoul(buf, 10, &input);
2528a3437870SNishanth Aravamudan 	if (err)
252973ae31e5SEric B Munson 		return err;
2530a3437870SNishanth Aravamudan 
2531a3437870SNishanth Aravamudan 	spin_lock(&hugetlb_lock);
2532a3437870SNishanth Aravamudan 	h->nr_overcommit_huge_pages = input;
2533a3437870SNishanth Aravamudan 	spin_unlock(&hugetlb_lock);
2534a3437870SNishanth Aravamudan 
2535a3437870SNishanth Aravamudan 	return count;
2536a3437870SNishanth Aravamudan }
2537a3437870SNishanth Aravamudan HSTATE_ATTR(nr_overcommit_hugepages);
2538a3437870SNishanth Aravamudan 
2539a3437870SNishanth Aravamudan static ssize_t free_hugepages_show(struct kobject *kobj,
2540a3437870SNishanth Aravamudan 					struct kobj_attribute *attr, char *buf)
2541a3437870SNishanth Aravamudan {
25429a305230SLee Schermerhorn 	struct hstate *h;
25439a305230SLee Schermerhorn 	unsigned long free_huge_pages;
25449a305230SLee Schermerhorn 	int nid;
25459a305230SLee Schermerhorn 
25469a305230SLee Schermerhorn 	h = kobj_to_hstate(kobj, &nid);
25479a305230SLee Schermerhorn 	if (nid == NUMA_NO_NODE)
25489a305230SLee Schermerhorn 		free_huge_pages = h->free_huge_pages;
25499a305230SLee Schermerhorn 	else
25509a305230SLee Schermerhorn 		free_huge_pages = h->free_huge_pages_node[nid];
25519a305230SLee Schermerhorn 
25529a305230SLee Schermerhorn 	return sprintf(buf, "%lu\n", free_huge_pages);
2553a3437870SNishanth Aravamudan }
2554a3437870SNishanth Aravamudan HSTATE_ATTR_RO(free_hugepages);
2555a3437870SNishanth Aravamudan 
2556a3437870SNishanth Aravamudan static ssize_t resv_hugepages_show(struct kobject *kobj,
2557a3437870SNishanth Aravamudan 					struct kobj_attribute *attr, char *buf)
2558a3437870SNishanth Aravamudan {
25599a305230SLee Schermerhorn 	struct hstate *h = kobj_to_hstate(kobj, NULL);
2560a3437870SNishanth Aravamudan 	return sprintf(buf, "%lu\n", h->resv_huge_pages);
2561a3437870SNishanth Aravamudan }
2562a3437870SNishanth Aravamudan HSTATE_ATTR_RO(resv_hugepages);
2563a3437870SNishanth Aravamudan 
2564a3437870SNishanth Aravamudan static ssize_t surplus_hugepages_show(struct kobject *kobj,
2565a3437870SNishanth Aravamudan 					struct kobj_attribute *attr, char *buf)
2566a3437870SNishanth Aravamudan {
25679a305230SLee Schermerhorn 	struct hstate *h;
25689a305230SLee Schermerhorn 	unsigned long surplus_huge_pages;
25699a305230SLee Schermerhorn 	int nid;
25709a305230SLee Schermerhorn 
25719a305230SLee Schermerhorn 	h = kobj_to_hstate(kobj, &nid);
25729a305230SLee Schermerhorn 	if (nid == NUMA_NO_NODE)
25739a305230SLee Schermerhorn 		surplus_huge_pages = h->surplus_huge_pages;
25749a305230SLee Schermerhorn 	else
25759a305230SLee Schermerhorn 		surplus_huge_pages = h->surplus_huge_pages_node[nid];
25769a305230SLee Schermerhorn 
25779a305230SLee Schermerhorn 	return sprintf(buf, "%lu\n", surplus_huge_pages);
2578a3437870SNishanth Aravamudan }
2579a3437870SNishanth Aravamudan HSTATE_ATTR_RO(surplus_hugepages);
2580a3437870SNishanth Aravamudan 
2581a3437870SNishanth Aravamudan static struct attribute *hstate_attrs[] = {
2582a3437870SNishanth Aravamudan 	&nr_hugepages_attr.attr,
2583a3437870SNishanth Aravamudan 	&nr_overcommit_hugepages_attr.attr,
2584a3437870SNishanth Aravamudan 	&free_hugepages_attr.attr,
2585a3437870SNishanth Aravamudan 	&resv_hugepages_attr.attr,
2586a3437870SNishanth Aravamudan 	&surplus_hugepages_attr.attr,
258706808b08SLee Schermerhorn #ifdef CONFIG_NUMA
258806808b08SLee Schermerhorn 	&nr_hugepages_mempolicy_attr.attr,
258906808b08SLee Schermerhorn #endif
2590a3437870SNishanth Aravamudan 	NULL,
2591a3437870SNishanth Aravamudan };
2592a3437870SNishanth Aravamudan 
259367e5ed96SArvind Yadav static const struct attribute_group hstate_attr_group = {
2594a3437870SNishanth Aravamudan 	.attrs = hstate_attrs,
2595a3437870SNishanth Aravamudan };
2596a3437870SNishanth Aravamudan 
2597094e9539SJeff Mahoney static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
25989a305230SLee Schermerhorn 				    struct kobject **hstate_kobjs,
259967e5ed96SArvind Yadav 				    const struct attribute_group *hstate_attr_group)
2600a3437870SNishanth Aravamudan {
2601a3437870SNishanth Aravamudan 	int retval;
2602972dc4deSAneesh Kumar K.V 	int hi = hstate_index(h);
2603a3437870SNishanth Aravamudan 
26049a305230SLee Schermerhorn 	hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
26059a305230SLee Schermerhorn 	if (!hstate_kobjs[hi])
2606a3437870SNishanth Aravamudan 		return -ENOMEM;
2607a3437870SNishanth Aravamudan 
26089a305230SLee Schermerhorn 	retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
2609a3437870SNishanth Aravamudan 	if (retval)
26109a305230SLee Schermerhorn 		kobject_put(hstate_kobjs[hi]);
2611a3437870SNishanth Aravamudan 
2612a3437870SNishanth Aravamudan 	return retval;
2613a3437870SNishanth Aravamudan }
2614a3437870SNishanth Aravamudan 
2615a3437870SNishanth Aravamudan static void __init hugetlb_sysfs_init(void)
2616a3437870SNishanth Aravamudan {
2617a3437870SNishanth Aravamudan 	struct hstate *h;
2618a3437870SNishanth Aravamudan 	int err;
2619a3437870SNishanth Aravamudan 
2620a3437870SNishanth Aravamudan 	hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
2621a3437870SNishanth Aravamudan 	if (!hugepages_kobj)
2622a3437870SNishanth Aravamudan 		return;
2623a3437870SNishanth Aravamudan 
2624a3437870SNishanth Aravamudan 	for_each_hstate(h) {
26259a305230SLee Schermerhorn 		err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
26269a305230SLee Schermerhorn 					 hstate_kobjs, &hstate_attr_group);
2627a3437870SNishanth Aravamudan 		if (err)
2628ffb22af5SAndrew Morton 			pr_err("Hugetlb: Unable to add hstate %s", h->name);
2629a3437870SNishanth Aravamudan 	}
2630a3437870SNishanth Aravamudan }
2631a3437870SNishanth Aravamudan 
26329a305230SLee Schermerhorn #ifdef CONFIG_NUMA
26339a305230SLee Schermerhorn 
26349a305230SLee Schermerhorn /*
26359a305230SLee Schermerhorn  * node_hstate/s - associate per node hstate attributes, via their kobjects,
263610fbcf4cSKay Sievers  * with node devices in node_devices[] using a parallel array.  The array
263710fbcf4cSKay Sievers  * index of a node device or _hstate == node id.
263810fbcf4cSKay Sievers  * This is here to avoid any static dependency of the node device driver, in
26399a305230SLee Schermerhorn  * the base kernel, on the hugetlb module.
26409a305230SLee Schermerhorn  */
26419a305230SLee Schermerhorn struct node_hstate {
26429a305230SLee Schermerhorn 	struct kobject		*hugepages_kobj;
26439a305230SLee Schermerhorn 	struct kobject		*hstate_kobjs[HUGE_MAX_HSTATE];
26449a305230SLee Schermerhorn };
2645b4e289a6SAlexander Kuleshov static struct node_hstate node_hstates[MAX_NUMNODES];
26469a305230SLee Schermerhorn 
26479a305230SLee Schermerhorn /*
264810fbcf4cSKay Sievers  * A subset of global hstate attributes for node devices
26499a305230SLee Schermerhorn  */
26509a305230SLee Schermerhorn static struct attribute *per_node_hstate_attrs[] = {
26519a305230SLee Schermerhorn 	&nr_hugepages_attr.attr,
26529a305230SLee Schermerhorn 	&free_hugepages_attr.attr,
26539a305230SLee Schermerhorn 	&surplus_hugepages_attr.attr,
26549a305230SLee Schermerhorn 	NULL,
26559a305230SLee Schermerhorn };
26569a305230SLee Schermerhorn 
265767e5ed96SArvind Yadav static const struct attribute_group per_node_hstate_attr_group = {
26589a305230SLee Schermerhorn 	.attrs = per_node_hstate_attrs,
26599a305230SLee Schermerhorn };
26609a305230SLee Schermerhorn 
26619a305230SLee Schermerhorn /*
266210fbcf4cSKay Sievers  * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj.
26639a305230SLee Schermerhorn  * Returns node id via non-NULL nidp.
26649a305230SLee Schermerhorn  */
26659a305230SLee Schermerhorn static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
26669a305230SLee Schermerhorn {
26679a305230SLee Schermerhorn 	int nid;
26689a305230SLee Schermerhorn 
26699a305230SLee Schermerhorn 	for (nid = 0; nid < nr_node_ids; nid++) {
26709a305230SLee Schermerhorn 		struct node_hstate *nhs = &node_hstates[nid];
26719a305230SLee Schermerhorn 		int i;
26729a305230SLee Schermerhorn 		for (i = 0; i < HUGE_MAX_HSTATE; i++)
26739a305230SLee Schermerhorn 			if (nhs->hstate_kobjs[i] == kobj) {
26749a305230SLee Schermerhorn 				if (nidp)
26759a305230SLee Schermerhorn 					*nidp = nid;
26769a305230SLee Schermerhorn 				return &hstates[i];
26779a305230SLee Schermerhorn 			}
26789a305230SLee Schermerhorn 	}
26799a305230SLee Schermerhorn 
26809a305230SLee Schermerhorn 	BUG();
26819a305230SLee Schermerhorn 	return NULL;
26829a305230SLee Schermerhorn }
26839a305230SLee Schermerhorn 
26849a305230SLee Schermerhorn /*
268510fbcf4cSKay Sievers  * Unregister hstate attributes from a single node device.
26869a305230SLee Schermerhorn  * No-op if no hstate attributes attached.
26879a305230SLee Schermerhorn  */
26883cd8b44fSClaudiu Ghioc static void hugetlb_unregister_node(struct node *node)
26899a305230SLee Schermerhorn {
26909a305230SLee Schermerhorn 	struct hstate *h;
269110fbcf4cSKay Sievers 	struct node_hstate *nhs = &node_hstates[node->dev.id];
26929a305230SLee Schermerhorn 
26939a305230SLee Schermerhorn 	if (!nhs->hugepages_kobj)
26949b5e5d0fSLee Schermerhorn 		return;		/* no hstate attributes */
26959a305230SLee Schermerhorn 
2696972dc4deSAneesh Kumar K.V 	for_each_hstate(h) {
2697972dc4deSAneesh Kumar K.V 		int idx = hstate_index(h);
2698972dc4deSAneesh Kumar K.V 		if (nhs->hstate_kobjs[idx]) {
2699972dc4deSAneesh Kumar K.V 			kobject_put(nhs->hstate_kobjs[idx]);
2700972dc4deSAneesh Kumar K.V 			nhs->hstate_kobjs[idx] = NULL;
2701972dc4deSAneesh Kumar K.V 		}
27029a305230SLee Schermerhorn 	}
27039a305230SLee Schermerhorn 
27049a305230SLee Schermerhorn 	kobject_put(nhs->hugepages_kobj);
27059a305230SLee Schermerhorn 	nhs->hugepages_kobj = NULL;
27069a305230SLee Schermerhorn }
27079a305230SLee Schermerhorn 
27089a305230SLee Schermerhorn 
27099a305230SLee Schermerhorn /*
271010fbcf4cSKay Sievers  * Register hstate attributes for a single node device.
27119a305230SLee Schermerhorn  * No-op if attributes already registered.
27129a305230SLee Schermerhorn  */
27133cd8b44fSClaudiu Ghioc static void hugetlb_register_node(struct node *node)
27149a305230SLee Schermerhorn {
27159a305230SLee Schermerhorn 	struct hstate *h;
271610fbcf4cSKay Sievers 	struct node_hstate *nhs = &node_hstates[node->dev.id];
27179a305230SLee Schermerhorn 	int err;
27189a305230SLee Schermerhorn 
27199a305230SLee Schermerhorn 	if (nhs->hugepages_kobj)
27209a305230SLee Schermerhorn 		return;		/* already allocated */
27219a305230SLee Schermerhorn 
27229a305230SLee Schermerhorn 	nhs->hugepages_kobj = kobject_create_and_add("hugepages",
272310fbcf4cSKay Sievers 							&node->dev.kobj);
27249a305230SLee Schermerhorn 	if (!nhs->hugepages_kobj)
27259a305230SLee Schermerhorn 		return;
27269a305230SLee Schermerhorn 
27279a305230SLee Schermerhorn 	for_each_hstate(h) {
27289a305230SLee Schermerhorn 		err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj,
27299a305230SLee Schermerhorn 						nhs->hstate_kobjs,
27309a305230SLee Schermerhorn 						&per_node_hstate_attr_group);
27319a305230SLee Schermerhorn 		if (err) {
2732ffb22af5SAndrew Morton 			pr_err("Hugetlb: Unable to add hstate %s for node %d\n",
273310fbcf4cSKay Sievers 				h->name, node->dev.id);
27349a305230SLee Schermerhorn 			hugetlb_unregister_node(node);
27359a305230SLee Schermerhorn 			break;
27369a305230SLee Schermerhorn 		}
27379a305230SLee Schermerhorn 	}
27389a305230SLee Schermerhorn }
27399a305230SLee Schermerhorn 
27409a305230SLee Schermerhorn /*
27419b5e5d0fSLee Schermerhorn  * hugetlb init time:  register hstate attributes for all registered node
274210fbcf4cSKay Sievers  * devices of nodes that have memory.  All on-line nodes should have
274310fbcf4cSKay Sievers  * registered their associated device by this time.
27449a305230SLee Schermerhorn  */
27457d9ca000SLuiz Capitulino static void __init hugetlb_register_all_nodes(void)
27469a305230SLee Schermerhorn {
27479a305230SLee Schermerhorn 	int nid;
27489a305230SLee Schermerhorn 
27498cebfcd0SLai Jiangshan 	for_each_node_state(nid, N_MEMORY) {
27508732794bSWen Congyang 		struct node *node = node_devices[nid];
275110fbcf4cSKay Sievers 		if (node->dev.id == nid)
27529a305230SLee Schermerhorn 			hugetlb_register_node(node);
27539a305230SLee Schermerhorn 	}
27549a305230SLee Schermerhorn 
27559a305230SLee Schermerhorn 	/*
275610fbcf4cSKay Sievers 	 * Let the node device driver know we're here so it can
27579a305230SLee Schermerhorn 	 * [un]register hstate attributes on node hotplug.
27589a305230SLee Schermerhorn 	 */
27599a305230SLee Schermerhorn 	register_hugetlbfs_with_node(hugetlb_register_node,
27609a305230SLee Schermerhorn 				     hugetlb_unregister_node);
27619a305230SLee Schermerhorn }
27629a305230SLee Schermerhorn #else	/* !CONFIG_NUMA */
27639a305230SLee Schermerhorn 
27649a305230SLee Schermerhorn static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
27659a305230SLee Schermerhorn {
27669a305230SLee Schermerhorn 	BUG();
27679a305230SLee Schermerhorn 	if (nidp)
27689a305230SLee Schermerhorn 		*nidp = -1;
27699a305230SLee Schermerhorn 	return NULL;
27709a305230SLee Schermerhorn }
27719a305230SLee Schermerhorn 
27729a305230SLee Schermerhorn static void hugetlb_register_all_nodes(void) { }
27739a305230SLee Schermerhorn 
27749a305230SLee Schermerhorn #endif
27759a305230SLee Schermerhorn 
2776a3437870SNishanth Aravamudan static int __init hugetlb_init(void)
2777a3437870SNishanth Aravamudan {
27788382d914SDavidlohr Bueso 	int i;
27798382d914SDavidlohr Bueso 
2780457c1b27SNishanth Aravamudan 	if (!hugepages_supported())
27810ef89d25SBenjamin Herrenschmidt 		return 0;
2782a3437870SNishanth Aravamudan 
2783e11bfbfcSNick Piggin 	if (!size_to_hstate(default_hstate_size)) {
2784d715cf80SLiam R. Howlett 		if (default_hstate_size != 0) {
2785d715cf80SLiam R. Howlett 			pr_err("HugeTLB: unsupported default_hugepagesz %lu. Reverting to %lu\n",
2786d715cf80SLiam R. Howlett 			       default_hstate_size, HPAGE_SIZE);
2787d715cf80SLiam R. Howlett 		}
2788d715cf80SLiam R. Howlett 
2789e11bfbfcSNick Piggin 		default_hstate_size = HPAGE_SIZE;
2790e11bfbfcSNick Piggin 		if (!size_to_hstate(default_hstate_size))
2791a3437870SNishanth Aravamudan 			hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
2792a3437870SNishanth Aravamudan 	}
2793972dc4deSAneesh Kumar K.V 	default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size));
2794f8b74815SVaishali Thakkar 	if (default_hstate_max_huge_pages) {
2795f8b74815SVaishali Thakkar 		if (!default_hstate.max_huge_pages)
2796e11bfbfcSNick Piggin 			default_hstate.max_huge_pages = default_hstate_max_huge_pages;
2797f8b74815SVaishali Thakkar 	}
2798a3437870SNishanth Aravamudan 
2799a3437870SNishanth Aravamudan 	hugetlb_init_hstates();
2800aa888a74SAndi Kleen 	gather_bootmem_prealloc();
2801a3437870SNishanth Aravamudan 	report_hugepages();
2802a3437870SNishanth Aravamudan 
2803a3437870SNishanth Aravamudan 	hugetlb_sysfs_init();
28049a305230SLee Schermerhorn 	hugetlb_register_all_nodes();
28057179e7bfSJianguo Wu 	hugetlb_cgroup_file_init();
28069a305230SLee Schermerhorn 
28078382d914SDavidlohr Bueso #ifdef CONFIG_SMP
28088382d914SDavidlohr Bueso 	num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus());
28098382d914SDavidlohr Bueso #else
28108382d914SDavidlohr Bueso 	num_fault_mutexes = 1;
28118382d914SDavidlohr Bueso #endif
2812c672c7f2SMike Kravetz 	hugetlb_fault_mutex_table =
28136da2ec56SKees Cook 		kmalloc_array(num_fault_mutexes, sizeof(struct mutex),
28146da2ec56SKees Cook 			      GFP_KERNEL);
2815c672c7f2SMike Kravetz 	BUG_ON(!hugetlb_fault_mutex_table);
28168382d914SDavidlohr Bueso 
28178382d914SDavidlohr Bueso 	for (i = 0; i < num_fault_mutexes; i++)
2818c672c7f2SMike Kravetz 		mutex_init(&hugetlb_fault_mutex_table[i]);
2819a3437870SNishanth Aravamudan 	return 0;
2820a3437870SNishanth Aravamudan }
28213e89e1c5SPaul Gortmaker subsys_initcall(hugetlb_init);
2822a3437870SNishanth Aravamudan 
2823a3437870SNishanth Aravamudan /* Should be called on processing a hugepagesz=... option */
28249fee021dSVaishali Thakkar void __init hugetlb_bad_size(void)
28259fee021dSVaishali Thakkar {
28269fee021dSVaishali Thakkar 	parsed_valid_hugepagesz = false;
28279fee021dSVaishali Thakkar }
28289fee021dSVaishali Thakkar 
2829d00181b9SKirill A. Shutemov void __init hugetlb_add_hstate(unsigned int order)
2830a3437870SNishanth Aravamudan {
2831a3437870SNishanth Aravamudan 	struct hstate *h;
28328faa8b07SAndi Kleen 	unsigned long i;
28338faa8b07SAndi Kleen 
2834a3437870SNishanth Aravamudan 	if (size_to_hstate(PAGE_SIZE << order)) {
2835598d8091SJoe Perches 		pr_warn("hugepagesz= specified twice, ignoring\n");
2836a3437870SNishanth Aravamudan 		return;
2837a3437870SNishanth Aravamudan 	}
283847d38344SAneesh Kumar K.V 	BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
2839a3437870SNishanth Aravamudan 	BUG_ON(order == 0);
284047d38344SAneesh Kumar K.V 	h = &hstates[hugetlb_max_hstate++];
2841a3437870SNishanth Aravamudan 	h->order = order;
2842a3437870SNishanth Aravamudan 	h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
28438faa8b07SAndi Kleen 	h->nr_huge_pages = 0;
28448faa8b07SAndi Kleen 	h->free_huge_pages = 0;
28458faa8b07SAndi Kleen 	for (i = 0; i < MAX_NUMNODES; ++i)
28468faa8b07SAndi Kleen 		INIT_LIST_HEAD(&h->hugepage_freelists[i]);
28470edaecfaSAneesh Kumar K.V 	INIT_LIST_HEAD(&h->hugepage_activelist);
284854f18d35SAndrew Morton 	h->next_nid_to_alloc = first_memory_node;
284954f18d35SAndrew Morton 	h->next_nid_to_free = first_memory_node;
2850a3437870SNishanth Aravamudan 	snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
2851a3437870SNishanth Aravamudan 					huge_page_size(h)/1024);
28528faa8b07SAndi Kleen 
2853a3437870SNishanth Aravamudan 	parsed_hstate = h;
2854a3437870SNishanth Aravamudan }
2855a3437870SNishanth Aravamudan 
2856e11bfbfcSNick Piggin static int __init hugetlb_nrpages_setup(char *s)
2857a3437870SNishanth Aravamudan {
2858a3437870SNishanth Aravamudan 	unsigned long *mhp;
28598faa8b07SAndi Kleen 	static unsigned long *last_mhp;
2860a3437870SNishanth Aravamudan 
28619fee021dSVaishali Thakkar 	if (!parsed_valid_hugepagesz) {
28629fee021dSVaishali Thakkar 		pr_warn("hugepages = %s preceded by "
28639fee021dSVaishali Thakkar 			"an unsupported hugepagesz, ignoring\n", s);
28649fee021dSVaishali Thakkar 		parsed_valid_hugepagesz = true;
28659fee021dSVaishali Thakkar 		return 1;
28669fee021dSVaishali Thakkar 	}
2867a3437870SNishanth Aravamudan 	/*
286847d38344SAneesh Kumar K.V 	 * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,
2869a3437870SNishanth Aravamudan 	 * so this hugepages= parameter goes to the "default hstate".
2870a3437870SNishanth Aravamudan 	 */
28719fee021dSVaishali Thakkar 	else if (!hugetlb_max_hstate)
2872a3437870SNishanth Aravamudan 		mhp = &default_hstate_max_huge_pages;
2873a3437870SNishanth Aravamudan 	else
2874a3437870SNishanth Aravamudan 		mhp = &parsed_hstate->max_huge_pages;
2875a3437870SNishanth Aravamudan 
28768faa8b07SAndi Kleen 	if (mhp == last_mhp) {
2877598d8091SJoe Perches 		pr_warn("hugepages= specified twice without interleaving hugepagesz=, ignoring\n");
28788faa8b07SAndi Kleen 		return 1;
28798faa8b07SAndi Kleen 	}
28808faa8b07SAndi Kleen 
2881a3437870SNishanth Aravamudan 	if (sscanf(s, "%lu", mhp) <= 0)
2882a3437870SNishanth Aravamudan 		*mhp = 0;
2883a3437870SNishanth Aravamudan 
28848faa8b07SAndi Kleen 	/*
28858faa8b07SAndi Kleen 	 * Global state is always initialized later in hugetlb_init.
28868faa8b07SAndi Kleen 	 * But we need to allocate >= MAX_ORDER hstates here early to still
28878faa8b07SAndi Kleen 	 * use the bootmem allocator.
28888faa8b07SAndi Kleen 	 */
288947d38344SAneesh Kumar K.V 	if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)
28908faa8b07SAndi Kleen 		hugetlb_hstate_alloc_pages(parsed_hstate);
28918faa8b07SAndi Kleen 
28928faa8b07SAndi Kleen 	last_mhp = mhp;
28938faa8b07SAndi Kleen 
2894a3437870SNishanth Aravamudan 	return 1;
2895a3437870SNishanth Aravamudan }
2896e11bfbfcSNick Piggin __setup("hugepages=", hugetlb_nrpages_setup);
2897e11bfbfcSNick Piggin 
2898e11bfbfcSNick Piggin static int __init hugetlb_default_setup(char *s)
2899e11bfbfcSNick Piggin {
2900e11bfbfcSNick Piggin 	default_hstate_size = memparse(s, &s);
2901e11bfbfcSNick Piggin 	return 1;
2902e11bfbfcSNick Piggin }
2903e11bfbfcSNick Piggin __setup("default_hugepagesz=", hugetlb_default_setup);
2904a3437870SNishanth Aravamudan 
29058a213460SNishanth Aravamudan static unsigned int cpuset_mems_nr(unsigned int *array)
29068a213460SNishanth Aravamudan {
29078a213460SNishanth Aravamudan 	int node;
29088a213460SNishanth Aravamudan 	unsigned int nr = 0;
29098a213460SNishanth Aravamudan 
29108a213460SNishanth Aravamudan 	for_each_node_mask(node, cpuset_current_mems_allowed)
29118a213460SNishanth Aravamudan 		nr += array[node];
29128a213460SNishanth Aravamudan 
29138a213460SNishanth Aravamudan 	return nr;
29148a213460SNishanth Aravamudan }
29158a213460SNishanth Aravamudan 
29168a213460SNishanth Aravamudan #ifdef CONFIG_SYSCTL
291706808b08SLee Schermerhorn static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
291806808b08SLee Schermerhorn 			 struct ctl_table *table, int write,
291906808b08SLee Schermerhorn 			 void __user *buffer, size_t *length, loff_t *ppos)
29201da177e4SLinus Torvalds {
2921e5ff2159SAndi Kleen 	struct hstate *h = &default_hstate;
2922238d3c13SDavid Rientjes 	unsigned long tmp = h->max_huge_pages;
292308d4a246SMichal Hocko 	int ret;
2924e5ff2159SAndi Kleen 
2925457c1b27SNishanth Aravamudan 	if (!hugepages_supported())
292686613628SJan Stancek 		return -EOPNOTSUPP;
2927457c1b27SNishanth Aravamudan 
2928e5ff2159SAndi Kleen 	table->data = &tmp;
2929e5ff2159SAndi Kleen 	table->maxlen = sizeof(unsigned long);
293008d4a246SMichal Hocko 	ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
293108d4a246SMichal Hocko 	if (ret)
293208d4a246SMichal Hocko 		goto out;
2933e5ff2159SAndi Kleen 
2934238d3c13SDavid Rientjes 	if (write)
2935238d3c13SDavid Rientjes 		ret = __nr_hugepages_store_common(obey_mempolicy, h,
2936238d3c13SDavid Rientjes 						  NUMA_NO_NODE, tmp, *length);
293708d4a246SMichal Hocko out:
293808d4a246SMichal Hocko 	return ret;
29391da177e4SLinus Torvalds }
2940396faf03SMel Gorman 
294106808b08SLee Schermerhorn int hugetlb_sysctl_handler(struct ctl_table *table, int write,
294206808b08SLee Schermerhorn 			  void __user *buffer, size_t *length, loff_t *ppos)
294306808b08SLee Schermerhorn {
294406808b08SLee Schermerhorn 
294506808b08SLee Schermerhorn 	return hugetlb_sysctl_handler_common(false, table, write,
294606808b08SLee Schermerhorn 							buffer, length, ppos);
294706808b08SLee Schermerhorn }
294806808b08SLee Schermerhorn 
294906808b08SLee Schermerhorn #ifdef CONFIG_NUMA
295006808b08SLee Schermerhorn int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
295106808b08SLee Schermerhorn 			  void __user *buffer, size_t *length, loff_t *ppos)
295206808b08SLee Schermerhorn {
295306808b08SLee Schermerhorn 	return hugetlb_sysctl_handler_common(true, table, write,
295406808b08SLee Schermerhorn 							buffer, length, ppos);
295506808b08SLee Schermerhorn }
295606808b08SLee Schermerhorn #endif /* CONFIG_NUMA */
295706808b08SLee Schermerhorn 
2958a3d0c6aaSNishanth Aravamudan int hugetlb_overcommit_handler(struct ctl_table *table, int write,
29598d65af78SAlexey Dobriyan 			void __user *buffer,
2960a3d0c6aaSNishanth Aravamudan 			size_t *length, loff_t *ppos)
2961a3d0c6aaSNishanth Aravamudan {
2962a5516438SAndi Kleen 	struct hstate *h = &default_hstate;
2963e5ff2159SAndi Kleen 	unsigned long tmp;
296408d4a246SMichal Hocko 	int ret;
2965e5ff2159SAndi Kleen 
2966457c1b27SNishanth Aravamudan 	if (!hugepages_supported())
296786613628SJan Stancek 		return -EOPNOTSUPP;
2968457c1b27SNishanth Aravamudan 
2969e5ff2159SAndi Kleen 	tmp = h->nr_overcommit_huge_pages;
2970e5ff2159SAndi Kleen 
2971bae7f4aeSLuiz Capitulino 	if (write && hstate_is_gigantic(h))
2972adbe8726SEric B Munson 		return -EINVAL;
2973adbe8726SEric B Munson 
2974e5ff2159SAndi Kleen 	table->data = &tmp;
2975e5ff2159SAndi Kleen 	table->maxlen = sizeof(unsigned long);
297608d4a246SMichal Hocko 	ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
297708d4a246SMichal Hocko 	if (ret)
297808d4a246SMichal Hocko 		goto out;
2979e5ff2159SAndi Kleen 
2980e5ff2159SAndi Kleen 	if (write) {
2981064d9efeSNishanth Aravamudan 		spin_lock(&hugetlb_lock);
2982e5ff2159SAndi Kleen 		h->nr_overcommit_huge_pages = tmp;
2983a3d0c6aaSNishanth Aravamudan 		spin_unlock(&hugetlb_lock);
2984e5ff2159SAndi Kleen 	}
298508d4a246SMichal Hocko out:
298608d4a246SMichal Hocko 	return ret;
2987a3d0c6aaSNishanth Aravamudan }
2988a3d0c6aaSNishanth Aravamudan 
29891da177e4SLinus Torvalds #endif /* CONFIG_SYSCTL */
29901da177e4SLinus Torvalds 
2991e1759c21SAlexey Dobriyan void hugetlb_report_meminfo(struct seq_file *m)
29921da177e4SLinus Torvalds {
2993fcb2b0c5SRoman Gushchin 	struct hstate *h;
2994fcb2b0c5SRoman Gushchin 	unsigned long total = 0;
2995fcb2b0c5SRoman Gushchin 
2996457c1b27SNishanth Aravamudan 	if (!hugepages_supported())
2997457c1b27SNishanth Aravamudan 		return;
2998fcb2b0c5SRoman Gushchin 
2999fcb2b0c5SRoman Gushchin 	for_each_hstate(h) {
3000fcb2b0c5SRoman Gushchin 		unsigned long count = h->nr_huge_pages;
3001fcb2b0c5SRoman Gushchin 
3002fcb2b0c5SRoman Gushchin 		total += (PAGE_SIZE << huge_page_order(h)) * count;
3003fcb2b0c5SRoman Gushchin 
3004fcb2b0c5SRoman Gushchin 		if (h == &default_hstate)
3005e1759c21SAlexey Dobriyan 			seq_printf(m,
30061da177e4SLinus Torvalds 				   "HugePages_Total:   %5lu\n"
30071da177e4SLinus Torvalds 				   "HugePages_Free:    %5lu\n"
3008b45b5bd6SDavid Gibson 				   "HugePages_Rsvd:    %5lu\n"
30097893d1d5SAdam Litke 				   "HugePages_Surp:    %5lu\n"
30104f98a2feSRik van Riel 				   "Hugepagesize:   %8lu kB\n",
3011fcb2b0c5SRoman Gushchin 				   count,
3012a5516438SAndi Kleen 				   h->free_huge_pages,
3013a5516438SAndi Kleen 				   h->resv_huge_pages,
3014a5516438SAndi Kleen 				   h->surplus_huge_pages,
3015fcb2b0c5SRoman Gushchin 				   (PAGE_SIZE << huge_page_order(h)) / 1024);
3016fcb2b0c5SRoman Gushchin 	}
3017fcb2b0c5SRoman Gushchin 
3018fcb2b0c5SRoman Gushchin 	seq_printf(m, "Hugetlb:        %8lu kB\n", total / 1024);
30191da177e4SLinus Torvalds }
30201da177e4SLinus Torvalds 
30211da177e4SLinus Torvalds int hugetlb_report_node_meminfo(int nid, char *buf)
30221da177e4SLinus Torvalds {
3023a5516438SAndi Kleen 	struct hstate *h = &default_hstate;
3024457c1b27SNishanth Aravamudan 	if (!hugepages_supported())
3025457c1b27SNishanth Aravamudan 		return 0;
30261da177e4SLinus Torvalds 	return sprintf(buf,
30271da177e4SLinus Torvalds 		"Node %d HugePages_Total: %5u\n"
3028a1de0919SNishanth Aravamudan 		"Node %d HugePages_Free:  %5u\n"
3029a1de0919SNishanth Aravamudan 		"Node %d HugePages_Surp:  %5u\n",
3030a5516438SAndi Kleen 		nid, h->nr_huge_pages_node[nid],
3031a5516438SAndi Kleen 		nid, h->free_huge_pages_node[nid],
3032a5516438SAndi Kleen 		nid, h->surplus_huge_pages_node[nid]);
30331da177e4SLinus Torvalds }
30341da177e4SLinus Torvalds 
3035949f7ec5SDavid Rientjes void hugetlb_show_meminfo(void)
3036949f7ec5SDavid Rientjes {
3037949f7ec5SDavid Rientjes 	struct hstate *h;
3038949f7ec5SDavid Rientjes 	int nid;
3039949f7ec5SDavid Rientjes 
3040457c1b27SNishanth Aravamudan 	if (!hugepages_supported())
3041457c1b27SNishanth Aravamudan 		return;
3042457c1b27SNishanth Aravamudan 
3043949f7ec5SDavid Rientjes 	for_each_node_state(nid, N_MEMORY)
3044949f7ec5SDavid Rientjes 		for_each_hstate(h)
3045949f7ec5SDavid Rientjes 			pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
3046949f7ec5SDavid Rientjes 				nid,
3047949f7ec5SDavid Rientjes 				h->nr_huge_pages_node[nid],
3048949f7ec5SDavid Rientjes 				h->free_huge_pages_node[nid],
3049949f7ec5SDavid Rientjes 				h->surplus_huge_pages_node[nid],
3050949f7ec5SDavid Rientjes 				1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
3051949f7ec5SDavid Rientjes }
3052949f7ec5SDavid Rientjes 
30535d317b2bSNaoya Horiguchi void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm)
30545d317b2bSNaoya Horiguchi {
30555d317b2bSNaoya Horiguchi 	seq_printf(m, "HugetlbPages:\t%8lu kB\n",
30565d317b2bSNaoya Horiguchi 		   atomic_long_read(&mm->hugetlb_usage) << (PAGE_SHIFT - 10));
30575d317b2bSNaoya Horiguchi }
30585d317b2bSNaoya Horiguchi 
30591da177e4SLinus Torvalds /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
30601da177e4SLinus Torvalds unsigned long hugetlb_total_pages(void)
30611da177e4SLinus Torvalds {
3062d0028588SWanpeng Li 	struct hstate *h;
3063d0028588SWanpeng Li 	unsigned long nr_total_pages = 0;
3064d0028588SWanpeng Li 
3065d0028588SWanpeng Li 	for_each_hstate(h)
3066d0028588SWanpeng Li 		nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h);
3067d0028588SWanpeng Li 	return nr_total_pages;
30681da177e4SLinus Torvalds }
30691da177e4SLinus Torvalds 
3070a5516438SAndi Kleen static int hugetlb_acct_memory(struct hstate *h, long delta)
3071fc1b8a73SMel Gorman {
3072fc1b8a73SMel Gorman 	int ret = -ENOMEM;
3073fc1b8a73SMel Gorman 
3074fc1b8a73SMel Gorman 	spin_lock(&hugetlb_lock);
3075fc1b8a73SMel Gorman 	/*
3076fc1b8a73SMel Gorman 	 * When cpuset is configured, it breaks the strict hugetlb page
3077fc1b8a73SMel Gorman 	 * reservation as the accounting is done on a global variable. Such
3078fc1b8a73SMel Gorman 	 * reservation is completely rubbish in the presence of cpuset because
3079fc1b8a73SMel Gorman 	 * the reservation is not checked against page availability for the
3080fc1b8a73SMel Gorman 	 * current cpuset. Application can still potentially OOM'ed by kernel
3081fc1b8a73SMel Gorman 	 * with lack of free htlb page in cpuset that the task is in.
3082fc1b8a73SMel Gorman 	 * Attempt to enforce strict accounting with cpuset is almost
3083fc1b8a73SMel Gorman 	 * impossible (or too ugly) because cpuset is too fluid that
3084fc1b8a73SMel Gorman 	 * task or memory node can be dynamically moved between cpusets.
3085fc1b8a73SMel Gorman 	 *
3086fc1b8a73SMel Gorman 	 * The change of semantics for shared hugetlb mapping with cpuset is
3087fc1b8a73SMel Gorman 	 * undesirable. However, in order to preserve some of the semantics,
3088fc1b8a73SMel Gorman 	 * we fall back to check against current free page availability as
3089fc1b8a73SMel Gorman 	 * a best attempt and hopefully to minimize the impact of changing
3090fc1b8a73SMel Gorman 	 * semantics that cpuset has.
3091fc1b8a73SMel Gorman 	 */
3092fc1b8a73SMel Gorman 	if (delta > 0) {
3093a5516438SAndi Kleen 		if (gather_surplus_pages(h, delta) < 0)
3094fc1b8a73SMel Gorman 			goto out;
3095fc1b8a73SMel Gorman 
3096a5516438SAndi Kleen 		if (delta > cpuset_mems_nr(h->free_huge_pages_node)) {
3097a5516438SAndi Kleen 			return_unused_surplus_pages(h, delta);
3098fc1b8a73SMel Gorman 			goto out;
3099fc1b8a73SMel Gorman 		}
3100fc1b8a73SMel Gorman 	}
3101fc1b8a73SMel Gorman 
3102fc1b8a73SMel Gorman 	ret = 0;
3103fc1b8a73SMel Gorman 	if (delta < 0)
3104a5516438SAndi Kleen 		return_unused_surplus_pages(h, (unsigned long) -delta);
3105fc1b8a73SMel Gorman 
3106fc1b8a73SMel Gorman out:
3107fc1b8a73SMel Gorman 	spin_unlock(&hugetlb_lock);
3108fc1b8a73SMel Gorman 	return ret;
3109fc1b8a73SMel Gorman }
3110fc1b8a73SMel Gorman 
311184afd99bSAndy Whitcroft static void hugetlb_vm_op_open(struct vm_area_struct *vma)
311284afd99bSAndy Whitcroft {
3113f522c3acSJoonsoo Kim 	struct resv_map *resv = vma_resv_map(vma);
311484afd99bSAndy Whitcroft 
311584afd99bSAndy Whitcroft 	/*
311684afd99bSAndy Whitcroft 	 * This new VMA should share its siblings reservation map if present.
311784afd99bSAndy Whitcroft 	 * The VMA will only ever have a valid reservation map pointer where
311884afd99bSAndy Whitcroft 	 * it is being copied for another still existing VMA.  As that VMA
311925985edcSLucas De Marchi 	 * has a reference to the reservation map it cannot disappear until
312084afd99bSAndy Whitcroft 	 * after this open call completes.  It is therefore safe to take a
312184afd99bSAndy Whitcroft 	 * new reference here without additional locking.
312284afd99bSAndy Whitcroft 	 */
31234e35f483SJoonsoo Kim 	if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
3124f522c3acSJoonsoo Kim 		kref_get(&resv->refs);
312584afd99bSAndy Whitcroft }
312684afd99bSAndy Whitcroft 
3127a1e78772SMel Gorman static void hugetlb_vm_op_close(struct vm_area_struct *vma)
3128a1e78772SMel Gorman {
3129a5516438SAndi Kleen 	struct hstate *h = hstate_vma(vma);
3130f522c3acSJoonsoo Kim 	struct resv_map *resv = vma_resv_map(vma);
313190481622SDavid Gibson 	struct hugepage_subpool *spool = subpool_vma(vma);
31324e35f483SJoonsoo Kim 	unsigned long reserve, start, end;
31331c5ecae3SMike Kravetz 	long gbl_reserve;
313484afd99bSAndy Whitcroft 
31354e35f483SJoonsoo Kim 	if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
31364e35f483SJoonsoo Kim 		return;
31374e35f483SJoonsoo Kim 
3138a5516438SAndi Kleen 	start = vma_hugecache_offset(h, vma, vma->vm_start);
3139a5516438SAndi Kleen 	end = vma_hugecache_offset(h, vma, vma->vm_end);
314084afd99bSAndy Whitcroft 
31414e35f483SJoonsoo Kim 	reserve = (end - start) - region_count(resv, start, end);
314284afd99bSAndy Whitcroft 
3143f031dd27SJoonsoo Kim 	kref_put(&resv->refs, resv_map_release);
314484afd99bSAndy Whitcroft 
31457251ff78SAdam Litke 	if (reserve) {
31461c5ecae3SMike Kravetz 		/*
31471c5ecae3SMike Kravetz 		 * Decrement reserve counts.  The global reserve count may be
31481c5ecae3SMike Kravetz 		 * adjusted if the subpool has a minimum size.
31491c5ecae3SMike Kravetz 		 */
31501c5ecae3SMike Kravetz 		gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
31511c5ecae3SMike Kravetz 		hugetlb_acct_memory(h, -gbl_reserve);
31527251ff78SAdam Litke 	}
3153a1e78772SMel Gorman }
3154a1e78772SMel Gorman 
315531383c68SDan Williams static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
315631383c68SDan Williams {
315731383c68SDan Williams 	if (addr & ~(huge_page_mask(hstate_vma(vma))))
315831383c68SDan Williams 		return -EINVAL;
315931383c68SDan Williams 	return 0;
316031383c68SDan Williams }
316131383c68SDan Williams 
316205ea8860SDan Williams static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
316305ea8860SDan Williams {
316405ea8860SDan Williams 	struct hstate *hstate = hstate_vma(vma);
316505ea8860SDan Williams 
316605ea8860SDan Williams 	return 1UL << huge_page_shift(hstate);
316705ea8860SDan Williams }
316805ea8860SDan Williams 
31691da177e4SLinus Torvalds /*
31701da177e4SLinus Torvalds  * We cannot handle pagefaults against hugetlb pages at all.  They cause
31711da177e4SLinus Torvalds  * handle_mm_fault() to try to instantiate regular-sized pages in the
31721da177e4SLinus Torvalds  * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
31731da177e4SLinus Torvalds  * this far.
31741da177e4SLinus Torvalds  */
3175b3ec9f33SSouptick Joarder static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf)
31761da177e4SLinus Torvalds {
31771da177e4SLinus Torvalds 	BUG();
3178d0217ac0SNick Piggin 	return 0;
31791da177e4SLinus Torvalds }
31801da177e4SLinus Torvalds 
3181eec3636aSJane Chu /*
3182eec3636aSJane Chu  * When a new function is introduced to vm_operations_struct and added
3183eec3636aSJane Chu  * to hugetlb_vm_ops, please consider adding the function to shm_vm_ops.
3184eec3636aSJane Chu  * This is because under System V memory model, mappings created via
3185eec3636aSJane Chu  * shmget/shmat with "huge page" specified are backed by hugetlbfs files,
3186eec3636aSJane Chu  * their original vm_ops are overwritten with shm_vm_ops.
3187eec3636aSJane Chu  */
3188f0f37e2fSAlexey Dobriyan const struct vm_operations_struct hugetlb_vm_ops = {
3189d0217ac0SNick Piggin 	.fault = hugetlb_vm_op_fault,
319084afd99bSAndy Whitcroft 	.open = hugetlb_vm_op_open,
3191a1e78772SMel Gorman 	.close = hugetlb_vm_op_close,
319231383c68SDan Williams 	.split = hugetlb_vm_op_split,
319305ea8860SDan Williams 	.pagesize = hugetlb_vm_op_pagesize,
31941da177e4SLinus Torvalds };
31951da177e4SLinus Torvalds 
31961e8f889bSDavid Gibson static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
31971e8f889bSDavid Gibson 				int writable)
319863551ae0SDavid Gibson {
319963551ae0SDavid Gibson 	pte_t entry;
320063551ae0SDavid Gibson 
32011e8f889bSDavid Gibson 	if (writable) {
3202106c992aSGerald Schaefer 		entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
3203106c992aSGerald Schaefer 					 vma->vm_page_prot)));
320463551ae0SDavid Gibson 	} else {
3205106c992aSGerald Schaefer 		entry = huge_pte_wrprotect(mk_huge_pte(page,
3206106c992aSGerald Schaefer 					   vma->vm_page_prot));
320763551ae0SDavid Gibson 	}
320863551ae0SDavid Gibson 	entry = pte_mkyoung(entry);
320963551ae0SDavid Gibson 	entry = pte_mkhuge(entry);
3210d9ed9faaSChris Metcalf 	entry = arch_make_huge_pte(entry, vma, page, writable);
321163551ae0SDavid Gibson 
321263551ae0SDavid Gibson 	return entry;
321363551ae0SDavid Gibson }
321463551ae0SDavid Gibson 
32151e8f889bSDavid Gibson static void set_huge_ptep_writable(struct vm_area_struct *vma,
32161e8f889bSDavid Gibson 				   unsigned long address, pte_t *ptep)
32171e8f889bSDavid Gibson {
32181e8f889bSDavid Gibson 	pte_t entry;
32191e8f889bSDavid Gibson 
3220106c992aSGerald Schaefer 	entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep)));
322132f84528SChris Forbes 	if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
32224b3073e1SRussell King 		update_mmu_cache(vma, address, ptep);
32231e8f889bSDavid Gibson }
32241e8f889bSDavid Gibson 
3225d5ed7444SAneesh Kumar K.V bool is_hugetlb_entry_migration(pte_t pte)
32264a705fefSNaoya Horiguchi {
32274a705fefSNaoya Horiguchi 	swp_entry_t swp;
32284a705fefSNaoya Horiguchi 
32294a705fefSNaoya Horiguchi 	if (huge_pte_none(pte) || pte_present(pte))
3230d5ed7444SAneesh Kumar K.V 		return false;
32314a705fefSNaoya Horiguchi 	swp = pte_to_swp_entry(pte);
32324a705fefSNaoya Horiguchi 	if (non_swap_entry(swp) && is_migration_entry(swp))
3233d5ed7444SAneesh Kumar K.V 		return true;
32344a705fefSNaoya Horiguchi 	else
3235d5ed7444SAneesh Kumar K.V 		return false;
32364a705fefSNaoya Horiguchi }
32374a705fefSNaoya Horiguchi 
32384a705fefSNaoya Horiguchi static int is_hugetlb_entry_hwpoisoned(pte_t pte)
32394a705fefSNaoya Horiguchi {
32404a705fefSNaoya Horiguchi 	swp_entry_t swp;
32414a705fefSNaoya Horiguchi 
32424a705fefSNaoya Horiguchi 	if (huge_pte_none(pte) || pte_present(pte))
32434a705fefSNaoya Horiguchi 		return 0;
32444a705fefSNaoya Horiguchi 	swp = pte_to_swp_entry(pte);
32454a705fefSNaoya Horiguchi 	if (non_swap_entry(swp) && is_hwpoison_entry(swp))
32464a705fefSNaoya Horiguchi 		return 1;
32474a705fefSNaoya Horiguchi 	else
32484a705fefSNaoya Horiguchi 		return 0;
32494a705fefSNaoya Horiguchi }
32501e8f889bSDavid Gibson 
325163551ae0SDavid Gibson int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
325263551ae0SDavid Gibson 			    struct vm_area_struct *vma)
325363551ae0SDavid Gibson {
32545e41540cSMike Kravetz 	pte_t *src_pte, *dst_pte, entry, dst_entry;
325563551ae0SDavid Gibson 	struct page *ptepage;
32561c59827dSHugh Dickins 	unsigned long addr;
32571e8f889bSDavid Gibson 	int cow;
3258a5516438SAndi Kleen 	struct hstate *h = hstate_vma(vma);
3259a5516438SAndi Kleen 	unsigned long sz = huge_page_size(h);
3260ac46d4f3SJérôme Glisse 	struct mmu_notifier_range range;
3261e8569dd2SAndreas Sandberg 	int ret = 0;
32621e8f889bSDavid Gibson 
32631e8f889bSDavid Gibson 	cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
326463551ae0SDavid Gibson 
3265ac46d4f3SJérôme Glisse 	if (cow) {
32667269f999SJérôme Glisse 		mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src,
32676f4f13e8SJérôme Glisse 					vma->vm_start,
3268ac46d4f3SJérôme Glisse 					vma->vm_end);
3269ac46d4f3SJérôme Glisse 		mmu_notifier_invalidate_range_start(&range);
3270ac46d4f3SJérôme Glisse 	}
3271e8569dd2SAndreas Sandberg 
3272a5516438SAndi Kleen 	for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
3273cb900f41SKirill A. Shutemov 		spinlock_t *src_ptl, *dst_ptl;
32747868a208SPunit Agrawal 		src_pte = huge_pte_offset(src, addr, sz);
3275c74df32cSHugh Dickins 		if (!src_pte)
3276c74df32cSHugh Dickins 			continue;
3277a5516438SAndi Kleen 		dst_pte = huge_pte_alloc(dst, addr, sz);
3278e8569dd2SAndreas Sandberg 		if (!dst_pte) {
3279e8569dd2SAndreas Sandberg 			ret = -ENOMEM;
3280e8569dd2SAndreas Sandberg 			break;
3281e8569dd2SAndreas Sandberg 		}
3282c5c99429SLarry Woodman 
32835e41540cSMike Kravetz 		/*
32845e41540cSMike Kravetz 		 * If the pagetables are shared don't copy or take references.
32855e41540cSMike Kravetz 		 * dst_pte == src_pte is the common case of src/dest sharing.
32865e41540cSMike Kravetz 		 *
32875e41540cSMike Kravetz 		 * However, src could have 'unshared' and dst shares with
32885e41540cSMike Kravetz 		 * another vma.  If dst_pte !none, this implies sharing.
32895e41540cSMike Kravetz 		 * Check here before taking page table lock, and once again
32905e41540cSMike Kravetz 		 * after taking the lock below.
32915e41540cSMike Kravetz 		 */
32925e41540cSMike Kravetz 		dst_entry = huge_ptep_get(dst_pte);
32935e41540cSMike Kravetz 		if ((dst_pte == src_pte) || !huge_pte_none(dst_entry))
3294c5c99429SLarry Woodman 			continue;
3295c5c99429SLarry Woodman 
3296cb900f41SKirill A. Shutemov 		dst_ptl = huge_pte_lock(h, dst, dst_pte);
3297cb900f41SKirill A. Shutemov 		src_ptl = huge_pte_lockptr(h, src, src_pte);
3298cb900f41SKirill A. Shutemov 		spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
32994a705fefSNaoya Horiguchi 		entry = huge_ptep_get(src_pte);
33005e41540cSMike Kravetz 		dst_entry = huge_ptep_get(dst_pte);
33015e41540cSMike Kravetz 		if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) {
33025e41540cSMike Kravetz 			/*
33035e41540cSMike Kravetz 			 * Skip if src entry none.  Also, skip in the
33045e41540cSMike Kravetz 			 * unlikely case dst entry !none as this implies
33055e41540cSMike Kravetz 			 * sharing with another vma.
33065e41540cSMike Kravetz 			 */
33074a705fefSNaoya Horiguchi 			;
33084a705fefSNaoya Horiguchi 		} else if (unlikely(is_hugetlb_entry_migration(entry) ||
33094a705fefSNaoya Horiguchi 				    is_hugetlb_entry_hwpoisoned(entry))) {
33104a705fefSNaoya Horiguchi 			swp_entry_t swp_entry = pte_to_swp_entry(entry);
33114a705fefSNaoya Horiguchi 
33124a705fefSNaoya Horiguchi 			if (is_write_migration_entry(swp_entry) && cow) {
33134a705fefSNaoya Horiguchi 				/*
33144a705fefSNaoya Horiguchi 				 * COW mappings require pages in both
33154a705fefSNaoya Horiguchi 				 * parent and child to be set to read.
33164a705fefSNaoya Horiguchi 				 */
33174a705fefSNaoya Horiguchi 				make_migration_entry_read(&swp_entry);
33184a705fefSNaoya Horiguchi 				entry = swp_entry_to_pte(swp_entry);
3319e5251fd4SPunit Agrawal 				set_huge_swap_pte_at(src, addr, src_pte,
3320e5251fd4SPunit Agrawal 						     entry, sz);
33214a705fefSNaoya Horiguchi 			}
3322e5251fd4SPunit Agrawal 			set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
33234a705fefSNaoya Horiguchi 		} else {
332434ee645eSJoerg Roedel 			if (cow) {
33250f10851eSJérôme Glisse 				/*
33260f10851eSJérôme Glisse 				 * No need to notify as we are downgrading page
33270f10851eSJérôme Glisse 				 * table protection not changing it to point
33280f10851eSJérôme Glisse 				 * to a new page.
33290f10851eSJérôme Glisse 				 *
3330ad56b738SMike Rapoport 				 * See Documentation/vm/mmu_notifier.rst
33310f10851eSJérôme Glisse 				 */
33327f2e9525SGerald Schaefer 				huge_ptep_set_wrprotect(src, addr, src_pte);
333334ee645eSJoerg Roedel 			}
33340253d634SNaoya Horiguchi 			entry = huge_ptep_get(src_pte);
333563551ae0SDavid Gibson 			ptepage = pte_page(entry);
333663551ae0SDavid Gibson 			get_page(ptepage);
333753f9263bSKirill A. Shutemov 			page_dup_rmap(ptepage, true);
333863551ae0SDavid Gibson 			set_huge_pte_at(dst, addr, dst_pte, entry);
33395d317b2bSNaoya Horiguchi 			hugetlb_count_add(pages_per_huge_page(h), dst);
33401c59827dSHugh Dickins 		}
3341cb900f41SKirill A. Shutemov 		spin_unlock(src_ptl);
3342cb900f41SKirill A. Shutemov 		spin_unlock(dst_ptl);
334363551ae0SDavid Gibson 	}
334463551ae0SDavid Gibson 
3345e8569dd2SAndreas Sandberg 	if (cow)
3346ac46d4f3SJérôme Glisse 		mmu_notifier_invalidate_range_end(&range);
3347e8569dd2SAndreas Sandberg 
3348e8569dd2SAndreas Sandberg 	return ret;
334963551ae0SDavid Gibson }
335063551ae0SDavid Gibson 
335124669e58SAneesh Kumar K.V void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
335224669e58SAneesh Kumar K.V 			    unsigned long start, unsigned long end,
335324669e58SAneesh Kumar K.V 			    struct page *ref_page)
335463551ae0SDavid Gibson {
335563551ae0SDavid Gibson 	struct mm_struct *mm = vma->vm_mm;
335663551ae0SDavid Gibson 	unsigned long address;
3357c7546f8fSDavid Gibson 	pte_t *ptep;
335863551ae0SDavid Gibson 	pte_t pte;
3359cb900f41SKirill A. Shutemov 	spinlock_t *ptl;
336063551ae0SDavid Gibson 	struct page *page;
3361a5516438SAndi Kleen 	struct hstate *h = hstate_vma(vma);
3362a5516438SAndi Kleen 	unsigned long sz = huge_page_size(h);
3363ac46d4f3SJérôme Glisse 	struct mmu_notifier_range range;
3364a5516438SAndi Kleen 
336563551ae0SDavid Gibson 	WARN_ON(!is_vm_hugetlb_page(vma));
3366a5516438SAndi Kleen 	BUG_ON(start & ~huge_page_mask(h));
3367a5516438SAndi Kleen 	BUG_ON(end & ~huge_page_mask(h));
336863551ae0SDavid Gibson 
336907e32661SAneesh Kumar K.V 	/*
337007e32661SAneesh Kumar K.V 	 * This is a hugetlb vma, all the pte entries should point
337107e32661SAneesh Kumar K.V 	 * to huge page.
337207e32661SAneesh Kumar K.V 	 */
3373ed6a7935SPeter Zijlstra 	tlb_change_page_size(tlb, sz);
337424669e58SAneesh Kumar K.V 	tlb_start_vma(tlb, vma);
3375dff11abeSMike Kravetz 
3376dff11abeSMike Kravetz 	/*
3377dff11abeSMike Kravetz 	 * If sharing possible, alert mmu notifiers of worst case.
3378dff11abeSMike Kravetz 	 */
33796f4f13e8SJérôme Glisse 	mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start,
33806f4f13e8SJérôme Glisse 				end);
3381ac46d4f3SJérôme Glisse 	adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
3382ac46d4f3SJérôme Glisse 	mmu_notifier_invalidate_range_start(&range);
3383569f48b8SHillf Danton 	address = start;
3384569f48b8SHillf Danton 	for (; address < end; address += sz) {
33857868a208SPunit Agrawal 		ptep = huge_pte_offset(mm, address, sz);
3386c7546f8fSDavid Gibson 		if (!ptep)
3387c7546f8fSDavid Gibson 			continue;
3388c7546f8fSDavid Gibson 
3389cb900f41SKirill A. Shutemov 		ptl = huge_pte_lock(h, mm, ptep);
339031d49da5SAneesh Kumar K.V 		if (huge_pmd_unshare(mm, &address, ptep)) {
339131d49da5SAneesh Kumar K.V 			spin_unlock(ptl);
3392dff11abeSMike Kravetz 			/*
3393dff11abeSMike Kravetz 			 * We just unmapped a page of PMDs by clearing a PUD.
3394dff11abeSMike Kravetz 			 * The caller's TLB flush range should cover this area.
3395dff11abeSMike Kravetz 			 */
339631d49da5SAneesh Kumar K.V 			continue;
339731d49da5SAneesh Kumar K.V 		}
339839dde65cSChen, Kenneth W 
33996629326bSHillf Danton 		pte = huge_ptep_get(ptep);
340031d49da5SAneesh Kumar K.V 		if (huge_pte_none(pte)) {
340131d49da5SAneesh Kumar K.V 			spin_unlock(ptl);
340231d49da5SAneesh Kumar K.V 			continue;
340331d49da5SAneesh Kumar K.V 		}
34046629326bSHillf Danton 
34056629326bSHillf Danton 		/*
34069fbc1f63SNaoya Horiguchi 		 * Migrating hugepage or HWPoisoned hugepage is already
34079fbc1f63SNaoya Horiguchi 		 * unmapped and its refcount is dropped, so just clear pte here.
34086629326bSHillf Danton 		 */
34099fbc1f63SNaoya Horiguchi 		if (unlikely(!pte_present(pte))) {
34109386fac3SPunit Agrawal 			huge_pte_clear(mm, address, ptep, sz);
341131d49da5SAneesh Kumar K.V 			spin_unlock(ptl);
341231d49da5SAneesh Kumar K.V 			continue;
34138c4894c6SNaoya Horiguchi 		}
34146629326bSHillf Danton 
34156629326bSHillf Danton 		page = pte_page(pte);
341604f2cbe3SMel Gorman 		/*
341704f2cbe3SMel Gorman 		 * If a reference page is supplied, it is because a specific
341804f2cbe3SMel Gorman 		 * page is being unmapped, not a range. Ensure the page we
341904f2cbe3SMel Gorman 		 * are about to unmap is the actual page of interest.
342004f2cbe3SMel Gorman 		 */
342104f2cbe3SMel Gorman 		if (ref_page) {
342231d49da5SAneesh Kumar K.V 			if (page != ref_page) {
342331d49da5SAneesh Kumar K.V 				spin_unlock(ptl);
342431d49da5SAneesh Kumar K.V 				continue;
342531d49da5SAneesh Kumar K.V 			}
342604f2cbe3SMel Gorman 			/*
342704f2cbe3SMel Gorman 			 * Mark the VMA as having unmapped its page so that
342804f2cbe3SMel Gorman 			 * future faults in this VMA will fail rather than
342904f2cbe3SMel Gorman 			 * looking like data was lost
343004f2cbe3SMel Gorman 			 */
343104f2cbe3SMel Gorman 			set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
343204f2cbe3SMel Gorman 		}
343304f2cbe3SMel Gorman 
3434c7546f8fSDavid Gibson 		pte = huge_ptep_get_and_clear(mm, address, ptep);
3435b528e4b6SAneesh Kumar K.V 		tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
3436106c992aSGerald Schaefer 		if (huge_pte_dirty(pte))
34376649a386SKen Chen 			set_page_dirty(page);
34389e81130bSHillf Danton 
34395d317b2bSNaoya Horiguchi 		hugetlb_count_sub(pages_per_huge_page(h), mm);
3440d281ee61SKirill A. Shutemov 		page_remove_rmap(page, true);
344131d49da5SAneesh Kumar K.V 
3442cb900f41SKirill A. Shutemov 		spin_unlock(ptl);
3443e77b0852SAneesh Kumar K.V 		tlb_remove_page_size(tlb, page, huge_page_size(h));
344424669e58SAneesh Kumar K.V 		/*
344531d49da5SAneesh Kumar K.V 		 * Bail out after unmapping reference page if supplied
344624669e58SAneesh Kumar K.V 		 */
344731d49da5SAneesh Kumar K.V 		if (ref_page)
344831d49da5SAneesh Kumar K.V 			break;
3449fe1668aeSChen, Kenneth W 	}
3450ac46d4f3SJérôme Glisse 	mmu_notifier_invalidate_range_end(&range);
345124669e58SAneesh Kumar K.V 	tlb_end_vma(tlb, vma);
34521da177e4SLinus Torvalds }
345363551ae0SDavid Gibson 
3454d833352aSMel Gorman void __unmap_hugepage_range_final(struct mmu_gather *tlb,
3455d833352aSMel Gorman 			  struct vm_area_struct *vma, unsigned long start,
3456d833352aSMel Gorman 			  unsigned long end, struct page *ref_page)
3457d833352aSMel Gorman {
3458d833352aSMel Gorman 	__unmap_hugepage_range(tlb, vma, start, end, ref_page);
3459d833352aSMel Gorman 
3460d833352aSMel Gorman 	/*
3461d833352aSMel Gorman 	 * Clear this flag so that x86's huge_pmd_share page_table_shareable
3462d833352aSMel Gorman 	 * test will fail on a vma being torn down, and not grab a page table
3463d833352aSMel Gorman 	 * on its way out.  We're lucky that the flag has such an appropriate
3464d833352aSMel Gorman 	 * name, and can in fact be safely cleared here. We could clear it
3465d833352aSMel Gorman 	 * before the __unmap_hugepage_range above, but all that's necessary
3466c8c06efaSDavidlohr Bueso 	 * is to clear it before releasing the i_mmap_rwsem. This works
3467d833352aSMel Gorman 	 * because in the context this is called, the VMA is about to be
3468c8c06efaSDavidlohr Bueso 	 * destroyed and the i_mmap_rwsem is held.
3469d833352aSMel Gorman 	 */
3470d833352aSMel Gorman 	vma->vm_flags &= ~VM_MAYSHARE;
3471d833352aSMel Gorman }
3472d833352aSMel Gorman 
3473502717f4SChen, Kenneth W void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
347404f2cbe3SMel Gorman 			  unsigned long end, struct page *ref_page)
3475502717f4SChen, Kenneth W {
347624669e58SAneesh Kumar K.V 	struct mm_struct *mm;
347724669e58SAneesh Kumar K.V 	struct mmu_gather tlb;
3478dff11abeSMike Kravetz 	unsigned long tlb_start = start;
3479dff11abeSMike Kravetz 	unsigned long tlb_end = end;
3480dff11abeSMike Kravetz 
3481dff11abeSMike Kravetz 	/*
3482dff11abeSMike Kravetz 	 * If shared PMDs were possibly used within this vma range, adjust
3483dff11abeSMike Kravetz 	 * start/end for worst case tlb flushing.
3484dff11abeSMike Kravetz 	 * Note that we can not be sure if PMDs are shared until we try to
3485dff11abeSMike Kravetz 	 * unmap pages.  However, we want to make sure TLB flushing covers
3486dff11abeSMike Kravetz 	 * the largest possible range.
3487dff11abeSMike Kravetz 	 */
3488dff11abeSMike Kravetz 	adjust_range_if_pmd_sharing_possible(vma, &tlb_start, &tlb_end);
348924669e58SAneesh Kumar K.V 
349024669e58SAneesh Kumar K.V 	mm = vma->vm_mm;
349124669e58SAneesh Kumar K.V 
3492dff11abeSMike Kravetz 	tlb_gather_mmu(&tlb, mm, tlb_start, tlb_end);
349324669e58SAneesh Kumar K.V 	__unmap_hugepage_range(&tlb, vma, start, end, ref_page);
3494dff11abeSMike Kravetz 	tlb_finish_mmu(&tlb, tlb_start, tlb_end);
3495502717f4SChen, Kenneth W }
3496502717f4SChen, Kenneth W 
349704f2cbe3SMel Gorman /*
349804f2cbe3SMel Gorman  * This is called when the original mapper is failing to COW a MAP_PRIVATE
349904f2cbe3SMel Gorman  * mappping it owns the reserve page for. The intention is to unmap the page
350004f2cbe3SMel Gorman  * from other VMAs and let the children be SIGKILLed if they are faulting the
350104f2cbe3SMel Gorman  * same region.
350204f2cbe3SMel Gorman  */
35032f4612afSDavidlohr Bueso static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
35042a4b3dedSHarvey Harrison 			      struct page *page, unsigned long address)
350504f2cbe3SMel Gorman {
35067526674dSAdam Litke 	struct hstate *h = hstate_vma(vma);
350704f2cbe3SMel Gorman 	struct vm_area_struct *iter_vma;
350804f2cbe3SMel Gorman 	struct address_space *mapping;
350904f2cbe3SMel Gorman 	pgoff_t pgoff;
351004f2cbe3SMel Gorman 
351104f2cbe3SMel Gorman 	/*
351204f2cbe3SMel Gorman 	 * vm_pgoff is in PAGE_SIZE units, hence the different calculation
351304f2cbe3SMel Gorman 	 * from page cache lookup which is in HPAGE_SIZE units.
351404f2cbe3SMel Gorman 	 */
35157526674dSAdam Litke 	address = address & huge_page_mask(h);
351636e4f20aSMichal Hocko 	pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
351736e4f20aSMichal Hocko 			vma->vm_pgoff;
351893c76a3dSAl Viro 	mapping = vma->vm_file->f_mapping;
351904f2cbe3SMel Gorman 
35204eb2b1dcSMel Gorman 	/*
35214eb2b1dcSMel Gorman 	 * Take the mapping lock for the duration of the table walk. As
35224eb2b1dcSMel Gorman 	 * this mapping should be shared between all the VMAs,
35234eb2b1dcSMel Gorman 	 * __unmap_hugepage_range() is called as the lock is already held
35244eb2b1dcSMel Gorman 	 */
352583cde9e8SDavidlohr Bueso 	i_mmap_lock_write(mapping);
35266b2dbba8SMichel Lespinasse 	vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
352704f2cbe3SMel Gorman 		/* Do not unmap the current VMA */
352804f2cbe3SMel Gorman 		if (iter_vma == vma)
352904f2cbe3SMel Gorman 			continue;
353004f2cbe3SMel Gorman 
353104f2cbe3SMel Gorman 		/*
35322f84a899SMel Gorman 		 * Shared VMAs have their own reserves and do not affect
35332f84a899SMel Gorman 		 * MAP_PRIVATE accounting but it is possible that a shared
35342f84a899SMel Gorman 		 * VMA is using the same page so check and skip such VMAs.
35352f84a899SMel Gorman 		 */
35362f84a899SMel Gorman 		if (iter_vma->vm_flags & VM_MAYSHARE)
35372f84a899SMel Gorman 			continue;
35382f84a899SMel Gorman 
35392f84a899SMel Gorman 		/*
354004f2cbe3SMel Gorman 		 * Unmap the page from other VMAs without their own reserves.
354104f2cbe3SMel Gorman 		 * They get marked to be SIGKILLed if they fault in these
354204f2cbe3SMel Gorman 		 * areas. This is because a future no-page fault on this VMA
354304f2cbe3SMel Gorman 		 * could insert a zeroed page instead of the data existing
354404f2cbe3SMel Gorman 		 * from the time of fork. This would look like data corruption
354504f2cbe3SMel Gorman 		 */
354604f2cbe3SMel Gorman 		if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
354724669e58SAneesh Kumar K.V 			unmap_hugepage_range(iter_vma, address,
354824669e58SAneesh Kumar K.V 					     address + huge_page_size(h), page);
354904f2cbe3SMel Gorman 	}
355083cde9e8SDavidlohr Bueso 	i_mmap_unlock_write(mapping);
355104f2cbe3SMel Gorman }
355204f2cbe3SMel Gorman 
35530fe6e20bSNaoya Horiguchi /*
35540fe6e20bSNaoya Horiguchi  * Hugetlb_cow() should be called with page lock of the original hugepage held.
3555ef009b25SMichal Hocko  * Called with hugetlb_instantiation_mutex held and pte_page locked so we
3556ef009b25SMichal Hocko  * cannot race with other handlers or page migration.
3557ef009b25SMichal Hocko  * Keep the pte_same checks anyway to make transition from the mutex easier.
35580fe6e20bSNaoya Horiguchi  */
35592b740303SSouptick Joarder static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
3560974e6d66SHuang Ying 		       unsigned long address, pte_t *ptep,
3561cb900f41SKirill A. Shutemov 		       struct page *pagecache_page, spinlock_t *ptl)
35621e8f889bSDavid Gibson {
35633999f52eSAneesh Kumar K.V 	pte_t pte;
3564a5516438SAndi Kleen 	struct hstate *h = hstate_vma(vma);
35651e8f889bSDavid Gibson 	struct page *old_page, *new_page;
35662b740303SSouptick Joarder 	int outside_reserve = 0;
35672b740303SSouptick Joarder 	vm_fault_t ret = 0;
3568974e6d66SHuang Ying 	unsigned long haddr = address & huge_page_mask(h);
3569ac46d4f3SJérôme Glisse 	struct mmu_notifier_range range;
35701e8f889bSDavid Gibson 
35713999f52eSAneesh Kumar K.V 	pte = huge_ptep_get(ptep);
35721e8f889bSDavid Gibson 	old_page = pte_page(pte);
35731e8f889bSDavid Gibson 
357404f2cbe3SMel Gorman retry_avoidcopy:
35751e8f889bSDavid Gibson 	/* If no-one else is actually using this page, avoid the copy
35761e8f889bSDavid Gibson 	 * and just make the page writable */
357737a2140dSJoonsoo Kim 	if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
35785a49973dSHugh Dickins 		page_move_anon_rmap(old_page, vma);
35795b7a1d40SHuang Ying 		set_huge_ptep_writable(vma, haddr, ptep);
358083c54070SNick Piggin 		return 0;
35811e8f889bSDavid Gibson 	}
35821e8f889bSDavid Gibson 
358304f2cbe3SMel Gorman 	/*
358404f2cbe3SMel Gorman 	 * If the process that created a MAP_PRIVATE mapping is about to
358504f2cbe3SMel Gorman 	 * perform a COW due to a shared page count, attempt to satisfy
358604f2cbe3SMel Gorman 	 * the allocation without using the existing reserves. The pagecache
358704f2cbe3SMel Gorman 	 * page is used to determine if the reserve at this address was
358804f2cbe3SMel Gorman 	 * consumed or not. If reserves were used, a partial faulted mapping
358904f2cbe3SMel Gorman 	 * at the time of fork() could consume its reserves on COW instead
359004f2cbe3SMel Gorman 	 * of the full address range.
359104f2cbe3SMel Gorman 	 */
35925944d011SJoonsoo Kim 	if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
359304f2cbe3SMel Gorman 			old_page != pagecache_page)
359404f2cbe3SMel Gorman 		outside_reserve = 1;
359504f2cbe3SMel Gorman 
359609cbfeafSKirill A. Shutemov 	get_page(old_page);
3597b76c8cfbSLarry Woodman 
3598ad4404a2SDavidlohr Bueso 	/*
3599ad4404a2SDavidlohr Bueso 	 * Drop page table lock as buddy allocator may be called. It will
3600ad4404a2SDavidlohr Bueso 	 * be acquired again before returning to the caller, as expected.
3601ad4404a2SDavidlohr Bueso 	 */
3602cb900f41SKirill A. Shutemov 	spin_unlock(ptl);
36035b7a1d40SHuang Ying 	new_page = alloc_huge_page(vma, haddr, outside_reserve);
36041e8f889bSDavid Gibson 
36052fc39cecSAdam Litke 	if (IS_ERR(new_page)) {
360604f2cbe3SMel Gorman 		/*
360704f2cbe3SMel Gorman 		 * If a process owning a MAP_PRIVATE mapping fails to COW,
360804f2cbe3SMel Gorman 		 * it is due to references held by a child and an insufficient
360904f2cbe3SMel Gorman 		 * huge page pool. To guarantee the original mappers
361004f2cbe3SMel Gorman 		 * reliability, unmap the page from child processes. The child
361104f2cbe3SMel Gorman 		 * may get SIGKILLed if it later faults.
361204f2cbe3SMel Gorman 		 */
361304f2cbe3SMel Gorman 		if (outside_reserve) {
361409cbfeafSKirill A. Shutemov 			put_page(old_page);
361504f2cbe3SMel Gorman 			BUG_ON(huge_pte_none(pte));
36165b7a1d40SHuang Ying 			unmap_ref_private(mm, vma, old_page, haddr);
361704f2cbe3SMel Gorman 			BUG_ON(huge_pte_none(pte));
3618cb900f41SKirill A. Shutemov 			spin_lock(ptl);
36195b7a1d40SHuang Ying 			ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
3620a9af0c5dSNaoya Horiguchi 			if (likely(ptep &&
3621a9af0c5dSNaoya Horiguchi 				   pte_same(huge_ptep_get(ptep), pte)))
362204f2cbe3SMel Gorman 				goto retry_avoidcopy;
3623a734bcc8SHillf Danton 			/*
3624cb900f41SKirill A. Shutemov 			 * race occurs while re-acquiring page table
3625cb900f41SKirill A. Shutemov 			 * lock, and our job is done.
3626a734bcc8SHillf Danton 			 */
3627a734bcc8SHillf Danton 			return 0;
362804f2cbe3SMel Gorman 		}
362904f2cbe3SMel Gorman 
36302b740303SSouptick Joarder 		ret = vmf_error(PTR_ERR(new_page));
3631ad4404a2SDavidlohr Bueso 		goto out_release_old;
36321e8f889bSDavid Gibson 	}
36331e8f889bSDavid Gibson 
36340fe6e20bSNaoya Horiguchi 	/*
36350fe6e20bSNaoya Horiguchi 	 * When the original hugepage is shared one, it does not have
36360fe6e20bSNaoya Horiguchi 	 * anon_vma prepared.
36370fe6e20bSNaoya Horiguchi 	 */
363844e2aa93SDean Nelson 	if (unlikely(anon_vma_prepare(vma))) {
3639ad4404a2SDavidlohr Bueso 		ret = VM_FAULT_OOM;
3640ad4404a2SDavidlohr Bueso 		goto out_release_all;
364144e2aa93SDean Nelson 	}
36420fe6e20bSNaoya Horiguchi 
3643974e6d66SHuang Ying 	copy_user_huge_page(new_page, old_page, address, vma,
364447ad8475SAndrea Arcangeli 			    pages_per_huge_page(h));
36450ed361deSNick Piggin 	__SetPageUptodate(new_page);
36461e8f889bSDavid Gibson 
36477269f999SJérôme Glisse 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr,
36486f4f13e8SJérôme Glisse 				haddr + huge_page_size(h));
3649ac46d4f3SJérôme Glisse 	mmu_notifier_invalidate_range_start(&range);
3650ad4404a2SDavidlohr Bueso 
3651b76c8cfbSLarry Woodman 	/*
3652cb900f41SKirill A. Shutemov 	 * Retake the page table lock to check for racing updates
3653b76c8cfbSLarry Woodman 	 * before the page tables are altered
3654b76c8cfbSLarry Woodman 	 */
3655cb900f41SKirill A. Shutemov 	spin_lock(ptl);
36565b7a1d40SHuang Ying 	ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
3657a9af0c5dSNaoya Horiguchi 	if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
365807443a85SJoonsoo Kim 		ClearPagePrivate(new_page);
365907443a85SJoonsoo Kim 
36601e8f889bSDavid Gibson 		/* Break COW */
36615b7a1d40SHuang Ying 		huge_ptep_clear_flush(vma, haddr, ptep);
3662ac46d4f3SJérôme Glisse 		mmu_notifier_invalidate_range(mm, range.start, range.end);
36635b7a1d40SHuang Ying 		set_huge_pte_at(mm, haddr, ptep,
36641e8f889bSDavid Gibson 				make_huge_pte(vma, new_page, 1));
3665d281ee61SKirill A. Shutemov 		page_remove_rmap(old_page, true);
36665b7a1d40SHuang Ying 		hugepage_add_new_anon_rmap(new_page, vma, haddr);
3667cb6acd01SMike Kravetz 		set_page_huge_active(new_page);
36681e8f889bSDavid Gibson 		/* Make the old page be freed below */
36691e8f889bSDavid Gibson 		new_page = old_page;
36701e8f889bSDavid Gibson 	}
3671cb900f41SKirill A. Shutemov 	spin_unlock(ptl);
3672ac46d4f3SJérôme Glisse 	mmu_notifier_invalidate_range_end(&range);
3673ad4404a2SDavidlohr Bueso out_release_all:
36745b7a1d40SHuang Ying 	restore_reserve_on_error(h, vma, haddr, new_page);
367509cbfeafSKirill A. Shutemov 	put_page(new_page);
3676ad4404a2SDavidlohr Bueso out_release_old:
367709cbfeafSKirill A. Shutemov 	put_page(old_page);
36788312034fSJoonsoo Kim 
3679ad4404a2SDavidlohr Bueso 	spin_lock(ptl); /* Caller expects lock to be held */
3680ad4404a2SDavidlohr Bueso 	return ret;
36811e8f889bSDavid Gibson }
36821e8f889bSDavid Gibson 
368304f2cbe3SMel Gorman /* Return the pagecache page at a given address within a VMA */
3684a5516438SAndi Kleen static struct page *hugetlbfs_pagecache_page(struct hstate *h,
3685a5516438SAndi Kleen 			struct vm_area_struct *vma, unsigned long address)
368604f2cbe3SMel Gorman {
368704f2cbe3SMel Gorman 	struct address_space *mapping;
3688e7c4b0bfSAndy Whitcroft 	pgoff_t idx;
368904f2cbe3SMel Gorman 
369004f2cbe3SMel Gorman 	mapping = vma->vm_file->f_mapping;
3691a5516438SAndi Kleen 	idx = vma_hugecache_offset(h, vma, address);
369204f2cbe3SMel Gorman 
369304f2cbe3SMel Gorman 	return find_lock_page(mapping, idx);
369404f2cbe3SMel Gorman }
369504f2cbe3SMel Gorman 
36963ae77f43SHugh Dickins /*
36973ae77f43SHugh Dickins  * Return whether there is a pagecache page to back given address within VMA.
36983ae77f43SHugh Dickins  * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
36993ae77f43SHugh Dickins  */
37003ae77f43SHugh Dickins static bool hugetlbfs_pagecache_present(struct hstate *h,
37012a15efc9SHugh Dickins 			struct vm_area_struct *vma, unsigned long address)
37022a15efc9SHugh Dickins {
37032a15efc9SHugh Dickins 	struct address_space *mapping;
37042a15efc9SHugh Dickins 	pgoff_t idx;
37052a15efc9SHugh Dickins 	struct page *page;
37062a15efc9SHugh Dickins 
37072a15efc9SHugh Dickins 	mapping = vma->vm_file->f_mapping;
37082a15efc9SHugh Dickins 	idx = vma_hugecache_offset(h, vma, address);
37092a15efc9SHugh Dickins 
37102a15efc9SHugh Dickins 	page = find_get_page(mapping, idx);
37112a15efc9SHugh Dickins 	if (page)
37122a15efc9SHugh Dickins 		put_page(page);
37132a15efc9SHugh Dickins 	return page != NULL;
37142a15efc9SHugh Dickins }
37152a15efc9SHugh Dickins 
3716ab76ad54SMike Kravetz int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
3717ab76ad54SMike Kravetz 			   pgoff_t idx)
3718ab76ad54SMike Kravetz {
3719ab76ad54SMike Kravetz 	struct inode *inode = mapping->host;
3720ab76ad54SMike Kravetz 	struct hstate *h = hstate_inode(inode);
3721ab76ad54SMike Kravetz 	int err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
3722ab76ad54SMike Kravetz 
3723ab76ad54SMike Kravetz 	if (err)
3724ab76ad54SMike Kravetz 		return err;
3725ab76ad54SMike Kravetz 	ClearPagePrivate(page);
3726ab76ad54SMike Kravetz 
372722146c3cSMike Kravetz 	/*
372822146c3cSMike Kravetz 	 * set page dirty so that it will not be removed from cache/file
372922146c3cSMike Kravetz 	 * by non-hugetlbfs specific code paths.
373022146c3cSMike Kravetz 	 */
373122146c3cSMike Kravetz 	set_page_dirty(page);
373222146c3cSMike Kravetz 
3733ab76ad54SMike Kravetz 	spin_lock(&inode->i_lock);
3734ab76ad54SMike Kravetz 	inode->i_blocks += blocks_per_huge_page(h);
3735ab76ad54SMike Kravetz 	spin_unlock(&inode->i_lock);
3736ab76ad54SMike Kravetz 	return 0;
3737ab76ad54SMike Kravetz }
3738ab76ad54SMike Kravetz 
37392b740303SSouptick Joarder static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
37402b740303SSouptick Joarder 			struct vm_area_struct *vma,
37418382d914SDavidlohr Bueso 			struct address_space *mapping, pgoff_t idx,
3742788c7df4SHugh Dickins 			unsigned long address, pte_t *ptep, unsigned int flags)
3743ac9b9c66SHugh Dickins {
3744a5516438SAndi Kleen 	struct hstate *h = hstate_vma(vma);
37452b740303SSouptick Joarder 	vm_fault_t ret = VM_FAULT_SIGBUS;
3746409eb8c2SHillf Danton 	int anon_rmap = 0;
37474c887265SAdam Litke 	unsigned long size;
37484c887265SAdam Litke 	struct page *page;
37491e8f889bSDavid Gibson 	pte_t new_pte;
3750cb900f41SKirill A. Shutemov 	spinlock_t *ptl;
3751285b8dcaSHuang Ying 	unsigned long haddr = address & huge_page_mask(h);
3752cb6acd01SMike Kravetz 	bool new_page = false;
37534c887265SAdam Litke 
375404f2cbe3SMel Gorman 	/*
375504f2cbe3SMel Gorman 	 * Currently, we are forced to kill the process in the event the
375604f2cbe3SMel Gorman 	 * original mapper has unmapped pages from the child due to a failed
375725985edcSLucas De Marchi 	 * COW. Warn that such a situation has occurred as it may not be obvious
375804f2cbe3SMel Gorman 	 */
375904f2cbe3SMel Gorman 	if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
3760910154d5SGeoffrey Thomas 		pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
376104f2cbe3SMel Gorman 			   current->pid);
376204f2cbe3SMel Gorman 		return ret;
376304f2cbe3SMel Gorman 	}
376404f2cbe3SMel Gorman 
37654c887265SAdam Litke 	/*
3766e7c58097SMike Kravetz 	 * Use page lock to guard against racing truncation
3767e7c58097SMike Kravetz 	 * before we get page_table_lock.
37684c887265SAdam Litke 	 */
3769e7c58097SMike Kravetz retry:
3770e7c58097SMike Kravetz 	page = find_lock_page(mapping, idx);
3771e7c58097SMike Kravetz 	if (!page) {
3772a5516438SAndi Kleen 		size = i_size_read(mapping->host) >> huge_page_shift(h);
3773ebed4bfcSHugh Dickins 		if (idx >= size)
3774ebed4bfcSHugh Dickins 			goto out;
37751a1aad8aSMike Kravetz 
37761a1aad8aSMike Kravetz 		/*
37771a1aad8aSMike Kravetz 		 * Check for page in userfault range
37781a1aad8aSMike Kravetz 		 */
37791a1aad8aSMike Kravetz 		if (userfaultfd_missing(vma)) {
37801a1aad8aSMike Kravetz 			u32 hash;
37811a1aad8aSMike Kravetz 			struct vm_fault vmf = {
37821a1aad8aSMike Kravetz 				.vma = vma,
3783285b8dcaSHuang Ying 				.address = haddr,
37841a1aad8aSMike Kravetz 				.flags = flags,
37851a1aad8aSMike Kravetz 				/*
37861a1aad8aSMike Kravetz 				 * Hard to debug if it ends up being
37871a1aad8aSMike Kravetz 				 * used by a callee that assumes
37881a1aad8aSMike Kravetz 				 * something about the other
37891a1aad8aSMike Kravetz 				 * uninitialized fields... same as in
37901a1aad8aSMike Kravetz 				 * memory.c
37911a1aad8aSMike Kravetz 				 */
37921a1aad8aSMike Kravetz 			};
37931a1aad8aSMike Kravetz 
37941a1aad8aSMike Kravetz 			/*
3795ddeaab32SMike Kravetz 			 * hugetlb_fault_mutex must be dropped before
3796ddeaab32SMike Kravetz 			 * handling userfault.  Reacquire after handling
3797ddeaab32SMike Kravetz 			 * fault to make calling code simpler.
37981a1aad8aSMike Kravetz 			 */
379955254636SMike Kravetz 			hash = hugetlb_fault_mutex_hash(h, mapping, idx);
38001a1aad8aSMike Kravetz 			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
38011a1aad8aSMike Kravetz 			ret = handle_userfault(&vmf, VM_UFFD_MISSING);
38021a1aad8aSMike Kravetz 			mutex_lock(&hugetlb_fault_mutex_table[hash]);
38031a1aad8aSMike Kravetz 			goto out;
38041a1aad8aSMike Kravetz 		}
38051a1aad8aSMike Kravetz 
3806285b8dcaSHuang Ying 		page = alloc_huge_page(vma, haddr, 0);
38072fc39cecSAdam Litke 		if (IS_ERR(page)) {
38084643d67eSMike Kravetz 			/*
38094643d67eSMike Kravetz 			 * Returning error will result in faulting task being
38104643d67eSMike Kravetz 			 * sent SIGBUS.  The hugetlb fault mutex prevents two
38114643d67eSMike Kravetz 			 * tasks from racing to fault in the same page which
38124643d67eSMike Kravetz 			 * could result in false unable to allocate errors.
38134643d67eSMike Kravetz 			 * Page migration does not take the fault mutex, but
38144643d67eSMike Kravetz 			 * does a clear then write of pte's under page table
38154643d67eSMike Kravetz 			 * lock.  Page fault code could race with migration,
38164643d67eSMike Kravetz 			 * notice the clear pte and try to allocate a page
38174643d67eSMike Kravetz 			 * here.  Before returning error, get ptl and make
38184643d67eSMike Kravetz 			 * sure there really is no pte entry.
38194643d67eSMike Kravetz 			 */
38204643d67eSMike Kravetz 			ptl = huge_pte_lock(h, mm, ptep);
38214643d67eSMike Kravetz 			if (!huge_pte_none(huge_ptep_get(ptep))) {
38224643d67eSMike Kravetz 				ret = 0;
38234643d67eSMike Kravetz 				spin_unlock(ptl);
38244643d67eSMike Kravetz 				goto out;
38254643d67eSMike Kravetz 			}
38264643d67eSMike Kravetz 			spin_unlock(ptl);
38272b740303SSouptick Joarder 			ret = vmf_error(PTR_ERR(page));
38286bda666aSChristoph Lameter 			goto out;
38296bda666aSChristoph Lameter 		}
383047ad8475SAndrea Arcangeli 		clear_huge_page(page, address, pages_per_huge_page(h));
38310ed361deSNick Piggin 		__SetPageUptodate(page);
3832cb6acd01SMike Kravetz 		new_page = true;
3833ac9b9c66SHugh Dickins 
3834f83a275dSMel Gorman 		if (vma->vm_flags & VM_MAYSHARE) {
3835ab76ad54SMike Kravetz 			int err = huge_add_to_page_cache(page, mapping, idx);
38366bda666aSChristoph Lameter 			if (err) {
38376bda666aSChristoph Lameter 				put_page(page);
38386bda666aSChristoph Lameter 				if (err == -EEXIST)
38396bda666aSChristoph Lameter 					goto retry;
38406bda666aSChristoph Lameter 				goto out;
38416bda666aSChristoph Lameter 			}
384223be7468SMel Gorman 		} else {
38436bda666aSChristoph Lameter 			lock_page(page);
38440fe6e20bSNaoya Horiguchi 			if (unlikely(anon_vma_prepare(vma))) {
38450fe6e20bSNaoya Horiguchi 				ret = VM_FAULT_OOM;
38460fe6e20bSNaoya Horiguchi 				goto backout_unlocked;
384723be7468SMel Gorman 			}
3848409eb8c2SHillf Danton 			anon_rmap = 1;
38490fe6e20bSNaoya Horiguchi 		}
38500fe6e20bSNaoya Horiguchi 	} else {
385157303d80SAndy Whitcroft 		/*
3852998b4382SNaoya Horiguchi 		 * If memory error occurs between mmap() and fault, some process
3853998b4382SNaoya Horiguchi 		 * don't have hwpoisoned swap entry for errored virtual address.
3854998b4382SNaoya Horiguchi 		 * So we need to block hugepage fault by PG_hwpoison bit check.
3855fd6a03edSNaoya Horiguchi 		 */
3856fd6a03edSNaoya Horiguchi 		if (unlikely(PageHWPoison(page))) {
3857aa50d3a7SAndi Kleen 			ret = VM_FAULT_HWPOISON |
3858972dc4deSAneesh Kumar K.V 				VM_FAULT_SET_HINDEX(hstate_index(h));
3859fd6a03edSNaoya Horiguchi 			goto backout_unlocked;
38606bda666aSChristoph Lameter 		}
3861998b4382SNaoya Horiguchi 	}
38621e8f889bSDavid Gibson 
386357303d80SAndy Whitcroft 	/*
386457303d80SAndy Whitcroft 	 * If we are going to COW a private mapping later, we examine the
386557303d80SAndy Whitcroft 	 * pending reservations for this page now. This will ensure that
386657303d80SAndy Whitcroft 	 * any allocations necessary to record that reservation occur outside
386757303d80SAndy Whitcroft 	 * the spinlock.
386857303d80SAndy Whitcroft 	 */
38695e911373SMike Kravetz 	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
3870285b8dcaSHuang Ying 		if (vma_needs_reservation(h, vma, haddr) < 0) {
38712b26736cSAndy Whitcroft 			ret = VM_FAULT_OOM;
38722b26736cSAndy Whitcroft 			goto backout_unlocked;
38732b26736cSAndy Whitcroft 		}
38745e911373SMike Kravetz 		/* Just decrements count, does not deallocate */
3875285b8dcaSHuang Ying 		vma_end_reservation(h, vma, haddr);
38765e911373SMike Kravetz 	}
387757303d80SAndy Whitcroft 
38788bea8052SAneesh Kumar K.V 	ptl = huge_pte_lock(h, mm, ptep);
3879e7c58097SMike Kravetz 	size = i_size_read(mapping->host) >> huge_page_shift(h);
3880e7c58097SMike Kravetz 	if (idx >= size)
3881e7c58097SMike Kravetz 		goto backout;
38824c887265SAdam Litke 
388383c54070SNick Piggin 	ret = 0;
38847f2e9525SGerald Schaefer 	if (!huge_pte_none(huge_ptep_get(ptep)))
38854c887265SAdam Litke 		goto backout;
38864c887265SAdam Litke 
388707443a85SJoonsoo Kim 	if (anon_rmap) {
388807443a85SJoonsoo Kim 		ClearPagePrivate(page);
3889285b8dcaSHuang Ying 		hugepage_add_new_anon_rmap(page, vma, haddr);
3890ac714904SChoi Gi-yong 	} else
389153f9263bSKirill A. Shutemov 		page_dup_rmap(page, true);
38921e8f889bSDavid Gibson 	new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
38931e8f889bSDavid Gibson 				&& (vma->vm_flags & VM_SHARED)));
3894285b8dcaSHuang Ying 	set_huge_pte_at(mm, haddr, ptep, new_pte);
38951e8f889bSDavid Gibson 
38965d317b2bSNaoya Horiguchi 	hugetlb_count_add(pages_per_huge_page(h), mm);
3897788c7df4SHugh Dickins 	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
38981e8f889bSDavid Gibson 		/* Optimization, do the COW without a second fault */
3899974e6d66SHuang Ying 		ret = hugetlb_cow(mm, vma, address, ptep, page, ptl);
39001e8f889bSDavid Gibson 	}
39011e8f889bSDavid Gibson 
3902cb900f41SKirill A. Shutemov 	spin_unlock(ptl);
3903cb6acd01SMike Kravetz 
3904cb6acd01SMike Kravetz 	/*
3905cb6acd01SMike Kravetz 	 * Only make newly allocated pages active.  Existing pages found
3906cb6acd01SMike Kravetz 	 * in the pagecache could be !page_huge_active() if they have been
3907cb6acd01SMike Kravetz 	 * isolated for migration.
3908cb6acd01SMike Kravetz 	 */
3909cb6acd01SMike Kravetz 	if (new_page)
3910cb6acd01SMike Kravetz 		set_page_huge_active(page);
3911cb6acd01SMike Kravetz 
39124c887265SAdam Litke 	unlock_page(page);
39134c887265SAdam Litke out:
3914ac9b9c66SHugh Dickins 	return ret;
39154c887265SAdam Litke 
39164c887265SAdam Litke backout:
3917cb900f41SKirill A. Shutemov 	spin_unlock(ptl);
39182b26736cSAndy Whitcroft backout_unlocked:
39194c887265SAdam Litke 	unlock_page(page);
3920285b8dcaSHuang Ying 	restore_reserve_on_error(h, vma, haddr, page);
39214c887265SAdam Litke 	put_page(page);
39224c887265SAdam Litke 	goto out;
3923ac9b9c66SHugh Dickins }
3924ac9b9c66SHugh Dickins 
39258382d914SDavidlohr Bueso #ifdef CONFIG_SMP
39261b426bacSMike Kravetz u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
392755254636SMike Kravetz 			    pgoff_t idx)
39288382d914SDavidlohr Bueso {
39298382d914SDavidlohr Bueso 	unsigned long key[2];
39308382d914SDavidlohr Bueso 	u32 hash;
39318382d914SDavidlohr Bueso 
39328382d914SDavidlohr Bueso 	key[0] = (unsigned long) mapping;
39338382d914SDavidlohr Bueso 	key[1] = idx;
39348382d914SDavidlohr Bueso 
393555254636SMike Kravetz 	hash = jhash2((u32 *)&key, sizeof(key)/(sizeof(u32)), 0);
39368382d914SDavidlohr Bueso 
39378382d914SDavidlohr Bueso 	return hash & (num_fault_mutexes - 1);
39388382d914SDavidlohr Bueso }
39398382d914SDavidlohr Bueso #else
39408382d914SDavidlohr Bueso /*
39418382d914SDavidlohr Bueso  * For uniprocesor systems we always use a single mutex, so just
39428382d914SDavidlohr Bueso  * return 0 and avoid the hashing overhead.
39438382d914SDavidlohr Bueso  */
39441b426bacSMike Kravetz u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
394555254636SMike Kravetz 			    pgoff_t idx)
39468382d914SDavidlohr Bueso {
39478382d914SDavidlohr Bueso 	return 0;
39488382d914SDavidlohr Bueso }
39498382d914SDavidlohr Bueso #endif
39508382d914SDavidlohr Bueso 
39512b740303SSouptick Joarder vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3952788c7df4SHugh Dickins 			unsigned long address, unsigned int flags)
395386e5216fSAdam Litke {
39548382d914SDavidlohr Bueso 	pte_t *ptep, entry;
3955cb900f41SKirill A. Shutemov 	spinlock_t *ptl;
39562b740303SSouptick Joarder 	vm_fault_t ret;
39578382d914SDavidlohr Bueso 	u32 hash;
39588382d914SDavidlohr Bueso 	pgoff_t idx;
39590fe6e20bSNaoya Horiguchi 	struct page *page = NULL;
396057303d80SAndy Whitcroft 	struct page *pagecache_page = NULL;
3961a5516438SAndi Kleen 	struct hstate *h = hstate_vma(vma);
39628382d914SDavidlohr Bueso 	struct address_space *mapping;
39630f792cf9SNaoya Horiguchi 	int need_wait_lock = 0;
3964285b8dcaSHuang Ying 	unsigned long haddr = address & huge_page_mask(h);
396586e5216fSAdam Litke 
3966285b8dcaSHuang Ying 	ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
3967fd6a03edSNaoya Horiguchi 	if (ptep) {
3968fd6a03edSNaoya Horiguchi 		entry = huge_ptep_get(ptep);
3969290408d4SNaoya Horiguchi 		if (unlikely(is_hugetlb_entry_migration(entry))) {
3970cb900f41SKirill A. Shutemov 			migration_entry_wait_huge(vma, mm, ptep);
3971290408d4SNaoya Horiguchi 			return 0;
3972290408d4SNaoya Horiguchi 		} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
3973aa50d3a7SAndi Kleen 			return VM_FAULT_HWPOISON_LARGE |
3974972dc4deSAneesh Kumar K.V 				VM_FAULT_SET_HINDEX(hstate_index(h));
3975ddeaab32SMike Kravetz 	} else {
3976b43a9990SMike Kravetz 		ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
3977ddeaab32SMike Kravetz 		if (!ptep)
3978b43a9990SMike Kravetz 			return VM_FAULT_OOM;
3979b43a9990SMike Kravetz 	}
39808382d914SDavidlohr Bueso 
3981ddeaab32SMike Kravetz 	mapping = vma->vm_file->f_mapping;
3982ddeaab32SMike Kravetz 	idx = vma_hugecache_offset(h, vma, haddr);
3983ddeaab32SMike Kravetz 
39843935baa9SDavid Gibson 	/*
39853935baa9SDavid Gibson 	 * Serialize hugepage allocation and instantiation, so that we don't
39863935baa9SDavid Gibson 	 * get spurious allocation failures if two CPUs race to instantiate
39873935baa9SDavid Gibson 	 * the same page in the page cache.
39883935baa9SDavid Gibson 	 */
398955254636SMike Kravetz 	hash = hugetlb_fault_mutex_hash(h, mapping, idx);
3990c672c7f2SMike Kravetz 	mutex_lock(&hugetlb_fault_mutex_table[hash]);
39918382d914SDavidlohr Bueso 
39927f2e9525SGerald Schaefer 	entry = huge_ptep_get(ptep);
39937f2e9525SGerald Schaefer 	if (huge_pte_none(entry)) {
39948382d914SDavidlohr Bueso 		ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags);
3995b4d1d99fSDavid Gibson 		goto out_mutex;
39963935baa9SDavid Gibson 	}
399786e5216fSAdam Litke 
399883c54070SNick Piggin 	ret = 0;
39991e8f889bSDavid Gibson 
400057303d80SAndy Whitcroft 	/*
40010f792cf9SNaoya Horiguchi 	 * entry could be a migration/hwpoison entry at this point, so this
40020f792cf9SNaoya Horiguchi 	 * check prevents the kernel from going below assuming that we have
40030f792cf9SNaoya Horiguchi 	 * a active hugepage in pagecache. This goto expects the 2nd page fault,
40040f792cf9SNaoya Horiguchi 	 * and is_hugetlb_entry_(migration|hwpoisoned) check will properly
40050f792cf9SNaoya Horiguchi 	 * handle it.
40060f792cf9SNaoya Horiguchi 	 */
40070f792cf9SNaoya Horiguchi 	if (!pte_present(entry))
40080f792cf9SNaoya Horiguchi 		goto out_mutex;
40090f792cf9SNaoya Horiguchi 
40100f792cf9SNaoya Horiguchi 	/*
401157303d80SAndy Whitcroft 	 * If we are going to COW the mapping later, we examine the pending
401257303d80SAndy Whitcroft 	 * reservations for this page now. This will ensure that any
401357303d80SAndy Whitcroft 	 * allocations necessary to record that reservation occur outside the
401457303d80SAndy Whitcroft 	 * spinlock. For private mappings, we also lookup the pagecache
401557303d80SAndy Whitcroft 	 * page now as it is used to determine if a reservation has been
401657303d80SAndy Whitcroft 	 * consumed.
401757303d80SAndy Whitcroft 	 */
4018106c992aSGerald Schaefer 	if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
4019285b8dcaSHuang Ying 		if (vma_needs_reservation(h, vma, haddr) < 0) {
40202b26736cSAndy Whitcroft 			ret = VM_FAULT_OOM;
4021b4d1d99fSDavid Gibson 			goto out_mutex;
40222b26736cSAndy Whitcroft 		}
40235e911373SMike Kravetz 		/* Just decrements count, does not deallocate */
4024285b8dcaSHuang Ying 		vma_end_reservation(h, vma, haddr);
402557303d80SAndy Whitcroft 
4026f83a275dSMel Gorman 		if (!(vma->vm_flags & VM_MAYSHARE))
402757303d80SAndy Whitcroft 			pagecache_page = hugetlbfs_pagecache_page(h,
4028285b8dcaSHuang Ying 								vma, haddr);
402957303d80SAndy Whitcroft 	}
403057303d80SAndy Whitcroft 
40310f792cf9SNaoya Horiguchi 	ptl = huge_pte_lock(h, mm, ptep);
40320fe6e20bSNaoya Horiguchi 
40331e8f889bSDavid Gibson 	/* Check for a racing update before calling hugetlb_cow */
4034b4d1d99fSDavid Gibson 	if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
4035cb900f41SKirill A. Shutemov 		goto out_ptl;
4036b4d1d99fSDavid Gibson 
40370f792cf9SNaoya Horiguchi 	/*
40380f792cf9SNaoya Horiguchi 	 * hugetlb_cow() requires page locks of pte_page(entry) and
40390f792cf9SNaoya Horiguchi 	 * pagecache_page, so here we need take the former one
40400f792cf9SNaoya Horiguchi 	 * when page != pagecache_page or !pagecache_page.
40410f792cf9SNaoya Horiguchi 	 */
40420f792cf9SNaoya Horiguchi 	page = pte_page(entry);
40430f792cf9SNaoya Horiguchi 	if (page != pagecache_page)
40440f792cf9SNaoya Horiguchi 		if (!trylock_page(page)) {
40450f792cf9SNaoya Horiguchi 			need_wait_lock = 1;
40460f792cf9SNaoya Horiguchi 			goto out_ptl;
40470f792cf9SNaoya Horiguchi 		}
40480f792cf9SNaoya Horiguchi 
40490f792cf9SNaoya Horiguchi 	get_page(page);
4050b4d1d99fSDavid Gibson 
4051788c7df4SHugh Dickins 	if (flags & FAULT_FLAG_WRITE) {
4052106c992aSGerald Schaefer 		if (!huge_pte_write(entry)) {
4053974e6d66SHuang Ying 			ret = hugetlb_cow(mm, vma, address, ptep,
4054cb900f41SKirill A. Shutemov 					  pagecache_page, ptl);
40550f792cf9SNaoya Horiguchi 			goto out_put_page;
4056b4d1d99fSDavid Gibson 		}
4057106c992aSGerald Schaefer 		entry = huge_pte_mkdirty(entry);
4058b4d1d99fSDavid Gibson 	}
4059b4d1d99fSDavid Gibson 	entry = pte_mkyoung(entry);
4060285b8dcaSHuang Ying 	if (huge_ptep_set_access_flags(vma, haddr, ptep, entry,
4061788c7df4SHugh Dickins 						flags & FAULT_FLAG_WRITE))
4062285b8dcaSHuang Ying 		update_mmu_cache(vma, haddr, ptep);
40630f792cf9SNaoya Horiguchi out_put_page:
40640f792cf9SNaoya Horiguchi 	if (page != pagecache_page)
40650f792cf9SNaoya Horiguchi 		unlock_page(page);
40660f792cf9SNaoya Horiguchi 	put_page(page);
4067cb900f41SKirill A. Shutemov out_ptl:
4068cb900f41SKirill A. Shutemov 	spin_unlock(ptl);
406957303d80SAndy Whitcroft 
407057303d80SAndy Whitcroft 	if (pagecache_page) {
407157303d80SAndy Whitcroft 		unlock_page(pagecache_page);
407257303d80SAndy Whitcroft 		put_page(pagecache_page);
407357303d80SAndy Whitcroft 	}
4074b4d1d99fSDavid Gibson out_mutex:
4075c672c7f2SMike Kravetz 	mutex_unlock(&hugetlb_fault_mutex_table[hash]);
40760f792cf9SNaoya Horiguchi 	/*
40770f792cf9SNaoya Horiguchi 	 * Generally it's safe to hold refcount during waiting page lock. But
40780f792cf9SNaoya Horiguchi 	 * here we just wait to defer the next page fault to avoid busy loop and
40790f792cf9SNaoya Horiguchi 	 * the page is not used after unlocked before returning from the current
40800f792cf9SNaoya Horiguchi 	 * page fault. So we are safe from accessing freed page, even if we wait
40810f792cf9SNaoya Horiguchi 	 * here without taking refcount.
40820f792cf9SNaoya Horiguchi 	 */
40830f792cf9SNaoya Horiguchi 	if (need_wait_lock)
40840f792cf9SNaoya Horiguchi 		wait_on_page_locked(page);
40851e8f889bSDavid Gibson 	return ret;
408686e5216fSAdam Litke }
408786e5216fSAdam Litke 
40888fb5debcSMike Kravetz /*
40898fb5debcSMike Kravetz  * Used by userfaultfd UFFDIO_COPY.  Based on mcopy_atomic_pte with
40908fb5debcSMike Kravetz  * modifications for huge pages.
40918fb5debcSMike Kravetz  */
40928fb5debcSMike Kravetz int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
40938fb5debcSMike Kravetz 			    pte_t *dst_pte,
40948fb5debcSMike Kravetz 			    struct vm_area_struct *dst_vma,
40958fb5debcSMike Kravetz 			    unsigned long dst_addr,
40968fb5debcSMike Kravetz 			    unsigned long src_addr,
40978fb5debcSMike Kravetz 			    struct page **pagep)
40988fb5debcSMike Kravetz {
40991e392147SAndrea Arcangeli 	struct address_space *mapping;
41001e392147SAndrea Arcangeli 	pgoff_t idx;
41011e392147SAndrea Arcangeli 	unsigned long size;
41021c9e8defSMike Kravetz 	int vm_shared = dst_vma->vm_flags & VM_SHARED;
41038fb5debcSMike Kravetz 	struct hstate *h = hstate_vma(dst_vma);
41048fb5debcSMike Kravetz 	pte_t _dst_pte;
41058fb5debcSMike Kravetz 	spinlock_t *ptl;
41068fb5debcSMike Kravetz 	int ret;
41078fb5debcSMike Kravetz 	struct page *page;
41088fb5debcSMike Kravetz 
41098fb5debcSMike Kravetz 	if (!*pagep) {
41108fb5debcSMike Kravetz 		ret = -ENOMEM;
41118fb5debcSMike Kravetz 		page = alloc_huge_page(dst_vma, dst_addr, 0);
41128fb5debcSMike Kravetz 		if (IS_ERR(page))
41138fb5debcSMike Kravetz 			goto out;
41148fb5debcSMike Kravetz 
41158fb5debcSMike Kravetz 		ret = copy_huge_page_from_user(page,
41168fb5debcSMike Kravetz 						(const void __user *) src_addr,
4117810a56b9SMike Kravetz 						pages_per_huge_page(h), false);
41188fb5debcSMike Kravetz 
41198fb5debcSMike Kravetz 		/* fallback to copy_from_user outside mmap_sem */
41208fb5debcSMike Kravetz 		if (unlikely(ret)) {
41219e368259SAndrea Arcangeli 			ret = -ENOENT;
41228fb5debcSMike Kravetz 			*pagep = page;
41238fb5debcSMike Kravetz 			/* don't free the page */
41248fb5debcSMike Kravetz 			goto out;
41258fb5debcSMike Kravetz 		}
41268fb5debcSMike Kravetz 	} else {
41278fb5debcSMike Kravetz 		page = *pagep;
41288fb5debcSMike Kravetz 		*pagep = NULL;
41298fb5debcSMike Kravetz 	}
41308fb5debcSMike Kravetz 
41318fb5debcSMike Kravetz 	/*
41328fb5debcSMike Kravetz 	 * The memory barrier inside __SetPageUptodate makes sure that
41338fb5debcSMike Kravetz 	 * preceding stores to the page contents become visible before
41348fb5debcSMike Kravetz 	 * the set_pte_at() write.
41358fb5debcSMike Kravetz 	 */
41368fb5debcSMike Kravetz 	__SetPageUptodate(page);
41378fb5debcSMike Kravetz 
41381e392147SAndrea Arcangeli 	mapping = dst_vma->vm_file->f_mapping;
41391e392147SAndrea Arcangeli 	idx = vma_hugecache_offset(h, dst_vma, dst_addr);
41401e392147SAndrea Arcangeli 
41411c9e8defSMike Kravetz 	/*
41421c9e8defSMike Kravetz 	 * If shared, add to page cache
41431c9e8defSMike Kravetz 	 */
41441c9e8defSMike Kravetz 	if (vm_shared) {
41451e392147SAndrea Arcangeli 		size = i_size_read(mapping->host) >> huge_page_shift(h);
41461e392147SAndrea Arcangeli 		ret = -EFAULT;
41471e392147SAndrea Arcangeli 		if (idx >= size)
41481e392147SAndrea Arcangeli 			goto out_release_nounlock;
41491c9e8defSMike Kravetz 
41501e392147SAndrea Arcangeli 		/*
41511e392147SAndrea Arcangeli 		 * Serialization between remove_inode_hugepages() and
41521e392147SAndrea Arcangeli 		 * huge_add_to_page_cache() below happens through the
41531e392147SAndrea Arcangeli 		 * hugetlb_fault_mutex_table that here must be hold by
41541e392147SAndrea Arcangeli 		 * the caller.
41551e392147SAndrea Arcangeli 		 */
41561c9e8defSMike Kravetz 		ret = huge_add_to_page_cache(page, mapping, idx);
41571c9e8defSMike Kravetz 		if (ret)
41581c9e8defSMike Kravetz 			goto out_release_nounlock;
41591c9e8defSMike Kravetz 	}
41601c9e8defSMike Kravetz 
41618fb5debcSMike Kravetz 	ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
41628fb5debcSMike Kravetz 	spin_lock(ptl);
41638fb5debcSMike Kravetz 
41641e392147SAndrea Arcangeli 	/*
41651e392147SAndrea Arcangeli 	 * Recheck the i_size after holding PT lock to make sure not
41661e392147SAndrea Arcangeli 	 * to leave any page mapped (as page_mapped()) beyond the end
41671e392147SAndrea Arcangeli 	 * of the i_size (remove_inode_hugepages() is strict about
41681e392147SAndrea Arcangeli 	 * enforcing that). If we bail out here, we'll also leave a
41691e392147SAndrea Arcangeli 	 * page in the radix tree in the vm_shared case beyond the end
41701e392147SAndrea Arcangeli 	 * of the i_size, but remove_inode_hugepages() will take care
41711e392147SAndrea Arcangeli 	 * of it as soon as we drop the hugetlb_fault_mutex_table.
41721e392147SAndrea Arcangeli 	 */
41731e392147SAndrea Arcangeli 	size = i_size_read(mapping->host) >> huge_page_shift(h);
41741e392147SAndrea Arcangeli 	ret = -EFAULT;
41751e392147SAndrea Arcangeli 	if (idx >= size)
41761e392147SAndrea Arcangeli 		goto out_release_unlock;
41771e392147SAndrea Arcangeli 
41788fb5debcSMike Kravetz 	ret = -EEXIST;
41798fb5debcSMike Kravetz 	if (!huge_pte_none(huge_ptep_get(dst_pte)))
41808fb5debcSMike Kravetz 		goto out_release_unlock;
41818fb5debcSMike Kravetz 
41821c9e8defSMike Kravetz 	if (vm_shared) {
41831c9e8defSMike Kravetz 		page_dup_rmap(page, true);
41841c9e8defSMike Kravetz 	} else {
41858fb5debcSMike Kravetz 		ClearPagePrivate(page);
41868fb5debcSMike Kravetz 		hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
41871c9e8defSMike Kravetz 	}
41888fb5debcSMike Kravetz 
41898fb5debcSMike Kravetz 	_dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE);
41908fb5debcSMike Kravetz 	if (dst_vma->vm_flags & VM_WRITE)
41918fb5debcSMike Kravetz 		_dst_pte = huge_pte_mkdirty(_dst_pte);
41928fb5debcSMike Kravetz 	_dst_pte = pte_mkyoung(_dst_pte);
41938fb5debcSMike Kravetz 
41948fb5debcSMike Kravetz 	set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
41958fb5debcSMike Kravetz 
41968fb5debcSMike Kravetz 	(void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte,
41978fb5debcSMike Kravetz 					dst_vma->vm_flags & VM_WRITE);
41988fb5debcSMike Kravetz 	hugetlb_count_add(pages_per_huge_page(h), dst_mm);
41998fb5debcSMike Kravetz 
42008fb5debcSMike Kravetz 	/* No need to invalidate - it was non-present before */
42018fb5debcSMike Kravetz 	update_mmu_cache(dst_vma, dst_addr, dst_pte);
42028fb5debcSMike Kravetz 
42038fb5debcSMike Kravetz 	spin_unlock(ptl);
4204cb6acd01SMike Kravetz 	set_page_huge_active(page);
42051c9e8defSMike Kravetz 	if (vm_shared)
42061c9e8defSMike Kravetz 		unlock_page(page);
42078fb5debcSMike Kravetz 	ret = 0;
42088fb5debcSMike Kravetz out:
42098fb5debcSMike Kravetz 	return ret;
42108fb5debcSMike Kravetz out_release_unlock:
42118fb5debcSMike Kravetz 	spin_unlock(ptl);
42121c9e8defSMike Kravetz 	if (vm_shared)
42131c9e8defSMike Kravetz 		unlock_page(page);
42145af10dfdSAndrea Arcangeli out_release_nounlock:
42158fb5debcSMike Kravetz 	put_page(page);
42168fb5debcSMike Kravetz 	goto out;
42178fb5debcSMike Kravetz }
42188fb5debcSMike Kravetz 
421928a35716SMichel Lespinasse long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
422063551ae0SDavid Gibson 			 struct page **pages, struct vm_area_struct **vmas,
422128a35716SMichel Lespinasse 			 unsigned long *position, unsigned long *nr_pages,
422287ffc118SAndrea Arcangeli 			 long i, unsigned int flags, int *nonblocking)
422363551ae0SDavid Gibson {
4224d5d4b0aaSChen, Kenneth W 	unsigned long pfn_offset;
4225d5d4b0aaSChen, Kenneth W 	unsigned long vaddr = *position;
422628a35716SMichel Lespinasse 	unsigned long remainder = *nr_pages;
4227a5516438SAndi Kleen 	struct hstate *h = hstate_vma(vma);
42282be7cfedSDaniel Jordan 	int err = -EFAULT;
422963551ae0SDavid Gibson 
423063551ae0SDavid Gibson 	while (vaddr < vma->vm_end && remainder) {
423163551ae0SDavid Gibson 		pte_t *pte;
4232cb900f41SKirill A. Shutemov 		spinlock_t *ptl = NULL;
42332a15efc9SHugh Dickins 		int absent;
423463551ae0SDavid Gibson 		struct page *page;
423563551ae0SDavid Gibson 
42364c887265SAdam Litke 		/*
423702057967SDavid Rientjes 		 * If we have a pending SIGKILL, don't keep faulting pages and
423802057967SDavid Rientjes 		 * potentially allocating memory.
423902057967SDavid Rientjes 		 */
4240fa45f116SDavidlohr Bueso 		if (fatal_signal_pending(current)) {
424102057967SDavid Rientjes 			remainder = 0;
424202057967SDavid Rientjes 			break;
424302057967SDavid Rientjes 		}
424402057967SDavid Rientjes 
424502057967SDavid Rientjes 		/*
42464c887265SAdam Litke 		 * Some archs (sparc64, sh*) have multiple pte_ts to
42472a15efc9SHugh Dickins 		 * each hugepage.  We have to make sure we get the
42484c887265SAdam Litke 		 * first, for the page indexing below to work.
4249cb900f41SKirill A. Shutemov 		 *
4250cb900f41SKirill A. Shutemov 		 * Note that page table lock is not held when pte is null.
42514c887265SAdam Litke 		 */
42527868a208SPunit Agrawal 		pte = huge_pte_offset(mm, vaddr & huge_page_mask(h),
42537868a208SPunit Agrawal 				      huge_page_size(h));
4254cb900f41SKirill A. Shutemov 		if (pte)
4255cb900f41SKirill A. Shutemov 			ptl = huge_pte_lock(h, mm, pte);
42562a15efc9SHugh Dickins 		absent = !pte || huge_pte_none(huge_ptep_get(pte));
425763551ae0SDavid Gibson 
42582a15efc9SHugh Dickins 		/*
42592a15efc9SHugh Dickins 		 * When coredumping, it suits get_dump_page if we just return
42603ae77f43SHugh Dickins 		 * an error where there's an empty slot with no huge pagecache
42613ae77f43SHugh Dickins 		 * to back it.  This way, we avoid allocating a hugepage, and
42623ae77f43SHugh Dickins 		 * the sparse dumpfile avoids allocating disk blocks, but its
42633ae77f43SHugh Dickins 		 * huge holes still show up with zeroes where they need to be.
42642a15efc9SHugh Dickins 		 */
42653ae77f43SHugh Dickins 		if (absent && (flags & FOLL_DUMP) &&
42663ae77f43SHugh Dickins 		    !hugetlbfs_pagecache_present(h, vma, vaddr)) {
4267cb900f41SKirill A. Shutemov 			if (pte)
4268cb900f41SKirill A. Shutemov 				spin_unlock(ptl);
42692a15efc9SHugh Dickins 			remainder = 0;
42702a15efc9SHugh Dickins 			break;
42712a15efc9SHugh Dickins 		}
42722a15efc9SHugh Dickins 
42739cc3a5bdSNaoya Horiguchi 		/*
42749cc3a5bdSNaoya Horiguchi 		 * We need call hugetlb_fault for both hugepages under migration
42759cc3a5bdSNaoya Horiguchi 		 * (in which case hugetlb_fault waits for the migration,) and
42769cc3a5bdSNaoya Horiguchi 		 * hwpoisoned hugepages (in which case we need to prevent the
42779cc3a5bdSNaoya Horiguchi 		 * caller from accessing to them.) In order to do this, we use
42789cc3a5bdSNaoya Horiguchi 		 * here is_swap_pte instead of is_hugetlb_entry_migration and
42799cc3a5bdSNaoya Horiguchi 		 * is_hugetlb_entry_hwpoisoned. This is because it simply covers
42809cc3a5bdSNaoya Horiguchi 		 * both cases, and because we can't follow correct pages
42819cc3a5bdSNaoya Horiguchi 		 * directly from any kind of swap entries.
42829cc3a5bdSNaoya Horiguchi 		 */
42839cc3a5bdSNaoya Horiguchi 		if (absent || is_swap_pte(huge_ptep_get(pte)) ||
4284106c992aSGerald Schaefer 		    ((flags & FOLL_WRITE) &&
4285106c992aSGerald Schaefer 		      !huge_pte_write(huge_ptep_get(pte)))) {
42862b740303SSouptick Joarder 			vm_fault_t ret;
428787ffc118SAndrea Arcangeli 			unsigned int fault_flags = 0;
42884c887265SAdam Litke 
4289cb900f41SKirill A. Shutemov 			if (pte)
4290cb900f41SKirill A. Shutemov 				spin_unlock(ptl);
429187ffc118SAndrea Arcangeli 			if (flags & FOLL_WRITE)
429287ffc118SAndrea Arcangeli 				fault_flags |= FAULT_FLAG_WRITE;
429387ffc118SAndrea Arcangeli 			if (nonblocking)
429487ffc118SAndrea Arcangeli 				fault_flags |= FAULT_FLAG_ALLOW_RETRY;
429587ffc118SAndrea Arcangeli 			if (flags & FOLL_NOWAIT)
429687ffc118SAndrea Arcangeli 				fault_flags |= FAULT_FLAG_ALLOW_RETRY |
429787ffc118SAndrea Arcangeli 					FAULT_FLAG_RETRY_NOWAIT;
429887ffc118SAndrea Arcangeli 			if (flags & FOLL_TRIED) {
429987ffc118SAndrea Arcangeli 				VM_WARN_ON_ONCE(fault_flags &
430087ffc118SAndrea Arcangeli 						FAULT_FLAG_ALLOW_RETRY);
430187ffc118SAndrea Arcangeli 				fault_flags |= FAULT_FLAG_TRIED;
430287ffc118SAndrea Arcangeli 			}
430387ffc118SAndrea Arcangeli 			ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
430487ffc118SAndrea Arcangeli 			if (ret & VM_FAULT_ERROR) {
43052be7cfedSDaniel Jordan 				err = vm_fault_to_errno(ret, flags);
43061c59827dSHugh Dickins 				remainder = 0;
43071c59827dSHugh Dickins 				break;
43081c59827dSHugh Dickins 			}
430987ffc118SAndrea Arcangeli 			if (ret & VM_FAULT_RETRY) {
43101ac25013SAndrea Arcangeli 				if (nonblocking &&
43111ac25013SAndrea Arcangeli 				    !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
431287ffc118SAndrea Arcangeli 					*nonblocking = 0;
431387ffc118SAndrea Arcangeli 				*nr_pages = 0;
431487ffc118SAndrea Arcangeli 				/*
431587ffc118SAndrea Arcangeli 				 * VM_FAULT_RETRY must not return an
431687ffc118SAndrea Arcangeli 				 * error, it will return zero
431787ffc118SAndrea Arcangeli 				 * instead.
431887ffc118SAndrea Arcangeli 				 *
431987ffc118SAndrea Arcangeli 				 * No need to update "position" as the
432087ffc118SAndrea Arcangeli 				 * caller will not check it after
432187ffc118SAndrea Arcangeli 				 * *nr_pages is set to 0.
432287ffc118SAndrea Arcangeli 				 */
432387ffc118SAndrea Arcangeli 				return i;
432487ffc118SAndrea Arcangeli 			}
432587ffc118SAndrea Arcangeli 			continue;
432687ffc118SAndrea Arcangeli 		}
432763551ae0SDavid Gibson 
4328a5516438SAndi Kleen 		pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
43297f2e9525SGerald Schaefer 		page = pte_page(huge_ptep_get(pte));
43308fde12caSLinus Torvalds 
43318fde12caSLinus Torvalds 		/*
43328fde12caSLinus Torvalds 		 * Instead of doing 'try_get_page()' below in the same_page
43338fde12caSLinus Torvalds 		 * loop, just check the count once here.
43348fde12caSLinus Torvalds 		 */
43358fde12caSLinus Torvalds 		if (unlikely(page_count(page) <= 0)) {
43368fde12caSLinus Torvalds 			if (pages) {
43378fde12caSLinus Torvalds 				spin_unlock(ptl);
43388fde12caSLinus Torvalds 				remainder = 0;
43398fde12caSLinus Torvalds 				err = -ENOMEM;
43408fde12caSLinus Torvalds 				break;
43418fde12caSLinus Torvalds 			}
43428fde12caSLinus Torvalds 		}
4343d5d4b0aaSChen, Kenneth W same_page:
4344d6692183SChen, Kenneth W 		if (pages) {
434569d177c2SAndy Whitcroft 			pages[i] = mem_map_offset(page, pfn_offset);
4346ddc58f27SKirill A. Shutemov 			get_page(pages[i]);
4347d6692183SChen, Kenneth W 		}
434863551ae0SDavid Gibson 
434963551ae0SDavid Gibson 		if (vmas)
435063551ae0SDavid Gibson 			vmas[i] = vma;
435163551ae0SDavid Gibson 
435263551ae0SDavid Gibson 		vaddr += PAGE_SIZE;
4353d5d4b0aaSChen, Kenneth W 		++pfn_offset;
435463551ae0SDavid Gibson 		--remainder;
435563551ae0SDavid Gibson 		++i;
4356d5d4b0aaSChen, Kenneth W 		if (vaddr < vma->vm_end && remainder &&
4357a5516438SAndi Kleen 				pfn_offset < pages_per_huge_page(h)) {
4358d5d4b0aaSChen, Kenneth W 			/*
4359d5d4b0aaSChen, Kenneth W 			 * We use pfn_offset to avoid touching the pageframes
4360d5d4b0aaSChen, Kenneth W 			 * of this compound page.
4361d5d4b0aaSChen, Kenneth W 			 */
4362d5d4b0aaSChen, Kenneth W 			goto same_page;
4363d5d4b0aaSChen, Kenneth W 		}
4364cb900f41SKirill A. Shutemov 		spin_unlock(ptl);
436563551ae0SDavid Gibson 	}
436628a35716SMichel Lespinasse 	*nr_pages = remainder;
436787ffc118SAndrea Arcangeli 	/*
436887ffc118SAndrea Arcangeli 	 * setting position is actually required only if remainder is
436987ffc118SAndrea Arcangeli 	 * not zero but it's faster not to add a "if (remainder)"
437087ffc118SAndrea Arcangeli 	 * branch.
437187ffc118SAndrea Arcangeli 	 */
437263551ae0SDavid Gibson 	*position = vaddr;
437363551ae0SDavid Gibson 
43742be7cfedSDaniel Jordan 	return i ? i : err;
437563551ae0SDavid Gibson }
43768f860591SZhang, Yanmin 
43775491ae7bSAneesh Kumar K.V #ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
43785491ae7bSAneesh Kumar K.V /*
43795491ae7bSAneesh Kumar K.V  * ARCHes with special requirements for evicting HUGETLB backing TLB entries can
43805491ae7bSAneesh Kumar K.V  * implement this.
43815491ae7bSAneesh Kumar K.V  */
43825491ae7bSAneesh Kumar K.V #define flush_hugetlb_tlb_range(vma, addr, end)	flush_tlb_range(vma, addr, end)
43835491ae7bSAneesh Kumar K.V #endif
43845491ae7bSAneesh Kumar K.V 
43857da4d641SPeter Zijlstra unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
43868f860591SZhang, Yanmin 		unsigned long address, unsigned long end, pgprot_t newprot)
43878f860591SZhang, Yanmin {
43888f860591SZhang, Yanmin 	struct mm_struct *mm = vma->vm_mm;
43898f860591SZhang, Yanmin 	unsigned long start = address;
43908f860591SZhang, Yanmin 	pte_t *ptep;
43918f860591SZhang, Yanmin 	pte_t pte;
4392a5516438SAndi Kleen 	struct hstate *h = hstate_vma(vma);
43937da4d641SPeter Zijlstra 	unsigned long pages = 0;
4394dff11abeSMike Kravetz 	bool shared_pmd = false;
4395ac46d4f3SJérôme Glisse 	struct mmu_notifier_range range;
4396dff11abeSMike Kravetz 
4397dff11abeSMike Kravetz 	/*
4398dff11abeSMike Kravetz 	 * In the case of shared PMDs, the area to flush could be beyond
4399ac46d4f3SJérôme Glisse 	 * start/end.  Set range.start/range.end to cover the maximum possible
4400dff11abeSMike Kravetz 	 * range if PMD sharing is possible.
4401dff11abeSMike Kravetz 	 */
44027269f999SJérôme Glisse 	mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA,
44037269f999SJérôme Glisse 				0, vma, mm, start, end);
4404ac46d4f3SJérôme Glisse 	adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
44058f860591SZhang, Yanmin 
44068f860591SZhang, Yanmin 	BUG_ON(address >= end);
4407ac46d4f3SJérôme Glisse 	flush_cache_range(vma, range.start, range.end);
44088f860591SZhang, Yanmin 
4409ac46d4f3SJérôme Glisse 	mmu_notifier_invalidate_range_start(&range);
441083cde9e8SDavidlohr Bueso 	i_mmap_lock_write(vma->vm_file->f_mapping);
4411a5516438SAndi Kleen 	for (; address < end; address += huge_page_size(h)) {
4412cb900f41SKirill A. Shutemov 		spinlock_t *ptl;
44137868a208SPunit Agrawal 		ptep = huge_pte_offset(mm, address, huge_page_size(h));
44148f860591SZhang, Yanmin 		if (!ptep)
44158f860591SZhang, Yanmin 			continue;
4416cb900f41SKirill A. Shutemov 		ptl = huge_pte_lock(h, mm, ptep);
44177da4d641SPeter Zijlstra 		if (huge_pmd_unshare(mm, &address, ptep)) {
44187da4d641SPeter Zijlstra 			pages++;
4419cb900f41SKirill A. Shutemov 			spin_unlock(ptl);
4420dff11abeSMike Kravetz 			shared_pmd = true;
442139dde65cSChen, Kenneth W 			continue;
44227da4d641SPeter Zijlstra 		}
4423a8bda28dSNaoya Horiguchi 		pte = huge_ptep_get(ptep);
4424a8bda28dSNaoya Horiguchi 		if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
4425a8bda28dSNaoya Horiguchi 			spin_unlock(ptl);
4426a8bda28dSNaoya Horiguchi 			continue;
4427a8bda28dSNaoya Horiguchi 		}
4428a8bda28dSNaoya Horiguchi 		if (unlikely(is_hugetlb_entry_migration(pte))) {
4429a8bda28dSNaoya Horiguchi 			swp_entry_t entry = pte_to_swp_entry(pte);
4430a8bda28dSNaoya Horiguchi 
4431a8bda28dSNaoya Horiguchi 			if (is_write_migration_entry(entry)) {
4432a8bda28dSNaoya Horiguchi 				pte_t newpte;
4433a8bda28dSNaoya Horiguchi 
4434a8bda28dSNaoya Horiguchi 				make_migration_entry_read(&entry);
4435a8bda28dSNaoya Horiguchi 				newpte = swp_entry_to_pte(entry);
4436e5251fd4SPunit Agrawal 				set_huge_swap_pte_at(mm, address, ptep,
4437e5251fd4SPunit Agrawal 						     newpte, huge_page_size(h));
4438a8bda28dSNaoya Horiguchi 				pages++;
4439a8bda28dSNaoya Horiguchi 			}
4440a8bda28dSNaoya Horiguchi 			spin_unlock(ptl);
4441a8bda28dSNaoya Horiguchi 			continue;
4442a8bda28dSNaoya Horiguchi 		}
4443a8bda28dSNaoya Horiguchi 		if (!huge_pte_none(pte)) {
4444023bdd00SAneesh Kumar K.V 			pte_t old_pte;
4445023bdd00SAneesh Kumar K.V 
4446023bdd00SAneesh Kumar K.V 			old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
4447023bdd00SAneesh Kumar K.V 			pte = pte_mkhuge(huge_pte_modify(old_pte, newprot));
4448be7517d6STony Lu 			pte = arch_make_huge_pte(pte, vma, NULL, 0);
4449023bdd00SAneesh Kumar K.V 			huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
44507da4d641SPeter Zijlstra 			pages++;
44518f860591SZhang, Yanmin 		}
4452cb900f41SKirill A. Shutemov 		spin_unlock(ptl);
44538f860591SZhang, Yanmin 	}
4454d833352aSMel Gorman 	/*
4455c8c06efaSDavidlohr Bueso 	 * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare
4456d833352aSMel Gorman 	 * may have cleared our pud entry and done put_page on the page table:
4457c8c06efaSDavidlohr Bueso 	 * once we release i_mmap_rwsem, another task can do the final put_page
4458dff11abeSMike Kravetz 	 * and that page table be reused and filled with junk.  If we actually
4459dff11abeSMike Kravetz 	 * did unshare a page of pmds, flush the range corresponding to the pud.
4460d833352aSMel Gorman 	 */
4461dff11abeSMike Kravetz 	if (shared_pmd)
4462ac46d4f3SJérôme Glisse 		flush_hugetlb_tlb_range(vma, range.start, range.end);
4463dff11abeSMike Kravetz 	else
44645491ae7bSAneesh Kumar K.V 		flush_hugetlb_tlb_range(vma, start, end);
44650f10851eSJérôme Glisse 	/*
44660f10851eSJérôme Glisse 	 * No need to call mmu_notifier_invalidate_range() we are downgrading
44670f10851eSJérôme Glisse 	 * page table protection not changing it to point to a new page.
44680f10851eSJérôme Glisse 	 *
4469ad56b738SMike Rapoport 	 * See Documentation/vm/mmu_notifier.rst
44700f10851eSJérôme Glisse 	 */
447183cde9e8SDavidlohr Bueso 	i_mmap_unlock_write(vma->vm_file->f_mapping);
4472ac46d4f3SJérôme Glisse 	mmu_notifier_invalidate_range_end(&range);
44737da4d641SPeter Zijlstra 
44747da4d641SPeter Zijlstra 	return pages << h->order;
44758f860591SZhang, Yanmin }
44768f860591SZhang, Yanmin 
4477a1e78772SMel Gorman int hugetlb_reserve_pages(struct inode *inode,
4478a1e78772SMel Gorman 					long from, long to,
44795a6fe125SMel Gorman 					struct vm_area_struct *vma,
4480ca16d140SKOSAKI Motohiro 					vm_flags_t vm_flags)
4481e4e574b7SAdam Litke {
448217c9d12eSMel Gorman 	long ret, chg;
4483a5516438SAndi Kleen 	struct hstate *h = hstate_inode(inode);
448490481622SDavid Gibson 	struct hugepage_subpool *spool = subpool_inode(inode);
44859119a41eSJoonsoo Kim 	struct resv_map *resv_map;
44861c5ecae3SMike Kravetz 	long gbl_reserve;
4487e4e574b7SAdam Litke 
448863489f8eSMike Kravetz 	/* This should never happen */
448963489f8eSMike Kravetz 	if (from > to) {
449063489f8eSMike Kravetz 		VM_WARN(1, "%s called with a negative range\n", __func__);
449163489f8eSMike Kravetz 		return -EINVAL;
449263489f8eSMike Kravetz 	}
449363489f8eSMike Kravetz 
4494a1e78772SMel Gorman 	/*
449517c9d12eSMel Gorman 	 * Only apply hugepage reservation if asked. At fault time, an
449617c9d12eSMel Gorman 	 * attempt will be made for VM_NORESERVE to allocate a page
449790481622SDavid Gibson 	 * without using reserves
449817c9d12eSMel Gorman 	 */
4499ca16d140SKOSAKI Motohiro 	if (vm_flags & VM_NORESERVE)
450017c9d12eSMel Gorman 		return 0;
450117c9d12eSMel Gorman 
450217c9d12eSMel Gorman 	/*
4503a1e78772SMel Gorman 	 * Shared mappings base their reservation on the number of pages that
4504a1e78772SMel Gorman 	 * are already allocated on behalf of the file. Private mappings need
4505a1e78772SMel Gorman 	 * to reserve the full area even if read-only as mprotect() may be
4506a1e78772SMel Gorman 	 * called to make the mapping read-write. Assume !vma is a shm mapping
4507a1e78772SMel Gorman 	 */
45089119a41eSJoonsoo Kim 	if (!vma || vma->vm_flags & VM_MAYSHARE) {
4509f27a5136SMike Kravetz 		/*
4510f27a5136SMike Kravetz 		 * resv_map can not be NULL as hugetlb_reserve_pages is only
4511f27a5136SMike Kravetz 		 * called for inodes for which resv_maps were created (see
4512f27a5136SMike Kravetz 		 * hugetlbfs_get_inode).
4513f27a5136SMike Kravetz 		 */
45144e35f483SJoonsoo Kim 		resv_map = inode_resv_map(inode);
45159119a41eSJoonsoo Kim 
45161406ec9bSJoonsoo Kim 		chg = region_chg(resv_map, from, to);
45179119a41eSJoonsoo Kim 
45189119a41eSJoonsoo Kim 	} else {
45199119a41eSJoonsoo Kim 		resv_map = resv_map_alloc();
45205a6fe125SMel Gorman 		if (!resv_map)
45215a6fe125SMel Gorman 			return -ENOMEM;
45225a6fe125SMel Gorman 
452317c9d12eSMel Gorman 		chg = to - from;
452417c9d12eSMel Gorman 
45255a6fe125SMel Gorman 		set_vma_resv_map(vma, resv_map);
45265a6fe125SMel Gorman 		set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
45275a6fe125SMel Gorman 	}
45285a6fe125SMel Gorman 
4529c50ac050SDave Hansen 	if (chg < 0) {
4530c50ac050SDave Hansen 		ret = chg;
4531c50ac050SDave Hansen 		goto out_err;
4532c50ac050SDave Hansen 	}
453317c9d12eSMel Gorman 
45341c5ecae3SMike Kravetz 	/*
45351c5ecae3SMike Kravetz 	 * There must be enough pages in the subpool for the mapping. If
45361c5ecae3SMike Kravetz 	 * the subpool has a minimum size, there may be some global
45371c5ecae3SMike Kravetz 	 * reservations already in place (gbl_reserve).
45381c5ecae3SMike Kravetz 	 */
45391c5ecae3SMike Kravetz 	gbl_reserve = hugepage_subpool_get_pages(spool, chg);
45401c5ecae3SMike Kravetz 	if (gbl_reserve < 0) {
4541c50ac050SDave Hansen 		ret = -ENOSPC;
4542c50ac050SDave Hansen 		goto out_err;
4543c50ac050SDave Hansen 	}
454417c9d12eSMel Gorman 
454517c9d12eSMel Gorman 	/*
454617c9d12eSMel Gorman 	 * Check enough hugepages are available for the reservation.
454790481622SDavid Gibson 	 * Hand the pages back to the subpool if there are not
454817c9d12eSMel Gorman 	 */
45491c5ecae3SMike Kravetz 	ret = hugetlb_acct_memory(h, gbl_reserve);
455017c9d12eSMel Gorman 	if (ret < 0) {
45511c5ecae3SMike Kravetz 		/* put back original number of pages, chg */
45521c5ecae3SMike Kravetz 		(void)hugepage_subpool_put_pages(spool, chg);
4553c50ac050SDave Hansen 		goto out_err;
455417c9d12eSMel Gorman 	}
455517c9d12eSMel Gorman 
455617c9d12eSMel Gorman 	/*
455717c9d12eSMel Gorman 	 * Account for the reservations made. Shared mappings record regions
455817c9d12eSMel Gorman 	 * that have reservations as they are shared by multiple VMAs.
455917c9d12eSMel Gorman 	 * When the last VMA disappears, the region map says how much
456017c9d12eSMel Gorman 	 * the reservation was and the page cache tells how much of
456117c9d12eSMel Gorman 	 * the reservation was consumed. Private mappings are per-VMA and
456217c9d12eSMel Gorman 	 * only the consumed reservations are tracked. When the VMA
456317c9d12eSMel Gorman 	 * disappears, the original reservation is the VMA size and the
456417c9d12eSMel Gorman 	 * consumed reservations are stored in the map. Hence, nothing
456517c9d12eSMel Gorman 	 * else has to be done for private mappings here
456617c9d12eSMel Gorman 	 */
456733039678SMike Kravetz 	if (!vma || vma->vm_flags & VM_MAYSHARE) {
456833039678SMike Kravetz 		long add = region_add(resv_map, from, to);
456933039678SMike Kravetz 
457033039678SMike Kravetz 		if (unlikely(chg > add)) {
457133039678SMike Kravetz 			/*
457233039678SMike Kravetz 			 * pages in this range were added to the reserve
457333039678SMike Kravetz 			 * map between region_chg and region_add.  This
457433039678SMike Kravetz 			 * indicates a race with alloc_huge_page.  Adjust
457533039678SMike Kravetz 			 * the subpool and reserve counts modified above
457633039678SMike Kravetz 			 * based on the difference.
457733039678SMike Kravetz 			 */
457833039678SMike Kravetz 			long rsv_adjust;
457933039678SMike Kravetz 
458033039678SMike Kravetz 			rsv_adjust = hugepage_subpool_put_pages(spool,
458133039678SMike Kravetz 								chg - add);
458233039678SMike Kravetz 			hugetlb_acct_memory(h, -rsv_adjust);
458333039678SMike Kravetz 		}
458433039678SMike Kravetz 	}
4585a43a8c39SChen, Kenneth W 	return 0;
4586c50ac050SDave Hansen out_err:
45875e911373SMike Kravetz 	if (!vma || vma->vm_flags & VM_MAYSHARE)
4588ff8c0c53SMike Kravetz 		/* Don't call region_abort if region_chg failed */
4589ff8c0c53SMike Kravetz 		if (chg >= 0)
45905e911373SMike Kravetz 			region_abort(resv_map, from, to);
4591f031dd27SJoonsoo Kim 	if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
4592f031dd27SJoonsoo Kim 		kref_put(&resv_map->refs, resv_map_release);
4593c50ac050SDave Hansen 	return ret;
4594a43a8c39SChen, Kenneth W }
4595a43a8c39SChen, Kenneth W 
4596b5cec28dSMike Kravetz long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
4597b5cec28dSMike Kravetz 								long freed)
4598a43a8c39SChen, Kenneth W {
4599a5516438SAndi Kleen 	struct hstate *h = hstate_inode(inode);
46004e35f483SJoonsoo Kim 	struct resv_map *resv_map = inode_resv_map(inode);
46019119a41eSJoonsoo Kim 	long chg = 0;
460290481622SDavid Gibson 	struct hugepage_subpool *spool = subpool_inode(inode);
46031c5ecae3SMike Kravetz 	long gbl_reserve;
460445c682a6SKen Chen 
4605f27a5136SMike Kravetz 	/*
4606f27a5136SMike Kravetz 	 * Since this routine can be called in the evict inode path for all
4607f27a5136SMike Kravetz 	 * hugetlbfs inodes, resv_map could be NULL.
4608f27a5136SMike Kravetz 	 */
4609b5cec28dSMike Kravetz 	if (resv_map) {
4610b5cec28dSMike Kravetz 		chg = region_del(resv_map, start, end);
4611b5cec28dSMike Kravetz 		/*
4612b5cec28dSMike Kravetz 		 * region_del() can fail in the rare case where a region
4613b5cec28dSMike Kravetz 		 * must be split and another region descriptor can not be
4614b5cec28dSMike Kravetz 		 * allocated.  If end == LONG_MAX, it will not fail.
4615b5cec28dSMike Kravetz 		 */
4616b5cec28dSMike Kravetz 		if (chg < 0)
4617b5cec28dSMike Kravetz 			return chg;
4618b5cec28dSMike Kravetz 	}
4619b5cec28dSMike Kravetz 
462045c682a6SKen Chen 	spin_lock(&inode->i_lock);
4621e4c6f8beSEric Sandeen 	inode->i_blocks -= (blocks_per_huge_page(h) * freed);
462245c682a6SKen Chen 	spin_unlock(&inode->i_lock);
462345c682a6SKen Chen 
46241c5ecae3SMike Kravetz 	/*
46251c5ecae3SMike Kravetz 	 * If the subpool has a minimum size, the number of global
46261c5ecae3SMike Kravetz 	 * reservations to be released may be adjusted.
46271c5ecae3SMike Kravetz 	 */
46281c5ecae3SMike Kravetz 	gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
46291c5ecae3SMike Kravetz 	hugetlb_acct_memory(h, -gbl_reserve);
4630b5cec28dSMike Kravetz 
4631b5cec28dSMike Kravetz 	return 0;
4632a43a8c39SChen, Kenneth W }
463393f70f90SNaoya Horiguchi 
46343212b535SSteve Capper #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
46353212b535SSteve Capper static unsigned long page_table_shareable(struct vm_area_struct *svma,
46363212b535SSteve Capper 				struct vm_area_struct *vma,
46373212b535SSteve Capper 				unsigned long addr, pgoff_t idx)
46383212b535SSteve Capper {
46393212b535SSteve Capper 	unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
46403212b535SSteve Capper 				svma->vm_start;
46413212b535SSteve Capper 	unsigned long sbase = saddr & PUD_MASK;
46423212b535SSteve Capper 	unsigned long s_end = sbase + PUD_SIZE;
46433212b535SSteve Capper 
46443212b535SSteve Capper 	/* Allow segments to share if only one is marked locked */
4645de60f5f1SEric B Munson 	unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
4646de60f5f1SEric B Munson 	unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK;
46473212b535SSteve Capper 
46483212b535SSteve Capper 	/*
46493212b535SSteve Capper 	 * match the virtual addresses, permission and the alignment of the
46503212b535SSteve Capper 	 * page table page.
46513212b535SSteve Capper 	 */
46523212b535SSteve Capper 	if (pmd_index(addr) != pmd_index(saddr) ||
46533212b535SSteve Capper 	    vm_flags != svm_flags ||
46543212b535SSteve Capper 	    sbase < svma->vm_start || svma->vm_end < s_end)
46553212b535SSteve Capper 		return 0;
46563212b535SSteve Capper 
46573212b535SSteve Capper 	return saddr;
46583212b535SSteve Capper }
46593212b535SSteve Capper 
466031aafb45SNicholas Krause static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
46613212b535SSteve Capper {
46623212b535SSteve Capper 	unsigned long base = addr & PUD_MASK;
46633212b535SSteve Capper 	unsigned long end = base + PUD_SIZE;
46643212b535SSteve Capper 
46653212b535SSteve Capper 	/*
46663212b535SSteve Capper 	 * check on proper vm_flags and page table alignment
46673212b535SSteve Capper 	 */
4668017b1660SMike Kravetz 	if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end))
466931aafb45SNicholas Krause 		return true;
467031aafb45SNicholas Krause 	return false;
46713212b535SSteve Capper }
46723212b535SSteve Capper 
46733212b535SSteve Capper /*
4674017b1660SMike Kravetz  * Determine if start,end range within vma could be mapped by shared pmd.
4675017b1660SMike Kravetz  * If yes, adjust start and end to cover range associated with possible
4676017b1660SMike Kravetz  * shared pmd mappings.
4677017b1660SMike Kravetz  */
4678017b1660SMike Kravetz void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
4679017b1660SMike Kravetz 				unsigned long *start, unsigned long *end)
4680017b1660SMike Kravetz {
4681017b1660SMike Kravetz 	unsigned long check_addr = *start;
4682017b1660SMike Kravetz 
4683017b1660SMike Kravetz 	if (!(vma->vm_flags & VM_MAYSHARE))
4684017b1660SMike Kravetz 		return;
4685017b1660SMike Kravetz 
4686017b1660SMike Kravetz 	for (check_addr = *start; check_addr < *end; check_addr += PUD_SIZE) {
4687017b1660SMike Kravetz 		unsigned long a_start = check_addr & PUD_MASK;
4688017b1660SMike Kravetz 		unsigned long a_end = a_start + PUD_SIZE;
4689017b1660SMike Kravetz 
4690017b1660SMike Kravetz 		/*
4691017b1660SMike Kravetz 		 * If sharing is possible, adjust start/end if necessary.
4692017b1660SMike Kravetz 		 */
4693017b1660SMike Kravetz 		if (range_in_vma(vma, a_start, a_end)) {
4694017b1660SMike Kravetz 			if (a_start < *start)
4695017b1660SMike Kravetz 				*start = a_start;
4696017b1660SMike Kravetz 			if (a_end > *end)
4697017b1660SMike Kravetz 				*end = a_end;
4698017b1660SMike Kravetz 		}
4699017b1660SMike Kravetz 	}
4700017b1660SMike Kravetz }
4701017b1660SMike Kravetz 
4702017b1660SMike Kravetz /*
47033212b535SSteve Capper  * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
47043212b535SSteve Capper  * and returns the corresponding pte. While this is not necessary for the
47053212b535SSteve Capper  * !shared pmd case because we can allocate the pmd later as well, it makes the
4706ddeaab32SMike Kravetz  * code much cleaner. pmd allocation is essential for the shared case because
4707ddeaab32SMike Kravetz  * pud has to be populated inside the same i_mmap_rwsem section - otherwise
4708ddeaab32SMike Kravetz  * racing tasks could either miss the sharing (see huge_pte_offset) or select a
4709ddeaab32SMike Kravetz  * bad pmd for sharing.
47103212b535SSteve Capper  */
47113212b535SSteve Capper pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
47123212b535SSteve Capper {
47133212b535SSteve Capper 	struct vm_area_struct *vma = find_vma(mm, addr);
47143212b535SSteve Capper 	struct address_space *mapping = vma->vm_file->f_mapping;
47153212b535SSteve Capper 	pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
47163212b535SSteve Capper 			vma->vm_pgoff;
47173212b535SSteve Capper 	struct vm_area_struct *svma;
47183212b535SSteve Capper 	unsigned long saddr;
47193212b535SSteve Capper 	pte_t *spte = NULL;
47203212b535SSteve Capper 	pte_t *pte;
4721cb900f41SKirill A. Shutemov 	spinlock_t *ptl;
47223212b535SSteve Capper 
47233212b535SSteve Capper 	if (!vma_shareable(vma, addr))
47243212b535SSteve Capper 		return (pte_t *)pmd_alloc(mm, pud, addr);
47253212b535SSteve Capper 
4726930668c3SWaiman Long 	i_mmap_lock_read(mapping);
47273212b535SSteve Capper 	vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
47283212b535SSteve Capper 		if (svma == vma)
47293212b535SSteve Capper 			continue;
47303212b535SSteve Capper 
47313212b535SSteve Capper 		saddr = page_table_shareable(svma, vma, addr, idx);
47323212b535SSteve Capper 		if (saddr) {
47337868a208SPunit Agrawal 			spte = huge_pte_offset(svma->vm_mm, saddr,
47347868a208SPunit Agrawal 					       vma_mmu_pagesize(svma));
47353212b535SSteve Capper 			if (spte) {
47363212b535SSteve Capper 				get_page(virt_to_page(spte));
47373212b535SSteve Capper 				break;
47383212b535SSteve Capper 			}
47393212b535SSteve Capper 		}
47403212b535SSteve Capper 	}
47413212b535SSteve Capper 
47423212b535SSteve Capper 	if (!spte)
47433212b535SSteve Capper 		goto out;
47443212b535SSteve Capper 
47458bea8052SAneesh Kumar K.V 	ptl = huge_pte_lock(hstate_vma(vma), mm, spte);
4746dc6c9a35SKirill A. Shutemov 	if (pud_none(*pud)) {
47473212b535SSteve Capper 		pud_populate(mm, pud,
47483212b535SSteve Capper 				(pmd_t *)((unsigned long)spte & PAGE_MASK));
4749c17b1f42SKirill A. Shutemov 		mm_inc_nr_pmds(mm);
4750dc6c9a35SKirill A. Shutemov 	} else {
47513212b535SSteve Capper 		put_page(virt_to_page(spte));
4752dc6c9a35SKirill A. Shutemov 	}
4753cb900f41SKirill A. Shutemov 	spin_unlock(ptl);
47543212b535SSteve Capper out:
47553212b535SSteve Capper 	pte = (pte_t *)pmd_alloc(mm, pud, addr);
4756930668c3SWaiman Long 	i_mmap_unlock_read(mapping);
47573212b535SSteve Capper 	return pte;
47583212b535SSteve Capper }
47593212b535SSteve Capper 
47603212b535SSteve Capper /*
47613212b535SSteve Capper  * unmap huge page backed by shared pte.
47623212b535SSteve Capper  *
47633212b535SSteve Capper  * Hugetlb pte page is ref counted at the time of mapping.  If pte is shared
47643212b535SSteve Capper  * indicated by page_count > 1, unmap is achieved by clearing pud and
47653212b535SSteve Capper  * decrementing the ref count. If count == 1, the pte page is not shared.
47663212b535SSteve Capper  *
4767ddeaab32SMike Kravetz  * called with page table lock held.
47683212b535SSteve Capper  *
47693212b535SSteve Capper  * returns: 1 successfully unmapped a shared pte page
47703212b535SSteve Capper  *	    0 the underlying pte page is not shared, or it is the last user
47713212b535SSteve Capper  */
47723212b535SSteve Capper int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
47733212b535SSteve Capper {
47743212b535SSteve Capper 	pgd_t *pgd = pgd_offset(mm, *addr);
4775c2febafcSKirill A. Shutemov 	p4d_t *p4d = p4d_offset(pgd, *addr);
4776c2febafcSKirill A. Shutemov 	pud_t *pud = pud_offset(p4d, *addr);
47773212b535SSteve Capper 
47783212b535SSteve Capper 	BUG_ON(page_count(virt_to_page(ptep)) == 0);
47793212b535SSteve Capper 	if (page_count(virt_to_page(ptep)) == 1)
47803212b535SSteve Capper 		return 0;
47813212b535SSteve Capper 
47823212b535SSteve Capper 	pud_clear(pud);
47833212b535SSteve Capper 	put_page(virt_to_page(ptep));
4784dc6c9a35SKirill A. Shutemov 	mm_dec_nr_pmds(mm);
47853212b535SSteve Capper 	*addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
47863212b535SSteve Capper 	return 1;
47873212b535SSteve Capper }
47889e5fc74cSSteve Capper #define want_pmd_share()	(1)
47899e5fc74cSSteve Capper #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
47909e5fc74cSSteve Capper pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
47919e5fc74cSSteve Capper {
47929e5fc74cSSteve Capper 	return NULL;
47939e5fc74cSSteve Capper }
4794e81f2d22SZhang Zhen 
4795e81f2d22SZhang Zhen int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
4796e81f2d22SZhang Zhen {
4797e81f2d22SZhang Zhen 	return 0;
4798e81f2d22SZhang Zhen }
4799017b1660SMike Kravetz 
4800017b1660SMike Kravetz void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
4801017b1660SMike Kravetz 				unsigned long *start, unsigned long *end)
4802017b1660SMike Kravetz {
4803017b1660SMike Kravetz }
48049e5fc74cSSteve Capper #define want_pmd_share()	(0)
48053212b535SSteve Capper #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
48063212b535SSteve Capper 
48079e5fc74cSSteve Capper #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
48089e5fc74cSSteve Capper pte_t *huge_pte_alloc(struct mm_struct *mm,
48099e5fc74cSSteve Capper 			unsigned long addr, unsigned long sz)
48109e5fc74cSSteve Capper {
48119e5fc74cSSteve Capper 	pgd_t *pgd;
4812c2febafcSKirill A. Shutemov 	p4d_t *p4d;
48139e5fc74cSSteve Capper 	pud_t *pud;
48149e5fc74cSSteve Capper 	pte_t *pte = NULL;
48159e5fc74cSSteve Capper 
48169e5fc74cSSteve Capper 	pgd = pgd_offset(mm, addr);
4817f4f0a3d8SKirill A. Shutemov 	p4d = p4d_alloc(mm, pgd, addr);
4818f4f0a3d8SKirill A. Shutemov 	if (!p4d)
4819f4f0a3d8SKirill A. Shutemov 		return NULL;
4820c2febafcSKirill A. Shutemov 	pud = pud_alloc(mm, p4d, addr);
48219e5fc74cSSteve Capper 	if (pud) {
48229e5fc74cSSteve Capper 		if (sz == PUD_SIZE) {
48239e5fc74cSSteve Capper 			pte = (pte_t *)pud;
48249e5fc74cSSteve Capper 		} else {
48259e5fc74cSSteve Capper 			BUG_ON(sz != PMD_SIZE);
48269e5fc74cSSteve Capper 			if (want_pmd_share() && pud_none(*pud))
48279e5fc74cSSteve Capper 				pte = huge_pmd_share(mm, addr, pud);
48289e5fc74cSSteve Capper 			else
48299e5fc74cSSteve Capper 				pte = (pte_t *)pmd_alloc(mm, pud, addr);
48309e5fc74cSSteve Capper 		}
48319e5fc74cSSteve Capper 	}
48324e666314SMichal Hocko 	BUG_ON(pte && pte_present(*pte) && !pte_huge(*pte));
48339e5fc74cSSteve Capper 
48349e5fc74cSSteve Capper 	return pte;
48359e5fc74cSSteve Capper }
48369e5fc74cSSteve Capper 
48379b19df29SPunit Agrawal /*
48389b19df29SPunit Agrawal  * huge_pte_offset() - Walk the page table to resolve the hugepage
48399b19df29SPunit Agrawal  * entry at address @addr
48409b19df29SPunit Agrawal  *
48419b19df29SPunit Agrawal  * Return: Pointer to page table or swap entry (PUD or PMD) for
48429b19df29SPunit Agrawal  * address @addr, or NULL if a p*d_none() entry is encountered and the
48439b19df29SPunit Agrawal  * size @sz doesn't match the hugepage size at this level of the page
48449b19df29SPunit Agrawal  * table.
48459b19df29SPunit Agrawal  */
48467868a208SPunit Agrawal pte_t *huge_pte_offset(struct mm_struct *mm,
48477868a208SPunit Agrawal 		       unsigned long addr, unsigned long sz)
48489e5fc74cSSteve Capper {
48499e5fc74cSSteve Capper 	pgd_t *pgd;
4850c2febafcSKirill A. Shutemov 	p4d_t *p4d;
48519e5fc74cSSteve Capper 	pud_t *pud;
4852c2febafcSKirill A. Shutemov 	pmd_t *pmd;
48539e5fc74cSSteve Capper 
48549e5fc74cSSteve Capper 	pgd = pgd_offset(mm, addr);
4855c2febafcSKirill A. Shutemov 	if (!pgd_present(*pgd))
4856c2febafcSKirill A. Shutemov 		return NULL;
4857c2febafcSKirill A. Shutemov 	p4d = p4d_offset(pgd, addr);
4858c2febafcSKirill A. Shutemov 	if (!p4d_present(*p4d))
4859c2febafcSKirill A. Shutemov 		return NULL;
48609b19df29SPunit Agrawal 
4861c2febafcSKirill A. Shutemov 	pud = pud_offset(p4d, addr);
48629b19df29SPunit Agrawal 	if (sz != PUD_SIZE && pud_none(*pud))
4863c2febafcSKirill A. Shutemov 		return NULL;
48649b19df29SPunit Agrawal 	/* hugepage or swap? */
48659b19df29SPunit Agrawal 	if (pud_huge(*pud) || !pud_present(*pud))
48669e5fc74cSSteve Capper 		return (pte_t *)pud;
48679b19df29SPunit Agrawal 
48689e5fc74cSSteve Capper 	pmd = pmd_offset(pud, addr);
48699b19df29SPunit Agrawal 	if (sz != PMD_SIZE && pmd_none(*pmd))
48709b19df29SPunit Agrawal 		return NULL;
48719b19df29SPunit Agrawal 	/* hugepage or swap? */
48729b19df29SPunit Agrawal 	if (pmd_huge(*pmd) || !pmd_present(*pmd))
48739e5fc74cSSteve Capper 		return (pte_t *)pmd;
48749b19df29SPunit Agrawal 
48759b19df29SPunit Agrawal 	return NULL;
48769e5fc74cSSteve Capper }
48779e5fc74cSSteve Capper 
487861f77edaSNaoya Horiguchi #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
487961f77edaSNaoya Horiguchi 
488061f77edaSNaoya Horiguchi /*
488161f77edaSNaoya Horiguchi  * These functions are overwritable if your architecture needs its own
488261f77edaSNaoya Horiguchi  * behavior.
488361f77edaSNaoya Horiguchi  */
488461f77edaSNaoya Horiguchi struct page * __weak
488561f77edaSNaoya Horiguchi follow_huge_addr(struct mm_struct *mm, unsigned long address,
488661f77edaSNaoya Horiguchi 			      int write)
488761f77edaSNaoya Horiguchi {
488861f77edaSNaoya Horiguchi 	return ERR_PTR(-EINVAL);
488961f77edaSNaoya Horiguchi }
489061f77edaSNaoya Horiguchi 
489161f77edaSNaoya Horiguchi struct page * __weak
48924dc71451SAneesh Kumar K.V follow_huge_pd(struct vm_area_struct *vma,
48934dc71451SAneesh Kumar K.V 	       unsigned long address, hugepd_t hpd, int flags, int pdshift)
48944dc71451SAneesh Kumar K.V {
48954dc71451SAneesh Kumar K.V 	WARN(1, "hugepd follow called with no support for hugepage directory format\n");
48964dc71451SAneesh Kumar K.V 	return NULL;
48974dc71451SAneesh Kumar K.V }
48984dc71451SAneesh Kumar K.V 
48994dc71451SAneesh Kumar K.V struct page * __weak
49009e5fc74cSSteve Capper follow_huge_pmd(struct mm_struct *mm, unsigned long address,
4901e66f17ffSNaoya Horiguchi 		pmd_t *pmd, int flags)
49029e5fc74cSSteve Capper {
4903e66f17ffSNaoya Horiguchi 	struct page *page = NULL;
4904e66f17ffSNaoya Horiguchi 	spinlock_t *ptl;
4905c9d398faSNaoya Horiguchi 	pte_t pte;
4906e66f17ffSNaoya Horiguchi retry:
4907e66f17ffSNaoya Horiguchi 	ptl = pmd_lockptr(mm, pmd);
4908e66f17ffSNaoya Horiguchi 	spin_lock(ptl);
4909e66f17ffSNaoya Horiguchi 	/*
4910e66f17ffSNaoya Horiguchi 	 * make sure that the address range covered by this pmd is not
4911e66f17ffSNaoya Horiguchi 	 * unmapped from other threads.
4912e66f17ffSNaoya Horiguchi 	 */
4913e66f17ffSNaoya Horiguchi 	if (!pmd_huge(*pmd))
4914e66f17ffSNaoya Horiguchi 		goto out;
4915c9d398faSNaoya Horiguchi 	pte = huge_ptep_get((pte_t *)pmd);
4916c9d398faSNaoya Horiguchi 	if (pte_present(pte)) {
491797534127SGerald Schaefer 		page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
4918e66f17ffSNaoya Horiguchi 		if (flags & FOLL_GET)
4919e66f17ffSNaoya Horiguchi 			get_page(page);
4920e66f17ffSNaoya Horiguchi 	} else {
4921c9d398faSNaoya Horiguchi 		if (is_hugetlb_entry_migration(pte)) {
4922e66f17ffSNaoya Horiguchi 			spin_unlock(ptl);
4923e66f17ffSNaoya Horiguchi 			__migration_entry_wait(mm, (pte_t *)pmd, ptl);
4924e66f17ffSNaoya Horiguchi 			goto retry;
4925e66f17ffSNaoya Horiguchi 		}
4926e66f17ffSNaoya Horiguchi 		/*
4927e66f17ffSNaoya Horiguchi 		 * hwpoisoned entry is treated as no_page_table in
4928e66f17ffSNaoya Horiguchi 		 * follow_page_mask().
4929e66f17ffSNaoya Horiguchi 		 */
4930e66f17ffSNaoya Horiguchi 	}
4931e66f17ffSNaoya Horiguchi out:
4932e66f17ffSNaoya Horiguchi 	spin_unlock(ptl);
49339e5fc74cSSteve Capper 	return page;
49349e5fc74cSSteve Capper }
49359e5fc74cSSteve Capper 
493661f77edaSNaoya Horiguchi struct page * __weak
49379e5fc74cSSteve Capper follow_huge_pud(struct mm_struct *mm, unsigned long address,
4938e66f17ffSNaoya Horiguchi 		pud_t *pud, int flags)
49399e5fc74cSSteve Capper {
4940e66f17ffSNaoya Horiguchi 	if (flags & FOLL_GET)
4941e66f17ffSNaoya Horiguchi 		return NULL;
49429e5fc74cSSteve Capper 
4943e66f17ffSNaoya Horiguchi 	return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
49449e5fc74cSSteve Capper }
49459e5fc74cSSteve Capper 
4946faaa5b62SAnshuman Khandual struct page * __weak
4947faaa5b62SAnshuman Khandual follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags)
4948faaa5b62SAnshuman Khandual {
4949faaa5b62SAnshuman Khandual 	if (flags & FOLL_GET)
4950faaa5b62SAnshuman Khandual 		return NULL;
4951faaa5b62SAnshuman Khandual 
4952faaa5b62SAnshuman Khandual 	return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
4953faaa5b62SAnshuman Khandual }
4954faaa5b62SAnshuman Khandual 
495531caf665SNaoya Horiguchi bool isolate_huge_page(struct page *page, struct list_head *list)
495631caf665SNaoya Horiguchi {
4957bcc54222SNaoya Horiguchi 	bool ret = true;
4958bcc54222SNaoya Horiguchi 
4959309381feSSasha Levin 	VM_BUG_ON_PAGE(!PageHead(page), page);
496031caf665SNaoya Horiguchi 	spin_lock(&hugetlb_lock);
4961bcc54222SNaoya Horiguchi 	if (!page_huge_active(page) || !get_page_unless_zero(page)) {
4962bcc54222SNaoya Horiguchi 		ret = false;
4963bcc54222SNaoya Horiguchi 		goto unlock;
4964bcc54222SNaoya Horiguchi 	}
4965bcc54222SNaoya Horiguchi 	clear_page_huge_active(page);
496631caf665SNaoya Horiguchi 	list_move_tail(&page->lru, list);
4967bcc54222SNaoya Horiguchi unlock:
496831caf665SNaoya Horiguchi 	spin_unlock(&hugetlb_lock);
4969bcc54222SNaoya Horiguchi 	return ret;
497031caf665SNaoya Horiguchi }
497131caf665SNaoya Horiguchi 
497231caf665SNaoya Horiguchi void putback_active_hugepage(struct page *page)
497331caf665SNaoya Horiguchi {
4974309381feSSasha Levin 	VM_BUG_ON_PAGE(!PageHead(page), page);
497531caf665SNaoya Horiguchi 	spin_lock(&hugetlb_lock);
4976bcc54222SNaoya Horiguchi 	set_page_huge_active(page);
497731caf665SNaoya Horiguchi 	list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
497831caf665SNaoya Horiguchi 	spin_unlock(&hugetlb_lock);
497931caf665SNaoya Horiguchi 	put_page(page);
498031caf665SNaoya Horiguchi }
4981ab5ac90aSMichal Hocko 
4982ab5ac90aSMichal Hocko void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
4983ab5ac90aSMichal Hocko {
4984ab5ac90aSMichal Hocko 	struct hstate *h = page_hstate(oldpage);
4985ab5ac90aSMichal Hocko 
4986ab5ac90aSMichal Hocko 	hugetlb_cgroup_migrate(oldpage, newpage);
4987ab5ac90aSMichal Hocko 	set_page_owner_migrate_reason(newpage, reason);
4988ab5ac90aSMichal Hocko 
4989ab5ac90aSMichal Hocko 	/*
4990ab5ac90aSMichal Hocko 	 * transfer temporary state of the new huge page. This is
4991ab5ac90aSMichal Hocko 	 * reverse to other transitions because the newpage is going to
4992ab5ac90aSMichal Hocko 	 * be final while the old one will be freed so it takes over
4993ab5ac90aSMichal Hocko 	 * the temporary status.
4994ab5ac90aSMichal Hocko 	 *
4995ab5ac90aSMichal Hocko 	 * Also note that we have to transfer the per-node surplus state
4996ab5ac90aSMichal Hocko 	 * here as well otherwise the global surplus count will not match
4997ab5ac90aSMichal Hocko 	 * the per-node's.
4998ab5ac90aSMichal Hocko 	 */
4999ab5ac90aSMichal Hocko 	if (PageHugeTemporary(newpage)) {
5000ab5ac90aSMichal Hocko 		int old_nid = page_to_nid(oldpage);
5001ab5ac90aSMichal Hocko 		int new_nid = page_to_nid(newpage);
5002ab5ac90aSMichal Hocko 
5003ab5ac90aSMichal Hocko 		SetPageHugeTemporary(oldpage);
5004ab5ac90aSMichal Hocko 		ClearPageHugeTemporary(newpage);
5005ab5ac90aSMichal Hocko 
5006ab5ac90aSMichal Hocko 		spin_lock(&hugetlb_lock);
5007ab5ac90aSMichal Hocko 		if (h->surplus_huge_pages_node[old_nid]) {
5008ab5ac90aSMichal Hocko 			h->surplus_huge_pages_node[old_nid]--;
5009ab5ac90aSMichal Hocko 			h->surplus_huge_pages_node[new_nid]++;
5010ab5ac90aSMichal Hocko 		}
5011ab5ac90aSMichal Hocko 		spin_unlock(&hugetlb_lock);
5012ab5ac90aSMichal Hocko 	}
5013ab5ac90aSMichal Hocko }
5014