xref: /openbmc/linux/mm/hugetlb.c (revision 5e911373)
11da177e4SLinus Torvalds /*
21da177e4SLinus Torvalds  * Generic hugetlb support.
36d49e352SNadia Yvette Chambers  * (C) Nadia Yvette Chambers, April 2004
41da177e4SLinus Torvalds  */
51da177e4SLinus Torvalds #include <linux/list.h>
61da177e4SLinus Torvalds #include <linux/init.h>
71da177e4SLinus Torvalds #include <linux/module.h>
81da177e4SLinus Torvalds #include <linux/mm.h>
9e1759c21SAlexey Dobriyan #include <linux/seq_file.h>
101da177e4SLinus Torvalds #include <linux/sysctl.h>
111da177e4SLinus Torvalds #include <linux/highmem.h>
12cddb8a5cSAndrea Arcangeli #include <linux/mmu_notifier.h>
131da177e4SLinus Torvalds #include <linux/nodemask.h>
1463551ae0SDavid Gibson #include <linux/pagemap.h>
155da7ca86SChristoph Lameter #include <linux/mempolicy.h>
163b32123dSGideon Israel Dsouza #include <linux/compiler.h>
17aea47ff3SChristoph Lameter #include <linux/cpuset.h>
183935baa9SDavid Gibson #include <linux/mutex.h>
19aa888a74SAndi Kleen #include <linux/bootmem.h>
20a3437870SNishanth Aravamudan #include <linux/sysfs.h>
215a0e3ad6STejun Heo #include <linux/slab.h>
220fe6e20bSNaoya Horiguchi #include <linux/rmap.h>
23fd6a03edSNaoya Horiguchi #include <linux/swap.h>
24fd6a03edSNaoya Horiguchi #include <linux/swapops.h>
25c8721bbbSNaoya Horiguchi #include <linux/page-isolation.h>
268382d914SDavidlohr Bueso #include <linux/jhash.h>
27d6606683SLinus Torvalds 
2863551ae0SDavid Gibson #include <asm/page.h>
2963551ae0SDavid Gibson #include <asm/pgtable.h>
3024669e58SAneesh Kumar K.V #include <asm/tlb.h>
3163551ae0SDavid Gibson 
3224669e58SAneesh Kumar K.V #include <linux/io.h>
3363551ae0SDavid Gibson #include <linux/hugetlb.h>
349dd540e2SAneesh Kumar K.V #include <linux/hugetlb_cgroup.h>
359a305230SLee Schermerhorn #include <linux/node.h>
367835e98bSNick Piggin #include "internal.h"
371da177e4SLinus Torvalds 
38753162cdSAndrey Ryabinin int hugepages_treat_as_movable;
39a5516438SAndi Kleen 
40c3f38a38SAneesh Kumar K.V int hugetlb_max_hstate __read_mostly;
41e5ff2159SAndi Kleen unsigned int default_hstate_idx;
42e5ff2159SAndi Kleen struct hstate hstates[HUGE_MAX_HSTATE];
43641844f5SNaoya Horiguchi /*
44641844f5SNaoya Horiguchi  * Minimum page order among possible hugepage sizes, set to a proper value
45641844f5SNaoya Horiguchi  * at boot time.
46641844f5SNaoya Horiguchi  */
47641844f5SNaoya Horiguchi static unsigned int minimum_order __read_mostly = UINT_MAX;
48e5ff2159SAndi Kleen 
4953ba51d2SJon Tollefson __initdata LIST_HEAD(huge_boot_pages);
5053ba51d2SJon Tollefson 
51e5ff2159SAndi Kleen /* for command line parsing */
52e5ff2159SAndi Kleen static struct hstate * __initdata parsed_hstate;
53e5ff2159SAndi Kleen static unsigned long __initdata default_hstate_max_huge_pages;
54e11bfbfcSNick Piggin static unsigned long __initdata default_hstate_size;
55e5ff2159SAndi Kleen 
563935baa9SDavid Gibson /*
5731caf665SNaoya Horiguchi  * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
5831caf665SNaoya Horiguchi  * free_huge_pages, and surplus_huge_pages.
593935baa9SDavid Gibson  */
60c3f38a38SAneesh Kumar K.V DEFINE_SPINLOCK(hugetlb_lock);
610bd0f9fbSEric Paris 
628382d914SDavidlohr Bueso /*
638382d914SDavidlohr Bueso  * Serializes faults on the same logical page.  This is used to
648382d914SDavidlohr Bueso  * prevent spurious OOMs when the hugepage pool is fully utilized.
658382d914SDavidlohr Bueso  */
668382d914SDavidlohr Bueso static int num_fault_mutexes;
678382d914SDavidlohr Bueso static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp;
688382d914SDavidlohr Bueso 
697ca02d0aSMike Kravetz /* Forward declaration */
707ca02d0aSMike Kravetz static int hugetlb_acct_memory(struct hstate *h, long delta);
717ca02d0aSMike Kravetz 
7290481622SDavid Gibson static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
7390481622SDavid Gibson {
7490481622SDavid Gibson 	bool free = (spool->count == 0) && (spool->used_hpages == 0);
7590481622SDavid Gibson 
7690481622SDavid Gibson 	spin_unlock(&spool->lock);
7790481622SDavid Gibson 
7890481622SDavid Gibson 	/* If no pages are used, and no other handles to the subpool
797ca02d0aSMike Kravetz 	 * remain, give up any reservations mased on minimum size and
807ca02d0aSMike Kravetz 	 * free the subpool */
817ca02d0aSMike Kravetz 	if (free) {
827ca02d0aSMike Kravetz 		if (spool->min_hpages != -1)
837ca02d0aSMike Kravetz 			hugetlb_acct_memory(spool->hstate,
847ca02d0aSMike Kravetz 						-spool->min_hpages);
8590481622SDavid Gibson 		kfree(spool);
8690481622SDavid Gibson 	}
877ca02d0aSMike Kravetz }
8890481622SDavid Gibson 
897ca02d0aSMike Kravetz struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
907ca02d0aSMike Kravetz 						long min_hpages)
9190481622SDavid Gibson {
9290481622SDavid Gibson 	struct hugepage_subpool *spool;
9390481622SDavid Gibson 
94c6a91820SMike Kravetz 	spool = kzalloc(sizeof(*spool), GFP_KERNEL);
9590481622SDavid Gibson 	if (!spool)
9690481622SDavid Gibson 		return NULL;
9790481622SDavid Gibson 
9890481622SDavid Gibson 	spin_lock_init(&spool->lock);
9990481622SDavid Gibson 	spool->count = 1;
1007ca02d0aSMike Kravetz 	spool->max_hpages = max_hpages;
1017ca02d0aSMike Kravetz 	spool->hstate = h;
1027ca02d0aSMike Kravetz 	spool->min_hpages = min_hpages;
1037ca02d0aSMike Kravetz 
1047ca02d0aSMike Kravetz 	if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
1057ca02d0aSMike Kravetz 		kfree(spool);
1067ca02d0aSMike Kravetz 		return NULL;
1077ca02d0aSMike Kravetz 	}
1087ca02d0aSMike Kravetz 	spool->rsv_hpages = min_hpages;
10990481622SDavid Gibson 
11090481622SDavid Gibson 	return spool;
11190481622SDavid Gibson }
11290481622SDavid Gibson 
11390481622SDavid Gibson void hugepage_put_subpool(struct hugepage_subpool *spool)
11490481622SDavid Gibson {
11590481622SDavid Gibson 	spin_lock(&spool->lock);
11690481622SDavid Gibson 	BUG_ON(!spool->count);
11790481622SDavid Gibson 	spool->count--;
11890481622SDavid Gibson 	unlock_or_release_subpool(spool);
11990481622SDavid Gibson }
12090481622SDavid Gibson 
1211c5ecae3SMike Kravetz /*
1221c5ecae3SMike Kravetz  * Subpool accounting for allocating and reserving pages.
1231c5ecae3SMike Kravetz  * Return -ENOMEM if there are not enough resources to satisfy the
1241c5ecae3SMike Kravetz  * the request.  Otherwise, return the number of pages by which the
1251c5ecae3SMike Kravetz  * global pools must be adjusted (upward).  The returned value may
1261c5ecae3SMike Kravetz  * only be different than the passed value (delta) in the case where
1271c5ecae3SMike Kravetz  * a subpool minimum size must be manitained.
1281c5ecae3SMike Kravetz  */
1291c5ecae3SMike Kravetz static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
13090481622SDavid Gibson 				      long delta)
13190481622SDavid Gibson {
1321c5ecae3SMike Kravetz 	long ret = delta;
13390481622SDavid Gibson 
13490481622SDavid Gibson 	if (!spool)
1351c5ecae3SMike Kravetz 		return ret;
13690481622SDavid Gibson 
13790481622SDavid Gibson 	spin_lock(&spool->lock);
13890481622SDavid Gibson 
1391c5ecae3SMike Kravetz 	if (spool->max_hpages != -1) {		/* maximum size accounting */
1401c5ecae3SMike Kravetz 		if ((spool->used_hpages + delta) <= spool->max_hpages)
1411c5ecae3SMike Kravetz 			spool->used_hpages += delta;
1421c5ecae3SMike Kravetz 		else {
1431c5ecae3SMike Kravetz 			ret = -ENOMEM;
1441c5ecae3SMike Kravetz 			goto unlock_ret;
1451c5ecae3SMike Kravetz 		}
1461c5ecae3SMike Kravetz 	}
1471c5ecae3SMike Kravetz 
1481c5ecae3SMike Kravetz 	if (spool->min_hpages != -1) {		/* minimum size accounting */
1491c5ecae3SMike Kravetz 		if (delta > spool->rsv_hpages) {
1501c5ecae3SMike Kravetz 			/*
1511c5ecae3SMike Kravetz 			 * Asking for more reserves than those already taken on
1521c5ecae3SMike Kravetz 			 * behalf of subpool.  Return difference.
1531c5ecae3SMike Kravetz 			 */
1541c5ecae3SMike Kravetz 			ret = delta - spool->rsv_hpages;
1551c5ecae3SMike Kravetz 			spool->rsv_hpages = 0;
1561c5ecae3SMike Kravetz 		} else {
1571c5ecae3SMike Kravetz 			ret = 0;	/* reserves already accounted for */
1581c5ecae3SMike Kravetz 			spool->rsv_hpages -= delta;
1591c5ecae3SMike Kravetz 		}
1601c5ecae3SMike Kravetz 	}
1611c5ecae3SMike Kravetz 
1621c5ecae3SMike Kravetz unlock_ret:
1631c5ecae3SMike Kravetz 	spin_unlock(&spool->lock);
16490481622SDavid Gibson 	return ret;
16590481622SDavid Gibson }
16690481622SDavid Gibson 
1671c5ecae3SMike Kravetz /*
1681c5ecae3SMike Kravetz  * Subpool accounting for freeing and unreserving pages.
1691c5ecae3SMike Kravetz  * Return the number of global page reservations that must be dropped.
1701c5ecae3SMike Kravetz  * The return value may only be different than the passed value (delta)
1711c5ecae3SMike Kravetz  * in the case where a subpool minimum size must be maintained.
1721c5ecae3SMike Kravetz  */
1731c5ecae3SMike Kravetz static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
17490481622SDavid Gibson 				       long delta)
17590481622SDavid Gibson {
1761c5ecae3SMike Kravetz 	long ret = delta;
1771c5ecae3SMike Kravetz 
17890481622SDavid Gibson 	if (!spool)
1791c5ecae3SMike Kravetz 		return delta;
18090481622SDavid Gibson 
18190481622SDavid Gibson 	spin_lock(&spool->lock);
1821c5ecae3SMike Kravetz 
1831c5ecae3SMike Kravetz 	if (spool->max_hpages != -1)		/* maximum size accounting */
18490481622SDavid Gibson 		spool->used_hpages -= delta;
1851c5ecae3SMike Kravetz 
1861c5ecae3SMike Kravetz 	if (spool->min_hpages != -1) {		/* minimum size accounting */
1871c5ecae3SMike Kravetz 		if (spool->rsv_hpages + delta <= spool->min_hpages)
1881c5ecae3SMike Kravetz 			ret = 0;
1891c5ecae3SMike Kravetz 		else
1901c5ecae3SMike Kravetz 			ret = spool->rsv_hpages + delta - spool->min_hpages;
1911c5ecae3SMike Kravetz 
1921c5ecae3SMike Kravetz 		spool->rsv_hpages += delta;
1931c5ecae3SMike Kravetz 		if (spool->rsv_hpages > spool->min_hpages)
1941c5ecae3SMike Kravetz 			spool->rsv_hpages = spool->min_hpages;
1951c5ecae3SMike Kravetz 	}
1961c5ecae3SMike Kravetz 
1971c5ecae3SMike Kravetz 	/*
1981c5ecae3SMike Kravetz 	 * If hugetlbfs_put_super couldn't free spool due to an outstanding
1991c5ecae3SMike Kravetz 	 * quota reference, free it now.
2001c5ecae3SMike Kravetz 	 */
20190481622SDavid Gibson 	unlock_or_release_subpool(spool);
2021c5ecae3SMike Kravetz 
2031c5ecae3SMike Kravetz 	return ret;
20490481622SDavid Gibson }
20590481622SDavid Gibson 
20690481622SDavid Gibson static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
20790481622SDavid Gibson {
20890481622SDavid Gibson 	return HUGETLBFS_SB(inode->i_sb)->spool;
20990481622SDavid Gibson }
21090481622SDavid Gibson 
21190481622SDavid Gibson static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
21290481622SDavid Gibson {
213496ad9aaSAl Viro 	return subpool_inode(file_inode(vma->vm_file));
21490481622SDavid Gibson }
21590481622SDavid Gibson 
216e7c4b0bfSAndy Whitcroft /*
21796822904SAndy Whitcroft  * Region tracking -- allows tracking of reservations and instantiated pages
21896822904SAndy Whitcroft  *                    across the pages in a mapping.
21984afd99bSAndy Whitcroft  *
2201dd308a7SMike Kravetz  * The region data structures are embedded into a resv_map and protected
2211dd308a7SMike Kravetz  * by a resv_map's lock.  The set of regions within the resv_map represent
2221dd308a7SMike Kravetz  * reservations for huge pages, or huge pages that have already been
2231dd308a7SMike Kravetz  * instantiated within the map.  The from and to elements are huge page
2241dd308a7SMike Kravetz  * indicies into the associated mapping.  from indicates the starting index
2251dd308a7SMike Kravetz  * of the region.  to represents the first index past the end of  the region.
2261dd308a7SMike Kravetz  *
2271dd308a7SMike Kravetz  * For example, a file region structure with from == 0 and to == 4 represents
2281dd308a7SMike Kravetz  * four huge pages in a mapping.  It is important to note that the to element
2291dd308a7SMike Kravetz  * represents the first element past the end of the region. This is used in
2301dd308a7SMike Kravetz  * arithmetic as 4(to) - 0(from) = 4 huge pages in the region.
2311dd308a7SMike Kravetz  *
2321dd308a7SMike Kravetz  * Interval notation of the form [from, to) will be used to indicate that
2331dd308a7SMike Kravetz  * the endpoint from is inclusive and to is exclusive.
23496822904SAndy Whitcroft  */
23596822904SAndy Whitcroft struct file_region {
23696822904SAndy Whitcroft 	struct list_head link;
23796822904SAndy Whitcroft 	long from;
23896822904SAndy Whitcroft 	long to;
23996822904SAndy Whitcroft };
24096822904SAndy Whitcroft 
2411dd308a7SMike Kravetz /*
2421dd308a7SMike Kravetz  * Add the huge page range represented by [f, t) to the reserve
2435e911373SMike Kravetz  * map.  In the normal case, existing regions will be expanded
2445e911373SMike Kravetz  * to accommodate the specified range.  Sufficient regions should
2455e911373SMike Kravetz  * exist for expansion due to the previous call to region_chg
2465e911373SMike Kravetz  * with the same range.  However, it is possible that region_del
2475e911373SMike Kravetz  * could have been called after region_chg and modifed the map
2485e911373SMike Kravetz  * in such a way that no region exists to be expanded.  In this
2495e911373SMike Kravetz  * case, pull a region descriptor from the cache associated with
2505e911373SMike Kravetz  * the map and use that for the new range.
251cf3ad20bSMike Kravetz  *
252cf3ad20bSMike Kravetz  * Return the number of new huge pages added to the map.  This
253cf3ad20bSMike Kravetz  * number is greater than or equal to zero.
2541dd308a7SMike Kravetz  */
2551406ec9bSJoonsoo Kim static long region_add(struct resv_map *resv, long f, long t)
25696822904SAndy Whitcroft {
2571406ec9bSJoonsoo Kim 	struct list_head *head = &resv->regions;
25896822904SAndy Whitcroft 	struct file_region *rg, *nrg, *trg;
259cf3ad20bSMike Kravetz 	long add = 0;
26096822904SAndy Whitcroft 
2617b24d861SDavidlohr Bueso 	spin_lock(&resv->lock);
26296822904SAndy Whitcroft 	/* Locate the region we are either in or before. */
26396822904SAndy Whitcroft 	list_for_each_entry(rg, head, link)
26496822904SAndy Whitcroft 		if (f <= rg->to)
26596822904SAndy Whitcroft 			break;
26696822904SAndy Whitcroft 
2675e911373SMike Kravetz 	/*
2685e911373SMike Kravetz 	 * If no region exists which can be expanded to include the
2695e911373SMike Kravetz 	 * specified range, the list must have been modified by an
2705e911373SMike Kravetz 	 * interleving call to region_del().  Pull a region descriptor
2715e911373SMike Kravetz 	 * from the cache and use it for this range.
2725e911373SMike Kravetz 	 */
2735e911373SMike Kravetz 	if (&rg->link == head || t < rg->from) {
2745e911373SMike Kravetz 		VM_BUG_ON(resv->region_cache_count <= 0);
2755e911373SMike Kravetz 
2765e911373SMike Kravetz 		resv->region_cache_count--;
2775e911373SMike Kravetz 		nrg = list_first_entry(&resv->region_cache, struct file_region,
2785e911373SMike Kravetz 					link);
2795e911373SMike Kravetz 		list_del(&nrg->link);
2805e911373SMike Kravetz 
2815e911373SMike Kravetz 		nrg->from = f;
2825e911373SMike Kravetz 		nrg->to = t;
2835e911373SMike Kravetz 		list_add(&nrg->link, rg->link.prev);
2845e911373SMike Kravetz 
2855e911373SMike Kravetz 		add += t - f;
2865e911373SMike Kravetz 		goto out_locked;
2875e911373SMike Kravetz 	}
2885e911373SMike Kravetz 
28996822904SAndy Whitcroft 	/* Round our left edge to the current segment if it encloses us. */
29096822904SAndy Whitcroft 	if (f > rg->from)
29196822904SAndy Whitcroft 		f = rg->from;
29296822904SAndy Whitcroft 
29396822904SAndy Whitcroft 	/* Check for and consume any regions we now overlap with. */
29496822904SAndy Whitcroft 	nrg = rg;
29596822904SAndy Whitcroft 	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
29696822904SAndy Whitcroft 		if (&rg->link == head)
29796822904SAndy Whitcroft 			break;
29896822904SAndy Whitcroft 		if (rg->from > t)
29996822904SAndy Whitcroft 			break;
30096822904SAndy Whitcroft 
30196822904SAndy Whitcroft 		/* If this area reaches higher then extend our area to
30296822904SAndy Whitcroft 		 * include it completely.  If this is not the first area
30396822904SAndy Whitcroft 		 * which we intend to reuse, free it. */
30496822904SAndy Whitcroft 		if (rg->to > t)
30596822904SAndy Whitcroft 			t = rg->to;
30696822904SAndy Whitcroft 		if (rg != nrg) {
307cf3ad20bSMike Kravetz 			/* Decrement return value by the deleted range.
308cf3ad20bSMike Kravetz 			 * Another range will span this area so that by
309cf3ad20bSMike Kravetz 			 * end of routine add will be >= zero
310cf3ad20bSMike Kravetz 			 */
311cf3ad20bSMike Kravetz 			add -= (rg->to - rg->from);
31296822904SAndy Whitcroft 			list_del(&rg->link);
31396822904SAndy Whitcroft 			kfree(rg);
31496822904SAndy Whitcroft 		}
31596822904SAndy Whitcroft 	}
316cf3ad20bSMike Kravetz 
317cf3ad20bSMike Kravetz 	add += (nrg->from - f);		/* Added to beginning of region */
31896822904SAndy Whitcroft 	nrg->from = f;
319cf3ad20bSMike Kravetz 	add += t - nrg->to;		/* Added to end of region */
32096822904SAndy Whitcroft 	nrg->to = t;
321cf3ad20bSMike Kravetz 
3225e911373SMike Kravetz out_locked:
3235e911373SMike Kravetz 	resv->adds_in_progress--;
3247b24d861SDavidlohr Bueso 	spin_unlock(&resv->lock);
325cf3ad20bSMike Kravetz 	VM_BUG_ON(add < 0);
326cf3ad20bSMike Kravetz 	return add;
32796822904SAndy Whitcroft }
32896822904SAndy Whitcroft 
3291dd308a7SMike Kravetz /*
3301dd308a7SMike Kravetz  * Examine the existing reserve map and determine how many
3311dd308a7SMike Kravetz  * huge pages in the specified range [f, t) are NOT currently
3321dd308a7SMike Kravetz  * represented.  This routine is called before a subsequent
3331dd308a7SMike Kravetz  * call to region_add that will actually modify the reserve
3341dd308a7SMike Kravetz  * map to add the specified range [f, t).  region_chg does
3351dd308a7SMike Kravetz  * not change the number of huge pages represented by the
3361dd308a7SMike Kravetz  * map.  However, if the existing regions in the map can not
3371dd308a7SMike Kravetz  * be expanded to represent the new range, a new file_region
3381dd308a7SMike Kravetz  * structure is added to the map as a placeholder.  This is
3391dd308a7SMike Kravetz  * so that the subsequent region_add call will have all the
3401dd308a7SMike Kravetz  * regions it needs and will not fail.
3411dd308a7SMike Kravetz  *
3425e911373SMike Kravetz  * Upon entry, region_chg will also examine the cache of region descriptors
3435e911373SMike Kravetz  * associated with the map.  If there are not enough descriptors cached, one
3445e911373SMike Kravetz  * will be allocated for the in progress add operation.
3455e911373SMike Kravetz  *
3465e911373SMike Kravetz  * Returns the number of huge pages that need to be added to the existing
3475e911373SMike Kravetz  * reservation map for the range [f, t).  This number is greater or equal to
3485e911373SMike Kravetz  * zero.  -ENOMEM is returned if a new file_region structure or cache entry
3495e911373SMike Kravetz  * is needed and can not be allocated.
3501dd308a7SMike Kravetz  */
3511406ec9bSJoonsoo Kim static long region_chg(struct resv_map *resv, long f, long t)
35296822904SAndy Whitcroft {
3531406ec9bSJoonsoo Kim 	struct list_head *head = &resv->regions;
3547b24d861SDavidlohr Bueso 	struct file_region *rg, *nrg = NULL;
35596822904SAndy Whitcroft 	long chg = 0;
35696822904SAndy Whitcroft 
3577b24d861SDavidlohr Bueso retry:
3587b24d861SDavidlohr Bueso 	spin_lock(&resv->lock);
3595e911373SMike Kravetz retry_locked:
3605e911373SMike Kravetz 	resv->adds_in_progress++;
3615e911373SMike Kravetz 
3625e911373SMike Kravetz 	/*
3635e911373SMike Kravetz 	 * Check for sufficient descriptors in the cache to accommodate
3645e911373SMike Kravetz 	 * the number of in progress add operations.
3655e911373SMike Kravetz 	 */
3665e911373SMike Kravetz 	if (resv->adds_in_progress > resv->region_cache_count) {
3675e911373SMike Kravetz 		struct file_region *trg;
3685e911373SMike Kravetz 
3695e911373SMike Kravetz 		VM_BUG_ON(resv->adds_in_progress - resv->region_cache_count > 1);
3705e911373SMike Kravetz 		/* Must drop lock to allocate a new descriptor. */
3715e911373SMike Kravetz 		resv->adds_in_progress--;
3725e911373SMike Kravetz 		spin_unlock(&resv->lock);
3735e911373SMike Kravetz 
3745e911373SMike Kravetz 		trg = kmalloc(sizeof(*trg), GFP_KERNEL);
3755e911373SMike Kravetz 		if (!trg)
3765e911373SMike Kravetz 			return -ENOMEM;
3775e911373SMike Kravetz 
3785e911373SMike Kravetz 		spin_lock(&resv->lock);
3795e911373SMike Kravetz 		list_add(&trg->link, &resv->region_cache);
3805e911373SMike Kravetz 		resv->region_cache_count++;
3815e911373SMike Kravetz 		goto retry_locked;
3825e911373SMike Kravetz 	}
3835e911373SMike Kravetz 
38496822904SAndy Whitcroft 	/* Locate the region we are before or in. */
38596822904SAndy Whitcroft 	list_for_each_entry(rg, head, link)
38696822904SAndy Whitcroft 		if (f <= rg->to)
38796822904SAndy Whitcroft 			break;
38896822904SAndy Whitcroft 
38996822904SAndy Whitcroft 	/* If we are below the current region then a new region is required.
39096822904SAndy Whitcroft 	 * Subtle, allocate a new region at the position but make it zero
39196822904SAndy Whitcroft 	 * size such that we can guarantee to record the reservation. */
39296822904SAndy Whitcroft 	if (&rg->link == head || t < rg->from) {
3937b24d861SDavidlohr Bueso 		if (!nrg) {
3945e911373SMike Kravetz 			resv->adds_in_progress--;
3957b24d861SDavidlohr Bueso 			spin_unlock(&resv->lock);
39696822904SAndy Whitcroft 			nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
39796822904SAndy Whitcroft 			if (!nrg)
39896822904SAndy Whitcroft 				return -ENOMEM;
3997b24d861SDavidlohr Bueso 
40096822904SAndy Whitcroft 			nrg->from = f;
40196822904SAndy Whitcroft 			nrg->to   = f;
40296822904SAndy Whitcroft 			INIT_LIST_HEAD(&nrg->link);
4037b24d861SDavidlohr Bueso 			goto retry;
4047b24d861SDavidlohr Bueso 		}
40596822904SAndy Whitcroft 
4067b24d861SDavidlohr Bueso 		list_add(&nrg->link, rg->link.prev);
4077b24d861SDavidlohr Bueso 		chg = t - f;
4087b24d861SDavidlohr Bueso 		goto out_nrg;
40996822904SAndy Whitcroft 	}
41096822904SAndy Whitcroft 
41196822904SAndy Whitcroft 	/* Round our left edge to the current segment if it encloses us. */
41296822904SAndy Whitcroft 	if (f > rg->from)
41396822904SAndy Whitcroft 		f = rg->from;
41496822904SAndy Whitcroft 	chg = t - f;
41596822904SAndy Whitcroft 
41696822904SAndy Whitcroft 	/* Check for and consume any regions we now overlap with. */
41796822904SAndy Whitcroft 	list_for_each_entry(rg, rg->link.prev, link) {
41896822904SAndy Whitcroft 		if (&rg->link == head)
41996822904SAndy Whitcroft 			break;
42096822904SAndy Whitcroft 		if (rg->from > t)
4217b24d861SDavidlohr Bueso 			goto out;
42296822904SAndy Whitcroft 
42325985edcSLucas De Marchi 		/* We overlap with this area, if it extends further than
42496822904SAndy Whitcroft 		 * us then we must extend ourselves.  Account for its
42596822904SAndy Whitcroft 		 * existing reservation. */
42696822904SAndy Whitcroft 		if (rg->to > t) {
42796822904SAndy Whitcroft 			chg += rg->to - t;
42896822904SAndy Whitcroft 			t = rg->to;
42996822904SAndy Whitcroft 		}
43096822904SAndy Whitcroft 		chg -= rg->to - rg->from;
43196822904SAndy Whitcroft 	}
4327b24d861SDavidlohr Bueso 
4337b24d861SDavidlohr Bueso out:
4347b24d861SDavidlohr Bueso 	spin_unlock(&resv->lock);
4357b24d861SDavidlohr Bueso 	/*  We already know we raced and no longer need the new region */
4367b24d861SDavidlohr Bueso 	kfree(nrg);
4377b24d861SDavidlohr Bueso 	return chg;
4387b24d861SDavidlohr Bueso out_nrg:
4397b24d861SDavidlohr Bueso 	spin_unlock(&resv->lock);
44096822904SAndy Whitcroft 	return chg;
44196822904SAndy Whitcroft }
44296822904SAndy Whitcroft 
4431dd308a7SMike Kravetz /*
4445e911373SMike Kravetz  * Abort the in progress add operation.  The adds_in_progress field
4455e911373SMike Kravetz  * of the resv_map keeps track of the operations in progress between
4465e911373SMike Kravetz  * calls to region_chg and region_add.  Operations are sometimes
4475e911373SMike Kravetz  * aborted after the call to region_chg.  In such cases, region_abort
4485e911373SMike Kravetz  * is called to decrement the adds_in_progress counter.
4495e911373SMike Kravetz  *
4505e911373SMike Kravetz  * NOTE: The range arguments [f, t) are not needed or used in this
4515e911373SMike Kravetz  * routine.  They are kept to make reading the calling code easier as
4525e911373SMike Kravetz  * arguments will match the associated region_chg call.
4535e911373SMike Kravetz  */
4545e911373SMike Kravetz static void region_abort(struct resv_map *resv, long f, long t)
4555e911373SMike Kravetz {
4565e911373SMike Kravetz 	spin_lock(&resv->lock);
4575e911373SMike Kravetz 	VM_BUG_ON(!resv->region_cache_count);
4585e911373SMike Kravetz 	resv->adds_in_progress--;
4595e911373SMike Kravetz 	spin_unlock(&resv->lock);
4605e911373SMike Kravetz }
4615e911373SMike Kravetz 
4625e911373SMike Kravetz /*
4631dd308a7SMike Kravetz  * Truncate the reserve map at index 'end'.  Modify/truncate any
4641dd308a7SMike Kravetz  * region which contains end.  Delete any regions past end.
4651dd308a7SMike Kravetz  * Return the number of huge pages removed from the map.
4661dd308a7SMike Kravetz  */
4671406ec9bSJoonsoo Kim static long region_truncate(struct resv_map *resv, long end)
46896822904SAndy Whitcroft {
4691406ec9bSJoonsoo Kim 	struct list_head *head = &resv->regions;
47096822904SAndy Whitcroft 	struct file_region *rg, *trg;
47196822904SAndy Whitcroft 	long chg = 0;
47296822904SAndy Whitcroft 
4737b24d861SDavidlohr Bueso 	spin_lock(&resv->lock);
47496822904SAndy Whitcroft 	/* Locate the region we are either in or before. */
47596822904SAndy Whitcroft 	list_for_each_entry(rg, head, link)
47696822904SAndy Whitcroft 		if (end <= rg->to)
47796822904SAndy Whitcroft 			break;
47896822904SAndy Whitcroft 	if (&rg->link == head)
4797b24d861SDavidlohr Bueso 		goto out;
48096822904SAndy Whitcroft 
48196822904SAndy Whitcroft 	/* If we are in the middle of a region then adjust it. */
48296822904SAndy Whitcroft 	if (end > rg->from) {
48396822904SAndy Whitcroft 		chg = rg->to - end;
48496822904SAndy Whitcroft 		rg->to = end;
48596822904SAndy Whitcroft 		rg = list_entry(rg->link.next, typeof(*rg), link);
48696822904SAndy Whitcroft 	}
48796822904SAndy Whitcroft 
48896822904SAndy Whitcroft 	/* Drop any remaining regions. */
48996822904SAndy Whitcroft 	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
49096822904SAndy Whitcroft 		if (&rg->link == head)
49196822904SAndy Whitcroft 			break;
49296822904SAndy Whitcroft 		chg += rg->to - rg->from;
49396822904SAndy Whitcroft 		list_del(&rg->link);
49496822904SAndy Whitcroft 		kfree(rg);
49596822904SAndy Whitcroft 	}
4967b24d861SDavidlohr Bueso 
4977b24d861SDavidlohr Bueso out:
4987b24d861SDavidlohr Bueso 	spin_unlock(&resv->lock);
49996822904SAndy Whitcroft 	return chg;
50096822904SAndy Whitcroft }
50196822904SAndy Whitcroft 
5021dd308a7SMike Kravetz /*
5031dd308a7SMike Kravetz  * Count and return the number of huge pages in the reserve map
5041dd308a7SMike Kravetz  * that intersect with the range [f, t).
5051dd308a7SMike Kravetz  */
5061406ec9bSJoonsoo Kim static long region_count(struct resv_map *resv, long f, long t)
50784afd99bSAndy Whitcroft {
5081406ec9bSJoonsoo Kim 	struct list_head *head = &resv->regions;
50984afd99bSAndy Whitcroft 	struct file_region *rg;
51084afd99bSAndy Whitcroft 	long chg = 0;
51184afd99bSAndy Whitcroft 
5127b24d861SDavidlohr Bueso 	spin_lock(&resv->lock);
51384afd99bSAndy Whitcroft 	/* Locate each segment we overlap with, and count that overlap. */
51484afd99bSAndy Whitcroft 	list_for_each_entry(rg, head, link) {
515f2135a4aSWang Sheng-Hui 		long seg_from;
516f2135a4aSWang Sheng-Hui 		long seg_to;
51784afd99bSAndy Whitcroft 
51884afd99bSAndy Whitcroft 		if (rg->to <= f)
51984afd99bSAndy Whitcroft 			continue;
52084afd99bSAndy Whitcroft 		if (rg->from >= t)
52184afd99bSAndy Whitcroft 			break;
52284afd99bSAndy Whitcroft 
52384afd99bSAndy Whitcroft 		seg_from = max(rg->from, f);
52484afd99bSAndy Whitcroft 		seg_to = min(rg->to, t);
52584afd99bSAndy Whitcroft 
52684afd99bSAndy Whitcroft 		chg += seg_to - seg_from;
52784afd99bSAndy Whitcroft 	}
5287b24d861SDavidlohr Bueso 	spin_unlock(&resv->lock);
52984afd99bSAndy Whitcroft 
53084afd99bSAndy Whitcroft 	return chg;
53184afd99bSAndy Whitcroft }
53284afd99bSAndy Whitcroft 
53396822904SAndy Whitcroft /*
534e7c4b0bfSAndy Whitcroft  * Convert the address within this vma to the page offset within
535e7c4b0bfSAndy Whitcroft  * the mapping, in pagecache page units; huge pages here.
536e7c4b0bfSAndy Whitcroft  */
537a5516438SAndi Kleen static pgoff_t vma_hugecache_offset(struct hstate *h,
538a5516438SAndi Kleen 			struct vm_area_struct *vma, unsigned long address)
539e7c4b0bfSAndy Whitcroft {
540a5516438SAndi Kleen 	return ((address - vma->vm_start) >> huge_page_shift(h)) +
541a5516438SAndi Kleen 			(vma->vm_pgoff >> huge_page_order(h));
542e7c4b0bfSAndy Whitcroft }
543e7c4b0bfSAndy Whitcroft 
5440fe6e20bSNaoya Horiguchi pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
5450fe6e20bSNaoya Horiguchi 				     unsigned long address)
5460fe6e20bSNaoya Horiguchi {
5470fe6e20bSNaoya Horiguchi 	return vma_hugecache_offset(hstate_vma(vma), vma, address);
5480fe6e20bSNaoya Horiguchi }
5490fe6e20bSNaoya Horiguchi 
55084afd99bSAndy Whitcroft /*
55108fba699SMel Gorman  * Return the size of the pages allocated when backing a VMA. In the majority
55208fba699SMel Gorman  * cases this will be same size as used by the page table entries.
55308fba699SMel Gorman  */
55408fba699SMel Gorman unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
55508fba699SMel Gorman {
55608fba699SMel Gorman 	struct hstate *hstate;
55708fba699SMel Gorman 
55808fba699SMel Gorman 	if (!is_vm_hugetlb_page(vma))
55908fba699SMel Gorman 		return PAGE_SIZE;
56008fba699SMel Gorman 
56108fba699SMel Gorman 	hstate = hstate_vma(vma);
56208fba699SMel Gorman 
5632415cf12SWanpeng Li 	return 1UL << huge_page_shift(hstate);
56408fba699SMel Gorman }
565f340ca0fSJoerg Roedel EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
56608fba699SMel Gorman 
56708fba699SMel Gorman /*
5683340289dSMel Gorman  * Return the page size being used by the MMU to back a VMA. In the majority
5693340289dSMel Gorman  * of cases, the page size used by the kernel matches the MMU size. On
5703340289dSMel Gorman  * architectures where it differs, an architecture-specific version of this
5713340289dSMel Gorman  * function is required.
5723340289dSMel Gorman  */
5733340289dSMel Gorman #ifndef vma_mmu_pagesize
5743340289dSMel Gorman unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
5753340289dSMel Gorman {
5763340289dSMel Gorman 	return vma_kernel_pagesize(vma);
5773340289dSMel Gorman }
5783340289dSMel Gorman #endif
5793340289dSMel Gorman 
5803340289dSMel Gorman /*
58184afd99bSAndy Whitcroft  * Flags for MAP_PRIVATE reservations.  These are stored in the bottom
58284afd99bSAndy Whitcroft  * bits of the reservation map pointer, which are always clear due to
58384afd99bSAndy Whitcroft  * alignment.
58484afd99bSAndy Whitcroft  */
58584afd99bSAndy Whitcroft #define HPAGE_RESV_OWNER    (1UL << 0)
58684afd99bSAndy Whitcroft #define HPAGE_RESV_UNMAPPED (1UL << 1)
58704f2cbe3SMel Gorman #define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
58884afd99bSAndy Whitcroft 
589a1e78772SMel Gorman /*
590a1e78772SMel Gorman  * These helpers are used to track how many pages are reserved for
591a1e78772SMel Gorman  * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
592a1e78772SMel Gorman  * is guaranteed to have their future faults succeed.
593a1e78772SMel Gorman  *
594a1e78772SMel Gorman  * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
595a1e78772SMel Gorman  * the reserve counters are updated with the hugetlb_lock held. It is safe
596a1e78772SMel Gorman  * to reset the VMA at fork() time as it is not in use yet and there is no
597a1e78772SMel Gorman  * chance of the global counters getting corrupted as a result of the values.
59884afd99bSAndy Whitcroft  *
59984afd99bSAndy Whitcroft  * The private mapping reservation is represented in a subtly different
60084afd99bSAndy Whitcroft  * manner to a shared mapping.  A shared mapping has a region map associated
60184afd99bSAndy Whitcroft  * with the underlying file, this region map represents the backing file
60284afd99bSAndy Whitcroft  * pages which have ever had a reservation assigned which this persists even
60384afd99bSAndy Whitcroft  * after the page is instantiated.  A private mapping has a region map
60484afd99bSAndy Whitcroft  * associated with the original mmap which is attached to all VMAs which
60584afd99bSAndy Whitcroft  * reference it, this region map represents those offsets which have consumed
60684afd99bSAndy Whitcroft  * reservation ie. where pages have been instantiated.
607a1e78772SMel Gorman  */
608e7c4b0bfSAndy Whitcroft static unsigned long get_vma_private_data(struct vm_area_struct *vma)
609e7c4b0bfSAndy Whitcroft {
610e7c4b0bfSAndy Whitcroft 	return (unsigned long)vma->vm_private_data;
611e7c4b0bfSAndy Whitcroft }
612e7c4b0bfSAndy Whitcroft 
613e7c4b0bfSAndy Whitcroft static void set_vma_private_data(struct vm_area_struct *vma,
614e7c4b0bfSAndy Whitcroft 							unsigned long value)
615e7c4b0bfSAndy Whitcroft {
616e7c4b0bfSAndy Whitcroft 	vma->vm_private_data = (void *)value;
617e7c4b0bfSAndy Whitcroft }
618e7c4b0bfSAndy Whitcroft 
6199119a41eSJoonsoo Kim struct resv_map *resv_map_alloc(void)
62084afd99bSAndy Whitcroft {
62184afd99bSAndy Whitcroft 	struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
6225e911373SMike Kravetz 	struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL);
6235e911373SMike Kravetz 
6245e911373SMike Kravetz 	if (!resv_map || !rg) {
6255e911373SMike Kravetz 		kfree(resv_map);
6265e911373SMike Kravetz 		kfree(rg);
62784afd99bSAndy Whitcroft 		return NULL;
6285e911373SMike Kravetz 	}
62984afd99bSAndy Whitcroft 
63084afd99bSAndy Whitcroft 	kref_init(&resv_map->refs);
6317b24d861SDavidlohr Bueso 	spin_lock_init(&resv_map->lock);
63284afd99bSAndy Whitcroft 	INIT_LIST_HEAD(&resv_map->regions);
63384afd99bSAndy Whitcroft 
6345e911373SMike Kravetz 	resv_map->adds_in_progress = 0;
6355e911373SMike Kravetz 
6365e911373SMike Kravetz 	INIT_LIST_HEAD(&resv_map->region_cache);
6375e911373SMike Kravetz 	list_add(&rg->link, &resv_map->region_cache);
6385e911373SMike Kravetz 	resv_map->region_cache_count = 1;
6395e911373SMike Kravetz 
64084afd99bSAndy Whitcroft 	return resv_map;
64184afd99bSAndy Whitcroft }
64284afd99bSAndy Whitcroft 
6439119a41eSJoonsoo Kim void resv_map_release(struct kref *ref)
64484afd99bSAndy Whitcroft {
64584afd99bSAndy Whitcroft 	struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
6465e911373SMike Kravetz 	struct list_head *head = &resv_map->region_cache;
6475e911373SMike Kravetz 	struct file_region *rg, *trg;
64884afd99bSAndy Whitcroft 
64984afd99bSAndy Whitcroft 	/* Clear out any active regions before we release the map. */
6501406ec9bSJoonsoo Kim 	region_truncate(resv_map, 0);
6515e911373SMike Kravetz 
6525e911373SMike Kravetz 	/* ... and any entries left in the cache */
6535e911373SMike Kravetz 	list_for_each_entry_safe(rg, trg, head, link) {
6545e911373SMike Kravetz 		list_del(&rg->link);
6555e911373SMike Kravetz 		kfree(rg);
6565e911373SMike Kravetz 	}
6575e911373SMike Kravetz 
6585e911373SMike Kravetz 	VM_BUG_ON(resv_map->adds_in_progress);
6595e911373SMike Kravetz 
66084afd99bSAndy Whitcroft 	kfree(resv_map);
66184afd99bSAndy Whitcroft }
66284afd99bSAndy Whitcroft 
6634e35f483SJoonsoo Kim static inline struct resv_map *inode_resv_map(struct inode *inode)
6644e35f483SJoonsoo Kim {
6654e35f483SJoonsoo Kim 	return inode->i_mapping->private_data;
6664e35f483SJoonsoo Kim }
6674e35f483SJoonsoo Kim 
66884afd99bSAndy Whitcroft static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
669a1e78772SMel Gorman {
67081d1b09cSSasha Levin 	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
6714e35f483SJoonsoo Kim 	if (vma->vm_flags & VM_MAYSHARE) {
6724e35f483SJoonsoo Kim 		struct address_space *mapping = vma->vm_file->f_mapping;
6734e35f483SJoonsoo Kim 		struct inode *inode = mapping->host;
6744e35f483SJoonsoo Kim 
6754e35f483SJoonsoo Kim 		return inode_resv_map(inode);
6764e35f483SJoonsoo Kim 
6774e35f483SJoonsoo Kim 	} else {
67884afd99bSAndy Whitcroft 		return (struct resv_map *)(get_vma_private_data(vma) &
67984afd99bSAndy Whitcroft 							~HPAGE_RESV_MASK);
6804e35f483SJoonsoo Kim 	}
681a1e78772SMel Gorman }
682a1e78772SMel Gorman 
68384afd99bSAndy Whitcroft static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
684a1e78772SMel Gorman {
68581d1b09cSSasha Levin 	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
68681d1b09cSSasha Levin 	VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
687a1e78772SMel Gorman 
68884afd99bSAndy Whitcroft 	set_vma_private_data(vma, (get_vma_private_data(vma) &
68984afd99bSAndy Whitcroft 				HPAGE_RESV_MASK) | (unsigned long)map);
69004f2cbe3SMel Gorman }
69104f2cbe3SMel Gorman 
69204f2cbe3SMel Gorman static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
69304f2cbe3SMel Gorman {
69481d1b09cSSasha Levin 	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
69581d1b09cSSasha Levin 	VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
696e7c4b0bfSAndy Whitcroft 
697e7c4b0bfSAndy Whitcroft 	set_vma_private_data(vma, get_vma_private_data(vma) | flags);
69804f2cbe3SMel Gorman }
69904f2cbe3SMel Gorman 
70004f2cbe3SMel Gorman static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
70104f2cbe3SMel Gorman {
70281d1b09cSSasha Levin 	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
703e7c4b0bfSAndy Whitcroft 
704e7c4b0bfSAndy Whitcroft 	return (get_vma_private_data(vma) & flag) != 0;
705a1e78772SMel Gorman }
706a1e78772SMel Gorman 
70704f2cbe3SMel Gorman /* Reset counters to 0 and clear all HPAGE_RESV_* flags */
708a1e78772SMel Gorman void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
709a1e78772SMel Gorman {
71081d1b09cSSasha Levin 	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
711f83a275dSMel Gorman 	if (!(vma->vm_flags & VM_MAYSHARE))
712a1e78772SMel Gorman 		vma->vm_private_data = (void *)0;
713a1e78772SMel Gorman }
714a1e78772SMel Gorman 
715a1e78772SMel Gorman /* Returns true if the VMA has associated reserve pages */
716559ec2f8SNicholas Krause static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
717a1e78772SMel Gorman {
718af0ed73eSJoonsoo Kim 	if (vma->vm_flags & VM_NORESERVE) {
719af0ed73eSJoonsoo Kim 		/*
720af0ed73eSJoonsoo Kim 		 * This address is already reserved by other process(chg == 0),
721af0ed73eSJoonsoo Kim 		 * so, we should decrement reserved count. Without decrementing,
722af0ed73eSJoonsoo Kim 		 * reserve count remains after releasing inode, because this
723af0ed73eSJoonsoo Kim 		 * allocated page will go into page cache and is regarded as
724af0ed73eSJoonsoo Kim 		 * coming from reserved pool in releasing step.  Currently, we
725af0ed73eSJoonsoo Kim 		 * don't have any other solution to deal with this situation
726af0ed73eSJoonsoo Kim 		 * properly, so add work-around here.
727af0ed73eSJoonsoo Kim 		 */
728af0ed73eSJoonsoo Kim 		if (vma->vm_flags & VM_MAYSHARE && chg == 0)
729559ec2f8SNicholas Krause 			return true;
730af0ed73eSJoonsoo Kim 		else
731559ec2f8SNicholas Krause 			return false;
732af0ed73eSJoonsoo Kim 	}
733a63884e9SJoonsoo Kim 
734a63884e9SJoonsoo Kim 	/* Shared mappings always use reserves */
735f83a275dSMel Gorman 	if (vma->vm_flags & VM_MAYSHARE)
736559ec2f8SNicholas Krause 		return true;
737a63884e9SJoonsoo Kim 
738a63884e9SJoonsoo Kim 	/*
739a63884e9SJoonsoo Kim 	 * Only the process that called mmap() has reserves for
740a63884e9SJoonsoo Kim 	 * private mappings.
741a63884e9SJoonsoo Kim 	 */
7427f09ca51SMel Gorman 	if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
743559ec2f8SNicholas Krause 		return true;
744a63884e9SJoonsoo Kim 
745559ec2f8SNicholas Krause 	return false;
746a1e78772SMel Gorman }
747a1e78772SMel Gorman 
748a5516438SAndi Kleen static void enqueue_huge_page(struct hstate *h, struct page *page)
7491da177e4SLinus Torvalds {
7501da177e4SLinus Torvalds 	int nid = page_to_nid(page);
7510edaecfaSAneesh Kumar K.V 	list_move(&page->lru, &h->hugepage_freelists[nid]);
752a5516438SAndi Kleen 	h->free_huge_pages++;
753a5516438SAndi Kleen 	h->free_huge_pages_node[nid]++;
7541da177e4SLinus Torvalds }
7551da177e4SLinus Torvalds 
756bf50bab2SNaoya Horiguchi static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
757bf50bab2SNaoya Horiguchi {
758bf50bab2SNaoya Horiguchi 	struct page *page;
759bf50bab2SNaoya Horiguchi 
760c8721bbbSNaoya Horiguchi 	list_for_each_entry(page, &h->hugepage_freelists[nid], lru)
761c8721bbbSNaoya Horiguchi 		if (!is_migrate_isolate_page(page))
762c8721bbbSNaoya Horiguchi 			break;
763c8721bbbSNaoya Horiguchi 	/*
764c8721bbbSNaoya Horiguchi 	 * if 'non-isolated free hugepage' not found on the list,
765c8721bbbSNaoya Horiguchi 	 * the allocation fails.
766c8721bbbSNaoya Horiguchi 	 */
767c8721bbbSNaoya Horiguchi 	if (&h->hugepage_freelists[nid] == &page->lru)
768bf50bab2SNaoya Horiguchi 		return NULL;
7690edaecfaSAneesh Kumar K.V 	list_move(&page->lru, &h->hugepage_activelist);
770a9869b83SNaoya Horiguchi 	set_page_refcounted(page);
771bf50bab2SNaoya Horiguchi 	h->free_huge_pages--;
772bf50bab2SNaoya Horiguchi 	h->free_huge_pages_node[nid]--;
773bf50bab2SNaoya Horiguchi 	return page;
774bf50bab2SNaoya Horiguchi }
775bf50bab2SNaoya Horiguchi 
77686cdb465SNaoya Horiguchi /* Movability of hugepages depends on migration support. */
77786cdb465SNaoya Horiguchi static inline gfp_t htlb_alloc_mask(struct hstate *h)
77886cdb465SNaoya Horiguchi {
779100873d7SNaoya Horiguchi 	if (hugepages_treat_as_movable || hugepage_migration_supported(h))
78086cdb465SNaoya Horiguchi 		return GFP_HIGHUSER_MOVABLE;
78186cdb465SNaoya Horiguchi 	else
78286cdb465SNaoya Horiguchi 		return GFP_HIGHUSER;
78386cdb465SNaoya Horiguchi }
78486cdb465SNaoya Horiguchi 
785a5516438SAndi Kleen static struct page *dequeue_huge_page_vma(struct hstate *h,
786a5516438SAndi Kleen 				struct vm_area_struct *vma,
787af0ed73eSJoonsoo Kim 				unsigned long address, int avoid_reserve,
788af0ed73eSJoonsoo Kim 				long chg)
7891da177e4SLinus Torvalds {
790b1c12cbcSKonstantin Khlebnikov 	struct page *page = NULL;
791480eccf9SLee Schermerhorn 	struct mempolicy *mpol;
79219770b32SMel Gorman 	nodemask_t *nodemask;
793c0ff7453SMiao Xie 	struct zonelist *zonelist;
794dd1a239fSMel Gorman 	struct zone *zone;
795dd1a239fSMel Gorman 	struct zoneref *z;
796cc9a6c87SMel Gorman 	unsigned int cpuset_mems_cookie;
7971da177e4SLinus Torvalds 
798a1e78772SMel Gorman 	/*
799a1e78772SMel Gorman 	 * A child process with MAP_PRIVATE mappings created by their parent
800a1e78772SMel Gorman 	 * have no page reserves. This check ensures that reservations are
801a1e78772SMel Gorman 	 * not "stolen". The child may still get SIGKILLed
802a1e78772SMel Gorman 	 */
803af0ed73eSJoonsoo Kim 	if (!vma_has_reserves(vma, chg) &&
804a5516438SAndi Kleen 			h->free_huge_pages - h->resv_huge_pages == 0)
805c0ff7453SMiao Xie 		goto err;
806a1e78772SMel Gorman 
80704f2cbe3SMel Gorman 	/* If reserves cannot be used, ensure enough pages are in the pool */
808a5516438SAndi Kleen 	if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
8096eab04a8SJustin P. Mattock 		goto err;
81004f2cbe3SMel Gorman 
8119966c4bbSJoonsoo Kim retry_cpuset:
812d26914d1SMel Gorman 	cpuset_mems_cookie = read_mems_allowed_begin();
8139966c4bbSJoonsoo Kim 	zonelist = huge_zonelist(vma, address,
81486cdb465SNaoya Horiguchi 					htlb_alloc_mask(h), &mpol, &nodemask);
8159966c4bbSJoonsoo Kim 
81619770b32SMel Gorman 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
81719770b32SMel Gorman 						MAX_NR_ZONES - 1, nodemask) {
818344736f2SVladimir Davydov 		if (cpuset_zone_allowed(zone, htlb_alloc_mask(h))) {
819bf50bab2SNaoya Horiguchi 			page = dequeue_huge_page_node(h, zone_to_nid(zone));
820bf50bab2SNaoya Horiguchi 			if (page) {
821af0ed73eSJoonsoo Kim 				if (avoid_reserve)
822af0ed73eSJoonsoo Kim 					break;
823af0ed73eSJoonsoo Kim 				if (!vma_has_reserves(vma, chg))
824af0ed73eSJoonsoo Kim 					break;
825af0ed73eSJoonsoo Kim 
82607443a85SJoonsoo Kim 				SetPagePrivate(page);
827a63884e9SJoonsoo Kim 				h->resv_huge_pages--;
8285ab3ee7bSKen Chen 				break;
8291da177e4SLinus Torvalds 			}
8303abf7afdSAndrew Morton 		}
831bf50bab2SNaoya Horiguchi 	}
832cc9a6c87SMel Gorman 
833cc9a6c87SMel Gorman 	mpol_cond_put(mpol);
834d26914d1SMel Gorman 	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
835cc9a6c87SMel Gorman 		goto retry_cpuset;
836cc9a6c87SMel Gorman 	return page;
837cc9a6c87SMel Gorman 
838c0ff7453SMiao Xie err:
839cc9a6c87SMel Gorman 	return NULL;
8401da177e4SLinus Torvalds }
8411da177e4SLinus Torvalds 
8421cac6f2cSLuiz Capitulino /*
8431cac6f2cSLuiz Capitulino  * common helper functions for hstate_next_node_to_{alloc|free}.
8441cac6f2cSLuiz Capitulino  * We may have allocated or freed a huge page based on a different
8451cac6f2cSLuiz Capitulino  * nodes_allowed previously, so h->next_node_to_{alloc|free} might
8461cac6f2cSLuiz Capitulino  * be outside of *nodes_allowed.  Ensure that we use an allowed
8471cac6f2cSLuiz Capitulino  * node for alloc or free.
8481cac6f2cSLuiz Capitulino  */
8491cac6f2cSLuiz Capitulino static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
8501cac6f2cSLuiz Capitulino {
8511cac6f2cSLuiz Capitulino 	nid = next_node(nid, *nodes_allowed);
8521cac6f2cSLuiz Capitulino 	if (nid == MAX_NUMNODES)
8531cac6f2cSLuiz Capitulino 		nid = first_node(*nodes_allowed);
8541cac6f2cSLuiz Capitulino 	VM_BUG_ON(nid >= MAX_NUMNODES);
8551cac6f2cSLuiz Capitulino 
8561cac6f2cSLuiz Capitulino 	return nid;
8571cac6f2cSLuiz Capitulino }
8581cac6f2cSLuiz Capitulino 
8591cac6f2cSLuiz Capitulino static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
8601cac6f2cSLuiz Capitulino {
8611cac6f2cSLuiz Capitulino 	if (!node_isset(nid, *nodes_allowed))
8621cac6f2cSLuiz Capitulino 		nid = next_node_allowed(nid, nodes_allowed);
8631cac6f2cSLuiz Capitulino 	return nid;
8641cac6f2cSLuiz Capitulino }
8651cac6f2cSLuiz Capitulino 
8661cac6f2cSLuiz Capitulino /*
8671cac6f2cSLuiz Capitulino  * returns the previously saved node ["this node"] from which to
8681cac6f2cSLuiz Capitulino  * allocate a persistent huge page for the pool and advance the
8691cac6f2cSLuiz Capitulino  * next node from which to allocate, handling wrap at end of node
8701cac6f2cSLuiz Capitulino  * mask.
8711cac6f2cSLuiz Capitulino  */
8721cac6f2cSLuiz Capitulino static int hstate_next_node_to_alloc(struct hstate *h,
8731cac6f2cSLuiz Capitulino 					nodemask_t *nodes_allowed)
8741cac6f2cSLuiz Capitulino {
8751cac6f2cSLuiz Capitulino 	int nid;
8761cac6f2cSLuiz Capitulino 
8771cac6f2cSLuiz Capitulino 	VM_BUG_ON(!nodes_allowed);
8781cac6f2cSLuiz Capitulino 
8791cac6f2cSLuiz Capitulino 	nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
8801cac6f2cSLuiz Capitulino 	h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
8811cac6f2cSLuiz Capitulino 
8821cac6f2cSLuiz Capitulino 	return nid;
8831cac6f2cSLuiz Capitulino }
8841cac6f2cSLuiz Capitulino 
8851cac6f2cSLuiz Capitulino /*
8861cac6f2cSLuiz Capitulino  * helper for free_pool_huge_page() - return the previously saved
8871cac6f2cSLuiz Capitulino  * node ["this node"] from which to free a huge page.  Advance the
8881cac6f2cSLuiz Capitulino  * next node id whether or not we find a free huge page to free so
8891cac6f2cSLuiz Capitulino  * that the next attempt to free addresses the next node.
8901cac6f2cSLuiz Capitulino  */
8911cac6f2cSLuiz Capitulino static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
8921cac6f2cSLuiz Capitulino {
8931cac6f2cSLuiz Capitulino 	int nid;
8941cac6f2cSLuiz Capitulino 
8951cac6f2cSLuiz Capitulino 	VM_BUG_ON(!nodes_allowed);
8961cac6f2cSLuiz Capitulino 
8971cac6f2cSLuiz Capitulino 	nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
8981cac6f2cSLuiz Capitulino 	h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
8991cac6f2cSLuiz Capitulino 
9001cac6f2cSLuiz Capitulino 	return nid;
9011cac6f2cSLuiz Capitulino }
9021cac6f2cSLuiz Capitulino 
9031cac6f2cSLuiz Capitulino #define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask)		\
9041cac6f2cSLuiz Capitulino 	for (nr_nodes = nodes_weight(*mask);				\
9051cac6f2cSLuiz Capitulino 		nr_nodes > 0 &&						\
9061cac6f2cSLuiz Capitulino 		((node = hstate_next_node_to_alloc(hs, mask)) || 1);	\
9071cac6f2cSLuiz Capitulino 		nr_nodes--)
9081cac6f2cSLuiz Capitulino 
9091cac6f2cSLuiz Capitulino #define for_each_node_mask_to_free(hs, nr_nodes, node, mask)		\
9101cac6f2cSLuiz Capitulino 	for (nr_nodes = nodes_weight(*mask);				\
9111cac6f2cSLuiz Capitulino 		nr_nodes > 0 &&						\
9121cac6f2cSLuiz Capitulino 		((node = hstate_next_node_to_free(hs, mask)) || 1);	\
9131cac6f2cSLuiz Capitulino 		nr_nodes--)
9141cac6f2cSLuiz Capitulino 
915944d9fecSLuiz Capitulino #if defined(CONFIG_CMA) && defined(CONFIG_X86_64)
916944d9fecSLuiz Capitulino static void destroy_compound_gigantic_page(struct page *page,
917944d9fecSLuiz Capitulino 					unsigned long order)
918944d9fecSLuiz Capitulino {
919944d9fecSLuiz Capitulino 	int i;
920944d9fecSLuiz Capitulino 	int nr_pages = 1 << order;
921944d9fecSLuiz Capitulino 	struct page *p = page + 1;
922944d9fecSLuiz Capitulino 
923944d9fecSLuiz Capitulino 	for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
924944d9fecSLuiz Capitulino 		__ClearPageTail(p);
925944d9fecSLuiz Capitulino 		set_page_refcounted(p);
926944d9fecSLuiz Capitulino 		p->first_page = NULL;
927944d9fecSLuiz Capitulino 	}
928944d9fecSLuiz Capitulino 
929944d9fecSLuiz Capitulino 	set_compound_order(page, 0);
930944d9fecSLuiz Capitulino 	__ClearPageHead(page);
931944d9fecSLuiz Capitulino }
932944d9fecSLuiz Capitulino 
933944d9fecSLuiz Capitulino static void free_gigantic_page(struct page *page, unsigned order)
934944d9fecSLuiz Capitulino {
935944d9fecSLuiz Capitulino 	free_contig_range(page_to_pfn(page), 1 << order);
936944d9fecSLuiz Capitulino }
937944d9fecSLuiz Capitulino 
938944d9fecSLuiz Capitulino static int __alloc_gigantic_page(unsigned long start_pfn,
939944d9fecSLuiz Capitulino 				unsigned long nr_pages)
940944d9fecSLuiz Capitulino {
941944d9fecSLuiz Capitulino 	unsigned long end_pfn = start_pfn + nr_pages;
942944d9fecSLuiz Capitulino 	return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
943944d9fecSLuiz Capitulino }
944944d9fecSLuiz Capitulino 
945944d9fecSLuiz Capitulino static bool pfn_range_valid_gigantic(unsigned long start_pfn,
946944d9fecSLuiz Capitulino 				unsigned long nr_pages)
947944d9fecSLuiz Capitulino {
948944d9fecSLuiz Capitulino 	unsigned long i, end_pfn = start_pfn + nr_pages;
949944d9fecSLuiz Capitulino 	struct page *page;
950944d9fecSLuiz Capitulino 
951944d9fecSLuiz Capitulino 	for (i = start_pfn; i < end_pfn; i++) {
952944d9fecSLuiz Capitulino 		if (!pfn_valid(i))
953944d9fecSLuiz Capitulino 			return false;
954944d9fecSLuiz Capitulino 
955944d9fecSLuiz Capitulino 		page = pfn_to_page(i);
956944d9fecSLuiz Capitulino 
957944d9fecSLuiz Capitulino 		if (PageReserved(page))
958944d9fecSLuiz Capitulino 			return false;
959944d9fecSLuiz Capitulino 
960944d9fecSLuiz Capitulino 		if (page_count(page) > 0)
961944d9fecSLuiz Capitulino 			return false;
962944d9fecSLuiz Capitulino 
963944d9fecSLuiz Capitulino 		if (PageHuge(page))
964944d9fecSLuiz Capitulino 			return false;
965944d9fecSLuiz Capitulino 	}
966944d9fecSLuiz Capitulino 
967944d9fecSLuiz Capitulino 	return true;
968944d9fecSLuiz Capitulino }
969944d9fecSLuiz Capitulino 
970944d9fecSLuiz Capitulino static bool zone_spans_last_pfn(const struct zone *zone,
971944d9fecSLuiz Capitulino 			unsigned long start_pfn, unsigned long nr_pages)
972944d9fecSLuiz Capitulino {
973944d9fecSLuiz Capitulino 	unsigned long last_pfn = start_pfn + nr_pages - 1;
974944d9fecSLuiz Capitulino 	return zone_spans_pfn(zone, last_pfn);
975944d9fecSLuiz Capitulino }
976944d9fecSLuiz Capitulino 
977944d9fecSLuiz Capitulino static struct page *alloc_gigantic_page(int nid, unsigned order)
978944d9fecSLuiz Capitulino {
979944d9fecSLuiz Capitulino 	unsigned long nr_pages = 1 << order;
980944d9fecSLuiz Capitulino 	unsigned long ret, pfn, flags;
981944d9fecSLuiz Capitulino 	struct zone *z;
982944d9fecSLuiz Capitulino 
983944d9fecSLuiz Capitulino 	z = NODE_DATA(nid)->node_zones;
984944d9fecSLuiz Capitulino 	for (; z - NODE_DATA(nid)->node_zones < MAX_NR_ZONES; z++) {
985944d9fecSLuiz Capitulino 		spin_lock_irqsave(&z->lock, flags);
986944d9fecSLuiz Capitulino 
987944d9fecSLuiz Capitulino 		pfn = ALIGN(z->zone_start_pfn, nr_pages);
988944d9fecSLuiz Capitulino 		while (zone_spans_last_pfn(z, pfn, nr_pages)) {
989944d9fecSLuiz Capitulino 			if (pfn_range_valid_gigantic(pfn, nr_pages)) {
990944d9fecSLuiz Capitulino 				/*
991944d9fecSLuiz Capitulino 				 * We release the zone lock here because
992944d9fecSLuiz Capitulino 				 * alloc_contig_range() will also lock the zone
993944d9fecSLuiz Capitulino 				 * at some point. If there's an allocation
994944d9fecSLuiz Capitulino 				 * spinning on this lock, it may win the race
995944d9fecSLuiz Capitulino 				 * and cause alloc_contig_range() to fail...
996944d9fecSLuiz Capitulino 				 */
997944d9fecSLuiz Capitulino 				spin_unlock_irqrestore(&z->lock, flags);
998944d9fecSLuiz Capitulino 				ret = __alloc_gigantic_page(pfn, nr_pages);
999944d9fecSLuiz Capitulino 				if (!ret)
1000944d9fecSLuiz Capitulino 					return pfn_to_page(pfn);
1001944d9fecSLuiz Capitulino 				spin_lock_irqsave(&z->lock, flags);
1002944d9fecSLuiz Capitulino 			}
1003944d9fecSLuiz Capitulino 			pfn += nr_pages;
1004944d9fecSLuiz Capitulino 		}
1005944d9fecSLuiz Capitulino 
1006944d9fecSLuiz Capitulino 		spin_unlock_irqrestore(&z->lock, flags);
1007944d9fecSLuiz Capitulino 	}
1008944d9fecSLuiz Capitulino 
1009944d9fecSLuiz Capitulino 	return NULL;
1010944d9fecSLuiz Capitulino }
1011944d9fecSLuiz Capitulino 
1012944d9fecSLuiz Capitulino static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
1013944d9fecSLuiz Capitulino static void prep_compound_gigantic_page(struct page *page, unsigned long order);
1014944d9fecSLuiz Capitulino 
1015944d9fecSLuiz Capitulino static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid)
1016944d9fecSLuiz Capitulino {
1017944d9fecSLuiz Capitulino 	struct page *page;
1018944d9fecSLuiz Capitulino 
1019944d9fecSLuiz Capitulino 	page = alloc_gigantic_page(nid, huge_page_order(h));
1020944d9fecSLuiz Capitulino 	if (page) {
1021944d9fecSLuiz Capitulino 		prep_compound_gigantic_page(page, huge_page_order(h));
1022944d9fecSLuiz Capitulino 		prep_new_huge_page(h, page, nid);
1023944d9fecSLuiz Capitulino 	}
1024944d9fecSLuiz Capitulino 
1025944d9fecSLuiz Capitulino 	return page;
1026944d9fecSLuiz Capitulino }
1027944d9fecSLuiz Capitulino 
1028944d9fecSLuiz Capitulino static int alloc_fresh_gigantic_page(struct hstate *h,
1029944d9fecSLuiz Capitulino 				nodemask_t *nodes_allowed)
1030944d9fecSLuiz Capitulino {
1031944d9fecSLuiz Capitulino 	struct page *page = NULL;
1032944d9fecSLuiz Capitulino 	int nr_nodes, node;
1033944d9fecSLuiz Capitulino 
1034944d9fecSLuiz Capitulino 	for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
1035944d9fecSLuiz Capitulino 		page = alloc_fresh_gigantic_page_node(h, node);
1036944d9fecSLuiz Capitulino 		if (page)
1037944d9fecSLuiz Capitulino 			return 1;
1038944d9fecSLuiz Capitulino 	}
1039944d9fecSLuiz Capitulino 
1040944d9fecSLuiz Capitulino 	return 0;
1041944d9fecSLuiz Capitulino }
1042944d9fecSLuiz Capitulino 
1043944d9fecSLuiz Capitulino static inline bool gigantic_page_supported(void) { return true; }
1044944d9fecSLuiz Capitulino #else
1045944d9fecSLuiz Capitulino static inline bool gigantic_page_supported(void) { return false; }
1046944d9fecSLuiz Capitulino static inline void free_gigantic_page(struct page *page, unsigned order) { }
1047944d9fecSLuiz Capitulino static inline void destroy_compound_gigantic_page(struct page *page,
1048944d9fecSLuiz Capitulino 						unsigned long order) { }
1049944d9fecSLuiz Capitulino static inline int alloc_fresh_gigantic_page(struct hstate *h,
1050944d9fecSLuiz Capitulino 					nodemask_t *nodes_allowed) { return 0; }
1051944d9fecSLuiz Capitulino #endif
1052944d9fecSLuiz Capitulino 
1053a5516438SAndi Kleen static void update_and_free_page(struct hstate *h, struct page *page)
10546af2acb6SAdam Litke {
10556af2acb6SAdam Litke 	int i;
1056a5516438SAndi Kleen 
1057944d9fecSLuiz Capitulino 	if (hstate_is_gigantic(h) && !gigantic_page_supported())
1058944d9fecSLuiz Capitulino 		return;
105918229df5SAndy Whitcroft 
1060a5516438SAndi Kleen 	h->nr_huge_pages--;
1061a5516438SAndi Kleen 	h->nr_huge_pages_node[page_to_nid(page)]--;
1062a5516438SAndi Kleen 	for (i = 0; i < pages_per_huge_page(h); i++) {
106332f84528SChris Forbes 		page[i].flags &= ~(1 << PG_locked | 1 << PG_error |
106432f84528SChris Forbes 				1 << PG_referenced | 1 << PG_dirty |
1065a7407a27SLuiz Capitulino 				1 << PG_active | 1 << PG_private |
1066a7407a27SLuiz Capitulino 				1 << PG_writeback);
10676af2acb6SAdam Litke 	}
1068309381feSSasha Levin 	VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
10696af2acb6SAdam Litke 	set_compound_page_dtor(page, NULL);
10706af2acb6SAdam Litke 	set_page_refcounted(page);
1071944d9fecSLuiz Capitulino 	if (hstate_is_gigantic(h)) {
1072944d9fecSLuiz Capitulino 		destroy_compound_gigantic_page(page, huge_page_order(h));
1073944d9fecSLuiz Capitulino 		free_gigantic_page(page, huge_page_order(h));
1074944d9fecSLuiz Capitulino 	} else {
1075a5516438SAndi Kleen 		__free_pages(page, huge_page_order(h));
10766af2acb6SAdam Litke 	}
1077944d9fecSLuiz Capitulino }
10786af2acb6SAdam Litke 
1079e5ff2159SAndi Kleen struct hstate *size_to_hstate(unsigned long size)
1080e5ff2159SAndi Kleen {
1081e5ff2159SAndi Kleen 	struct hstate *h;
1082e5ff2159SAndi Kleen 
1083e5ff2159SAndi Kleen 	for_each_hstate(h) {
1084e5ff2159SAndi Kleen 		if (huge_page_size(h) == size)
1085e5ff2159SAndi Kleen 			return h;
1086e5ff2159SAndi Kleen 	}
1087e5ff2159SAndi Kleen 	return NULL;
1088e5ff2159SAndi Kleen }
1089e5ff2159SAndi Kleen 
1090bcc54222SNaoya Horiguchi /*
1091bcc54222SNaoya Horiguchi  * Test to determine whether the hugepage is "active/in-use" (i.e. being linked
1092bcc54222SNaoya Horiguchi  * to hstate->hugepage_activelist.)
1093bcc54222SNaoya Horiguchi  *
1094bcc54222SNaoya Horiguchi  * This function can be called for tail pages, but never returns true for them.
1095bcc54222SNaoya Horiguchi  */
1096bcc54222SNaoya Horiguchi bool page_huge_active(struct page *page)
1097bcc54222SNaoya Horiguchi {
1098bcc54222SNaoya Horiguchi 	VM_BUG_ON_PAGE(!PageHuge(page), page);
1099bcc54222SNaoya Horiguchi 	return PageHead(page) && PagePrivate(&page[1]);
1100bcc54222SNaoya Horiguchi }
1101bcc54222SNaoya Horiguchi 
1102bcc54222SNaoya Horiguchi /* never called for tail page */
1103bcc54222SNaoya Horiguchi static void set_page_huge_active(struct page *page)
1104bcc54222SNaoya Horiguchi {
1105bcc54222SNaoya Horiguchi 	VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
1106bcc54222SNaoya Horiguchi 	SetPagePrivate(&page[1]);
1107bcc54222SNaoya Horiguchi }
1108bcc54222SNaoya Horiguchi 
1109bcc54222SNaoya Horiguchi static void clear_page_huge_active(struct page *page)
1110bcc54222SNaoya Horiguchi {
1111bcc54222SNaoya Horiguchi 	VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
1112bcc54222SNaoya Horiguchi 	ClearPagePrivate(&page[1]);
1113bcc54222SNaoya Horiguchi }
1114bcc54222SNaoya Horiguchi 
11158f1d26d0SAtsushi Kumagai void free_huge_page(struct page *page)
111627a85ef1SDavid Gibson {
1117a5516438SAndi Kleen 	/*
1118a5516438SAndi Kleen 	 * Can't pass hstate in here because it is called from the
1119a5516438SAndi Kleen 	 * compound page destructor.
1120a5516438SAndi Kleen 	 */
1121e5ff2159SAndi Kleen 	struct hstate *h = page_hstate(page);
11227893d1d5SAdam Litke 	int nid = page_to_nid(page);
112390481622SDavid Gibson 	struct hugepage_subpool *spool =
112490481622SDavid Gibson 		(struct hugepage_subpool *)page_private(page);
112507443a85SJoonsoo Kim 	bool restore_reserve;
112627a85ef1SDavid Gibson 
1127e5df70abSAndy Whitcroft 	set_page_private(page, 0);
112823be7468SMel Gorman 	page->mapping = NULL;
11297893d1d5SAdam Litke 	BUG_ON(page_count(page));
11300fe6e20bSNaoya Horiguchi 	BUG_ON(page_mapcount(page));
113107443a85SJoonsoo Kim 	restore_reserve = PagePrivate(page);
113216c794b4SJoonsoo Kim 	ClearPagePrivate(page);
113327a85ef1SDavid Gibson 
11341c5ecae3SMike Kravetz 	/*
11351c5ecae3SMike Kravetz 	 * A return code of zero implies that the subpool will be under its
11361c5ecae3SMike Kravetz 	 * minimum size if the reservation is not restored after page is free.
11371c5ecae3SMike Kravetz 	 * Therefore, force restore_reserve operation.
11381c5ecae3SMike Kravetz 	 */
11391c5ecae3SMike Kravetz 	if (hugepage_subpool_put_pages(spool, 1) == 0)
11401c5ecae3SMike Kravetz 		restore_reserve = true;
11411c5ecae3SMike Kravetz 
114227a85ef1SDavid Gibson 	spin_lock(&hugetlb_lock);
1143bcc54222SNaoya Horiguchi 	clear_page_huge_active(page);
11446d76dcf4SAneesh Kumar K.V 	hugetlb_cgroup_uncharge_page(hstate_index(h),
11456d76dcf4SAneesh Kumar K.V 				     pages_per_huge_page(h), page);
114607443a85SJoonsoo Kim 	if (restore_reserve)
114707443a85SJoonsoo Kim 		h->resv_huge_pages++;
114807443a85SJoonsoo Kim 
1149944d9fecSLuiz Capitulino 	if (h->surplus_huge_pages_node[nid]) {
11500edaecfaSAneesh Kumar K.V 		/* remove the page from active list */
11510edaecfaSAneesh Kumar K.V 		list_del(&page->lru);
1152a5516438SAndi Kleen 		update_and_free_page(h, page);
1153a5516438SAndi Kleen 		h->surplus_huge_pages--;
1154a5516438SAndi Kleen 		h->surplus_huge_pages_node[nid]--;
11557893d1d5SAdam Litke 	} else {
11565d3a551cSWill Deacon 		arch_clear_hugepage_flags(page);
1157a5516438SAndi Kleen 		enqueue_huge_page(h, page);
11587893d1d5SAdam Litke 	}
115927a85ef1SDavid Gibson 	spin_unlock(&hugetlb_lock);
116027a85ef1SDavid Gibson }
116127a85ef1SDavid Gibson 
1162a5516438SAndi Kleen static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
1163b7ba30c6SAndi Kleen {
11640edaecfaSAneesh Kumar K.V 	INIT_LIST_HEAD(&page->lru);
1165b7ba30c6SAndi Kleen 	set_compound_page_dtor(page, free_huge_page);
1166b7ba30c6SAndi Kleen 	spin_lock(&hugetlb_lock);
11679dd540e2SAneesh Kumar K.V 	set_hugetlb_cgroup(page, NULL);
1168a5516438SAndi Kleen 	h->nr_huge_pages++;
1169a5516438SAndi Kleen 	h->nr_huge_pages_node[nid]++;
1170b7ba30c6SAndi Kleen 	spin_unlock(&hugetlb_lock);
1171b7ba30c6SAndi Kleen 	put_page(page); /* free it into the hugepage allocator */
1172b7ba30c6SAndi Kleen }
1173b7ba30c6SAndi Kleen 
11742906dd52SLuiz Capitulino static void prep_compound_gigantic_page(struct page *page, unsigned long order)
117520a0307cSWu Fengguang {
117620a0307cSWu Fengguang 	int i;
117720a0307cSWu Fengguang 	int nr_pages = 1 << order;
117820a0307cSWu Fengguang 	struct page *p = page + 1;
117920a0307cSWu Fengguang 
118020a0307cSWu Fengguang 	/* we rely on prep_new_huge_page to set the destructor */
118120a0307cSWu Fengguang 	set_compound_order(page, order);
118220a0307cSWu Fengguang 	__SetPageHead(page);
1183ef5a22beSAndrea Arcangeli 	__ClearPageReserved(page);
118420a0307cSWu Fengguang 	for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
1185ef5a22beSAndrea Arcangeli 		/*
1186ef5a22beSAndrea Arcangeli 		 * For gigantic hugepages allocated through bootmem at
1187ef5a22beSAndrea Arcangeli 		 * boot, it's safer to be consistent with the not-gigantic
1188ef5a22beSAndrea Arcangeli 		 * hugepages and clear the PG_reserved bit from all tail pages
1189ef5a22beSAndrea Arcangeli 		 * too.  Otherwse drivers using get_user_pages() to access tail
1190ef5a22beSAndrea Arcangeli 		 * pages may get the reference counting wrong if they see
1191ef5a22beSAndrea Arcangeli 		 * PG_reserved set on a tail page (despite the head page not
1192ef5a22beSAndrea Arcangeli 		 * having PG_reserved set).  Enforcing this consistency between
1193ef5a22beSAndrea Arcangeli 		 * head and tail pages allows drivers to optimize away a check
1194ef5a22beSAndrea Arcangeli 		 * on the head page when they need know if put_page() is needed
1195ef5a22beSAndrea Arcangeli 		 * after get_user_pages().
1196ef5a22beSAndrea Arcangeli 		 */
1197ef5a22beSAndrea Arcangeli 		__ClearPageReserved(p);
119858a84aa9SYouquan Song 		set_page_count(p, 0);
119920a0307cSWu Fengguang 		p->first_page = page;
120044fc8057SDavid Rientjes 		/* Make sure p->first_page is always valid for PageTail() */
120144fc8057SDavid Rientjes 		smp_wmb();
120244fc8057SDavid Rientjes 		__SetPageTail(p);
120320a0307cSWu Fengguang 	}
120420a0307cSWu Fengguang }
120520a0307cSWu Fengguang 
12067795912cSAndrew Morton /*
12077795912cSAndrew Morton  * PageHuge() only returns true for hugetlbfs pages, but not for normal or
12087795912cSAndrew Morton  * transparent huge pages.  See the PageTransHuge() documentation for more
12097795912cSAndrew Morton  * details.
12107795912cSAndrew Morton  */
121120a0307cSWu Fengguang int PageHuge(struct page *page)
121220a0307cSWu Fengguang {
121320a0307cSWu Fengguang 	if (!PageCompound(page))
121420a0307cSWu Fengguang 		return 0;
121520a0307cSWu Fengguang 
121620a0307cSWu Fengguang 	page = compound_head(page);
1217758f66a2SAndrew Morton 	return get_compound_page_dtor(page) == free_huge_page;
121820a0307cSWu Fengguang }
121943131e14SNaoya Horiguchi EXPORT_SYMBOL_GPL(PageHuge);
122043131e14SNaoya Horiguchi 
122127c73ae7SAndrea Arcangeli /*
122227c73ae7SAndrea Arcangeli  * PageHeadHuge() only returns true for hugetlbfs head page, but not for
122327c73ae7SAndrea Arcangeli  * normal or transparent huge pages.
122427c73ae7SAndrea Arcangeli  */
122527c73ae7SAndrea Arcangeli int PageHeadHuge(struct page *page_head)
122627c73ae7SAndrea Arcangeli {
122727c73ae7SAndrea Arcangeli 	if (!PageHead(page_head))
122827c73ae7SAndrea Arcangeli 		return 0;
122927c73ae7SAndrea Arcangeli 
1230758f66a2SAndrew Morton 	return get_compound_page_dtor(page_head) == free_huge_page;
123127c73ae7SAndrea Arcangeli }
123227c73ae7SAndrea Arcangeli 
123313d60f4bSZhang Yi pgoff_t __basepage_index(struct page *page)
123413d60f4bSZhang Yi {
123513d60f4bSZhang Yi 	struct page *page_head = compound_head(page);
123613d60f4bSZhang Yi 	pgoff_t index = page_index(page_head);
123713d60f4bSZhang Yi 	unsigned long compound_idx;
123813d60f4bSZhang Yi 
123913d60f4bSZhang Yi 	if (!PageHuge(page_head))
124013d60f4bSZhang Yi 		return page_index(page);
124113d60f4bSZhang Yi 
124213d60f4bSZhang Yi 	if (compound_order(page_head) >= MAX_ORDER)
124313d60f4bSZhang Yi 		compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
124413d60f4bSZhang Yi 	else
124513d60f4bSZhang Yi 		compound_idx = page - page_head;
124613d60f4bSZhang Yi 
124713d60f4bSZhang Yi 	return (index << compound_order(page_head)) + compound_idx;
124813d60f4bSZhang Yi }
124913d60f4bSZhang Yi 
1250a5516438SAndi Kleen static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
12511da177e4SLinus Torvalds {
12521da177e4SLinus Torvalds 	struct page *page;
1253f96efd58SJoe Jin 
12546484eb3eSMel Gorman 	page = alloc_pages_exact_node(nid,
125586cdb465SNaoya Horiguchi 		htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
1256551883aeSNishanth Aravamudan 						__GFP_REPEAT|__GFP_NOWARN,
1257a5516438SAndi Kleen 		huge_page_order(h));
12581da177e4SLinus Torvalds 	if (page) {
1259a5516438SAndi Kleen 		prep_new_huge_page(h, page, nid);
12601da177e4SLinus Torvalds 	}
126163b4613cSNishanth Aravamudan 
126263b4613cSNishanth Aravamudan 	return page;
126363b4613cSNishanth Aravamudan }
126463b4613cSNishanth Aravamudan 
1265b2261026SJoonsoo Kim static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
1266b2261026SJoonsoo Kim {
1267b2261026SJoonsoo Kim 	struct page *page;
1268b2261026SJoonsoo Kim 	int nr_nodes, node;
1269b2261026SJoonsoo Kim 	int ret = 0;
1270b2261026SJoonsoo Kim 
1271b2261026SJoonsoo Kim 	for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
1272b2261026SJoonsoo Kim 		page = alloc_fresh_huge_page_node(h, node);
1273b2261026SJoonsoo Kim 		if (page) {
1274b2261026SJoonsoo Kim 			ret = 1;
1275b2261026SJoonsoo Kim 			break;
1276b2261026SJoonsoo Kim 		}
1277b2261026SJoonsoo Kim 	}
1278b2261026SJoonsoo Kim 
1279b2261026SJoonsoo Kim 	if (ret)
1280b2261026SJoonsoo Kim 		count_vm_event(HTLB_BUDDY_PGALLOC);
1281b2261026SJoonsoo Kim 	else
1282b2261026SJoonsoo Kim 		count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
1283b2261026SJoonsoo Kim 
1284b2261026SJoonsoo Kim 	return ret;
1285b2261026SJoonsoo Kim }
1286b2261026SJoonsoo Kim 
1287e8c5c824SLee Schermerhorn /*
1288e8c5c824SLee Schermerhorn  * Free huge page from pool from next node to free.
1289e8c5c824SLee Schermerhorn  * Attempt to keep persistent huge pages more or less
1290e8c5c824SLee Schermerhorn  * balanced over allowed nodes.
1291e8c5c824SLee Schermerhorn  * Called with hugetlb_lock locked.
1292e8c5c824SLee Schermerhorn  */
12936ae11b27SLee Schermerhorn static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
12946ae11b27SLee Schermerhorn 							 bool acct_surplus)
1295e8c5c824SLee Schermerhorn {
1296b2261026SJoonsoo Kim 	int nr_nodes, node;
1297e8c5c824SLee Schermerhorn 	int ret = 0;
1298e8c5c824SLee Schermerhorn 
1299b2261026SJoonsoo Kim 	for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
1300685f3457SLee Schermerhorn 		/*
1301685f3457SLee Schermerhorn 		 * If we're returning unused surplus pages, only examine
1302685f3457SLee Schermerhorn 		 * nodes with surplus pages.
1303685f3457SLee Schermerhorn 		 */
1304b2261026SJoonsoo Kim 		if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
1305b2261026SJoonsoo Kim 		    !list_empty(&h->hugepage_freelists[node])) {
1306e8c5c824SLee Schermerhorn 			struct page *page =
1307b2261026SJoonsoo Kim 				list_entry(h->hugepage_freelists[node].next,
1308e8c5c824SLee Schermerhorn 					  struct page, lru);
1309e8c5c824SLee Schermerhorn 			list_del(&page->lru);
1310e8c5c824SLee Schermerhorn 			h->free_huge_pages--;
1311b2261026SJoonsoo Kim 			h->free_huge_pages_node[node]--;
1312685f3457SLee Schermerhorn 			if (acct_surplus) {
1313685f3457SLee Schermerhorn 				h->surplus_huge_pages--;
1314b2261026SJoonsoo Kim 				h->surplus_huge_pages_node[node]--;
1315685f3457SLee Schermerhorn 			}
1316e8c5c824SLee Schermerhorn 			update_and_free_page(h, page);
1317e8c5c824SLee Schermerhorn 			ret = 1;
13189a76db09SLee Schermerhorn 			break;
1319e8c5c824SLee Schermerhorn 		}
1320b2261026SJoonsoo Kim 	}
1321e8c5c824SLee Schermerhorn 
1322e8c5c824SLee Schermerhorn 	return ret;
1323e8c5c824SLee Schermerhorn }
1324e8c5c824SLee Schermerhorn 
1325c8721bbbSNaoya Horiguchi /*
1326c8721bbbSNaoya Horiguchi  * Dissolve a given free hugepage into free buddy pages. This function does
1327c8721bbbSNaoya Horiguchi  * nothing for in-use (including surplus) hugepages.
1328c8721bbbSNaoya Horiguchi  */
1329c8721bbbSNaoya Horiguchi static void dissolve_free_huge_page(struct page *page)
1330c8721bbbSNaoya Horiguchi {
1331c8721bbbSNaoya Horiguchi 	spin_lock(&hugetlb_lock);
1332c8721bbbSNaoya Horiguchi 	if (PageHuge(page) && !page_count(page)) {
1333c8721bbbSNaoya Horiguchi 		struct hstate *h = page_hstate(page);
1334c8721bbbSNaoya Horiguchi 		int nid = page_to_nid(page);
1335c8721bbbSNaoya Horiguchi 		list_del(&page->lru);
1336c8721bbbSNaoya Horiguchi 		h->free_huge_pages--;
1337c8721bbbSNaoya Horiguchi 		h->free_huge_pages_node[nid]--;
1338c8721bbbSNaoya Horiguchi 		update_and_free_page(h, page);
1339c8721bbbSNaoya Horiguchi 	}
1340c8721bbbSNaoya Horiguchi 	spin_unlock(&hugetlb_lock);
1341c8721bbbSNaoya Horiguchi }
1342c8721bbbSNaoya Horiguchi 
1343c8721bbbSNaoya Horiguchi /*
1344c8721bbbSNaoya Horiguchi  * Dissolve free hugepages in a given pfn range. Used by memory hotplug to
1345c8721bbbSNaoya Horiguchi  * make specified memory blocks removable from the system.
1346c8721bbbSNaoya Horiguchi  * Note that start_pfn should aligned with (minimum) hugepage size.
1347c8721bbbSNaoya Horiguchi  */
1348c8721bbbSNaoya Horiguchi void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
1349c8721bbbSNaoya Horiguchi {
1350c8721bbbSNaoya Horiguchi 	unsigned long pfn;
1351c8721bbbSNaoya Horiguchi 
1352d0177639SLi Zhong 	if (!hugepages_supported())
1353d0177639SLi Zhong 		return;
1354d0177639SLi Zhong 
1355641844f5SNaoya Horiguchi 	VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << minimum_order));
1356641844f5SNaoya Horiguchi 	for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order)
1357c8721bbbSNaoya Horiguchi 		dissolve_free_huge_page(pfn_to_page(pfn));
1358c8721bbbSNaoya Horiguchi }
1359c8721bbbSNaoya Horiguchi 
1360bf50bab2SNaoya Horiguchi static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
13617893d1d5SAdam Litke {
13627893d1d5SAdam Litke 	struct page *page;
1363bf50bab2SNaoya Horiguchi 	unsigned int r_nid;
13647893d1d5SAdam Litke 
1365bae7f4aeSLuiz Capitulino 	if (hstate_is_gigantic(h))
1366aa888a74SAndi Kleen 		return NULL;
1367aa888a74SAndi Kleen 
1368d1c3fb1fSNishanth Aravamudan 	/*
1369d1c3fb1fSNishanth Aravamudan 	 * Assume we will successfully allocate the surplus page to
1370d1c3fb1fSNishanth Aravamudan 	 * prevent racing processes from causing the surplus to exceed
1371d1c3fb1fSNishanth Aravamudan 	 * overcommit
1372d1c3fb1fSNishanth Aravamudan 	 *
1373d1c3fb1fSNishanth Aravamudan 	 * This however introduces a different race, where a process B
1374d1c3fb1fSNishanth Aravamudan 	 * tries to grow the static hugepage pool while alloc_pages() is
1375d1c3fb1fSNishanth Aravamudan 	 * called by process A. B will only examine the per-node
1376d1c3fb1fSNishanth Aravamudan 	 * counters in determining if surplus huge pages can be
1377d1c3fb1fSNishanth Aravamudan 	 * converted to normal huge pages in adjust_pool_surplus(). A
1378d1c3fb1fSNishanth Aravamudan 	 * won't be able to increment the per-node counter, until the
1379d1c3fb1fSNishanth Aravamudan 	 * lock is dropped by B, but B doesn't drop hugetlb_lock until
1380d1c3fb1fSNishanth Aravamudan 	 * no more huge pages can be converted from surplus to normal
1381d1c3fb1fSNishanth Aravamudan 	 * state (and doesn't try to convert again). Thus, we have a
1382d1c3fb1fSNishanth Aravamudan 	 * case where a surplus huge page exists, the pool is grown, and
1383d1c3fb1fSNishanth Aravamudan 	 * the surplus huge page still exists after, even though it
1384d1c3fb1fSNishanth Aravamudan 	 * should just have been converted to a normal huge page. This
1385d1c3fb1fSNishanth Aravamudan 	 * does not leak memory, though, as the hugepage will be freed
1386d1c3fb1fSNishanth Aravamudan 	 * once it is out of use. It also does not allow the counters to
1387d1c3fb1fSNishanth Aravamudan 	 * go out of whack in adjust_pool_surplus() as we don't modify
1388d1c3fb1fSNishanth Aravamudan 	 * the node values until we've gotten the hugepage and only the
1389d1c3fb1fSNishanth Aravamudan 	 * per-node value is checked there.
1390d1c3fb1fSNishanth Aravamudan 	 */
1391d1c3fb1fSNishanth Aravamudan 	spin_lock(&hugetlb_lock);
1392a5516438SAndi Kleen 	if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
1393d1c3fb1fSNishanth Aravamudan 		spin_unlock(&hugetlb_lock);
1394d1c3fb1fSNishanth Aravamudan 		return NULL;
1395d1c3fb1fSNishanth Aravamudan 	} else {
1396a5516438SAndi Kleen 		h->nr_huge_pages++;
1397a5516438SAndi Kleen 		h->surplus_huge_pages++;
1398d1c3fb1fSNishanth Aravamudan 	}
1399d1c3fb1fSNishanth Aravamudan 	spin_unlock(&hugetlb_lock);
1400d1c3fb1fSNishanth Aravamudan 
1401bf50bab2SNaoya Horiguchi 	if (nid == NUMA_NO_NODE)
140286cdb465SNaoya Horiguchi 		page = alloc_pages(htlb_alloc_mask(h)|__GFP_COMP|
1403551883aeSNishanth Aravamudan 				   __GFP_REPEAT|__GFP_NOWARN,
1404a5516438SAndi Kleen 				   huge_page_order(h));
1405bf50bab2SNaoya Horiguchi 	else
1406bf50bab2SNaoya Horiguchi 		page = alloc_pages_exact_node(nid,
140786cdb465SNaoya Horiguchi 			htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
1408bf50bab2SNaoya Horiguchi 			__GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
1409d1c3fb1fSNishanth Aravamudan 
14107893d1d5SAdam Litke 	spin_lock(&hugetlb_lock);
1411d1c3fb1fSNishanth Aravamudan 	if (page) {
14120edaecfaSAneesh Kumar K.V 		INIT_LIST_HEAD(&page->lru);
1413bf50bab2SNaoya Horiguchi 		r_nid = page_to_nid(page);
1414d1c3fb1fSNishanth Aravamudan 		set_compound_page_dtor(page, free_huge_page);
14159dd540e2SAneesh Kumar K.V 		set_hugetlb_cgroup(page, NULL);
1416d1c3fb1fSNishanth Aravamudan 		/*
1417d1c3fb1fSNishanth Aravamudan 		 * We incremented the global counters already
1418d1c3fb1fSNishanth Aravamudan 		 */
1419bf50bab2SNaoya Horiguchi 		h->nr_huge_pages_node[r_nid]++;
1420bf50bab2SNaoya Horiguchi 		h->surplus_huge_pages_node[r_nid]++;
14213b116300SAdam Litke 		__count_vm_event(HTLB_BUDDY_PGALLOC);
1422d1c3fb1fSNishanth Aravamudan 	} else {
1423a5516438SAndi Kleen 		h->nr_huge_pages--;
1424a5516438SAndi Kleen 		h->surplus_huge_pages--;
14253b116300SAdam Litke 		__count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
14267893d1d5SAdam Litke 	}
1427d1c3fb1fSNishanth Aravamudan 	spin_unlock(&hugetlb_lock);
14287893d1d5SAdam Litke 
14297893d1d5SAdam Litke 	return page;
14307893d1d5SAdam Litke }
14317893d1d5SAdam Litke 
1432e4e574b7SAdam Litke /*
1433bf50bab2SNaoya Horiguchi  * This allocation function is useful in the context where vma is irrelevant.
1434bf50bab2SNaoya Horiguchi  * E.g. soft-offlining uses this function because it only cares physical
1435bf50bab2SNaoya Horiguchi  * address of error page.
1436bf50bab2SNaoya Horiguchi  */
1437bf50bab2SNaoya Horiguchi struct page *alloc_huge_page_node(struct hstate *h, int nid)
1438bf50bab2SNaoya Horiguchi {
14394ef91848SJoonsoo Kim 	struct page *page = NULL;
1440bf50bab2SNaoya Horiguchi 
1441bf50bab2SNaoya Horiguchi 	spin_lock(&hugetlb_lock);
14424ef91848SJoonsoo Kim 	if (h->free_huge_pages - h->resv_huge_pages > 0)
1443bf50bab2SNaoya Horiguchi 		page = dequeue_huge_page_node(h, nid);
1444bf50bab2SNaoya Horiguchi 	spin_unlock(&hugetlb_lock);
1445bf50bab2SNaoya Horiguchi 
144694ae8ba7SAneesh Kumar K.V 	if (!page)
1447bf50bab2SNaoya Horiguchi 		page = alloc_buddy_huge_page(h, nid);
1448bf50bab2SNaoya Horiguchi 
1449bf50bab2SNaoya Horiguchi 	return page;
1450bf50bab2SNaoya Horiguchi }
1451bf50bab2SNaoya Horiguchi 
1452bf50bab2SNaoya Horiguchi /*
145325985edcSLucas De Marchi  * Increase the hugetlb pool such that it can accommodate a reservation
1454e4e574b7SAdam Litke  * of size 'delta'.
1455e4e574b7SAdam Litke  */
1456a5516438SAndi Kleen static int gather_surplus_pages(struct hstate *h, int delta)
1457e4e574b7SAdam Litke {
1458e4e574b7SAdam Litke 	struct list_head surplus_list;
1459e4e574b7SAdam Litke 	struct page *page, *tmp;
1460e4e574b7SAdam Litke 	int ret, i;
1461e4e574b7SAdam Litke 	int needed, allocated;
146228073b02SHillf Danton 	bool alloc_ok = true;
1463e4e574b7SAdam Litke 
1464a5516438SAndi Kleen 	needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
1465ac09b3a1SAdam Litke 	if (needed <= 0) {
1466a5516438SAndi Kleen 		h->resv_huge_pages += delta;
1467e4e574b7SAdam Litke 		return 0;
1468ac09b3a1SAdam Litke 	}
1469e4e574b7SAdam Litke 
1470e4e574b7SAdam Litke 	allocated = 0;
1471e4e574b7SAdam Litke 	INIT_LIST_HEAD(&surplus_list);
1472e4e574b7SAdam Litke 
1473e4e574b7SAdam Litke 	ret = -ENOMEM;
1474e4e574b7SAdam Litke retry:
1475e4e574b7SAdam Litke 	spin_unlock(&hugetlb_lock);
1476e4e574b7SAdam Litke 	for (i = 0; i < needed; i++) {
1477bf50bab2SNaoya Horiguchi 		page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
147828073b02SHillf Danton 		if (!page) {
147928073b02SHillf Danton 			alloc_ok = false;
148028073b02SHillf Danton 			break;
148128073b02SHillf Danton 		}
1482e4e574b7SAdam Litke 		list_add(&page->lru, &surplus_list);
1483e4e574b7SAdam Litke 	}
148428073b02SHillf Danton 	allocated += i;
1485e4e574b7SAdam Litke 
1486e4e574b7SAdam Litke 	/*
1487e4e574b7SAdam Litke 	 * After retaking hugetlb_lock, we need to recalculate 'needed'
1488e4e574b7SAdam Litke 	 * because either resv_huge_pages or free_huge_pages may have changed.
1489e4e574b7SAdam Litke 	 */
1490e4e574b7SAdam Litke 	spin_lock(&hugetlb_lock);
1491a5516438SAndi Kleen 	needed = (h->resv_huge_pages + delta) -
1492a5516438SAndi Kleen 			(h->free_huge_pages + allocated);
149328073b02SHillf Danton 	if (needed > 0) {
149428073b02SHillf Danton 		if (alloc_ok)
1495e4e574b7SAdam Litke 			goto retry;
149628073b02SHillf Danton 		/*
149728073b02SHillf Danton 		 * We were not able to allocate enough pages to
149828073b02SHillf Danton 		 * satisfy the entire reservation so we free what
149928073b02SHillf Danton 		 * we've allocated so far.
150028073b02SHillf Danton 		 */
150128073b02SHillf Danton 		goto free;
150228073b02SHillf Danton 	}
1503e4e574b7SAdam Litke 	/*
1504e4e574b7SAdam Litke 	 * The surplus_list now contains _at_least_ the number of extra pages
150525985edcSLucas De Marchi 	 * needed to accommodate the reservation.  Add the appropriate number
1506e4e574b7SAdam Litke 	 * of pages to the hugetlb pool and free the extras back to the buddy
1507ac09b3a1SAdam Litke 	 * allocator.  Commit the entire reservation here to prevent another
1508ac09b3a1SAdam Litke 	 * process from stealing the pages as they are added to the pool but
1509ac09b3a1SAdam Litke 	 * before they are reserved.
1510e4e574b7SAdam Litke 	 */
1511e4e574b7SAdam Litke 	needed += allocated;
1512a5516438SAndi Kleen 	h->resv_huge_pages += delta;
1513e4e574b7SAdam Litke 	ret = 0;
1514a9869b83SNaoya Horiguchi 
151519fc3f0aSAdam Litke 	/* Free the needed pages to the hugetlb pool */
151619fc3f0aSAdam Litke 	list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
151719fc3f0aSAdam Litke 		if ((--needed) < 0)
151819fc3f0aSAdam Litke 			break;
1519a9869b83SNaoya Horiguchi 		/*
1520a9869b83SNaoya Horiguchi 		 * This page is now managed by the hugetlb allocator and has
1521a9869b83SNaoya Horiguchi 		 * no users -- drop the buddy allocator's reference.
1522a9869b83SNaoya Horiguchi 		 */
1523a9869b83SNaoya Horiguchi 		put_page_testzero(page);
1524309381feSSasha Levin 		VM_BUG_ON_PAGE(page_count(page), page);
1525a5516438SAndi Kleen 		enqueue_huge_page(h, page);
152619fc3f0aSAdam Litke 	}
152728073b02SHillf Danton free:
1528b0365c8dSHillf Danton 	spin_unlock(&hugetlb_lock);
152919fc3f0aSAdam Litke 
153019fc3f0aSAdam Litke 	/* Free unnecessary surplus pages to the buddy allocator */
1531c0d934baSJoonsoo Kim 	list_for_each_entry_safe(page, tmp, &surplus_list, lru)
1532a9869b83SNaoya Horiguchi 		put_page(page);
153319fc3f0aSAdam Litke 	spin_lock(&hugetlb_lock);
1534e4e574b7SAdam Litke 
1535e4e574b7SAdam Litke 	return ret;
1536e4e574b7SAdam Litke }
1537e4e574b7SAdam Litke 
1538e4e574b7SAdam Litke /*
1539e4e574b7SAdam Litke  * When releasing a hugetlb pool reservation, any surplus pages that were
1540e4e574b7SAdam Litke  * allocated to satisfy the reservation must be explicitly freed if they were
1541e4e574b7SAdam Litke  * never used.
1542685f3457SLee Schermerhorn  * Called with hugetlb_lock held.
1543e4e574b7SAdam Litke  */
1544a5516438SAndi Kleen static void return_unused_surplus_pages(struct hstate *h,
1545a5516438SAndi Kleen 					unsigned long unused_resv_pages)
1546e4e574b7SAdam Litke {
1547e4e574b7SAdam Litke 	unsigned long nr_pages;
1548e4e574b7SAdam Litke 
1549ac09b3a1SAdam Litke 	/* Uncommit the reservation */
1550a5516438SAndi Kleen 	h->resv_huge_pages -= unused_resv_pages;
1551ac09b3a1SAdam Litke 
1552aa888a74SAndi Kleen 	/* Cannot return gigantic pages currently */
1553bae7f4aeSLuiz Capitulino 	if (hstate_is_gigantic(h))
1554aa888a74SAndi Kleen 		return;
1555aa888a74SAndi Kleen 
1556a5516438SAndi Kleen 	nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
1557e4e574b7SAdam Litke 
1558685f3457SLee Schermerhorn 	/*
1559685f3457SLee Schermerhorn 	 * We want to release as many surplus pages as possible, spread
15609b5e5d0fSLee Schermerhorn 	 * evenly across all nodes with memory. Iterate across these nodes
15619b5e5d0fSLee Schermerhorn 	 * until we can no longer free unreserved surplus pages. This occurs
15629b5e5d0fSLee Schermerhorn 	 * when the nodes with surplus pages have no free pages.
15639b5e5d0fSLee Schermerhorn 	 * free_pool_huge_page() will balance the the freed pages across the
15649b5e5d0fSLee Schermerhorn 	 * on-line nodes with memory and will handle the hstate accounting.
1565685f3457SLee Schermerhorn 	 */
1566685f3457SLee Schermerhorn 	while (nr_pages--) {
15678cebfcd0SLai Jiangshan 		if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
1568685f3457SLee Schermerhorn 			break;
15697848a4bfSMizuma, Masayoshi 		cond_resched_lock(&hugetlb_lock);
1570e4e574b7SAdam Litke 	}
1571e4e574b7SAdam Litke }
1572e4e574b7SAdam Litke 
15735e911373SMike Kravetz 
1574c37f9fb1SAndy Whitcroft /*
15755e911373SMike Kravetz  * vma_needs_reservation, vma_commit_reservation and vma_abort_reservation
15765e911373SMike Kravetz  * are used by the huge page allocation routines to manage reservations.
1577cf3ad20bSMike Kravetz  *
1578cf3ad20bSMike Kravetz  * vma_needs_reservation is called to determine if the huge page at addr
1579cf3ad20bSMike Kravetz  * within the vma has an associated reservation.  If a reservation is
1580cf3ad20bSMike Kravetz  * needed, the value 1 is returned.  The caller is then responsible for
1581cf3ad20bSMike Kravetz  * managing the global reservation and subpool usage counts.  After
1582cf3ad20bSMike Kravetz  * the huge page has been allocated, vma_commit_reservation is called
15835e911373SMike Kravetz  * to add the page to the reservation map.  If the reservation must be
15845e911373SMike Kravetz  * aborted instead of committed, vma_abort_reservation is called.
1585cf3ad20bSMike Kravetz  *
1586cf3ad20bSMike Kravetz  * In the normal case, vma_commit_reservation returns the same value
1587cf3ad20bSMike Kravetz  * as the preceding vma_needs_reservation call.  The only time this
1588cf3ad20bSMike Kravetz  * is not the case is if a reserve map was changed between calls.  It
1589cf3ad20bSMike Kravetz  * is the responsibility of the caller to notice the difference and
1590cf3ad20bSMike Kravetz  * take appropriate action.
1591c37f9fb1SAndy Whitcroft  */
15925e911373SMike Kravetz enum vma_resv_mode {
15935e911373SMike Kravetz 	VMA_NEEDS_RESV,
15945e911373SMike Kravetz 	VMA_COMMIT_RESV,
15955e911373SMike Kravetz 	VMA_ABORT_RESV,
15965e911373SMike Kravetz };
1597cf3ad20bSMike Kravetz static long __vma_reservation_common(struct hstate *h,
1598cf3ad20bSMike Kravetz 				struct vm_area_struct *vma, unsigned long addr,
15995e911373SMike Kravetz 				enum vma_resv_mode mode)
1600c37f9fb1SAndy Whitcroft {
16014e35f483SJoonsoo Kim 	struct resv_map *resv;
16024e35f483SJoonsoo Kim 	pgoff_t idx;
1603cf3ad20bSMike Kravetz 	long ret;
1604c37f9fb1SAndy Whitcroft 
16054e35f483SJoonsoo Kim 	resv = vma_resv_map(vma);
16064e35f483SJoonsoo Kim 	if (!resv)
1607c37f9fb1SAndy Whitcroft 		return 1;
1608c37f9fb1SAndy Whitcroft 
16094e35f483SJoonsoo Kim 	idx = vma_hugecache_offset(h, vma, addr);
16105e911373SMike Kravetz 	switch (mode) {
16115e911373SMike Kravetz 	case VMA_NEEDS_RESV:
1612cf3ad20bSMike Kravetz 		ret = region_chg(resv, idx, idx + 1);
16135e911373SMike Kravetz 		break;
16145e911373SMike Kravetz 	case VMA_COMMIT_RESV:
16155e911373SMike Kravetz 		ret = region_add(resv, idx, idx + 1);
16165e911373SMike Kravetz 		break;
16175e911373SMike Kravetz 	case VMA_ABORT_RESV:
16185e911373SMike Kravetz 		region_abort(resv, idx, idx + 1);
16195e911373SMike Kravetz 		ret = 0;
16205e911373SMike Kravetz 		break;
16215e911373SMike Kravetz 	default:
16225e911373SMike Kravetz 		BUG();
16235e911373SMike Kravetz 	}
162484afd99bSAndy Whitcroft 
16254e35f483SJoonsoo Kim 	if (vma->vm_flags & VM_MAYSHARE)
1626cf3ad20bSMike Kravetz 		return ret;
16274e35f483SJoonsoo Kim 	else
1628cf3ad20bSMike Kravetz 		return ret < 0 ? ret : 0;
162984afd99bSAndy Whitcroft }
1630cf3ad20bSMike Kravetz 
1631cf3ad20bSMike Kravetz static long vma_needs_reservation(struct hstate *h,
1632a5516438SAndi Kleen 			struct vm_area_struct *vma, unsigned long addr)
1633c37f9fb1SAndy Whitcroft {
16345e911373SMike Kravetz 	return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV);
1635cf3ad20bSMike Kravetz }
1636c37f9fb1SAndy Whitcroft 
1637cf3ad20bSMike Kravetz static long vma_commit_reservation(struct hstate *h,
1638cf3ad20bSMike Kravetz 			struct vm_area_struct *vma, unsigned long addr)
1639cf3ad20bSMike Kravetz {
16405e911373SMike Kravetz 	return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV);
16415e911373SMike Kravetz }
16425e911373SMike Kravetz 
16435e911373SMike Kravetz static void vma_abort_reservation(struct hstate *h,
16445e911373SMike Kravetz 			struct vm_area_struct *vma, unsigned long addr)
16455e911373SMike Kravetz {
16465e911373SMike Kravetz 	(void)__vma_reservation_common(h, vma, addr, VMA_ABORT_RESV);
1647c37f9fb1SAndy Whitcroft }
1648c37f9fb1SAndy Whitcroft 
1649348ea204SAdam Litke static struct page *alloc_huge_page(struct vm_area_struct *vma,
165004f2cbe3SMel Gorman 				    unsigned long addr, int avoid_reserve)
1651348ea204SAdam Litke {
165290481622SDavid Gibson 	struct hugepage_subpool *spool = subpool_vma(vma);
1653a5516438SAndi Kleen 	struct hstate *h = hstate_vma(vma);
1654348ea204SAdam Litke 	struct page *page;
165533039678SMike Kravetz 	long chg, commit;
16566d76dcf4SAneesh Kumar K.V 	int ret, idx;
16576d76dcf4SAneesh Kumar K.V 	struct hugetlb_cgroup *h_cg;
16582fc39cecSAdam Litke 
16596d76dcf4SAneesh Kumar K.V 	idx = hstate_index(h);
1660a1e78772SMel Gorman 	/*
166190481622SDavid Gibson 	 * Processes that did not create the mapping will have no
166290481622SDavid Gibson 	 * reserves and will not have accounted against subpool
166390481622SDavid Gibson 	 * limit. Check that the subpool limit can be made before
166490481622SDavid Gibson 	 * satisfying the allocation MAP_NORESERVE mappings may also
166590481622SDavid Gibson 	 * need pages and subpool limit allocated allocated if no reserve
166690481622SDavid Gibson 	 * mapping overlaps.
1667a1e78772SMel Gorman 	 */
1668a5516438SAndi Kleen 	chg = vma_needs_reservation(h, vma, addr);
1669c37f9fb1SAndy Whitcroft 	if (chg < 0)
167076dcee75SAneesh Kumar K.V 		return ERR_PTR(-ENOMEM);
16718bb3f12eSJoonsoo Kim 	if (chg || avoid_reserve)
16725e911373SMike Kravetz 		if (hugepage_subpool_get_pages(spool, 1) < 0) {
16735e911373SMike Kravetz 			vma_abort_reservation(h, vma, addr);
167476dcee75SAneesh Kumar K.V 			return ERR_PTR(-ENOSPC);
16755e911373SMike Kravetz 		}
167690d8b7e6SAdam Litke 
16776d76dcf4SAneesh Kumar K.V 	ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
16788f34af6fSJianyu Zhan 	if (ret)
16798f34af6fSJianyu Zhan 		goto out_subpool_put;
16808f34af6fSJianyu Zhan 
1681a1e78772SMel Gorman 	spin_lock(&hugetlb_lock);
1682af0ed73eSJoonsoo Kim 	page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg);
168381a6fcaeSJoonsoo Kim 	if (!page) {
168494ae8ba7SAneesh Kumar K.V 		spin_unlock(&hugetlb_lock);
1685bf50bab2SNaoya Horiguchi 		page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
16868f34af6fSJianyu Zhan 		if (!page)
16878f34af6fSJianyu Zhan 			goto out_uncharge_cgroup;
16888f34af6fSJianyu Zhan 
168979dbb236SAneesh Kumar K.V 		spin_lock(&hugetlb_lock);
169079dbb236SAneesh Kumar K.V 		list_move(&page->lru, &h->hugepage_activelist);
169181a6fcaeSJoonsoo Kim 		/* Fall through */
1692a1e78772SMel Gorman 	}
169381a6fcaeSJoonsoo Kim 	hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
169481a6fcaeSJoonsoo Kim 	spin_unlock(&hugetlb_lock);
1695a1e78772SMel Gorman 
169690481622SDavid Gibson 	set_page_private(page, (unsigned long)spool);
1697a1e78772SMel Gorman 
169833039678SMike Kravetz 	commit = vma_commit_reservation(h, vma, addr);
169933039678SMike Kravetz 	if (unlikely(chg > commit)) {
170033039678SMike Kravetz 		/*
170133039678SMike Kravetz 		 * The page was added to the reservation map between
170233039678SMike Kravetz 		 * vma_needs_reservation and vma_commit_reservation.
170333039678SMike Kravetz 		 * This indicates a race with hugetlb_reserve_pages.
170433039678SMike Kravetz 		 * Adjust for the subpool count incremented above AND
170533039678SMike Kravetz 		 * in hugetlb_reserve_pages for the same page.  Also,
170633039678SMike Kravetz 		 * the reservation count added in hugetlb_reserve_pages
170733039678SMike Kravetz 		 * no longer applies.
170833039678SMike Kravetz 		 */
170933039678SMike Kravetz 		long rsv_adjust;
171033039678SMike Kravetz 
171133039678SMike Kravetz 		rsv_adjust = hugepage_subpool_put_pages(spool, 1);
171233039678SMike Kravetz 		hugetlb_acct_memory(h, -rsv_adjust);
171333039678SMike Kravetz 	}
17147893d1d5SAdam Litke 	return page;
17158f34af6fSJianyu Zhan 
17168f34af6fSJianyu Zhan out_uncharge_cgroup:
17178f34af6fSJianyu Zhan 	hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
17188f34af6fSJianyu Zhan out_subpool_put:
17198f34af6fSJianyu Zhan 	if (chg || avoid_reserve)
17208f34af6fSJianyu Zhan 		hugepage_subpool_put_pages(spool, 1);
17215e911373SMike Kravetz 	vma_abort_reservation(h, vma, addr);
17228f34af6fSJianyu Zhan 	return ERR_PTR(-ENOSPC);
1723b45b5bd6SDavid Gibson }
1724b45b5bd6SDavid Gibson 
172574060e4dSNaoya Horiguchi /*
172674060e4dSNaoya Horiguchi  * alloc_huge_page()'s wrapper which simply returns the page if allocation
172774060e4dSNaoya Horiguchi  * succeeds, otherwise NULL. This function is called from new_vma_page(),
172874060e4dSNaoya Horiguchi  * where no ERR_VALUE is expected to be returned.
172974060e4dSNaoya Horiguchi  */
173074060e4dSNaoya Horiguchi struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
173174060e4dSNaoya Horiguchi 				unsigned long addr, int avoid_reserve)
173274060e4dSNaoya Horiguchi {
173374060e4dSNaoya Horiguchi 	struct page *page = alloc_huge_page(vma, addr, avoid_reserve);
173474060e4dSNaoya Horiguchi 	if (IS_ERR(page))
173574060e4dSNaoya Horiguchi 		page = NULL;
173674060e4dSNaoya Horiguchi 	return page;
173774060e4dSNaoya Horiguchi }
173874060e4dSNaoya Horiguchi 
173991f47662SCyrill Gorcunov int __weak alloc_bootmem_huge_page(struct hstate *h)
1740aa888a74SAndi Kleen {
1741aa888a74SAndi Kleen 	struct huge_bootmem_page *m;
1742b2261026SJoonsoo Kim 	int nr_nodes, node;
1743aa888a74SAndi Kleen 
1744b2261026SJoonsoo Kim 	for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
1745aa888a74SAndi Kleen 		void *addr;
1746aa888a74SAndi Kleen 
17478b89a116SGrygorii Strashko 		addr = memblock_virt_alloc_try_nid_nopanic(
17488b89a116SGrygorii Strashko 				huge_page_size(h), huge_page_size(h),
17498b89a116SGrygorii Strashko 				0, BOOTMEM_ALLOC_ACCESSIBLE, node);
1750aa888a74SAndi Kleen 		if (addr) {
1751aa888a74SAndi Kleen 			/*
1752aa888a74SAndi Kleen 			 * Use the beginning of the huge page to store the
1753aa888a74SAndi Kleen 			 * huge_bootmem_page struct (until gather_bootmem
1754aa888a74SAndi Kleen 			 * puts them into the mem_map).
1755aa888a74SAndi Kleen 			 */
1756aa888a74SAndi Kleen 			m = addr;
1757aa888a74SAndi Kleen 			goto found;
1758aa888a74SAndi Kleen 		}
1759aa888a74SAndi Kleen 	}
1760aa888a74SAndi Kleen 	return 0;
1761aa888a74SAndi Kleen 
1762aa888a74SAndi Kleen found:
1763df994eadSLuiz Capitulino 	BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h)));
1764aa888a74SAndi Kleen 	/* Put them into a private list first because mem_map is not up yet */
1765aa888a74SAndi Kleen 	list_add(&m->list, &huge_boot_pages);
1766aa888a74SAndi Kleen 	m->hstate = h;
1767aa888a74SAndi Kleen 	return 1;
1768aa888a74SAndi Kleen }
1769aa888a74SAndi Kleen 
1770f412c97aSDavid Rientjes static void __init prep_compound_huge_page(struct page *page, int order)
177118229df5SAndy Whitcroft {
177218229df5SAndy Whitcroft 	if (unlikely(order > (MAX_ORDER - 1)))
177318229df5SAndy Whitcroft 		prep_compound_gigantic_page(page, order);
177418229df5SAndy Whitcroft 	else
177518229df5SAndy Whitcroft 		prep_compound_page(page, order);
177618229df5SAndy Whitcroft }
177718229df5SAndy Whitcroft 
1778aa888a74SAndi Kleen /* Put bootmem huge pages into the standard lists after mem_map is up */
1779aa888a74SAndi Kleen static void __init gather_bootmem_prealloc(void)
1780aa888a74SAndi Kleen {
1781aa888a74SAndi Kleen 	struct huge_bootmem_page *m;
1782aa888a74SAndi Kleen 
1783aa888a74SAndi Kleen 	list_for_each_entry(m, &huge_boot_pages, list) {
1784aa888a74SAndi Kleen 		struct hstate *h = m->hstate;
1785ee8f248dSBecky Bruce 		struct page *page;
1786ee8f248dSBecky Bruce 
1787ee8f248dSBecky Bruce #ifdef CONFIG_HIGHMEM
1788ee8f248dSBecky Bruce 		page = pfn_to_page(m->phys >> PAGE_SHIFT);
17898b89a116SGrygorii Strashko 		memblock_free_late(__pa(m),
1790ee8f248dSBecky Bruce 				   sizeof(struct huge_bootmem_page));
1791ee8f248dSBecky Bruce #else
1792ee8f248dSBecky Bruce 		page = virt_to_page(m);
1793ee8f248dSBecky Bruce #endif
1794aa888a74SAndi Kleen 		WARN_ON(page_count(page) != 1);
179518229df5SAndy Whitcroft 		prep_compound_huge_page(page, h->order);
1796ef5a22beSAndrea Arcangeli 		WARN_ON(PageReserved(page));
1797aa888a74SAndi Kleen 		prep_new_huge_page(h, page, page_to_nid(page));
1798b0320c7bSRafael Aquini 		/*
1799b0320c7bSRafael Aquini 		 * If we had gigantic hugepages allocated at boot time, we need
1800b0320c7bSRafael Aquini 		 * to restore the 'stolen' pages to totalram_pages in order to
1801b0320c7bSRafael Aquini 		 * fix confusing memory reports from free(1) and another
1802b0320c7bSRafael Aquini 		 * side-effects, like CommitLimit going negative.
1803b0320c7bSRafael Aquini 		 */
1804bae7f4aeSLuiz Capitulino 		if (hstate_is_gigantic(h))
18053dcc0571SJiang Liu 			adjust_managed_page_count(page, 1 << h->order);
1806aa888a74SAndi Kleen 	}
1807aa888a74SAndi Kleen }
1808aa888a74SAndi Kleen 
18098faa8b07SAndi Kleen static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
18101da177e4SLinus Torvalds {
18111da177e4SLinus Torvalds 	unsigned long i;
18121da177e4SLinus Torvalds 
1813e5ff2159SAndi Kleen 	for (i = 0; i < h->max_huge_pages; ++i) {
1814bae7f4aeSLuiz Capitulino 		if (hstate_is_gigantic(h)) {
1815aa888a74SAndi Kleen 			if (!alloc_bootmem_huge_page(h))
1816aa888a74SAndi Kleen 				break;
18179b5e5d0fSLee Schermerhorn 		} else if (!alloc_fresh_huge_page(h,
18188cebfcd0SLai Jiangshan 					 &node_states[N_MEMORY]))
18191da177e4SLinus Torvalds 			break;
18201da177e4SLinus Torvalds 	}
18218faa8b07SAndi Kleen 	h->max_huge_pages = i;
1822e5ff2159SAndi Kleen }
1823e5ff2159SAndi Kleen 
1824e5ff2159SAndi Kleen static void __init hugetlb_init_hstates(void)
1825e5ff2159SAndi Kleen {
1826e5ff2159SAndi Kleen 	struct hstate *h;
1827e5ff2159SAndi Kleen 
1828e5ff2159SAndi Kleen 	for_each_hstate(h) {
1829641844f5SNaoya Horiguchi 		if (minimum_order > huge_page_order(h))
1830641844f5SNaoya Horiguchi 			minimum_order = huge_page_order(h);
1831641844f5SNaoya Horiguchi 
18328faa8b07SAndi Kleen 		/* oversize hugepages were init'ed in early boot */
1833bae7f4aeSLuiz Capitulino 		if (!hstate_is_gigantic(h))
18348faa8b07SAndi Kleen 			hugetlb_hstate_alloc_pages(h);
1835e5ff2159SAndi Kleen 	}
1836641844f5SNaoya Horiguchi 	VM_BUG_ON(minimum_order == UINT_MAX);
1837e5ff2159SAndi Kleen }
1838e5ff2159SAndi Kleen 
18394abd32dbSAndi Kleen static char * __init memfmt(char *buf, unsigned long n)
18404abd32dbSAndi Kleen {
18414abd32dbSAndi Kleen 	if (n >= (1UL << 30))
18424abd32dbSAndi Kleen 		sprintf(buf, "%lu GB", n >> 30);
18434abd32dbSAndi Kleen 	else if (n >= (1UL << 20))
18444abd32dbSAndi Kleen 		sprintf(buf, "%lu MB", n >> 20);
18454abd32dbSAndi Kleen 	else
18464abd32dbSAndi Kleen 		sprintf(buf, "%lu KB", n >> 10);
18474abd32dbSAndi Kleen 	return buf;
18484abd32dbSAndi Kleen }
18494abd32dbSAndi Kleen 
1850e5ff2159SAndi Kleen static void __init report_hugepages(void)
1851e5ff2159SAndi Kleen {
1852e5ff2159SAndi Kleen 	struct hstate *h;
1853e5ff2159SAndi Kleen 
1854e5ff2159SAndi Kleen 	for_each_hstate(h) {
18554abd32dbSAndi Kleen 		char buf[32];
1856ffb22af5SAndrew Morton 		pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n",
18574abd32dbSAndi Kleen 			memfmt(buf, huge_page_size(h)),
18584abd32dbSAndi Kleen 			h->free_huge_pages);
1859e5ff2159SAndi Kleen 	}
1860e5ff2159SAndi Kleen }
1861e5ff2159SAndi Kleen 
18621da177e4SLinus Torvalds #ifdef CONFIG_HIGHMEM
18636ae11b27SLee Schermerhorn static void try_to_free_low(struct hstate *h, unsigned long count,
18646ae11b27SLee Schermerhorn 						nodemask_t *nodes_allowed)
18651da177e4SLinus Torvalds {
18664415cc8dSChristoph Lameter 	int i;
18674415cc8dSChristoph Lameter 
1868bae7f4aeSLuiz Capitulino 	if (hstate_is_gigantic(h))
1869aa888a74SAndi Kleen 		return;
1870aa888a74SAndi Kleen 
18716ae11b27SLee Schermerhorn 	for_each_node_mask(i, *nodes_allowed) {
18721da177e4SLinus Torvalds 		struct page *page, *next;
1873a5516438SAndi Kleen 		struct list_head *freel = &h->hugepage_freelists[i];
1874a5516438SAndi Kleen 		list_for_each_entry_safe(page, next, freel, lru) {
1875a5516438SAndi Kleen 			if (count >= h->nr_huge_pages)
18766b0c880dSAdam Litke 				return;
18771da177e4SLinus Torvalds 			if (PageHighMem(page))
18781da177e4SLinus Torvalds 				continue;
18791da177e4SLinus Torvalds 			list_del(&page->lru);
1880e5ff2159SAndi Kleen 			update_and_free_page(h, page);
1881a5516438SAndi Kleen 			h->free_huge_pages--;
1882a5516438SAndi Kleen 			h->free_huge_pages_node[page_to_nid(page)]--;
18831da177e4SLinus Torvalds 		}
18841da177e4SLinus Torvalds 	}
18851da177e4SLinus Torvalds }
18861da177e4SLinus Torvalds #else
18876ae11b27SLee Schermerhorn static inline void try_to_free_low(struct hstate *h, unsigned long count,
18886ae11b27SLee Schermerhorn 						nodemask_t *nodes_allowed)
18891da177e4SLinus Torvalds {
18901da177e4SLinus Torvalds }
18911da177e4SLinus Torvalds #endif
18921da177e4SLinus Torvalds 
189320a0307cSWu Fengguang /*
189420a0307cSWu Fengguang  * Increment or decrement surplus_huge_pages.  Keep node-specific counters
189520a0307cSWu Fengguang  * balanced by operating on them in a round-robin fashion.
189620a0307cSWu Fengguang  * Returns 1 if an adjustment was made.
189720a0307cSWu Fengguang  */
18986ae11b27SLee Schermerhorn static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
18996ae11b27SLee Schermerhorn 				int delta)
190020a0307cSWu Fengguang {
1901b2261026SJoonsoo Kim 	int nr_nodes, node;
190220a0307cSWu Fengguang 
190320a0307cSWu Fengguang 	VM_BUG_ON(delta != -1 && delta != 1);
190420a0307cSWu Fengguang 
1905e8c5c824SLee Schermerhorn 	if (delta < 0) {
1906b2261026SJoonsoo Kim 		for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
1907b2261026SJoonsoo Kim 			if (h->surplus_huge_pages_node[node])
1908b2261026SJoonsoo Kim 				goto found;
1909b2261026SJoonsoo Kim 		}
1910b2261026SJoonsoo Kim 	} else {
1911b2261026SJoonsoo Kim 		for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
1912b2261026SJoonsoo Kim 			if (h->surplus_huge_pages_node[node] <
1913b2261026SJoonsoo Kim 					h->nr_huge_pages_node[node])
1914b2261026SJoonsoo Kim 				goto found;
1915e8c5c824SLee Schermerhorn 		}
19169a76db09SLee Schermerhorn 	}
1917b2261026SJoonsoo Kim 	return 0;
191820a0307cSWu Fengguang 
1919b2261026SJoonsoo Kim found:
192020a0307cSWu Fengguang 	h->surplus_huge_pages += delta;
1921b2261026SJoonsoo Kim 	h->surplus_huge_pages_node[node] += delta;
1922b2261026SJoonsoo Kim 	return 1;
192320a0307cSWu Fengguang }
192420a0307cSWu Fengguang 
1925a5516438SAndi Kleen #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
19266ae11b27SLee Schermerhorn static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
19276ae11b27SLee Schermerhorn 						nodemask_t *nodes_allowed)
19281da177e4SLinus Torvalds {
19297893d1d5SAdam Litke 	unsigned long min_count, ret;
19301da177e4SLinus Torvalds 
1931944d9fecSLuiz Capitulino 	if (hstate_is_gigantic(h) && !gigantic_page_supported())
1932aa888a74SAndi Kleen 		return h->max_huge_pages;
1933aa888a74SAndi Kleen 
19347893d1d5SAdam Litke 	/*
19357893d1d5SAdam Litke 	 * Increase the pool size
19367893d1d5SAdam Litke 	 * First take pages out of surplus state.  Then make up the
19377893d1d5SAdam Litke 	 * remaining difference by allocating fresh huge pages.
1938d1c3fb1fSNishanth Aravamudan 	 *
1939d1c3fb1fSNishanth Aravamudan 	 * We might race with alloc_buddy_huge_page() here and be unable
1940d1c3fb1fSNishanth Aravamudan 	 * to convert a surplus huge page to a normal huge page. That is
1941d1c3fb1fSNishanth Aravamudan 	 * not critical, though, it just means the overall size of the
1942d1c3fb1fSNishanth Aravamudan 	 * pool might be one hugepage larger than it needs to be, but
1943d1c3fb1fSNishanth Aravamudan 	 * within all the constraints specified by the sysctls.
19447893d1d5SAdam Litke 	 */
19451da177e4SLinus Torvalds 	spin_lock(&hugetlb_lock);
1946a5516438SAndi Kleen 	while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
19476ae11b27SLee Schermerhorn 		if (!adjust_pool_surplus(h, nodes_allowed, -1))
19487893d1d5SAdam Litke 			break;
19497893d1d5SAdam Litke 	}
19507893d1d5SAdam Litke 
1951a5516438SAndi Kleen 	while (count > persistent_huge_pages(h)) {
19527893d1d5SAdam Litke 		/*
19537893d1d5SAdam Litke 		 * If this allocation races such that we no longer need the
19547893d1d5SAdam Litke 		 * page, free_huge_page will handle it by freeing the page
19557893d1d5SAdam Litke 		 * and reducing the surplus.
19567893d1d5SAdam Litke 		 */
19577893d1d5SAdam Litke 		spin_unlock(&hugetlb_lock);
1958944d9fecSLuiz Capitulino 		if (hstate_is_gigantic(h))
1959944d9fecSLuiz Capitulino 			ret = alloc_fresh_gigantic_page(h, nodes_allowed);
1960944d9fecSLuiz Capitulino 		else
19616ae11b27SLee Schermerhorn 			ret = alloc_fresh_huge_page(h, nodes_allowed);
19627893d1d5SAdam Litke 		spin_lock(&hugetlb_lock);
19637893d1d5SAdam Litke 		if (!ret)
19647893d1d5SAdam Litke 			goto out;
19657893d1d5SAdam Litke 
1966536240f2SMel Gorman 		/* Bail for signals. Probably ctrl-c from user */
1967536240f2SMel Gorman 		if (signal_pending(current))
1968536240f2SMel Gorman 			goto out;
19697893d1d5SAdam Litke 	}
19707893d1d5SAdam Litke 
19717893d1d5SAdam Litke 	/*
19727893d1d5SAdam Litke 	 * Decrease the pool size
19737893d1d5SAdam Litke 	 * First return free pages to the buddy allocator (being careful
19747893d1d5SAdam Litke 	 * to keep enough around to satisfy reservations).  Then place
19757893d1d5SAdam Litke 	 * pages into surplus state as needed so the pool will shrink
19767893d1d5SAdam Litke 	 * to the desired size as pages become free.
1977d1c3fb1fSNishanth Aravamudan 	 *
1978d1c3fb1fSNishanth Aravamudan 	 * By placing pages into the surplus state independent of the
1979d1c3fb1fSNishanth Aravamudan 	 * overcommit value, we are allowing the surplus pool size to
1980d1c3fb1fSNishanth Aravamudan 	 * exceed overcommit. There are few sane options here. Since
1981d1c3fb1fSNishanth Aravamudan 	 * alloc_buddy_huge_page() is checking the global counter,
1982d1c3fb1fSNishanth Aravamudan 	 * though, we'll note that we're not allowed to exceed surplus
1983d1c3fb1fSNishanth Aravamudan 	 * and won't grow the pool anywhere else. Not until one of the
1984d1c3fb1fSNishanth Aravamudan 	 * sysctls are changed, or the surplus pages go out of use.
19857893d1d5SAdam Litke 	 */
1986a5516438SAndi Kleen 	min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
19876b0c880dSAdam Litke 	min_count = max(count, min_count);
19886ae11b27SLee Schermerhorn 	try_to_free_low(h, min_count, nodes_allowed);
1989a5516438SAndi Kleen 	while (min_count < persistent_huge_pages(h)) {
19906ae11b27SLee Schermerhorn 		if (!free_pool_huge_page(h, nodes_allowed, 0))
19911da177e4SLinus Torvalds 			break;
199255f67141SMizuma, Masayoshi 		cond_resched_lock(&hugetlb_lock);
19931da177e4SLinus Torvalds 	}
1994a5516438SAndi Kleen 	while (count < persistent_huge_pages(h)) {
19956ae11b27SLee Schermerhorn 		if (!adjust_pool_surplus(h, nodes_allowed, 1))
19967893d1d5SAdam Litke 			break;
19977893d1d5SAdam Litke 	}
19987893d1d5SAdam Litke out:
1999a5516438SAndi Kleen 	ret = persistent_huge_pages(h);
20001da177e4SLinus Torvalds 	spin_unlock(&hugetlb_lock);
20017893d1d5SAdam Litke 	return ret;
20021da177e4SLinus Torvalds }
20031da177e4SLinus Torvalds 
2004a3437870SNishanth Aravamudan #define HSTATE_ATTR_RO(_name) \
2005a3437870SNishanth Aravamudan 	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
2006a3437870SNishanth Aravamudan 
2007a3437870SNishanth Aravamudan #define HSTATE_ATTR(_name) \
2008a3437870SNishanth Aravamudan 	static struct kobj_attribute _name##_attr = \
2009a3437870SNishanth Aravamudan 		__ATTR(_name, 0644, _name##_show, _name##_store)
2010a3437870SNishanth Aravamudan 
2011a3437870SNishanth Aravamudan static struct kobject *hugepages_kobj;
2012a3437870SNishanth Aravamudan static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
2013a3437870SNishanth Aravamudan 
20149a305230SLee Schermerhorn static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp);
20159a305230SLee Schermerhorn 
20169a305230SLee Schermerhorn static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp)
2017a3437870SNishanth Aravamudan {
2018a3437870SNishanth Aravamudan 	int i;
20199a305230SLee Schermerhorn 
2020a3437870SNishanth Aravamudan 	for (i = 0; i < HUGE_MAX_HSTATE; i++)
20219a305230SLee Schermerhorn 		if (hstate_kobjs[i] == kobj) {
20229a305230SLee Schermerhorn 			if (nidp)
20239a305230SLee Schermerhorn 				*nidp = NUMA_NO_NODE;
2024a3437870SNishanth Aravamudan 			return &hstates[i];
20259a305230SLee Schermerhorn 		}
20269a305230SLee Schermerhorn 
20279a305230SLee Schermerhorn 	return kobj_to_node_hstate(kobj, nidp);
2028a3437870SNishanth Aravamudan }
2029a3437870SNishanth Aravamudan 
203006808b08SLee Schermerhorn static ssize_t nr_hugepages_show_common(struct kobject *kobj,
2031a3437870SNishanth Aravamudan 					struct kobj_attribute *attr, char *buf)
2032a3437870SNishanth Aravamudan {
20339a305230SLee Schermerhorn 	struct hstate *h;
20349a305230SLee Schermerhorn 	unsigned long nr_huge_pages;
20359a305230SLee Schermerhorn 	int nid;
20369a305230SLee Schermerhorn 
20379a305230SLee Schermerhorn 	h = kobj_to_hstate(kobj, &nid);
20389a305230SLee Schermerhorn 	if (nid == NUMA_NO_NODE)
20399a305230SLee Schermerhorn 		nr_huge_pages = h->nr_huge_pages;
20409a305230SLee Schermerhorn 	else
20419a305230SLee Schermerhorn 		nr_huge_pages = h->nr_huge_pages_node[nid];
20429a305230SLee Schermerhorn 
20439a305230SLee Schermerhorn 	return sprintf(buf, "%lu\n", nr_huge_pages);
2044a3437870SNishanth Aravamudan }
2045adbe8726SEric B Munson 
2046238d3c13SDavid Rientjes static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
2047238d3c13SDavid Rientjes 					   struct hstate *h, int nid,
2048238d3c13SDavid Rientjes 					   unsigned long count, size_t len)
2049a3437870SNishanth Aravamudan {
2050a3437870SNishanth Aravamudan 	int err;
2051bad44b5bSDavid Rientjes 	NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
2052a3437870SNishanth Aravamudan 
2053944d9fecSLuiz Capitulino 	if (hstate_is_gigantic(h) && !gigantic_page_supported()) {
2054adbe8726SEric B Munson 		err = -EINVAL;
2055adbe8726SEric B Munson 		goto out;
2056adbe8726SEric B Munson 	}
2057adbe8726SEric B Munson 
20589a305230SLee Schermerhorn 	if (nid == NUMA_NO_NODE) {
20599a305230SLee Schermerhorn 		/*
20609a305230SLee Schermerhorn 		 * global hstate attribute
20619a305230SLee Schermerhorn 		 */
20629a305230SLee Schermerhorn 		if (!(obey_mempolicy &&
20639a305230SLee Schermerhorn 				init_nodemask_of_mempolicy(nodes_allowed))) {
206406808b08SLee Schermerhorn 			NODEMASK_FREE(nodes_allowed);
20658cebfcd0SLai Jiangshan 			nodes_allowed = &node_states[N_MEMORY];
206606808b08SLee Schermerhorn 		}
20679a305230SLee Schermerhorn 	} else if (nodes_allowed) {
20689a305230SLee Schermerhorn 		/*
20699a305230SLee Schermerhorn 		 * per node hstate attribute: adjust count to global,
20709a305230SLee Schermerhorn 		 * but restrict alloc/free to the specified node.
20719a305230SLee Schermerhorn 		 */
20729a305230SLee Schermerhorn 		count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
20739a305230SLee Schermerhorn 		init_nodemask_of_node(nodes_allowed, nid);
20749a305230SLee Schermerhorn 	} else
20758cebfcd0SLai Jiangshan 		nodes_allowed = &node_states[N_MEMORY];
20769a305230SLee Schermerhorn 
207706808b08SLee Schermerhorn 	h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
2078a3437870SNishanth Aravamudan 
20798cebfcd0SLai Jiangshan 	if (nodes_allowed != &node_states[N_MEMORY])
208006808b08SLee Schermerhorn 		NODEMASK_FREE(nodes_allowed);
208106808b08SLee Schermerhorn 
208206808b08SLee Schermerhorn 	return len;
2083adbe8726SEric B Munson out:
2084adbe8726SEric B Munson 	NODEMASK_FREE(nodes_allowed);
2085adbe8726SEric B Munson 	return err;
208606808b08SLee Schermerhorn }
208706808b08SLee Schermerhorn 
2088238d3c13SDavid Rientjes static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
2089238d3c13SDavid Rientjes 					 struct kobject *kobj, const char *buf,
2090238d3c13SDavid Rientjes 					 size_t len)
2091238d3c13SDavid Rientjes {
2092238d3c13SDavid Rientjes 	struct hstate *h;
2093238d3c13SDavid Rientjes 	unsigned long count;
2094238d3c13SDavid Rientjes 	int nid;
2095238d3c13SDavid Rientjes 	int err;
2096238d3c13SDavid Rientjes 
2097238d3c13SDavid Rientjes 	err = kstrtoul(buf, 10, &count);
2098238d3c13SDavid Rientjes 	if (err)
2099238d3c13SDavid Rientjes 		return err;
2100238d3c13SDavid Rientjes 
2101238d3c13SDavid Rientjes 	h = kobj_to_hstate(kobj, &nid);
2102238d3c13SDavid Rientjes 	return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len);
2103238d3c13SDavid Rientjes }
2104238d3c13SDavid Rientjes 
210506808b08SLee Schermerhorn static ssize_t nr_hugepages_show(struct kobject *kobj,
210606808b08SLee Schermerhorn 				       struct kobj_attribute *attr, char *buf)
210706808b08SLee Schermerhorn {
210806808b08SLee Schermerhorn 	return nr_hugepages_show_common(kobj, attr, buf);
210906808b08SLee Schermerhorn }
211006808b08SLee Schermerhorn 
211106808b08SLee Schermerhorn static ssize_t nr_hugepages_store(struct kobject *kobj,
211206808b08SLee Schermerhorn 	       struct kobj_attribute *attr, const char *buf, size_t len)
211306808b08SLee Schermerhorn {
2114238d3c13SDavid Rientjes 	return nr_hugepages_store_common(false, kobj, buf, len);
2115a3437870SNishanth Aravamudan }
2116a3437870SNishanth Aravamudan HSTATE_ATTR(nr_hugepages);
2117a3437870SNishanth Aravamudan 
211806808b08SLee Schermerhorn #ifdef CONFIG_NUMA
211906808b08SLee Schermerhorn 
212006808b08SLee Schermerhorn /*
212106808b08SLee Schermerhorn  * hstate attribute for optionally mempolicy-based constraint on persistent
212206808b08SLee Schermerhorn  * huge page alloc/free.
212306808b08SLee Schermerhorn  */
212406808b08SLee Schermerhorn static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
212506808b08SLee Schermerhorn 				       struct kobj_attribute *attr, char *buf)
212606808b08SLee Schermerhorn {
212706808b08SLee Schermerhorn 	return nr_hugepages_show_common(kobj, attr, buf);
212806808b08SLee Schermerhorn }
212906808b08SLee Schermerhorn 
213006808b08SLee Schermerhorn static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
213106808b08SLee Schermerhorn 	       struct kobj_attribute *attr, const char *buf, size_t len)
213206808b08SLee Schermerhorn {
2133238d3c13SDavid Rientjes 	return nr_hugepages_store_common(true, kobj, buf, len);
213406808b08SLee Schermerhorn }
213506808b08SLee Schermerhorn HSTATE_ATTR(nr_hugepages_mempolicy);
213606808b08SLee Schermerhorn #endif
213706808b08SLee Schermerhorn 
213806808b08SLee Schermerhorn 
2139a3437870SNishanth Aravamudan static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
2140a3437870SNishanth Aravamudan 					struct kobj_attribute *attr, char *buf)
2141a3437870SNishanth Aravamudan {
21429a305230SLee Schermerhorn 	struct hstate *h = kobj_to_hstate(kobj, NULL);
2143a3437870SNishanth Aravamudan 	return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
2144a3437870SNishanth Aravamudan }
2145adbe8726SEric B Munson 
2146a3437870SNishanth Aravamudan static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
2147a3437870SNishanth Aravamudan 		struct kobj_attribute *attr, const char *buf, size_t count)
2148a3437870SNishanth Aravamudan {
2149a3437870SNishanth Aravamudan 	int err;
2150a3437870SNishanth Aravamudan 	unsigned long input;
21519a305230SLee Schermerhorn 	struct hstate *h = kobj_to_hstate(kobj, NULL);
2152a3437870SNishanth Aravamudan 
2153bae7f4aeSLuiz Capitulino 	if (hstate_is_gigantic(h))
2154adbe8726SEric B Munson 		return -EINVAL;
2155adbe8726SEric B Munson 
21563dbb95f7SJingoo Han 	err = kstrtoul(buf, 10, &input);
2157a3437870SNishanth Aravamudan 	if (err)
215873ae31e5SEric B Munson 		return err;
2159a3437870SNishanth Aravamudan 
2160a3437870SNishanth Aravamudan 	spin_lock(&hugetlb_lock);
2161a3437870SNishanth Aravamudan 	h->nr_overcommit_huge_pages = input;
2162a3437870SNishanth Aravamudan 	spin_unlock(&hugetlb_lock);
2163a3437870SNishanth Aravamudan 
2164a3437870SNishanth Aravamudan 	return count;
2165a3437870SNishanth Aravamudan }
2166a3437870SNishanth Aravamudan HSTATE_ATTR(nr_overcommit_hugepages);
2167a3437870SNishanth Aravamudan 
2168a3437870SNishanth Aravamudan static ssize_t free_hugepages_show(struct kobject *kobj,
2169a3437870SNishanth Aravamudan 					struct kobj_attribute *attr, char *buf)
2170a3437870SNishanth Aravamudan {
21719a305230SLee Schermerhorn 	struct hstate *h;
21729a305230SLee Schermerhorn 	unsigned long free_huge_pages;
21739a305230SLee Schermerhorn 	int nid;
21749a305230SLee Schermerhorn 
21759a305230SLee Schermerhorn 	h = kobj_to_hstate(kobj, &nid);
21769a305230SLee Schermerhorn 	if (nid == NUMA_NO_NODE)
21779a305230SLee Schermerhorn 		free_huge_pages = h->free_huge_pages;
21789a305230SLee Schermerhorn 	else
21799a305230SLee Schermerhorn 		free_huge_pages = h->free_huge_pages_node[nid];
21809a305230SLee Schermerhorn 
21819a305230SLee Schermerhorn 	return sprintf(buf, "%lu\n", free_huge_pages);
2182a3437870SNishanth Aravamudan }
2183a3437870SNishanth Aravamudan HSTATE_ATTR_RO(free_hugepages);
2184a3437870SNishanth Aravamudan 
2185a3437870SNishanth Aravamudan static ssize_t resv_hugepages_show(struct kobject *kobj,
2186a3437870SNishanth Aravamudan 					struct kobj_attribute *attr, char *buf)
2187a3437870SNishanth Aravamudan {
21889a305230SLee Schermerhorn 	struct hstate *h = kobj_to_hstate(kobj, NULL);
2189a3437870SNishanth Aravamudan 	return sprintf(buf, "%lu\n", h->resv_huge_pages);
2190a3437870SNishanth Aravamudan }
2191a3437870SNishanth Aravamudan HSTATE_ATTR_RO(resv_hugepages);
2192a3437870SNishanth Aravamudan 
2193a3437870SNishanth Aravamudan static ssize_t surplus_hugepages_show(struct kobject *kobj,
2194a3437870SNishanth Aravamudan 					struct kobj_attribute *attr, char *buf)
2195a3437870SNishanth Aravamudan {
21969a305230SLee Schermerhorn 	struct hstate *h;
21979a305230SLee Schermerhorn 	unsigned long surplus_huge_pages;
21989a305230SLee Schermerhorn 	int nid;
21999a305230SLee Schermerhorn 
22009a305230SLee Schermerhorn 	h = kobj_to_hstate(kobj, &nid);
22019a305230SLee Schermerhorn 	if (nid == NUMA_NO_NODE)
22029a305230SLee Schermerhorn 		surplus_huge_pages = h->surplus_huge_pages;
22039a305230SLee Schermerhorn 	else
22049a305230SLee Schermerhorn 		surplus_huge_pages = h->surplus_huge_pages_node[nid];
22059a305230SLee Schermerhorn 
22069a305230SLee Schermerhorn 	return sprintf(buf, "%lu\n", surplus_huge_pages);
2207a3437870SNishanth Aravamudan }
2208a3437870SNishanth Aravamudan HSTATE_ATTR_RO(surplus_hugepages);
2209a3437870SNishanth Aravamudan 
2210a3437870SNishanth Aravamudan static struct attribute *hstate_attrs[] = {
2211a3437870SNishanth Aravamudan 	&nr_hugepages_attr.attr,
2212a3437870SNishanth Aravamudan 	&nr_overcommit_hugepages_attr.attr,
2213a3437870SNishanth Aravamudan 	&free_hugepages_attr.attr,
2214a3437870SNishanth Aravamudan 	&resv_hugepages_attr.attr,
2215a3437870SNishanth Aravamudan 	&surplus_hugepages_attr.attr,
221606808b08SLee Schermerhorn #ifdef CONFIG_NUMA
221706808b08SLee Schermerhorn 	&nr_hugepages_mempolicy_attr.attr,
221806808b08SLee Schermerhorn #endif
2219a3437870SNishanth Aravamudan 	NULL,
2220a3437870SNishanth Aravamudan };
2221a3437870SNishanth Aravamudan 
2222a3437870SNishanth Aravamudan static struct attribute_group hstate_attr_group = {
2223a3437870SNishanth Aravamudan 	.attrs = hstate_attrs,
2224a3437870SNishanth Aravamudan };
2225a3437870SNishanth Aravamudan 
2226094e9539SJeff Mahoney static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
22279a305230SLee Schermerhorn 				    struct kobject **hstate_kobjs,
22289a305230SLee Schermerhorn 				    struct attribute_group *hstate_attr_group)
2229a3437870SNishanth Aravamudan {
2230a3437870SNishanth Aravamudan 	int retval;
2231972dc4deSAneesh Kumar K.V 	int hi = hstate_index(h);
2232a3437870SNishanth Aravamudan 
22339a305230SLee Schermerhorn 	hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
22349a305230SLee Schermerhorn 	if (!hstate_kobjs[hi])
2235a3437870SNishanth Aravamudan 		return -ENOMEM;
2236a3437870SNishanth Aravamudan 
22379a305230SLee Schermerhorn 	retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
2238a3437870SNishanth Aravamudan 	if (retval)
22399a305230SLee Schermerhorn 		kobject_put(hstate_kobjs[hi]);
2240a3437870SNishanth Aravamudan 
2241a3437870SNishanth Aravamudan 	return retval;
2242a3437870SNishanth Aravamudan }
2243a3437870SNishanth Aravamudan 
2244a3437870SNishanth Aravamudan static void __init hugetlb_sysfs_init(void)
2245a3437870SNishanth Aravamudan {
2246a3437870SNishanth Aravamudan 	struct hstate *h;
2247a3437870SNishanth Aravamudan 	int err;
2248a3437870SNishanth Aravamudan 
2249a3437870SNishanth Aravamudan 	hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
2250a3437870SNishanth Aravamudan 	if (!hugepages_kobj)
2251a3437870SNishanth Aravamudan 		return;
2252a3437870SNishanth Aravamudan 
2253a3437870SNishanth Aravamudan 	for_each_hstate(h) {
22549a305230SLee Schermerhorn 		err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
22559a305230SLee Schermerhorn 					 hstate_kobjs, &hstate_attr_group);
2256a3437870SNishanth Aravamudan 		if (err)
2257ffb22af5SAndrew Morton 			pr_err("Hugetlb: Unable to add hstate %s", h->name);
2258a3437870SNishanth Aravamudan 	}
2259a3437870SNishanth Aravamudan }
2260a3437870SNishanth Aravamudan 
22619a305230SLee Schermerhorn #ifdef CONFIG_NUMA
22629a305230SLee Schermerhorn 
22639a305230SLee Schermerhorn /*
22649a305230SLee Schermerhorn  * node_hstate/s - associate per node hstate attributes, via their kobjects,
226510fbcf4cSKay Sievers  * with node devices in node_devices[] using a parallel array.  The array
226610fbcf4cSKay Sievers  * index of a node device or _hstate == node id.
226710fbcf4cSKay Sievers  * This is here to avoid any static dependency of the node device driver, in
22689a305230SLee Schermerhorn  * the base kernel, on the hugetlb module.
22699a305230SLee Schermerhorn  */
22709a305230SLee Schermerhorn struct node_hstate {
22719a305230SLee Schermerhorn 	struct kobject		*hugepages_kobj;
22729a305230SLee Schermerhorn 	struct kobject		*hstate_kobjs[HUGE_MAX_HSTATE];
22739a305230SLee Schermerhorn };
22749a305230SLee Schermerhorn struct node_hstate node_hstates[MAX_NUMNODES];
22759a305230SLee Schermerhorn 
22769a305230SLee Schermerhorn /*
227710fbcf4cSKay Sievers  * A subset of global hstate attributes for node devices
22789a305230SLee Schermerhorn  */
22799a305230SLee Schermerhorn static struct attribute *per_node_hstate_attrs[] = {
22809a305230SLee Schermerhorn 	&nr_hugepages_attr.attr,
22819a305230SLee Schermerhorn 	&free_hugepages_attr.attr,
22829a305230SLee Schermerhorn 	&surplus_hugepages_attr.attr,
22839a305230SLee Schermerhorn 	NULL,
22849a305230SLee Schermerhorn };
22859a305230SLee Schermerhorn 
22869a305230SLee Schermerhorn static struct attribute_group per_node_hstate_attr_group = {
22879a305230SLee Schermerhorn 	.attrs = per_node_hstate_attrs,
22889a305230SLee Schermerhorn };
22899a305230SLee Schermerhorn 
22909a305230SLee Schermerhorn /*
229110fbcf4cSKay Sievers  * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj.
22929a305230SLee Schermerhorn  * Returns node id via non-NULL nidp.
22939a305230SLee Schermerhorn  */
22949a305230SLee Schermerhorn static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
22959a305230SLee Schermerhorn {
22969a305230SLee Schermerhorn 	int nid;
22979a305230SLee Schermerhorn 
22989a305230SLee Schermerhorn 	for (nid = 0; nid < nr_node_ids; nid++) {
22999a305230SLee Schermerhorn 		struct node_hstate *nhs = &node_hstates[nid];
23009a305230SLee Schermerhorn 		int i;
23019a305230SLee Schermerhorn 		for (i = 0; i < HUGE_MAX_HSTATE; i++)
23029a305230SLee Schermerhorn 			if (nhs->hstate_kobjs[i] == kobj) {
23039a305230SLee Schermerhorn 				if (nidp)
23049a305230SLee Schermerhorn 					*nidp = nid;
23059a305230SLee Schermerhorn 				return &hstates[i];
23069a305230SLee Schermerhorn 			}
23079a305230SLee Schermerhorn 	}
23089a305230SLee Schermerhorn 
23099a305230SLee Schermerhorn 	BUG();
23109a305230SLee Schermerhorn 	return NULL;
23119a305230SLee Schermerhorn }
23129a305230SLee Schermerhorn 
23139a305230SLee Schermerhorn /*
231410fbcf4cSKay Sievers  * Unregister hstate attributes from a single node device.
23159a305230SLee Schermerhorn  * No-op if no hstate attributes attached.
23169a305230SLee Schermerhorn  */
23173cd8b44fSClaudiu Ghioc static void hugetlb_unregister_node(struct node *node)
23189a305230SLee Schermerhorn {
23199a305230SLee Schermerhorn 	struct hstate *h;
232010fbcf4cSKay Sievers 	struct node_hstate *nhs = &node_hstates[node->dev.id];
23219a305230SLee Schermerhorn 
23229a305230SLee Schermerhorn 	if (!nhs->hugepages_kobj)
23239b5e5d0fSLee Schermerhorn 		return;		/* no hstate attributes */
23249a305230SLee Schermerhorn 
2325972dc4deSAneesh Kumar K.V 	for_each_hstate(h) {
2326972dc4deSAneesh Kumar K.V 		int idx = hstate_index(h);
2327972dc4deSAneesh Kumar K.V 		if (nhs->hstate_kobjs[idx]) {
2328972dc4deSAneesh Kumar K.V 			kobject_put(nhs->hstate_kobjs[idx]);
2329972dc4deSAneesh Kumar K.V 			nhs->hstate_kobjs[idx] = NULL;
2330972dc4deSAneesh Kumar K.V 		}
23319a305230SLee Schermerhorn 	}
23329a305230SLee Schermerhorn 
23339a305230SLee Schermerhorn 	kobject_put(nhs->hugepages_kobj);
23349a305230SLee Schermerhorn 	nhs->hugepages_kobj = NULL;
23359a305230SLee Schermerhorn }
23369a305230SLee Schermerhorn 
23379a305230SLee Schermerhorn /*
233810fbcf4cSKay Sievers  * hugetlb module exit:  unregister hstate attributes from node devices
23399a305230SLee Schermerhorn  * that have them.
23409a305230SLee Schermerhorn  */
23419a305230SLee Schermerhorn static void hugetlb_unregister_all_nodes(void)
23429a305230SLee Schermerhorn {
23439a305230SLee Schermerhorn 	int nid;
23449a305230SLee Schermerhorn 
23459a305230SLee Schermerhorn 	/*
234610fbcf4cSKay Sievers 	 * disable node device registrations.
23479a305230SLee Schermerhorn 	 */
23489a305230SLee Schermerhorn 	register_hugetlbfs_with_node(NULL, NULL);
23499a305230SLee Schermerhorn 
23509a305230SLee Schermerhorn 	/*
23519a305230SLee Schermerhorn 	 * remove hstate attributes from any nodes that have them.
23529a305230SLee Schermerhorn 	 */
23539a305230SLee Schermerhorn 	for (nid = 0; nid < nr_node_ids; nid++)
23548732794bSWen Congyang 		hugetlb_unregister_node(node_devices[nid]);
23559a305230SLee Schermerhorn }
23569a305230SLee Schermerhorn 
23579a305230SLee Schermerhorn /*
235810fbcf4cSKay Sievers  * Register hstate attributes for a single node device.
23599a305230SLee Schermerhorn  * No-op if attributes already registered.
23609a305230SLee Schermerhorn  */
23613cd8b44fSClaudiu Ghioc static void hugetlb_register_node(struct node *node)
23629a305230SLee Schermerhorn {
23639a305230SLee Schermerhorn 	struct hstate *h;
236410fbcf4cSKay Sievers 	struct node_hstate *nhs = &node_hstates[node->dev.id];
23659a305230SLee Schermerhorn 	int err;
23669a305230SLee Schermerhorn 
23679a305230SLee Schermerhorn 	if (nhs->hugepages_kobj)
23689a305230SLee Schermerhorn 		return;		/* already allocated */
23699a305230SLee Schermerhorn 
23709a305230SLee Schermerhorn 	nhs->hugepages_kobj = kobject_create_and_add("hugepages",
237110fbcf4cSKay Sievers 							&node->dev.kobj);
23729a305230SLee Schermerhorn 	if (!nhs->hugepages_kobj)
23739a305230SLee Schermerhorn 		return;
23749a305230SLee Schermerhorn 
23759a305230SLee Schermerhorn 	for_each_hstate(h) {
23769a305230SLee Schermerhorn 		err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj,
23779a305230SLee Schermerhorn 						nhs->hstate_kobjs,
23789a305230SLee Schermerhorn 						&per_node_hstate_attr_group);
23799a305230SLee Schermerhorn 		if (err) {
2380ffb22af5SAndrew Morton 			pr_err("Hugetlb: Unable to add hstate %s for node %d\n",
238110fbcf4cSKay Sievers 				h->name, node->dev.id);
23829a305230SLee Schermerhorn 			hugetlb_unregister_node(node);
23839a305230SLee Schermerhorn 			break;
23849a305230SLee Schermerhorn 		}
23859a305230SLee Schermerhorn 	}
23869a305230SLee Schermerhorn }
23879a305230SLee Schermerhorn 
23889a305230SLee Schermerhorn /*
23899b5e5d0fSLee Schermerhorn  * hugetlb init time:  register hstate attributes for all registered node
239010fbcf4cSKay Sievers  * devices of nodes that have memory.  All on-line nodes should have
239110fbcf4cSKay Sievers  * registered their associated device by this time.
23929a305230SLee Schermerhorn  */
23937d9ca000SLuiz Capitulino static void __init hugetlb_register_all_nodes(void)
23949a305230SLee Schermerhorn {
23959a305230SLee Schermerhorn 	int nid;
23969a305230SLee Schermerhorn 
23978cebfcd0SLai Jiangshan 	for_each_node_state(nid, N_MEMORY) {
23988732794bSWen Congyang 		struct node *node = node_devices[nid];
239910fbcf4cSKay Sievers 		if (node->dev.id == nid)
24009a305230SLee Schermerhorn 			hugetlb_register_node(node);
24019a305230SLee Schermerhorn 	}
24029a305230SLee Schermerhorn 
24039a305230SLee Schermerhorn 	/*
240410fbcf4cSKay Sievers 	 * Let the node device driver know we're here so it can
24059a305230SLee Schermerhorn 	 * [un]register hstate attributes on node hotplug.
24069a305230SLee Schermerhorn 	 */
24079a305230SLee Schermerhorn 	register_hugetlbfs_with_node(hugetlb_register_node,
24089a305230SLee Schermerhorn 				     hugetlb_unregister_node);
24099a305230SLee Schermerhorn }
24109a305230SLee Schermerhorn #else	/* !CONFIG_NUMA */
24119a305230SLee Schermerhorn 
24129a305230SLee Schermerhorn static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
24139a305230SLee Schermerhorn {
24149a305230SLee Schermerhorn 	BUG();
24159a305230SLee Schermerhorn 	if (nidp)
24169a305230SLee Schermerhorn 		*nidp = -1;
24179a305230SLee Schermerhorn 	return NULL;
24189a305230SLee Schermerhorn }
24199a305230SLee Schermerhorn 
24209a305230SLee Schermerhorn static void hugetlb_unregister_all_nodes(void) { }
24219a305230SLee Schermerhorn 
24229a305230SLee Schermerhorn static void hugetlb_register_all_nodes(void) { }
24239a305230SLee Schermerhorn 
24249a305230SLee Schermerhorn #endif
24259a305230SLee Schermerhorn 
2426a3437870SNishanth Aravamudan static void __exit hugetlb_exit(void)
2427a3437870SNishanth Aravamudan {
2428a3437870SNishanth Aravamudan 	struct hstate *h;
2429a3437870SNishanth Aravamudan 
24309a305230SLee Schermerhorn 	hugetlb_unregister_all_nodes();
24319a305230SLee Schermerhorn 
2432a3437870SNishanth Aravamudan 	for_each_hstate(h) {
2433972dc4deSAneesh Kumar K.V 		kobject_put(hstate_kobjs[hstate_index(h)]);
2434a3437870SNishanth Aravamudan 	}
2435a3437870SNishanth Aravamudan 
2436a3437870SNishanth Aravamudan 	kobject_put(hugepages_kobj);
24378382d914SDavidlohr Bueso 	kfree(htlb_fault_mutex_table);
2438a3437870SNishanth Aravamudan }
2439a3437870SNishanth Aravamudan module_exit(hugetlb_exit);
2440a3437870SNishanth Aravamudan 
2441a3437870SNishanth Aravamudan static int __init hugetlb_init(void)
2442a3437870SNishanth Aravamudan {
24438382d914SDavidlohr Bueso 	int i;
24448382d914SDavidlohr Bueso 
2445457c1b27SNishanth Aravamudan 	if (!hugepages_supported())
24460ef89d25SBenjamin Herrenschmidt 		return 0;
2447a3437870SNishanth Aravamudan 
2448e11bfbfcSNick Piggin 	if (!size_to_hstate(default_hstate_size)) {
2449e11bfbfcSNick Piggin 		default_hstate_size = HPAGE_SIZE;
2450e11bfbfcSNick Piggin 		if (!size_to_hstate(default_hstate_size))
2451a3437870SNishanth Aravamudan 			hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
2452a3437870SNishanth Aravamudan 	}
2453972dc4deSAneesh Kumar K.V 	default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size));
2454e11bfbfcSNick Piggin 	if (default_hstate_max_huge_pages)
2455e11bfbfcSNick Piggin 		default_hstate.max_huge_pages = default_hstate_max_huge_pages;
2456a3437870SNishanth Aravamudan 
2457a3437870SNishanth Aravamudan 	hugetlb_init_hstates();
2458aa888a74SAndi Kleen 	gather_bootmem_prealloc();
2459a3437870SNishanth Aravamudan 	report_hugepages();
2460a3437870SNishanth Aravamudan 
2461a3437870SNishanth Aravamudan 	hugetlb_sysfs_init();
24629a305230SLee Schermerhorn 	hugetlb_register_all_nodes();
24637179e7bfSJianguo Wu 	hugetlb_cgroup_file_init();
24649a305230SLee Schermerhorn 
24658382d914SDavidlohr Bueso #ifdef CONFIG_SMP
24668382d914SDavidlohr Bueso 	num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus());
24678382d914SDavidlohr Bueso #else
24688382d914SDavidlohr Bueso 	num_fault_mutexes = 1;
24698382d914SDavidlohr Bueso #endif
24708382d914SDavidlohr Bueso 	htlb_fault_mutex_table =
24718382d914SDavidlohr Bueso 		kmalloc(sizeof(struct mutex) * num_fault_mutexes, GFP_KERNEL);
24728382d914SDavidlohr Bueso 	BUG_ON(!htlb_fault_mutex_table);
24738382d914SDavidlohr Bueso 
24748382d914SDavidlohr Bueso 	for (i = 0; i < num_fault_mutexes; i++)
24758382d914SDavidlohr Bueso 		mutex_init(&htlb_fault_mutex_table[i]);
2476a3437870SNishanth Aravamudan 	return 0;
2477a3437870SNishanth Aravamudan }
2478a3437870SNishanth Aravamudan module_init(hugetlb_init);
2479a3437870SNishanth Aravamudan 
2480a3437870SNishanth Aravamudan /* Should be called on processing a hugepagesz=... option */
2481a3437870SNishanth Aravamudan void __init hugetlb_add_hstate(unsigned order)
2482a3437870SNishanth Aravamudan {
2483a3437870SNishanth Aravamudan 	struct hstate *h;
24848faa8b07SAndi Kleen 	unsigned long i;
24858faa8b07SAndi Kleen 
2486a3437870SNishanth Aravamudan 	if (size_to_hstate(PAGE_SIZE << order)) {
2487ffb22af5SAndrew Morton 		pr_warning("hugepagesz= specified twice, ignoring\n");
2488a3437870SNishanth Aravamudan 		return;
2489a3437870SNishanth Aravamudan 	}
249047d38344SAneesh Kumar K.V 	BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
2491a3437870SNishanth Aravamudan 	BUG_ON(order == 0);
249247d38344SAneesh Kumar K.V 	h = &hstates[hugetlb_max_hstate++];
2493a3437870SNishanth Aravamudan 	h->order = order;
2494a3437870SNishanth Aravamudan 	h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
24958faa8b07SAndi Kleen 	h->nr_huge_pages = 0;
24968faa8b07SAndi Kleen 	h->free_huge_pages = 0;
24978faa8b07SAndi Kleen 	for (i = 0; i < MAX_NUMNODES; ++i)
24988faa8b07SAndi Kleen 		INIT_LIST_HEAD(&h->hugepage_freelists[i]);
24990edaecfaSAneesh Kumar K.V 	INIT_LIST_HEAD(&h->hugepage_activelist);
25008cebfcd0SLai Jiangshan 	h->next_nid_to_alloc = first_node(node_states[N_MEMORY]);
25018cebfcd0SLai Jiangshan 	h->next_nid_to_free = first_node(node_states[N_MEMORY]);
2502a3437870SNishanth Aravamudan 	snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
2503a3437870SNishanth Aravamudan 					huge_page_size(h)/1024);
25048faa8b07SAndi Kleen 
2505a3437870SNishanth Aravamudan 	parsed_hstate = h;
2506a3437870SNishanth Aravamudan }
2507a3437870SNishanth Aravamudan 
2508e11bfbfcSNick Piggin static int __init hugetlb_nrpages_setup(char *s)
2509a3437870SNishanth Aravamudan {
2510a3437870SNishanth Aravamudan 	unsigned long *mhp;
25118faa8b07SAndi Kleen 	static unsigned long *last_mhp;
2512a3437870SNishanth Aravamudan 
2513a3437870SNishanth Aravamudan 	/*
251447d38344SAneesh Kumar K.V 	 * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,
2515a3437870SNishanth Aravamudan 	 * so this hugepages= parameter goes to the "default hstate".
2516a3437870SNishanth Aravamudan 	 */
251747d38344SAneesh Kumar K.V 	if (!hugetlb_max_hstate)
2518a3437870SNishanth Aravamudan 		mhp = &default_hstate_max_huge_pages;
2519a3437870SNishanth Aravamudan 	else
2520a3437870SNishanth Aravamudan 		mhp = &parsed_hstate->max_huge_pages;
2521a3437870SNishanth Aravamudan 
25228faa8b07SAndi Kleen 	if (mhp == last_mhp) {
2523ffb22af5SAndrew Morton 		pr_warning("hugepages= specified twice without "
25248faa8b07SAndi Kleen 			   "interleaving hugepagesz=, ignoring\n");
25258faa8b07SAndi Kleen 		return 1;
25268faa8b07SAndi Kleen 	}
25278faa8b07SAndi Kleen 
2528a3437870SNishanth Aravamudan 	if (sscanf(s, "%lu", mhp) <= 0)
2529a3437870SNishanth Aravamudan 		*mhp = 0;
2530a3437870SNishanth Aravamudan 
25318faa8b07SAndi Kleen 	/*
25328faa8b07SAndi Kleen 	 * Global state is always initialized later in hugetlb_init.
25338faa8b07SAndi Kleen 	 * But we need to allocate >= MAX_ORDER hstates here early to still
25348faa8b07SAndi Kleen 	 * use the bootmem allocator.
25358faa8b07SAndi Kleen 	 */
253647d38344SAneesh Kumar K.V 	if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)
25378faa8b07SAndi Kleen 		hugetlb_hstate_alloc_pages(parsed_hstate);
25388faa8b07SAndi Kleen 
25398faa8b07SAndi Kleen 	last_mhp = mhp;
25408faa8b07SAndi Kleen 
2541a3437870SNishanth Aravamudan 	return 1;
2542a3437870SNishanth Aravamudan }
2543e11bfbfcSNick Piggin __setup("hugepages=", hugetlb_nrpages_setup);
2544e11bfbfcSNick Piggin 
2545e11bfbfcSNick Piggin static int __init hugetlb_default_setup(char *s)
2546e11bfbfcSNick Piggin {
2547e11bfbfcSNick Piggin 	default_hstate_size = memparse(s, &s);
2548e11bfbfcSNick Piggin 	return 1;
2549e11bfbfcSNick Piggin }
2550e11bfbfcSNick Piggin __setup("default_hugepagesz=", hugetlb_default_setup);
2551a3437870SNishanth Aravamudan 
25528a213460SNishanth Aravamudan static unsigned int cpuset_mems_nr(unsigned int *array)
25538a213460SNishanth Aravamudan {
25548a213460SNishanth Aravamudan 	int node;
25558a213460SNishanth Aravamudan 	unsigned int nr = 0;
25568a213460SNishanth Aravamudan 
25578a213460SNishanth Aravamudan 	for_each_node_mask(node, cpuset_current_mems_allowed)
25588a213460SNishanth Aravamudan 		nr += array[node];
25598a213460SNishanth Aravamudan 
25608a213460SNishanth Aravamudan 	return nr;
25618a213460SNishanth Aravamudan }
25628a213460SNishanth Aravamudan 
25638a213460SNishanth Aravamudan #ifdef CONFIG_SYSCTL
256406808b08SLee Schermerhorn static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
256506808b08SLee Schermerhorn 			 struct ctl_table *table, int write,
256606808b08SLee Schermerhorn 			 void __user *buffer, size_t *length, loff_t *ppos)
25671da177e4SLinus Torvalds {
2568e5ff2159SAndi Kleen 	struct hstate *h = &default_hstate;
2569238d3c13SDavid Rientjes 	unsigned long tmp = h->max_huge_pages;
257008d4a246SMichal Hocko 	int ret;
2571e5ff2159SAndi Kleen 
2572457c1b27SNishanth Aravamudan 	if (!hugepages_supported())
2573457c1b27SNishanth Aravamudan 		return -ENOTSUPP;
2574457c1b27SNishanth Aravamudan 
2575e5ff2159SAndi Kleen 	table->data = &tmp;
2576e5ff2159SAndi Kleen 	table->maxlen = sizeof(unsigned long);
257708d4a246SMichal Hocko 	ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
257808d4a246SMichal Hocko 	if (ret)
257908d4a246SMichal Hocko 		goto out;
2580e5ff2159SAndi Kleen 
2581238d3c13SDavid Rientjes 	if (write)
2582238d3c13SDavid Rientjes 		ret = __nr_hugepages_store_common(obey_mempolicy, h,
2583238d3c13SDavid Rientjes 						  NUMA_NO_NODE, tmp, *length);
258408d4a246SMichal Hocko out:
258508d4a246SMichal Hocko 	return ret;
25861da177e4SLinus Torvalds }
2587396faf03SMel Gorman 
258806808b08SLee Schermerhorn int hugetlb_sysctl_handler(struct ctl_table *table, int write,
258906808b08SLee Schermerhorn 			  void __user *buffer, size_t *length, loff_t *ppos)
259006808b08SLee Schermerhorn {
259106808b08SLee Schermerhorn 
259206808b08SLee Schermerhorn 	return hugetlb_sysctl_handler_common(false, table, write,
259306808b08SLee Schermerhorn 							buffer, length, ppos);
259406808b08SLee Schermerhorn }
259506808b08SLee Schermerhorn 
259606808b08SLee Schermerhorn #ifdef CONFIG_NUMA
259706808b08SLee Schermerhorn int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
259806808b08SLee Schermerhorn 			  void __user *buffer, size_t *length, loff_t *ppos)
259906808b08SLee Schermerhorn {
260006808b08SLee Schermerhorn 	return hugetlb_sysctl_handler_common(true, table, write,
260106808b08SLee Schermerhorn 							buffer, length, ppos);
260206808b08SLee Schermerhorn }
260306808b08SLee Schermerhorn #endif /* CONFIG_NUMA */
260406808b08SLee Schermerhorn 
2605a3d0c6aaSNishanth Aravamudan int hugetlb_overcommit_handler(struct ctl_table *table, int write,
26068d65af78SAlexey Dobriyan 			void __user *buffer,
2607a3d0c6aaSNishanth Aravamudan 			size_t *length, loff_t *ppos)
2608a3d0c6aaSNishanth Aravamudan {
2609a5516438SAndi Kleen 	struct hstate *h = &default_hstate;
2610e5ff2159SAndi Kleen 	unsigned long tmp;
261108d4a246SMichal Hocko 	int ret;
2612e5ff2159SAndi Kleen 
2613457c1b27SNishanth Aravamudan 	if (!hugepages_supported())
2614457c1b27SNishanth Aravamudan 		return -ENOTSUPP;
2615457c1b27SNishanth Aravamudan 
2616e5ff2159SAndi Kleen 	tmp = h->nr_overcommit_huge_pages;
2617e5ff2159SAndi Kleen 
2618bae7f4aeSLuiz Capitulino 	if (write && hstate_is_gigantic(h))
2619adbe8726SEric B Munson 		return -EINVAL;
2620adbe8726SEric B Munson 
2621e5ff2159SAndi Kleen 	table->data = &tmp;
2622e5ff2159SAndi Kleen 	table->maxlen = sizeof(unsigned long);
262308d4a246SMichal Hocko 	ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
262408d4a246SMichal Hocko 	if (ret)
262508d4a246SMichal Hocko 		goto out;
2626e5ff2159SAndi Kleen 
2627e5ff2159SAndi Kleen 	if (write) {
2628064d9efeSNishanth Aravamudan 		spin_lock(&hugetlb_lock);
2629e5ff2159SAndi Kleen 		h->nr_overcommit_huge_pages = tmp;
2630a3d0c6aaSNishanth Aravamudan 		spin_unlock(&hugetlb_lock);
2631e5ff2159SAndi Kleen 	}
263208d4a246SMichal Hocko out:
263308d4a246SMichal Hocko 	return ret;
2634a3d0c6aaSNishanth Aravamudan }
2635a3d0c6aaSNishanth Aravamudan 
26361da177e4SLinus Torvalds #endif /* CONFIG_SYSCTL */
26371da177e4SLinus Torvalds 
2638e1759c21SAlexey Dobriyan void hugetlb_report_meminfo(struct seq_file *m)
26391da177e4SLinus Torvalds {
2640a5516438SAndi Kleen 	struct hstate *h = &default_hstate;
2641457c1b27SNishanth Aravamudan 	if (!hugepages_supported())
2642457c1b27SNishanth Aravamudan 		return;
2643e1759c21SAlexey Dobriyan 	seq_printf(m,
26441da177e4SLinus Torvalds 			"HugePages_Total:   %5lu\n"
26451da177e4SLinus Torvalds 			"HugePages_Free:    %5lu\n"
2646b45b5bd6SDavid Gibson 			"HugePages_Rsvd:    %5lu\n"
26477893d1d5SAdam Litke 			"HugePages_Surp:    %5lu\n"
26484f98a2feSRik van Riel 			"Hugepagesize:   %8lu kB\n",
2649a5516438SAndi Kleen 			h->nr_huge_pages,
2650a5516438SAndi Kleen 			h->free_huge_pages,
2651a5516438SAndi Kleen 			h->resv_huge_pages,
2652a5516438SAndi Kleen 			h->surplus_huge_pages,
2653a5516438SAndi Kleen 			1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
26541da177e4SLinus Torvalds }
26551da177e4SLinus Torvalds 
26561da177e4SLinus Torvalds int hugetlb_report_node_meminfo(int nid, char *buf)
26571da177e4SLinus Torvalds {
2658a5516438SAndi Kleen 	struct hstate *h = &default_hstate;
2659457c1b27SNishanth Aravamudan 	if (!hugepages_supported())
2660457c1b27SNishanth Aravamudan 		return 0;
26611da177e4SLinus Torvalds 	return sprintf(buf,
26621da177e4SLinus Torvalds 		"Node %d HugePages_Total: %5u\n"
2663a1de0919SNishanth Aravamudan 		"Node %d HugePages_Free:  %5u\n"
2664a1de0919SNishanth Aravamudan 		"Node %d HugePages_Surp:  %5u\n",
2665a5516438SAndi Kleen 		nid, h->nr_huge_pages_node[nid],
2666a5516438SAndi Kleen 		nid, h->free_huge_pages_node[nid],
2667a5516438SAndi Kleen 		nid, h->surplus_huge_pages_node[nid]);
26681da177e4SLinus Torvalds }
26691da177e4SLinus Torvalds 
2670949f7ec5SDavid Rientjes void hugetlb_show_meminfo(void)
2671949f7ec5SDavid Rientjes {
2672949f7ec5SDavid Rientjes 	struct hstate *h;
2673949f7ec5SDavid Rientjes 	int nid;
2674949f7ec5SDavid Rientjes 
2675457c1b27SNishanth Aravamudan 	if (!hugepages_supported())
2676457c1b27SNishanth Aravamudan 		return;
2677457c1b27SNishanth Aravamudan 
2678949f7ec5SDavid Rientjes 	for_each_node_state(nid, N_MEMORY)
2679949f7ec5SDavid Rientjes 		for_each_hstate(h)
2680949f7ec5SDavid Rientjes 			pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
2681949f7ec5SDavid Rientjes 				nid,
2682949f7ec5SDavid Rientjes 				h->nr_huge_pages_node[nid],
2683949f7ec5SDavid Rientjes 				h->free_huge_pages_node[nid],
2684949f7ec5SDavid Rientjes 				h->surplus_huge_pages_node[nid],
2685949f7ec5SDavid Rientjes 				1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
2686949f7ec5SDavid Rientjes }
2687949f7ec5SDavid Rientjes 
26881da177e4SLinus Torvalds /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
26891da177e4SLinus Torvalds unsigned long hugetlb_total_pages(void)
26901da177e4SLinus Torvalds {
2691d0028588SWanpeng Li 	struct hstate *h;
2692d0028588SWanpeng Li 	unsigned long nr_total_pages = 0;
2693d0028588SWanpeng Li 
2694d0028588SWanpeng Li 	for_each_hstate(h)
2695d0028588SWanpeng Li 		nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h);
2696d0028588SWanpeng Li 	return nr_total_pages;
26971da177e4SLinus Torvalds }
26981da177e4SLinus Torvalds 
2699a5516438SAndi Kleen static int hugetlb_acct_memory(struct hstate *h, long delta)
2700fc1b8a73SMel Gorman {
2701fc1b8a73SMel Gorman 	int ret = -ENOMEM;
2702fc1b8a73SMel Gorman 
2703fc1b8a73SMel Gorman 	spin_lock(&hugetlb_lock);
2704fc1b8a73SMel Gorman 	/*
2705fc1b8a73SMel Gorman 	 * When cpuset is configured, it breaks the strict hugetlb page
2706fc1b8a73SMel Gorman 	 * reservation as the accounting is done on a global variable. Such
2707fc1b8a73SMel Gorman 	 * reservation is completely rubbish in the presence of cpuset because
2708fc1b8a73SMel Gorman 	 * the reservation is not checked against page availability for the
2709fc1b8a73SMel Gorman 	 * current cpuset. Application can still potentially OOM'ed by kernel
2710fc1b8a73SMel Gorman 	 * with lack of free htlb page in cpuset that the task is in.
2711fc1b8a73SMel Gorman 	 * Attempt to enforce strict accounting with cpuset is almost
2712fc1b8a73SMel Gorman 	 * impossible (or too ugly) because cpuset is too fluid that
2713fc1b8a73SMel Gorman 	 * task or memory node can be dynamically moved between cpusets.
2714fc1b8a73SMel Gorman 	 *
2715fc1b8a73SMel Gorman 	 * The change of semantics for shared hugetlb mapping with cpuset is
2716fc1b8a73SMel Gorman 	 * undesirable. However, in order to preserve some of the semantics,
2717fc1b8a73SMel Gorman 	 * we fall back to check against current free page availability as
2718fc1b8a73SMel Gorman 	 * a best attempt and hopefully to minimize the impact of changing
2719fc1b8a73SMel Gorman 	 * semantics that cpuset has.
2720fc1b8a73SMel Gorman 	 */
2721fc1b8a73SMel Gorman 	if (delta > 0) {
2722a5516438SAndi Kleen 		if (gather_surplus_pages(h, delta) < 0)
2723fc1b8a73SMel Gorman 			goto out;
2724fc1b8a73SMel Gorman 
2725a5516438SAndi Kleen 		if (delta > cpuset_mems_nr(h->free_huge_pages_node)) {
2726a5516438SAndi Kleen 			return_unused_surplus_pages(h, delta);
2727fc1b8a73SMel Gorman 			goto out;
2728fc1b8a73SMel Gorman 		}
2729fc1b8a73SMel Gorman 	}
2730fc1b8a73SMel Gorman 
2731fc1b8a73SMel Gorman 	ret = 0;
2732fc1b8a73SMel Gorman 	if (delta < 0)
2733a5516438SAndi Kleen 		return_unused_surplus_pages(h, (unsigned long) -delta);
2734fc1b8a73SMel Gorman 
2735fc1b8a73SMel Gorman out:
2736fc1b8a73SMel Gorman 	spin_unlock(&hugetlb_lock);
2737fc1b8a73SMel Gorman 	return ret;
2738fc1b8a73SMel Gorman }
2739fc1b8a73SMel Gorman 
274084afd99bSAndy Whitcroft static void hugetlb_vm_op_open(struct vm_area_struct *vma)
274184afd99bSAndy Whitcroft {
2742f522c3acSJoonsoo Kim 	struct resv_map *resv = vma_resv_map(vma);
274384afd99bSAndy Whitcroft 
274484afd99bSAndy Whitcroft 	/*
274584afd99bSAndy Whitcroft 	 * This new VMA should share its siblings reservation map if present.
274684afd99bSAndy Whitcroft 	 * The VMA will only ever have a valid reservation map pointer where
274784afd99bSAndy Whitcroft 	 * it is being copied for another still existing VMA.  As that VMA
274825985edcSLucas De Marchi 	 * has a reference to the reservation map it cannot disappear until
274984afd99bSAndy Whitcroft 	 * after this open call completes.  It is therefore safe to take a
275084afd99bSAndy Whitcroft 	 * new reference here without additional locking.
275184afd99bSAndy Whitcroft 	 */
27524e35f483SJoonsoo Kim 	if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
2753f522c3acSJoonsoo Kim 		kref_get(&resv->refs);
275484afd99bSAndy Whitcroft }
275584afd99bSAndy Whitcroft 
2756a1e78772SMel Gorman static void hugetlb_vm_op_close(struct vm_area_struct *vma)
2757a1e78772SMel Gorman {
2758a5516438SAndi Kleen 	struct hstate *h = hstate_vma(vma);
2759f522c3acSJoonsoo Kim 	struct resv_map *resv = vma_resv_map(vma);
276090481622SDavid Gibson 	struct hugepage_subpool *spool = subpool_vma(vma);
27614e35f483SJoonsoo Kim 	unsigned long reserve, start, end;
27621c5ecae3SMike Kravetz 	long gbl_reserve;
276384afd99bSAndy Whitcroft 
27644e35f483SJoonsoo Kim 	if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
27654e35f483SJoonsoo Kim 		return;
27664e35f483SJoonsoo Kim 
2767a5516438SAndi Kleen 	start = vma_hugecache_offset(h, vma, vma->vm_start);
2768a5516438SAndi Kleen 	end = vma_hugecache_offset(h, vma, vma->vm_end);
276984afd99bSAndy Whitcroft 
27704e35f483SJoonsoo Kim 	reserve = (end - start) - region_count(resv, start, end);
277184afd99bSAndy Whitcroft 
2772f031dd27SJoonsoo Kim 	kref_put(&resv->refs, resv_map_release);
277384afd99bSAndy Whitcroft 
27747251ff78SAdam Litke 	if (reserve) {
27751c5ecae3SMike Kravetz 		/*
27761c5ecae3SMike Kravetz 		 * Decrement reserve counts.  The global reserve count may be
27771c5ecae3SMike Kravetz 		 * adjusted if the subpool has a minimum size.
27781c5ecae3SMike Kravetz 		 */
27791c5ecae3SMike Kravetz 		gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
27801c5ecae3SMike Kravetz 		hugetlb_acct_memory(h, -gbl_reserve);
27817251ff78SAdam Litke 	}
2782a1e78772SMel Gorman }
2783a1e78772SMel Gorman 
27841da177e4SLinus Torvalds /*
27851da177e4SLinus Torvalds  * We cannot handle pagefaults against hugetlb pages at all.  They cause
27861da177e4SLinus Torvalds  * handle_mm_fault() to try to instantiate regular-sized pages in the
27871da177e4SLinus Torvalds  * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
27881da177e4SLinus Torvalds  * this far.
27891da177e4SLinus Torvalds  */
2790d0217ac0SNick Piggin static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
27911da177e4SLinus Torvalds {
27921da177e4SLinus Torvalds 	BUG();
2793d0217ac0SNick Piggin 	return 0;
27941da177e4SLinus Torvalds }
27951da177e4SLinus Torvalds 
2796f0f37e2fSAlexey Dobriyan const struct vm_operations_struct hugetlb_vm_ops = {
2797d0217ac0SNick Piggin 	.fault = hugetlb_vm_op_fault,
279884afd99bSAndy Whitcroft 	.open = hugetlb_vm_op_open,
2799a1e78772SMel Gorman 	.close = hugetlb_vm_op_close,
28001da177e4SLinus Torvalds };
28011da177e4SLinus Torvalds 
28021e8f889bSDavid Gibson static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
28031e8f889bSDavid Gibson 				int writable)
280463551ae0SDavid Gibson {
280563551ae0SDavid Gibson 	pte_t entry;
280663551ae0SDavid Gibson 
28071e8f889bSDavid Gibson 	if (writable) {
2808106c992aSGerald Schaefer 		entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
2809106c992aSGerald Schaefer 					 vma->vm_page_prot)));
281063551ae0SDavid Gibson 	} else {
2811106c992aSGerald Schaefer 		entry = huge_pte_wrprotect(mk_huge_pte(page,
2812106c992aSGerald Schaefer 					   vma->vm_page_prot));
281363551ae0SDavid Gibson 	}
281463551ae0SDavid Gibson 	entry = pte_mkyoung(entry);
281563551ae0SDavid Gibson 	entry = pte_mkhuge(entry);
2816d9ed9faaSChris Metcalf 	entry = arch_make_huge_pte(entry, vma, page, writable);
281763551ae0SDavid Gibson 
281863551ae0SDavid Gibson 	return entry;
281963551ae0SDavid Gibson }
282063551ae0SDavid Gibson 
28211e8f889bSDavid Gibson static void set_huge_ptep_writable(struct vm_area_struct *vma,
28221e8f889bSDavid Gibson 				   unsigned long address, pte_t *ptep)
28231e8f889bSDavid Gibson {
28241e8f889bSDavid Gibson 	pte_t entry;
28251e8f889bSDavid Gibson 
2826106c992aSGerald Schaefer 	entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep)));
282732f84528SChris Forbes 	if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
28284b3073e1SRussell King 		update_mmu_cache(vma, address, ptep);
28291e8f889bSDavid Gibson }
28301e8f889bSDavid Gibson 
28314a705fefSNaoya Horiguchi static int is_hugetlb_entry_migration(pte_t pte)
28324a705fefSNaoya Horiguchi {
28334a705fefSNaoya Horiguchi 	swp_entry_t swp;
28344a705fefSNaoya Horiguchi 
28354a705fefSNaoya Horiguchi 	if (huge_pte_none(pte) || pte_present(pte))
28364a705fefSNaoya Horiguchi 		return 0;
28374a705fefSNaoya Horiguchi 	swp = pte_to_swp_entry(pte);
28384a705fefSNaoya Horiguchi 	if (non_swap_entry(swp) && is_migration_entry(swp))
28394a705fefSNaoya Horiguchi 		return 1;
28404a705fefSNaoya Horiguchi 	else
28414a705fefSNaoya Horiguchi 		return 0;
28424a705fefSNaoya Horiguchi }
28434a705fefSNaoya Horiguchi 
28444a705fefSNaoya Horiguchi static int is_hugetlb_entry_hwpoisoned(pte_t pte)
28454a705fefSNaoya Horiguchi {
28464a705fefSNaoya Horiguchi 	swp_entry_t swp;
28474a705fefSNaoya Horiguchi 
28484a705fefSNaoya Horiguchi 	if (huge_pte_none(pte) || pte_present(pte))
28494a705fefSNaoya Horiguchi 		return 0;
28504a705fefSNaoya Horiguchi 	swp = pte_to_swp_entry(pte);
28514a705fefSNaoya Horiguchi 	if (non_swap_entry(swp) && is_hwpoison_entry(swp))
28524a705fefSNaoya Horiguchi 		return 1;
28534a705fefSNaoya Horiguchi 	else
28544a705fefSNaoya Horiguchi 		return 0;
28554a705fefSNaoya Horiguchi }
28561e8f889bSDavid Gibson 
285763551ae0SDavid Gibson int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
285863551ae0SDavid Gibson 			    struct vm_area_struct *vma)
285963551ae0SDavid Gibson {
286063551ae0SDavid Gibson 	pte_t *src_pte, *dst_pte, entry;
286163551ae0SDavid Gibson 	struct page *ptepage;
28621c59827dSHugh Dickins 	unsigned long addr;
28631e8f889bSDavid Gibson 	int cow;
2864a5516438SAndi Kleen 	struct hstate *h = hstate_vma(vma);
2865a5516438SAndi Kleen 	unsigned long sz = huge_page_size(h);
2866e8569dd2SAndreas Sandberg 	unsigned long mmun_start;	/* For mmu_notifiers */
2867e8569dd2SAndreas Sandberg 	unsigned long mmun_end;		/* For mmu_notifiers */
2868e8569dd2SAndreas Sandberg 	int ret = 0;
28691e8f889bSDavid Gibson 
28701e8f889bSDavid Gibson 	cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
287163551ae0SDavid Gibson 
2872e8569dd2SAndreas Sandberg 	mmun_start = vma->vm_start;
2873e8569dd2SAndreas Sandberg 	mmun_end = vma->vm_end;
2874e8569dd2SAndreas Sandberg 	if (cow)
2875e8569dd2SAndreas Sandberg 		mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end);
2876e8569dd2SAndreas Sandberg 
2877a5516438SAndi Kleen 	for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
2878cb900f41SKirill A. Shutemov 		spinlock_t *src_ptl, *dst_ptl;
2879c74df32cSHugh Dickins 		src_pte = huge_pte_offset(src, addr);
2880c74df32cSHugh Dickins 		if (!src_pte)
2881c74df32cSHugh Dickins 			continue;
2882a5516438SAndi Kleen 		dst_pte = huge_pte_alloc(dst, addr, sz);
2883e8569dd2SAndreas Sandberg 		if (!dst_pte) {
2884e8569dd2SAndreas Sandberg 			ret = -ENOMEM;
2885e8569dd2SAndreas Sandberg 			break;
2886e8569dd2SAndreas Sandberg 		}
2887c5c99429SLarry Woodman 
2888c5c99429SLarry Woodman 		/* If the pagetables are shared don't copy or take references */
2889c5c99429SLarry Woodman 		if (dst_pte == src_pte)
2890c5c99429SLarry Woodman 			continue;
2891c5c99429SLarry Woodman 
2892cb900f41SKirill A. Shutemov 		dst_ptl = huge_pte_lock(h, dst, dst_pte);
2893cb900f41SKirill A. Shutemov 		src_ptl = huge_pte_lockptr(h, src, src_pte);
2894cb900f41SKirill A. Shutemov 		spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
28954a705fefSNaoya Horiguchi 		entry = huge_ptep_get(src_pte);
28964a705fefSNaoya Horiguchi 		if (huge_pte_none(entry)) { /* skip none entry */
28974a705fefSNaoya Horiguchi 			;
28984a705fefSNaoya Horiguchi 		} else if (unlikely(is_hugetlb_entry_migration(entry) ||
28994a705fefSNaoya Horiguchi 				    is_hugetlb_entry_hwpoisoned(entry))) {
29004a705fefSNaoya Horiguchi 			swp_entry_t swp_entry = pte_to_swp_entry(entry);
29014a705fefSNaoya Horiguchi 
29024a705fefSNaoya Horiguchi 			if (is_write_migration_entry(swp_entry) && cow) {
29034a705fefSNaoya Horiguchi 				/*
29044a705fefSNaoya Horiguchi 				 * COW mappings require pages in both
29054a705fefSNaoya Horiguchi 				 * parent and child to be set to read.
29064a705fefSNaoya Horiguchi 				 */
29074a705fefSNaoya Horiguchi 				make_migration_entry_read(&swp_entry);
29084a705fefSNaoya Horiguchi 				entry = swp_entry_to_pte(swp_entry);
29094a705fefSNaoya Horiguchi 				set_huge_pte_at(src, addr, src_pte, entry);
29104a705fefSNaoya Horiguchi 			}
29114a705fefSNaoya Horiguchi 			set_huge_pte_at(dst, addr, dst_pte, entry);
29124a705fefSNaoya Horiguchi 		} else {
291334ee645eSJoerg Roedel 			if (cow) {
29147f2e9525SGerald Schaefer 				huge_ptep_set_wrprotect(src, addr, src_pte);
291534ee645eSJoerg Roedel 				mmu_notifier_invalidate_range(src, mmun_start,
291634ee645eSJoerg Roedel 								   mmun_end);
291734ee645eSJoerg Roedel 			}
29180253d634SNaoya Horiguchi 			entry = huge_ptep_get(src_pte);
291963551ae0SDavid Gibson 			ptepage = pte_page(entry);
292063551ae0SDavid Gibson 			get_page(ptepage);
29210fe6e20bSNaoya Horiguchi 			page_dup_rmap(ptepage);
292263551ae0SDavid Gibson 			set_huge_pte_at(dst, addr, dst_pte, entry);
29231c59827dSHugh Dickins 		}
2924cb900f41SKirill A. Shutemov 		spin_unlock(src_ptl);
2925cb900f41SKirill A. Shutemov 		spin_unlock(dst_ptl);
292663551ae0SDavid Gibson 	}
292763551ae0SDavid Gibson 
2928e8569dd2SAndreas Sandberg 	if (cow)
2929e8569dd2SAndreas Sandberg 		mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end);
2930e8569dd2SAndreas Sandberg 
2931e8569dd2SAndreas Sandberg 	return ret;
293263551ae0SDavid Gibson }
293363551ae0SDavid Gibson 
293424669e58SAneesh Kumar K.V void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
293524669e58SAneesh Kumar K.V 			    unsigned long start, unsigned long end,
293624669e58SAneesh Kumar K.V 			    struct page *ref_page)
293763551ae0SDavid Gibson {
293824669e58SAneesh Kumar K.V 	int force_flush = 0;
293963551ae0SDavid Gibson 	struct mm_struct *mm = vma->vm_mm;
294063551ae0SDavid Gibson 	unsigned long address;
2941c7546f8fSDavid Gibson 	pte_t *ptep;
294263551ae0SDavid Gibson 	pte_t pte;
2943cb900f41SKirill A. Shutemov 	spinlock_t *ptl;
294463551ae0SDavid Gibson 	struct page *page;
2945a5516438SAndi Kleen 	struct hstate *h = hstate_vma(vma);
2946a5516438SAndi Kleen 	unsigned long sz = huge_page_size(h);
29472ec74c3eSSagi Grimberg 	const unsigned long mmun_start = start;	/* For mmu_notifiers */
29482ec74c3eSSagi Grimberg 	const unsigned long mmun_end   = end;	/* For mmu_notifiers */
2949a5516438SAndi Kleen 
295063551ae0SDavid Gibson 	WARN_ON(!is_vm_hugetlb_page(vma));
2951a5516438SAndi Kleen 	BUG_ON(start & ~huge_page_mask(h));
2952a5516438SAndi Kleen 	BUG_ON(end & ~huge_page_mask(h));
295363551ae0SDavid Gibson 
295424669e58SAneesh Kumar K.V 	tlb_start_vma(tlb, vma);
29552ec74c3eSSagi Grimberg 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2956569f48b8SHillf Danton 	address = start;
295724669e58SAneesh Kumar K.V again:
2958569f48b8SHillf Danton 	for (; address < end; address += sz) {
2959c7546f8fSDavid Gibson 		ptep = huge_pte_offset(mm, address);
2960c7546f8fSDavid Gibson 		if (!ptep)
2961c7546f8fSDavid Gibson 			continue;
2962c7546f8fSDavid Gibson 
2963cb900f41SKirill A. Shutemov 		ptl = huge_pte_lock(h, mm, ptep);
296439dde65cSChen, Kenneth W 		if (huge_pmd_unshare(mm, &address, ptep))
2965cb900f41SKirill A. Shutemov 			goto unlock;
296639dde65cSChen, Kenneth W 
29676629326bSHillf Danton 		pte = huge_ptep_get(ptep);
29686629326bSHillf Danton 		if (huge_pte_none(pte))
2969cb900f41SKirill A. Shutemov 			goto unlock;
29706629326bSHillf Danton 
29716629326bSHillf Danton 		/*
29729fbc1f63SNaoya Horiguchi 		 * Migrating hugepage or HWPoisoned hugepage is already
29739fbc1f63SNaoya Horiguchi 		 * unmapped and its refcount is dropped, so just clear pte here.
29746629326bSHillf Danton 		 */
29759fbc1f63SNaoya Horiguchi 		if (unlikely(!pte_present(pte))) {
2976106c992aSGerald Schaefer 			huge_pte_clear(mm, address, ptep);
2977cb900f41SKirill A. Shutemov 			goto unlock;
29788c4894c6SNaoya Horiguchi 		}
29796629326bSHillf Danton 
29806629326bSHillf Danton 		page = pte_page(pte);
298104f2cbe3SMel Gorman 		/*
298204f2cbe3SMel Gorman 		 * If a reference page is supplied, it is because a specific
298304f2cbe3SMel Gorman 		 * page is being unmapped, not a range. Ensure the page we
298404f2cbe3SMel Gorman 		 * are about to unmap is the actual page of interest.
298504f2cbe3SMel Gorman 		 */
298604f2cbe3SMel Gorman 		if (ref_page) {
298704f2cbe3SMel Gorman 			if (page != ref_page)
2988cb900f41SKirill A. Shutemov 				goto unlock;
298904f2cbe3SMel Gorman 
299004f2cbe3SMel Gorman 			/*
299104f2cbe3SMel Gorman 			 * Mark the VMA as having unmapped its page so that
299204f2cbe3SMel Gorman 			 * future faults in this VMA will fail rather than
299304f2cbe3SMel Gorman 			 * looking like data was lost
299404f2cbe3SMel Gorman 			 */
299504f2cbe3SMel Gorman 			set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
299604f2cbe3SMel Gorman 		}
299704f2cbe3SMel Gorman 
2998c7546f8fSDavid Gibson 		pte = huge_ptep_get_and_clear(mm, address, ptep);
299924669e58SAneesh Kumar K.V 		tlb_remove_tlb_entry(tlb, ptep, address);
3000106c992aSGerald Schaefer 		if (huge_pte_dirty(pte))
30016649a386SKen Chen 			set_page_dirty(page);
30029e81130bSHillf Danton 
300324669e58SAneesh Kumar K.V 		page_remove_rmap(page);
300424669e58SAneesh Kumar K.V 		force_flush = !__tlb_remove_page(tlb, page);
3005cb900f41SKirill A. Shutemov 		if (force_flush) {
3006569f48b8SHillf Danton 			address += sz;
3007cb900f41SKirill A. Shutemov 			spin_unlock(ptl);
30089e81130bSHillf Danton 			break;
300963551ae0SDavid Gibson 		}
3010cb900f41SKirill A. Shutemov 		/* Bail out after unmapping reference page if supplied */
3011cb900f41SKirill A. Shutemov 		if (ref_page) {
3012cb900f41SKirill A. Shutemov 			spin_unlock(ptl);
3013cb900f41SKirill A. Shutemov 			break;
3014cb900f41SKirill A. Shutemov 		}
3015cb900f41SKirill A. Shutemov unlock:
3016cb900f41SKirill A. Shutemov 		spin_unlock(ptl);
3017cb900f41SKirill A. Shutemov 	}
301824669e58SAneesh Kumar K.V 	/*
301924669e58SAneesh Kumar K.V 	 * mmu_gather ran out of room to batch pages, we break out of
302024669e58SAneesh Kumar K.V 	 * the PTE lock to avoid doing the potential expensive TLB invalidate
302124669e58SAneesh Kumar K.V 	 * and page-free while holding it.
302224669e58SAneesh Kumar K.V 	 */
302324669e58SAneesh Kumar K.V 	if (force_flush) {
302424669e58SAneesh Kumar K.V 		force_flush = 0;
302524669e58SAneesh Kumar K.V 		tlb_flush_mmu(tlb);
302624669e58SAneesh Kumar K.V 		if (address < end && !ref_page)
302724669e58SAneesh Kumar K.V 			goto again;
3028fe1668aeSChen, Kenneth W 	}
30292ec74c3eSSagi Grimberg 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
303024669e58SAneesh Kumar K.V 	tlb_end_vma(tlb, vma);
30311da177e4SLinus Torvalds }
303263551ae0SDavid Gibson 
3033d833352aSMel Gorman void __unmap_hugepage_range_final(struct mmu_gather *tlb,
3034d833352aSMel Gorman 			  struct vm_area_struct *vma, unsigned long start,
3035d833352aSMel Gorman 			  unsigned long end, struct page *ref_page)
3036d833352aSMel Gorman {
3037d833352aSMel Gorman 	__unmap_hugepage_range(tlb, vma, start, end, ref_page);
3038d833352aSMel Gorman 
3039d833352aSMel Gorman 	/*
3040d833352aSMel Gorman 	 * Clear this flag so that x86's huge_pmd_share page_table_shareable
3041d833352aSMel Gorman 	 * test will fail on a vma being torn down, and not grab a page table
3042d833352aSMel Gorman 	 * on its way out.  We're lucky that the flag has such an appropriate
3043d833352aSMel Gorman 	 * name, and can in fact be safely cleared here. We could clear it
3044d833352aSMel Gorman 	 * before the __unmap_hugepage_range above, but all that's necessary
3045c8c06efaSDavidlohr Bueso 	 * is to clear it before releasing the i_mmap_rwsem. This works
3046d833352aSMel Gorman 	 * because in the context this is called, the VMA is about to be
3047c8c06efaSDavidlohr Bueso 	 * destroyed and the i_mmap_rwsem is held.
3048d833352aSMel Gorman 	 */
3049d833352aSMel Gorman 	vma->vm_flags &= ~VM_MAYSHARE;
3050d833352aSMel Gorman }
3051d833352aSMel Gorman 
3052502717f4SChen, Kenneth W void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
305304f2cbe3SMel Gorman 			  unsigned long end, struct page *ref_page)
3054502717f4SChen, Kenneth W {
305524669e58SAneesh Kumar K.V 	struct mm_struct *mm;
305624669e58SAneesh Kumar K.V 	struct mmu_gather tlb;
305724669e58SAneesh Kumar K.V 
305824669e58SAneesh Kumar K.V 	mm = vma->vm_mm;
305924669e58SAneesh Kumar K.V 
30602b047252SLinus Torvalds 	tlb_gather_mmu(&tlb, mm, start, end);
306124669e58SAneesh Kumar K.V 	__unmap_hugepage_range(&tlb, vma, start, end, ref_page);
306224669e58SAneesh Kumar K.V 	tlb_finish_mmu(&tlb, start, end);
3063502717f4SChen, Kenneth W }
3064502717f4SChen, Kenneth W 
306504f2cbe3SMel Gorman /*
306604f2cbe3SMel Gorman  * This is called when the original mapper is failing to COW a MAP_PRIVATE
306704f2cbe3SMel Gorman  * mappping it owns the reserve page for. The intention is to unmap the page
306804f2cbe3SMel Gorman  * from other VMAs and let the children be SIGKILLed if they are faulting the
306904f2cbe3SMel Gorman  * same region.
307004f2cbe3SMel Gorman  */
30712f4612afSDavidlohr Bueso static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
30722a4b3dedSHarvey Harrison 			      struct page *page, unsigned long address)
307304f2cbe3SMel Gorman {
30747526674dSAdam Litke 	struct hstate *h = hstate_vma(vma);
307504f2cbe3SMel Gorman 	struct vm_area_struct *iter_vma;
307604f2cbe3SMel Gorman 	struct address_space *mapping;
307704f2cbe3SMel Gorman 	pgoff_t pgoff;
307804f2cbe3SMel Gorman 
307904f2cbe3SMel Gorman 	/*
308004f2cbe3SMel Gorman 	 * vm_pgoff is in PAGE_SIZE units, hence the different calculation
308104f2cbe3SMel Gorman 	 * from page cache lookup which is in HPAGE_SIZE units.
308204f2cbe3SMel Gorman 	 */
30837526674dSAdam Litke 	address = address & huge_page_mask(h);
308436e4f20aSMichal Hocko 	pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
308536e4f20aSMichal Hocko 			vma->vm_pgoff;
3086496ad9aaSAl Viro 	mapping = file_inode(vma->vm_file)->i_mapping;
308704f2cbe3SMel Gorman 
30884eb2b1dcSMel Gorman 	/*
30894eb2b1dcSMel Gorman 	 * Take the mapping lock for the duration of the table walk. As
30904eb2b1dcSMel Gorman 	 * this mapping should be shared between all the VMAs,
30914eb2b1dcSMel Gorman 	 * __unmap_hugepage_range() is called as the lock is already held
30924eb2b1dcSMel Gorman 	 */
309383cde9e8SDavidlohr Bueso 	i_mmap_lock_write(mapping);
30946b2dbba8SMichel Lespinasse 	vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
309504f2cbe3SMel Gorman 		/* Do not unmap the current VMA */
309604f2cbe3SMel Gorman 		if (iter_vma == vma)
309704f2cbe3SMel Gorman 			continue;
309804f2cbe3SMel Gorman 
309904f2cbe3SMel Gorman 		/*
310004f2cbe3SMel Gorman 		 * Unmap the page from other VMAs without their own reserves.
310104f2cbe3SMel Gorman 		 * They get marked to be SIGKILLed if they fault in these
310204f2cbe3SMel Gorman 		 * areas. This is because a future no-page fault on this VMA
310304f2cbe3SMel Gorman 		 * could insert a zeroed page instead of the data existing
310404f2cbe3SMel Gorman 		 * from the time of fork. This would look like data corruption
310504f2cbe3SMel Gorman 		 */
310604f2cbe3SMel Gorman 		if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
310724669e58SAneesh Kumar K.V 			unmap_hugepage_range(iter_vma, address,
310824669e58SAneesh Kumar K.V 					     address + huge_page_size(h), page);
310904f2cbe3SMel Gorman 	}
311083cde9e8SDavidlohr Bueso 	i_mmap_unlock_write(mapping);
311104f2cbe3SMel Gorman }
311204f2cbe3SMel Gorman 
31130fe6e20bSNaoya Horiguchi /*
31140fe6e20bSNaoya Horiguchi  * Hugetlb_cow() should be called with page lock of the original hugepage held.
3115ef009b25SMichal Hocko  * Called with hugetlb_instantiation_mutex held and pte_page locked so we
3116ef009b25SMichal Hocko  * cannot race with other handlers or page migration.
3117ef009b25SMichal Hocko  * Keep the pte_same checks anyway to make transition from the mutex easier.
31180fe6e20bSNaoya Horiguchi  */
31191e8f889bSDavid Gibson static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
312004f2cbe3SMel Gorman 			unsigned long address, pte_t *ptep, pte_t pte,
3121cb900f41SKirill A. Shutemov 			struct page *pagecache_page, spinlock_t *ptl)
31221e8f889bSDavid Gibson {
3123a5516438SAndi Kleen 	struct hstate *h = hstate_vma(vma);
31241e8f889bSDavid Gibson 	struct page *old_page, *new_page;
3125ad4404a2SDavidlohr Bueso 	int ret = 0, outside_reserve = 0;
31262ec74c3eSSagi Grimberg 	unsigned long mmun_start;	/* For mmu_notifiers */
31272ec74c3eSSagi Grimberg 	unsigned long mmun_end;		/* For mmu_notifiers */
31281e8f889bSDavid Gibson 
31291e8f889bSDavid Gibson 	old_page = pte_page(pte);
31301e8f889bSDavid Gibson 
313104f2cbe3SMel Gorman retry_avoidcopy:
31321e8f889bSDavid Gibson 	/* If no-one else is actually using this page, avoid the copy
31331e8f889bSDavid Gibson 	 * and just make the page writable */
313437a2140dSJoonsoo Kim 	if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
31350fe6e20bSNaoya Horiguchi 		page_move_anon_rmap(old_page, vma, address);
31361e8f889bSDavid Gibson 		set_huge_ptep_writable(vma, address, ptep);
313783c54070SNick Piggin 		return 0;
31381e8f889bSDavid Gibson 	}
31391e8f889bSDavid Gibson 
314004f2cbe3SMel Gorman 	/*
314104f2cbe3SMel Gorman 	 * If the process that created a MAP_PRIVATE mapping is about to
314204f2cbe3SMel Gorman 	 * perform a COW due to a shared page count, attempt to satisfy
314304f2cbe3SMel Gorman 	 * the allocation without using the existing reserves. The pagecache
314404f2cbe3SMel Gorman 	 * page is used to determine if the reserve at this address was
314504f2cbe3SMel Gorman 	 * consumed or not. If reserves were used, a partial faulted mapping
314604f2cbe3SMel Gorman 	 * at the time of fork() could consume its reserves on COW instead
314704f2cbe3SMel Gorman 	 * of the full address range.
314804f2cbe3SMel Gorman 	 */
31495944d011SJoonsoo Kim 	if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
315004f2cbe3SMel Gorman 			old_page != pagecache_page)
315104f2cbe3SMel Gorman 		outside_reserve = 1;
315204f2cbe3SMel Gorman 
31531e8f889bSDavid Gibson 	page_cache_get(old_page);
3154b76c8cfbSLarry Woodman 
3155ad4404a2SDavidlohr Bueso 	/*
3156ad4404a2SDavidlohr Bueso 	 * Drop page table lock as buddy allocator may be called. It will
3157ad4404a2SDavidlohr Bueso 	 * be acquired again before returning to the caller, as expected.
3158ad4404a2SDavidlohr Bueso 	 */
3159cb900f41SKirill A. Shutemov 	spin_unlock(ptl);
316004f2cbe3SMel Gorman 	new_page = alloc_huge_page(vma, address, outside_reserve);
31611e8f889bSDavid Gibson 
31622fc39cecSAdam Litke 	if (IS_ERR(new_page)) {
316304f2cbe3SMel Gorman 		/*
316404f2cbe3SMel Gorman 		 * If a process owning a MAP_PRIVATE mapping fails to COW,
316504f2cbe3SMel Gorman 		 * it is due to references held by a child and an insufficient
316604f2cbe3SMel Gorman 		 * huge page pool. To guarantee the original mappers
316704f2cbe3SMel Gorman 		 * reliability, unmap the page from child processes. The child
316804f2cbe3SMel Gorman 		 * may get SIGKILLed if it later faults.
316904f2cbe3SMel Gorman 		 */
317004f2cbe3SMel Gorman 		if (outside_reserve) {
3171ad4404a2SDavidlohr Bueso 			page_cache_release(old_page);
317204f2cbe3SMel Gorman 			BUG_ON(huge_pte_none(pte));
31732f4612afSDavidlohr Bueso 			unmap_ref_private(mm, vma, old_page, address);
317404f2cbe3SMel Gorman 			BUG_ON(huge_pte_none(pte));
3175cb900f41SKirill A. Shutemov 			spin_lock(ptl);
3176a734bcc8SHillf Danton 			ptep = huge_pte_offset(mm, address & huge_page_mask(h));
3177a9af0c5dSNaoya Horiguchi 			if (likely(ptep &&
3178a9af0c5dSNaoya Horiguchi 				   pte_same(huge_ptep_get(ptep), pte)))
317904f2cbe3SMel Gorman 				goto retry_avoidcopy;
3180a734bcc8SHillf Danton 			/*
3181cb900f41SKirill A. Shutemov 			 * race occurs while re-acquiring page table
3182cb900f41SKirill A. Shutemov 			 * lock, and our job is done.
3183a734bcc8SHillf Danton 			 */
3184a734bcc8SHillf Danton 			return 0;
318504f2cbe3SMel Gorman 		}
318604f2cbe3SMel Gorman 
3187ad4404a2SDavidlohr Bueso 		ret = (PTR_ERR(new_page) == -ENOMEM) ?
3188ad4404a2SDavidlohr Bueso 			VM_FAULT_OOM : VM_FAULT_SIGBUS;
3189ad4404a2SDavidlohr Bueso 		goto out_release_old;
31901e8f889bSDavid Gibson 	}
31911e8f889bSDavid Gibson 
31920fe6e20bSNaoya Horiguchi 	/*
31930fe6e20bSNaoya Horiguchi 	 * When the original hugepage is shared one, it does not have
31940fe6e20bSNaoya Horiguchi 	 * anon_vma prepared.
31950fe6e20bSNaoya Horiguchi 	 */
319644e2aa93SDean Nelson 	if (unlikely(anon_vma_prepare(vma))) {
3197ad4404a2SDavidlohr Bueso 		ret = VM_FAULT_OOM;
3198ad4404a2SDavidlohr Bueso 		goto out_release_all;
319944e2aa93SDean Nelson 	}
32000fe6e20bSNaoya Horiguchi 
320147ad8475SAndrea Arcangeli 	copy_user_huge_page(new_page, old_page, address, vma,
320247ad8475SAndrea Arcangeli 			    pages_per_huge_page(h));
32030ed361deSNick Piggin 	__SetPageUptodate(new_page);
3204bcc54222SNaoya Horiguchi 	set_page_huge_active(new_page);
32051e8f889bSDavid Gibson 
32062ec74c3eSSagi Grimberg 	mmun_start = address & huge_page_mask(h);
32072ec74c3eSSagi Grimberg 	mmun_end = mmun_start + huge_page_size(h);
32082ec74c3eSSagi Grimberg 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
3209ad4404a2SDavidlohr Bueso 
3210b76c8cfbSLarry Woodman 	/*
3211cb900f41SKirill A. Shutemov 	 * Retake the page table lock to check for racing updates
3212b76c8cfbSLarry Woodman 	 * before the page tables are altered
3213b76c8cfbSLarry Woodman 	 */
3214cb900f41SKirill A. Shutemov 	spin_lock(ptl);
3215a5516438SAndi Kleen 	ptep = huge_pte_offset(mm, address & huge_page_mask(h));
3216a9af0c5dSNaoya Horiguchi 	if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
321707443a85SJoonsoo Kim 		ClearPagePrivate(new_page);
321807443a85SJoonsoo Kim 
32191e8f889bSDavid Gibson 		/* Break COW */
32208fe627ecSGerald Schaefer 		huge_ptep_clear_flush(vma, address, ptep);
322134ee645eSJoerg Roedel 		mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
32221e8f889bSDavid Gibson 		set_huge_pte_at(mm, address, ptep,
32231e8f889bSDavid Gibson 				make_huge_pte(vma, new_page, 1));
32240fe6e20bSNaoya Horiguchi 		page_remove_rmap(old_page);
3225cd67f0d2SNaoya Horiguchi 		hugepage_add_new_anon_rmap(new_page, vma, address);
32261e8f889bSDavid Gibson 		/* Make the old page be freed below */
32271e8f889bSDavid Gibson 		new_page = old_page;
32281e8f889bSDavid Gibson 	}
3229cb900f41SKirill A. Shutemov 	spin_unlock(ptl);
32302ec74c3eSSagi Grimberg 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
3231ad4404a2SDavidlohr Bueso out_release_all:
32321e8f889bSDavid Gibson 	page_cache_release(new_page);
3233ad4404a2SDavidlohr Bueso out_release_old:
32341e8f889bSDavid Gibson 	page_cache_release(old_page);
32358312034fSJoonsoo Kim 
3236ad4404a2SDavidlohr Bueso 	spin_lock(ptl); /* Caller expects lock to be held */
3237ad4404a2SDavidlohr Bueso 	return ret;
32381e8f889bSDavid Gibson }
32391e8f889bSDavid Gibson 
324004f2cbe3SMel Gorman /* Return the pagecache page at a given address within a VMA */
3241a5516438SAndi Kleen static struct page *hugetlbfs_pagecache_page(struct hstate *h,
3242a5516438SAndi Kleen 			struct vm_area_struct *vma, unsigned long address)
324304f2cbe3SMel Gorman {
324404f2cbe3SMel Gorman 	struct address_space *mapping;
3245e7c4b0bfSAndy Whitcroft 	pgoff_t idx;
324604f2cbe3SMel Gorman 
324704f2cbe3SMel Gorman 	mapping = vma->vm_file->f_mapping;
3248a5516438SAndi Kleen 	idx = vma_hugecache_offset(h, vma, address);
324904f2cbe3SMel Gorman 
325004f2cbe3SMel Gorman 	return find_lock_page(mapping, idx);
325104f2cbe3SMel Gorman }
325204f2cbe3SMel Gorman 
32533ae77f43SHugh Dickins /*
32543ae77f43SHugh Dickins  * Return whether there is a pagecache page to back given address within VMA.
32553ae77f43SHugh Dickins  * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
32563ae77f43SHugh Dickins  */
32573ae77f43SHugh Dickins static bool hugetlbfs_pagecache_present(struct hstate *h,
32582a15efc9SHugh Dickins 			struct vm_area_struct *vma, unsigned long address)
32592a15efc9SHugh Dickins {
32602a15efc9SHugh Dickins 	struct address_space *mapping;
32612a15efc9SHugh Dickins 	pgoff_t idx;
32622a15efc9SHugh Dickins 	struct page *page;
32632a15efc9SHugh Dickins 
32642a15efc9SHugh Dickins 	mapping = vma->vm_file->f_mapping;
32652a15efc9SHugh Dickins 	idx = vma_hugecache_offset(h, vma, address);
32662a15efc9SHugh Dickins 
32672a15efc9SHugh Dickins 	page = find_get_page(mapping, idx);
32682a15efc9SHugh Dickins 	if (page)
32692a15efc9SHugh Dickins 		put_page(page);
32702a15efc9SHugh Dickins 	return page != NULL;
32712a15efc9SHugh Dickins }
32722a15efc9SHugh Dickins 
3273a1ed3ddaSRobert P. J. Day static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
32748382d914SDavidlohr Bueso 			   struct address_space *mapping, pgoff_t idx,
3275788c7df4SHugh Dickins 			   unsigned long address, pte_t *ptep, unsigned int flags)
3276ac9b9c66SHugh Dickins {
3277a5516438SAndi Kleen 	struct hstate *h = hstate_vma(vma);
3278ac9b9c66SHugh Dickins 	int ret = VM_FAULT_SIGBUS;
3279409eb8c2SHillf Danton 	int anon_rmap = 0;
32804c887265SAdam Litke 	unsigned long size;
32814c887265SAdam Litke 	struct page *page;
32821e8f889bSDavid Gibson 	pte_t new_pte;
3283cb900f41SKirill A. Shutemov 	spinlock_t *ptl;
32844c887265SAdam Litke 
328504f2cbe3SMel Gorman 	/*
328604f2cbe3SMel Gorman 	 * Currently, we are forced to kill the process in the event the
328704f2cbe3SMel Gorman 	 * original mapper has unmapped pages from the child due to a failed
328825985edcSLucas De Marchi 	 * COW. Warn that such a situation has occurred as it may not be obvious
328904f2cbe3SMel Gorman 	 */
329004f2cbe3SMel Gorman 	if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
3291ffb22af5SAndrew Morton 		pr_warning("PID %d killed due to inadequate hugepage pool\n",
329204f2cbe3SMel Gorman 			   current->pid);
329304f2cbe3SMel Gorman 		return ret;
329404f2cbe3SMel Gorman 	}
329504f2cbe3SMel Gorman 
32964c887265SAdam Litke 	/*
32974c887265SAdam Litke 	 * Use page lock to guard against racing truncation
32984c887265SAdam Litke 	 * before we get page_table_lock.
32994c887265SAdam Litke 	 */
33006bda666aSChristoph Lameter retry:
33016bda666aSChristoph Lameter 	page = find_lock_page(mapping, idx);
33026bda666aSChristoph Lameter 	if (!page) {
3303a5516438SAndi Kleen 		size = i_size_read(mapping->host) >> huge_page_shift(h);
3304ebed4bfcSHugh Dickins 		if (idx >= size)
3305ebed4bfcSHugh Dickins 			goto out;
330604f2cbe3SMel Gorman 		page = alloc_huge_page(vma, address, 0);
33072fc39cecSAdam Litke 		if (IS_ERR(page)) {
330876dcee75SAneesh Kumar K.V 			ret = PTR_ERR(page);
330976dcee75SAneesh Kumar K.V 			if (ret == -ENOMEM)
331076dcee75SAneesh Kumar K.V 				ret = VM_FAULT_OOM;
331176dcee75SAneesh Kumar K.V 			else
331276dcee75SAneesh Kumar K.V 				ret = VM_FAULT_SIGBUS;
33136bda666aSChristoph Lameter 			goto out;
33146bda666aSChristoph Lameter 		}
331547ad8475SAndrea Arcangeli 		clear_huge_page(page, address, pages_per_huge_page(h));
33160ed361deSNick Piggin 		__SetPageUptodate(page);
3317bcc54222SNaoya Horiguchi 		set_page_huge_active(page);
3318ac9b9c66SHugh Dickins 
3319f83a275dSMel Gorman 		if (vma->vm_flags & VM_MAYSHARE) {
33206bda666aSChristoph Lameter 			int err;
332145c682a6SKen Chen 			struct inode *inode = mapping->host;
33226bda666aSChristoph Lameter 
33236bda666aSChristoph Lameter 			err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
33246bda666aSChristoph Lameter 			if (err) {
33256bda666aSChristoph Lameter 				put_page(page);
33266bda666aSChristoph Lameter 				if (err == -EEXIST)
33276bda666aSChristoph Lameter 					goto retry;
33286bda666aSChristoph Lameter 				goto out;
33296bda666aSChristoph Lameter 			}
333007443a85SJoonsoo Kim 			ClearPagePrivate(page);
333145c682a6SKen Chen 
333245c682a6SKen Chen 			spin_lock(&inode->i_lock);
3333a5516438SAndi Kleen 			inode->i_blocks += blocks_per_huge_page(h);
333445c682a6SKen Chen 			spin_unlock(&inode->i_lock);
333523be7468SMel Gorman 		} else {
33366bda666aSChristoph Lameter 			lock_page(page);
33370fe6e20bSNaoya Horiguchi 			if (unlikely(anon_vma_prepare(vma))) {
33380fe6e20bSNaoya Horiguchi 				ret = VM_FAULT_OOM;
33390fe6e20bSNaoya Horiguchi 				goto backout_unlocked;
334023be7468SMel Gorman 			}
3341409eb8c2SHillf Danton 			anon_rmap = 1;
33420fe6e20bSNaoya Horiguchi 		}
33430fe6e20bSNaoya Horiguchi 	} else {
334457303d80SAndy Whitcroft 		/*
3345998b4382SNaoya Horiguchi 		 * If memory error occurs between mmap() and fault, some process
3346998b4382SNaoya Horiguchi 		 * don't have hwpoisoned swap entry for errored virtual address.
3347998b4382SNaoya Horiguchi 		 * So we need to block hugepage fault by PG_hwpoison bit check.
3348fd6a03edSNaoya Horiguchi 		 */
3349fd6a03edSNaoya Horiguchi 		if (unlikely(PageHWPoison(page))) {
3350aa50d3a7SAndi Kleen 			ret = VM_FAULT_HWPOISON |
3351972dc4deSAneesh Kumar K.V 				VM_FAULT_SET_HINDEX(hstate_index(h));
3352fd6a03edSNaoya Horiguchi 			goto backout_unlocked;
33536bda666aSChristoph Lameter 		}
3354998b4382SNaoya Horiguchi 	}
33551e8f889bSDavid Gibson 
335657303d80SAndy Whitcroft 	/*
335757303d80SAndy Whitcroft 	 * If we are going to COW a private mapping later, we examine the
335857303d80SAndy Whitcroft 	 * pending reservations for this page now. This will ensure that
335957303d80SAndy Whitcroft 	 * any allocations necessary to record that reservation occur outside
336057303d80SAndy Whitcroft 	 * the spinlock.
336157303d80SAndy Whitcroft 	 */
33625e911373SMike Kravetz 	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
33632b26736cSAndy Whitcroft 		if (vma_needs_reservation(h, vma, address) < 0) {
33642b26736cSAndy Whitcroft 			ret = VM_FAULT_OOM;
33652b26736cSAndy Whitcroft 			goto backout_unlocked;
33662b26736cSAndy Whitcroft 		}
33675e911373SMike Kravetz 		/* Just decrements count, does not deallocate */
33685e911373SMike Kravetz 		vma_abort_reservation(h, vma, address);
33695e911373SMike Kravetz 	}
337057303d80SAndy Whitcroft 
3371cb900f41SKirill A. Shutemov 	ptl = huge_pte_lockptr(h, mm, ptep);
3372cb900f41SKirill A. Shutemov 	spin_lock(ptl);
3373a5516438SAndi Kleen 	size = i_size_read(mapping->host) >> huge_page_shift(h);
33744c887265SAdam Litke 	if (idx >= size)
33754c887265SAdam Litke 		goto backout;
33764c887265SAdam Litke 
337783c54070SNick Piggin 	ret = 0;
33787f2e9525SGerald Schaefer 	if (!huge_pte_none(huge_ptep_get(ptep)))
33794c887265SAdam Litke 		goto backout;
33804c887265SAdam Litke 
338107443a85SJoonsoo Kim 	if (anon_rmap) {
338207443a85SJoonsoo Kim 		ClearPagePrivate(page);
3383409eb8c2SHillf Danton 		hugepage_add_new_anon_rmap(page, vma, address);
3384ac714904SChoi Gi-yong 	} else
3385409eb8c2SHillf Danton 		page_dup_rmap(page);
33861e8f889bSDavid Gibson 	new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
33871e8f889bSDavid Gibson 				&& (vma->vm_flags & VM_SHARED)));
33881e8f889bSDavid Gibson 	set_huge_pte_at(mm, address, ptep, new_pte);
33891e8f889bSDavid Gibson 
3390788c7df4SHugh Dickins 	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
33911e8f889bSDavid Gibson 		/* Optimization, do the COW without a second fault */
3392cb900f41SKirill A. Shutemov 		ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl);
33931e8f889bSDavid Gibson 	}
33941e8f889bSDavid Gibson 
3395cb900f41SKirill A. Shutemov 	spin_unlock(ptl);
33964c887265SAdam Litke 	unlock_page(page);
33974c887265SAdam Litke out:
3398ac9b9c66SHugh Dickins 	return ret;
33994c887265SAdam Litke 
34004c887265SAdam Litke backout:
3401cb900f41SKirill A. Shutemov 	spin_unlock(ptl);
34022b26736cSAndy Whitcroft backout_unlocked:
34034c887265SAdam Litke 	unlock_page(page);
34044c887265SAdam Litke 	put_page(page);
34054c887265SAdam Litke 	goto out;
3406ac9b9c66SHugh Dickins }
3407ac9b9c66SHugh Dickins 
34088382d914SDavidlohr Bueso #ifdef CONFIG_SMP
34098382d914SDavidlohr Bueso static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
34108382d914SDavidlohr Bueso 			    struct vm_area_struct *vma,
34118382d914SDavidlohr Bueso 			    struct address_space *mapping,
34128382d914SDavidlohr Bueso 			    pgoff_t idx, unsigned long address)
34138382d914SDavidlohr Bueso {
34148382d914SDavidlohr Bueso 	unsigned long key[2];
34158382d914SDavidlohr Bueso 	u32 hash;
34168382d914SDavidlohr Bueso 
34178382d914SDavidlohr Bueso 	if (vma->vm_flags & VM_SHARED) {
34188382d914SDavidlohr Bueso 		key[0] = (unsigned long) mapping;
34198382d914SDavidlohr Bueso 		key[1] = idx;
34208382d914SDavidlohr Bueso 	} else {
34218382d914SDavidlohr Bueso 		key[0] = (unsigned long) mm;
34228382d914SDavidlohr Bueso 		key[1] = address >> huge_page_shift(h);
34238382d914SDavidlohr Bueso 	}
34248382d914SDavidlohr Bueso 
34258382d914SDavidlohr Bueso 	hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0);
34268382d914SDavidlohr Bueso 
34278382d914SDavidlohr Bueso 	return hash & (num_fault_mutexes - 1);
34288382d914SDavidlohr Bueso }
34298382d914SDavidlohr Bueso #else
34308382d914SDavidlohr Bueso /*
34318382d914SDavidlohr Bueso  * For uniprocesor systems we always use a single mutex, so just
34328382d914SDavidlohr Bueso  * return 0 and avoid the hashing overhead.
34338382d914SDavidlohr Bueso  */
34348382d914SDavidlohr Bueso static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
34358382d914SDavidlohr Bueso 			    struct vm_area_struct *vma,
34368382d914SDavidlohr Bueso 			    struct address_space *mapping,
34378382d914SDavidlohr Bueso 			    pgoff_t idx, unsigned long address)
34388382d914SDavidlohr Bueso {
34398382d914SDavidlohr Bueso 	return 0;
34408382d914SDavidlohr Bueso }
34418382d914SDavidlohr Bueso #endif
34428382d914SDavidlohr Bueso 
344386e5216fSAdam Litke int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3444788c7df4SHugh Dickins 			unsigned long address, unsigned int flags)
344586e5216fSAdam Litke {
34468382d914SDavidlohr Bueso 	pte_t *ptep, entry;
3447cb900f41SKirill A. Shutemov 	spinlock_t *ptl;
34481e8f889bSDavid Gibson 	int ret;
34498382d914SDavidlohr Bueso 	u32 hash;
34508382d914SDavidlohr Bueso 	pgoff_t idx;
34510fe6e20bSNaoya Horiguchi 	struct page *page = NULL;
345257303d80SAndy Whitcroft 	struct page *pagecache_page = NULL;
3453a5516438SAndi Kleen 	struct hstate *h = hstate_vma(vma);
34548382d914SDavidlohr Bueso 	struct address_space *mapping;
34550f792cf9SNaoya Horiguchi 	int need_wait_lock = 0;
345686e5216fSAdam Litke 
34571e16a539SKAMEZAWA Hiroyuki 	address &= huge_page_mask(h);
34581e16a539SKAMEZAWA Hiroyuki 
3459fd6a03edSNaoya Horiguchi 	ptep = huge_pte_offset(mm, address);
3460fd6a03edSNaoya Horiguchi 	if (ptep) {
3461fd6a03edSNaoya Horiguchi 		entry = huge_ptep_get(ptep);
3462290408d4SNaoya Horiguchi 		if (unlikely(is_hugetlb_entry_migration(entry))) {
3463cb900f41SKirill A. Shutemov 			migration_entry_wait_huge(vma, mm, ptep);
3464290408d4SNaoya Horiguchi 			return 0;
3465290408d4SNaoya Horiguchi 		} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
3466aa50d3a7SAndi Kleen 			return VM_FAULT_HWPOISON_LARGE |
3467972dc4deSAneesh Kumar K.V 				VM_FAULT_SET_HINDEX(hstate_index(h));
3468fd6a03edSNaoya Horiguchi 	}
3469fd6a03edSNaoya Horiguchi 
3470a5516438SAndi Kleen 	ptep = huge_pte_alloc(mm, address, huge_page_size(h));
347186e5216fSAdam Litke 	if (!ptep)
347286e5216fSAdam Litke 		return VM_FAULT_OOM;
347386e5216fSAdam Litke 
34748382d914SDavidlohr Bueso 	mapping = vma->vm_file->f_mapping;
34758382d914SDavidlohr Bueso 	idx = vma_hugecache_offset(h, vma, address);
34768382d914SDavidlohr Bueso 
34773935baa9SDavid Gibson 	/*
34783935baa9SDavid Gibson 	 * Serialize hugepage allocation and instantiation, so that we don't
34793935baa9SDavid Gibson 	 * get spurious allocation failures if two CPUs race to instantiate
34803935baa9SDavid Gibson 	 * the same page in the page cache.
34813935baa9SDavid Gibson 	 */
34828382d914SDavidlohr Bueso 	hash = fault_mutex_hash(h, mm, vma, mapping, idx, address);
34838382d914SDavidlohr Bueso 	mutex_lock(&htlb_fault_mutex_table[hash]);
34848382d914SDavidlohr Bueso 
34857f2e9525SGerald Schaefer 	entry = huge_ptep_get(ptep);
34867f2e9525SGerald Schaefer 	if (huge_pte_none(entry)) {
34878382d914SDavidlohr Bueso 		ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags);
3488b4d1d99fSDavid Gibson 		goto out_mutex;
34893935baa9SDavid Gibson 	}
349086e5216fSAdam Litke 
349183c54070SNick Piggin 	ret = 0;
34921e8f889bSDavid Gibson 
349357303d80SAndy Whitcroft 	/*
34940f792cf9SNaoya Horiguchi 	 * entry could be a migration/hwpoison entry at this point, so this
34950f792cf9SNaoya Horiguchi 	 * check prevents the kernel from going below assuming that we have
34960f792cf9SNaoya Horiguchi 	 * a active hugepage in pagecache. This goto expects the 2nd page fault,
34970f792cf9SNaoya Horiguchi 	 * and is_hugetlb_entry_(migration|hwpoisoned) check will properly
34980f792cf9SNaoya Horiguchi 	 * handle it.
34990f792cf9SNaoya Horiguchi 	 */
35000f792cf9SNaoya Horiguchi 	if (!pte_present(entry))
35010f792cf9SNaoya Horiguchi 		goto out_mutex;
35020f792cf9SNaoya Horiguchi 
35030f792cf9SNaoya Horiguchi 	/*
350457303d80SAndy Whitcroft 	 * If we are going to COW the mapping later, we examine the pending
350557303d80SAndy Whitcroft 	 * reservations for this page now. This will ensure that any
350657303d80SAndy Whitcroft 	 * allocations necessary to record that reservation occur outside the
350757303d80SAndy Whitcroft 	 * spinlock. For private mappings, we also lookup the pagecache
350857303d80SAndy Whitcroft 	 * page now as it is used to determine if a reservation has been
350957303d80SAndy Whitcroft 	 * consumed.
351057303d80SAndy Whitcroft 	 */
3511106c992aSGerald Schaefer 	if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
35122b26736cSAndy Whitcroft 		if (vma_needs_reservation(h, vma, address) < 0) {
35132b26736cSAndy Whitcroft 			ret = VM_FAULT_OOM;
3514b4d1d99fSDavid Gibson 			goto out_mutex;
35152b26736cSAndy Whitcroft 		}
35165e911373SMike Kravetz 		/* Just decrements count, does not deallocate */
35175e911373SMike Kravetz 		vma_abort_reservation(h, vma, address);
351857303d80SAndy Whitcroft 
3519f83a275dSMel Gorman 		if (!(vma->vm_flags & VM_MAYSHARE))
352057303d80SAndy Whitcroft 			pagecache_page = hugetlbfs_pagecache_page(h,
352157303d80SAndy Whitcroft 								vma, address);
352257303d80SAndy Whitcroft 	}
352357303d80SAndy Whitcroft 
35240f792cf9SNaoya Horiguchi 	ptl = huge_pte_lock(h, mm, ptep);
35250fe6e20bSNaoya Horiguchi 
35261e8f889bSDavid Gibson 	/* Check for a racing update before calling hugetlb_cow */
3527b4d1d99fSDavid Gibson 	if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
3528cb900f41SKirill A. Shutemov 		goto out_ptl;
3529b4d1d99fSDavid Gibson 
35300f792cf9SNaoya Horiguchi 	/*
35310f792cf9SNaoya Horiguchi 	 * hugetlb_cow() requires page locks of pte_page(entry) and
35320f792cf9SNaoya Horiguchi 	 * pagecache_page, so here we need take the former one
35330f792cf9SNaoya Horiguchi 	 * when page != pagecache_page or !pagecache_page.
35340f792cf9SNaoya Horiguchi 	 */
35350f792cf9SNaoya Horiguchi 	page = pte_page(entry);
35360f792cf9SNaoya Horiguchi 	if (page != pagecache_page)
35370f792cf9SNaoya Horiguchi 		if (!trylock_page(page)) {
35380f792cf9SNaoya Horiguchi 			need_wait_lock = 1;
35390f792cf9SNaoya Horiguchi 			goto out_ptl;
35400f792cf9SNaoya Horiguchi 		}
35410f792cf9SNaoya Horiguchi 
35420f792cf9SNaoya Horiguchi 	get_page(page);
3543b4d1d99fSDavid Gibson 
3544788c7df4SHugh Dickins 	if (flags & FAULT_FLAG_WRITE) {
3545106c992aSGerald Schaefer 		if (!huge_pte_write(entry)) {
354657303d80SAndy Whitcroft 			ret = hugetlb_cow(mm, vma, address, ptep, entry,
3547cb900f41SKirill A. Shutemov 					pagecache_page, ptl);
35480f792cf9SNaoya Horiguchi 			goto out_put_page;
3549b4d1d99fSDavid Gibson 		}
3550106c992aSGerald Schaefer 		entry = huge_pte_mkdirty(entry);
3551b4d1d99fSDavid Gibson 	}
3552b4d1d99fSDavid Gibson 	entry = pte_mkyoung(entry);
3553788c7df4SHugh Dickins 	if (huge_ptep_set_access_flags(vma, address, ptep, entry,
3554788c7df4SHugh Dickins 						flags & FAULT_FLAG_WRITE))
35554b3073e1SRussell King 		update_mmu_cache(vma, address, ptep);
35560f792cf9SNaoya Horiguchi out_put_page:
35570f792cf9SNaoya Horiguchi 	if (page != pagecache_page)
35580f792cf9SNaoya Horiguchi 		unlock_page(page);
35590f792cf9SNaoya Horiguchi 	put_page(page);
3560cb900f41SKirill A. Shutemov out_ptl:
3561cb900f41SKirill A. Shutemov 	spin_unlock(ptl);
356257303d80SAndy Whitcroft 
356357303d80SAndy Whitcroft 	if (pagecache_page) {
356457303d80SAndy Whitcroft 		unlock_page(pagecache_page);
356557303d80SAndy Whitcroft 		put_page(pagecache_page);
356657303d80SAndy Whitcroft 	}
3567b4d1d99fSDavid Gibson out_mutex:
35688382d914SDavidlohr Bueso 	mutex_unlock(&htlb_fault_mutex_table[hash]);
35690f792cf9SNaoya Horiguchi 	/*
35700f792cf9SNaoya Horiguchi 	 * Generally it's safe to hold refcount during waiting page lock. But
35710f792cf9SNaoya Horiguchi 	 * here we just wait to defer the next page fault to avoid busy loop and
35720f792cf9SNaoya Horiguchi 	 * the page is not used after unlocked before returning from the current
35730f792cf9SNaoya Horiguchi 	 * page fault. So we are safe from accessing freed page, even if we wait
35740f792cf9SNaoya Horiguchi 	 * here without taking refcount.
35750f792cf9SNaoya Horiguchi 	 */
35760f792cf9SNaoya Horiguchi 	if (need_wait_lock)
35770f792cf9SNaoya Horiguchi 		wait_on_page_locked(page);
35781e8f889bSDavid Gibson 	return ret;
357986e5216fSAdam Litke }
358086e5216fSAdam Litke 
358128a35716SMichel Lespinasse long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
358263551ae0SDavid Gibson 			 struct page **pages, struct vm_area_struct **vmas,
358328a35716SMichel Lespinasse 			 unsigned long *position, unsigned long *nr_pages,
358428a35716SMichel Lespinasse 			 long i, unsigned int flags)
358563551ae0SDavid Gibson {
3586d5d4b0aaSChen, Kenneth W 	unsigned long pfn_offset;
3587d5d4b0aaSChen, Kenneth W 	unsigned long vaddr = *position;
358828a35716SMichel Lespinasse 	unsigned long remainder = *nr_pages;
3589a5516438SAndi Kleen 	struct hstate *h = hstate_vma(vma);
359063551ae0SDavid Gibson 
359163551ae0SDavid Gibson 	while (vaddr < vma->vm_end && remainder) {
359263551ae0SDavid Gibson 		pte_t *pte;
3593cb900f41SKirill A. Shutemov 		spinlock_t *ptl = NULL;
35942a15efc9SHugh Dickins 		int absent;
359563551ae0SDavid Gibson 		struct page *page;
359663551ae0SDavid Gibson 
35974c887265SAdam Litke 		/*
359802057967SDavid Rientjes 		 * If we have a pending SIGKILL, don't keep faulting pages and
359902057967SDavid Rientjes 		 * potentially allocating memory.
360002057967SDavid Rientjes 		 */
360102057967SDavid Rientjes 		if (unlikely(fatal_signal_pending(current))) {
360202057967SDavid Rientjes 			remainder = 0;
360302057967SDavid Rientjes 			break;
360402057967SDavid Rientjes 		}
360502057967SDavid Rientjes 
360602057967SDavid Rientjes 		/*
36074c887265SAdam Litke 		 * Some archs (sparc64, sh*) have multiple pte_ts to
36082a15efc9SHugh Dickins 		 * each hugepage.  We have to make sure we get the
36094c887265SAdam Litke 		 * first, for the page indexing below to work.
3610cb900f41SKirill A. Shutemov 		 *
3611cb900f41SKirill A. Shutemov 		 * Note that page table lock is not held when pte is null.
36124c887265SAdam Litke 		 */
3613a5516438SAndi Kleen 		pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
3614cb900f41SKirill A. Shutemov 		if (pte)
3615cb900f41SKirill A. Shutemov 			ptl = huge_pte_lock(h, mm, pte);
36162a15efc9SHugh Dickins 		absent = !pte || huge_pte_none(huge_ptep_get(pte));
361763551ae0SDavid Gibson 
36182a15efc9SHugh Dickins 		/*
36192a15efc9SHugh Dickins 		 * When coredumping, it suits get_dump_page if we just return
36203ae77f43SHugh Dickins 		 * an error where there's an empty slot with no huge pagecache
36213ae77f43SHugh Dickins 		 * to back it.  This way, we avoid allocating a hugepage, and
36223ae77f43SHugh Dickins 		 * the sparse dumpfile avoids allocating disk blocks, but its
36233ae77f43SHugh Dickins 		 * huge holes still show up with zeroes where they need to be.
36242a15efc9SHugh Dickins 		 */
36253ae77f43SHugh Dickins 		if (absent && (flags & FOLL_DUMP) &&
36263ae77f43SHugh Dickins 		    !hugetlbfs_pagecache_present(h, vma, vaddr)) {
3627cb900f41SKirill A. Shutemov 			if (pte)
3628cb900f41SKirill A. Shutemov 				spin_unlock(ptl);
36292a15efc9SHugh Dickins 			remainder = 0;
36302a15efc9SHugh Dickins 			break;
36312a15efc9SHugh Dickins 		}
36322a15efc9SHugh Dickins 
36339cc3a5bdSNaoya Horiguchi 		/*
36349cc3a5bdSNaoya Horiguchi 		 * We need call hugetlb_fault for both hugepages under migration
36359cc3a5bdSNaoya Horiguchi 		 * (in which case hugetlb_fault waits for the migration,) and
36369cc3a5bdSNaoya Horiguchi 		 * hwpoisoned hugepages (in which case we need to prevent the
36379cc3a5bdSNaoya Horiguchi 		 * caller from accessing to them.) In order to do this, we use
36389cc3a5bdSNaoya Horiguchi 		 * here is_swap_pte instead of is_hugetlb_entry_migration and
36399cc3a5bdSNaoya Horiguchi 		 * is_hugetlb_entry_hwpoisoned. This is because it simply covers
36409cc3a5bdSNaoya Horiguchi 		 * both cases, and because we can't follow correct pages
36419cc3a5bdSNaoya Horiguchi 		 * directly from any kind of swap entries.
36429cc3a5bdSNaoya Horiguchi 		 */
36439cc3a5bdSNaoya Horiguchi 		if (absent || is_swap_pte(huge_ptep_get(pte)) ||
3644106c992aSGerald Schaefer 		    ((flags & FOLL_WRITE) &&
3645106c992aSGerald Schaefer 		      !huge_pte_write(huge_ptep_get(pte)))) {
36464c887265SAdam Litke 			int ret;
36474c887265SAdam Litke 
3648cb900f41SKirill A. Shutemov 			if (pte)
3649cb900f41SKirill A. Shutemov 				spin_unlock(ptl);
36502a15efc9SHugh Dickins 			ret = hugetlb_fault(mm, vma, vaddr,
36512a15efc9SHugh Dickins 				(flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
3652a89182c7SAdam Litke 			if (!(ret & VM_FAULT_ERROR))
36534c887265SAdam Litke 				continue;
36544c887265SAdam Litke 
36551c59827dSHugh Dickins 			remainder = 0;
36561c59827dSHugh Dickins 			break;
36571c59827dSHugh Dickins 		}
365863551ae0SDavid Gibson 
3659a5516438SAndi Kleen 		pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
36607f2e9525SGerald Schaefer 		page = pte_page(huge_ptep_get(pte));
3661d5d4b0aaSChen, Kenneth W same_page:
3662d6692183SChen, Kenneth W 		if (pages) {
366369d177c2SAndy Whitcroft 			pages[i] = mem_map_offset(page, pfn_offset);
3664a0368d4eSAndrea Arcangeli 			get_page_foll(pages[i]);
3665d6692183SChen, Kenneth W 		}
366663551ae0SDavid Gibson 
366763551ae0SDavid Gibson 		if (vmas)
366863551ae0SDavid Gibson 			vmas[i] = vma;
366963551ae0SDavid Gibson 
367063551ae0SDavid Gibson 		vaddr += PAGE_SIZE;
3671d5d4b0aaSChen, Kenneth W 		++pfn_offset;
367263551ae0SDavid Gibson 		--remainder;
367363551ae0SDavid Gibson 		++i;
3674d5d4b0aaSChen, Kenneth W 		if (vaddr < vma->vm_end && remainder &&
3675a5516438SAndi Kleen 				pfn_offset < pages_per_huge_page(h)) {
3676d5d4b0aaSChen, Kenneth W 			/*
3677d5d4b0aaSChen, Kenneth W 			 * We use pfn_offset to avoid touching the pageframes
3678d5d4b0aaSChen, Kenneth W 			 * of this compound page.
3679d5d4b0aaSChen, Kenneth W 			 */
3680d5d4b0aaSChen, Kenneth W 			goto same_page;
3681d5d4b0aaSChen, Kenneth W 		}
3682cb900f41SKirill A. Shutemov 		spin_unlock(ptl);
368363551ae0SDavid Gibson 	}
368428a35716SMichel Lespinasse 	*nr_pages = remainder;
368563551ae0SDavid Gibson 	*position = vaddr;
368663551ae0SDavid Gibson 
36872a15efc9SHugh Dickins 	return i ? i : -EFAULT;
368863551ae0SDavid Gibson }
36898f860591SZhang, Yanmin 
36907da4d641SPeter Zijlstra unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
36918f860591SZhang, Yanmin 		unsigned long address, unsigned long end, pgprot_t newprot)
36928f860591SZhang, Yanmin {
36938f860591SZhang, Yanmin 	struct mm_struct *mm = vma->vm_mm;
36948f860591SZhang, Yanmin 	unsigned long start = address;
36958f860591SZhang, Yanmin 	pte_t *ptep;
36968f860591SZhang, Yanmin 	pte_t pte;
3697a5516438SAndi Kleen 	struct hstate *h = hstate_vma(vma);
36987da4d641SPeter Zijlstra 	unsigned long pages = 0;
36998f860591SZhang, Yanmin 
37008f860591SZhang, Yanmin 	BUG_ON(address >= end);
37018f860591SZhang, Yanmin 	flush_cache_range(vma, address, end);
37028f860591SZhang, Yanmin 
3703a5338093SRik van Riel 	mmu_notifier_invalidate_range_start(mm, start, end);
370483cde9e8SDavidlohr Bueso 	i_mmap_lock_write(vma->vm_file->f_mapping);
3705a5516438SAndi Kleen 	for (; address < end; address += huge_page_size(h)) {
3706cb900f41SKirill A. Shutemov 		spinlock_t *ptl;
37078f860591SZhang, Yanmin 		ptep = huge_pte_offset(mm, address);
37088f860591SZhang, Yanmin 		if (!ptep)
37098f860591SZhang, Yanmin 			continue;
3710cb900f41SKirill A. Shutemov 		ptl = huge_pte_lock(h, mm, ptep);
37117da4d641SPeter Zijlstra 		if (huge_pmd_unshare(mm, &address, ptep)) {
37127da4d641SPeter Zijlstra 			pages++;
3713cb900f41SKirill A. Shutemov 			spin_unlock(ptl);
371439dde65cSChen, Kenneth W 			continue;
37157da4d641SPeter Zijlstra 		}
3716a8bda28dSNaoya Horiguchi 		pte = huge_ptep_get(ptep);
3717a8bda28dSNaoya Horiguchi 		if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
3718a8bda28dSNaoya Horiguchi 			spin_unlock(ptl);
3719a8bda28dSNaoya Horiguchi 			continue;
3720a8bda28dSNaoya Horiguchi 		}
3721a8bda28dSNaoya Horiguchi 		if (unlikely(is_hugetlb_entry_migration(pte))) {
3722a8bda28dSNaoya Horiguchi 			swp_entry_t entry = pte_to_swp_entry(pte);
3723a8bda28dSNaoya Horiguchi 
3724a8bda28dSNaoya Horiguchi 			if (is_write_migration_entry(entry)) {
3725a8bda28dSNaoya Horiguchi 				pte_t newpte;
3726a8bda28dSNaoya Horiguchi 
3727a8bda28dSNaoya Horiguchi 				make_migration_entry_read(&entry);
3728a8bda28dSNaoya Horiguchi 				newpte = swp_entry_to_pte(entry);
3729a8bda28dSNaoya Horiguchi 				set_huge_pte_at(mm, address, ptep, newpte);
3730a8bda28dSNaoya Horiguchi 				pages++;
3731a8bda28dSNaoya Horiguchi 			}
3732a8bda28dSNaoya Horiguchi 			spin_unlock(ptl);
3733a8bda28dSNaoya Horiguchi 			continue;
3734a8bda28dSNaoya Horiguchi 		}
3735a8bda28dSNaoya Horiguchi 		if (!huge_pte_none(pte)) {
37368f860591SZhang, Yanmin 			pte = huge_ptep_get_and_clear(mm, address, ptep);
3737106c992aSGerald Schaefer 			pte = pte_mkhuge(huge_pte_modify(pte, newprot));
3738be7517d6STony Lu 			pte = arch_make_huge_pte(pte, vma, NULL, 0);
37398f860591SZhang, Yanmin 			set_huge_pte_at(mm, address, ptep, pte);
37407da4d641SPeter Zijlstra 			pages++;
37418f860591SZhang, Yanmin 		}
3742cb900f41SKirill A. Shutemov 		spin_unlock(ptl);
37438f860591SZhang, Yanmin 	}
3744d833352aSMel Gorman 	/*
3745c8c06efaSDavidlohr Bueso 	 * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare
3746d833352aSMel Gorman 	 * may have cleared our pud entry and done put_page on the page table:
3747c8c06efaSDavidlohr Bueso 	 * once we release i_mmap_rwsem, another task can do the final put_page
3748d833352aSMel Gorman 	 * and that page table be reused and filled with junk.
3749d833352aSMel Gorman 	 */
37508f860591SZhang, Yanmin 	flush_tlb_range(vma, start, end);
375134ee645eSJoerg Roedel 	mmu_notifier_invalidate_range(mm, start, end);
375283cde9e8SDavidlohr Bueso 	i_mmap_unlock_write(vma->vm_file->f_mapping);
3753a5338093SRik van Riel 	mmu_notifier_invalidate_range_end(mm, start, end);
37547da4d641SPeter Zijlstra 
37557da4d641SPeter Zijlstra 	return pages << h->order;
37568f860591SZhang, Yanmin }
37578f860591SZhang, Yanmin 
3758a1e78772SMel Gorman int hugetlb_reserve_pages(struct inode *inode,
3759a1e78772SMel Gorman 					long from, long to,
37605a6fe125SMel Gorman 					struct vm_area_struct *vma,
3761ca16d140SKOSAKI Motohiro 					vm_flags_t vm_flags)
3762e4e574b7SAdam Litke {
376317c9d12eSMel Gorman 	long ret, chg;
3764a5516438SAndi Kleen 	struct hstate *h = hstate_inode(inode);
376590481622SDavid Gibson 	struct hugepage_subpool *spool = subpool_inode(inode);
37669119a41eSJoonsoo Kim 	struct resv_map *resv_map;
37671c5ecae3SMike Kravetz 	long gbl_reserve;
3768e4e574b7SAdam Litke 
3769a1e78772SMel Gorman 	/*
377017c9d12eSMel Gorman 	 * Only apply hugepage reservation if asked. At fault time, an
377117c9d12eSMel Gorman 	 * attempt will be made for VM_NORESERVE to allocate a page
377290481622SDavid Gibson 	 * without using reserves
377317c9d12eSMel Gorman 	 */
3774ca16d140SKOSAKI Motohiro 	if (vm_flags & VM_NORESERVE)
377517c9d12eSMel Gorman 		return 0;
377617c9d12eSMel Gorman 
377717c9d12eSMel Gorman 	/*
3778a1e78772SMel Gorman 	 * Shared mappings base their reservation on the number of pages that
3779a1e78772SMel Gorman 	 * are already allocated on behalf of the file. Private mappings need
3780a1e78772SMel Gorman 	 * to reserve the full area even if read-only as mprotect() may be
3781a1e78772SMel Gorman 	 * called to make the mapping read-write. Assume !vma is a shm mapping
3782a1e78772SMel Gorman 	 */
37839119a41eSJoonsoo Kim 	if (!vma || vma->vm_flags & VM_MAYSHARE) {
37844e35f483SJoonsoo Kim 		resv_map = inode_resv_map(inode);
37859119a41eSJoonsoo Kim 
37861406ec9bSJoonsoo Kim 		chg = region_chg(resv_map, from, to);
37879119a41eSJoonsoo Kim 
37889119a41eSJoonsoo Kim 	} else {
37899119a41eSJoonsoo Kim 		resv_map = resv_map_alloc();
37905a6fe125SMel Gorman 		if (!resv_map)
37915a6fe125SMel Gorman 			return -ENOMEM;
37925a6fe125SMel Gorman 
379317c9d12eSMel Gorman 		chg = to - from;
379417c9d12eSMel Gorman 
37955a6fe125SMel Gorman 		set_vma_resv_map(vma, resv_map);
37965a6fe125SMel Gorman 		set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
37975a6fe125SMel Gorman 	}
37985a6fe125SMel Gorman 
3799c50ac050SDave Hansen 	if (chg < 0) {
3800c50ac050SDave Hansen 		ret = chg;
3801c50ac050SDave Hansen 		goto out_err;
3802c50ac050SDave Hansen 	}
380317c9d12eSMel Gorman 
38041c5ecae3SMike Kravetz 	/*
38051c5ecae3SMike Kravetz 	 * There must be enough pages in the subpool for the mapping. If
38061c5ecae3SMike Kravetz 	 * the subpool has a minimum size, there may be some global
38071c5ecae3SMike Kravetz 	 * reservations already in place (gbl_reserve).
38081c5ecae3SMike Kravetz 	 */
38091c5ecae3SMike Kravetz 	gbl_reserve = hugepage_subpool_get_pages(spool, chg);
38101c5ecae3SMike Kravetz 	if (gbl_reserve < 0) {
3811c50ac050SDave Hansen 		ret = -ENOSPC;
3812c50ac050SDave Hansen 		goto out_err;
3813c50ac050SDave Hansen 	}
381417c9d12eSMel Gorman 
381517c9d12eSMel Gorman 	/*
381617c9d12eSMel Gorman 	 * Check enough hugepages are available for the reservation.
381790481622SDavid Gibson 	 * Hand the pages back to the subpool if there are not
381817c9d12eSMel Gorman 	 */
38191c5ecae3SMike Kravetz 	ret = hugetlb_acct_memory(h, gbl_reserve);
382017c9d12eSMel Gorman 	if (ret < 0) {
38211c5ecae3SMike Kravetz 		/* put back original number of pages, chg */
38221c5ecae3SMike Kravetz 		(void)hugepage_subpool_put_pages(spool, chg);
3823c50ac050SDave Hansen 		goto out_err;
382417c9d12eSMel Gorman 	}
382517c9d12eSMel Gorman 
382617c9d12eSMel Gorman 	/*
382717c9d12eSMel Gorman 	 * Account for the reservations made. Shared mappings record regions
382817c9d12eSMel Gorman 	 * that have reservations as they are shared by multiple VMAs.
382917c9d12eSMel Gorman 	 * When the last VMA disappears, the region map says how much
383017c9d12eSMel Gorman 	 * the reservation was and the page cache tells how much of
383117c9d12eSMel Gorman 	 * the reservation was consumed. Private mappings are per-VMA and
383217c9d12eSMel Gorman 	 * only the consumed reservations are tracked. When the VMA
383317c9d12eSMel Gorman 	 * disappears, the original reservation is the VMA size and the
383417c9d12eSMel Gorman 	 * consumed reservations are stored in the map. Hence, nothing
383517c9d12eSMel Gorman 	 * else has to be done for private mappings here
383617c9d12eSMel Gorman 	 */
383733039678SMike Kravetz 	if (!vma || vma->vm_flags & VM_MAYSHARE) {
383833039678SMike Kravetz 		long add = region_add(resv_map, from, to);
383933039678SMike Kravetz 
384033039678SMike Kravetz 		if (unlikely(chg > add)) {
384133039678SMike Kravetz 			/*
384233039678SMike Kravetz 			 * pages in this range were added to the reserve
384333039678SMike Kravetz 			 * map between region_chg and region_add.  This
384433039678SMike Kravetz 			 * indicates a race with alloc_huge_page.  Adjust
384533039678SMike Kravetz 			 * the subpool and reserve counts modified above
384633039678SMike Kravetz 			 * based on the difference.
384733039678SMike Kravetz 			 */
384833039678SMike Kravetz 			long rsv_adjust;
384933039678SMike Kravetz 
385033039678SMike Kravetz 			rsv_adjust = hugepage_subpool_put_pages(spool,
385133039678SMike Kravetz 								chg - add);
385233039678SMike Kravetz 			hugetlb_acct_memory(h, -rsv_adjust);
385333039678SMike Kravetz 		}
385433039678SMike Kravetz 	}
3855a43a8c39SChen, Kenneth W 	return 0;
3856c50ac050SDave Hansen out_err:
38575e911373SMike Kravetz 	if (!vma || vma->vm_flags & VM_MAYSHARE)
38585e911373SMike Kravetz 		region_abort(resv_map, from, to);
3859f031dd27SJoonsoo Kim 	if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
3860f031dd27SJoonsoo Kim 		kref_put(&resv_map->refs, resv_map_release);
3861c50ac050SDave Hansen 	return ret;
3862a43a8c39SChen, Kenneth W }
3863a43a8c39SChen, Kenneth W 
3864a43a8c39SChen, Kenneth W void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
3865a43a8c39SChen, Kenneth W {
3866a5516438SAndi Kleen 	struct hstate *h = hstate_inode(inode);
38674e35f483SJoonsoo Kim 	struct resv_map *resv_map = inode_resv_map(inode);
38689119a41eSJoonsoo Kim 	long chg = 0;
386990481622SDavid Gibson 	struct hugepage_subpool *spool = subpool_inode(inode);
38701c5ecae3SMike Kravetz 	long gbl_reserve;
387145c682a6SKen Chen 
38729119a41eSJoonsoo Kim 	if (resv_map)
38731406ec9bSJoonsoo Kim 		chg = region_truncate(resv_map, offset);
387445c682a6SKen Chen 	spin_lock(&inode->i_lock);
3875e4c6f8beSEric Sandeen 	inode->i_blocks -= (blocks_per_huge_page(h) * freed);
387645c682a6SKen Chen 	spin_unlock(&inode->i_lock);
387745c682a6SKen Chen 
38781c5ecae3SMike Kravetz 	/*
38791c5ecae3SMike Kravetz 	 * If the subpool has a minimum size, the number of global
38801c5ecae3SMike Kravetz 	 * reservations to be released may be adjusted.
38811c5ecae3SMike Kravetz 	 */
38821c5ecae3SMike Kravetz 	gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
38831c5ecae3SMike Kravetz 	hugetlb_acct_memory(h, -gbl_reserve);
3884a43a8c39SChen, Kenneth W }
388593f70f90SNaoya Horiguchi 
38863212b535SSteve Capper #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
38873212b535SSteve Capper static unsigned long page_table_shareable(struct vm_area_struct *svma,
38883212b535SSteve Capper 				struct vm_area_struct *vma,
38893212b535SSteve Capper 				unsigned long addr, pgoff_t idx)
38903212b535SSteve Capper {
38913212b535SSteve Capper 	unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
38923212b535SSteve Capper 				svma->vm_start;
38933212b535SSteve Capper 	unsigned long sbase = saddr & PUD_MASK;
38943212b535SSteve Capper 	unsigned long s_end = sbase + PUD_SIZE;
38953212b535SSteve Capper 
38963212b535SSteve Capper 	/* Allow segments to share if only one is marked locked */
38973212b535SSteve Capper 	unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED;
38983212b535SSteve Capper 	unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED;
38993212b535SSteve Capper 
39003212b535SSteve Capper 	/*
39013212b535SSteve Capper 	 * match the virtual addresses, permission and the alignment of the
39023212b535SSteve Capper 	 * page table page.
39033212b535SSteve Capper 	 */
39043212b535SSteve Capper 	if (pmd_index(addr) != pmd_index(saddr) ||
39053212b535SSteve Capper 	    vm_flags != svm_flags ||
39063212b535SSteve Capper 	    sbase < svma->vm_start || svma->vm_end < s_end)
39073212b535SSteve Capper 		return 0;
39083212b535SSteve Capper 
39093212b535SSteve Capper 	return saddr;
39103212b535SSteve Capper }
39113212b535SSteve Capper 
391231aafb45SNicholas Krause static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
39133212b535SSteve Capper {
39143212b535SSteve Capper 	unsigned long base = addr & PUD_MASK;
39153212b535SSteve Capper 	unsigned long end = base + PUD_SIZE;
39163212b535SSteve Capper 
39173212b535SSteve Capper 	/*
39183212b535SSteve Capper 	 * check on proper vm_flags and page table alignment
39193212b535SSteve Capper 	 */
39203212b535SSteve Capper 	if (vma->vm_flags & VM_MAYSHARE &&
39213212b535SSteve Capper 	    vma->vm_start <= base && end <= vma->vm_end)
392231aafb45SNicholas Krause 		return true;
392331aafb45SNicholas Krause 	return false;
39243212b535SSteve Capper }
39253212b535SSteve Capper 
39263212b535SSteve Capper /*
39273212b535SSteve Capper  * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
39283212b535SSteve Capper  * and returns the corresponding pte. While this is not necessary for the
39293212b535SSteve Capper  * !shared pmd case because we can allocate the pmd later as well, it makes the
39303212b535SSteve Capper  * code much cleaner. pmd allocation is essential for the shared case because
3931c8c06efaSDavidlohr Bueso  * pud has to be populated inside the same i_mmap_rwsem section - otherwise
39323212b535SSteve Capper  * racing tasks could either miss the sharing (see huge_pte_offset) or select a
39333212b535SSteve Capper  * bad pmd for sharing.
39343212b535SSteve Capper  */
39353212b535SSteve Capper pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
39363212b535SSteve Capper {
39373212b535SSteve Capper 	struct vm_area_struct *vma = find_vma(mm, addr);
39383212b535SSteve Capper 	struct address_space *mapping = vma->vm_file->f_mapping;
39393212b535SSteve Capper 	pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
39403212b535SSteve Capper 			vma->vm_pgoff;
39413212b535SSteve Capper 	struct vm_area_struct *svma;
39423212b535SSteve Capper 	unsigned long saddr;
39433212b535SSteve Capper 	pte_t *spte = NULL;
39443212b535SSteve Capper 	pte_t *pte;
3945cb900f41SKirill A. Shutemov 	spinlock_t *ptl;
39463212b535SSteve Capper 
39473212b535SSteve Capper 	if (!vma_shareable(vma, addr))
39483212b535SSteve Capper 		return (pte_t *)pmd_alloc(mm, pud, addr);
39493212b535SSteve Capper 
395083cde9e8SDavidlohr Bueso 	i_mmap_lock_write(mapping);
39513212b535SSteve Capper 	vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
39523212b535SSteve Capper 		if (svma == vma)
39533212b535SSteve Capper 			continue;
39543212b535SSteve Capper 
39553212b535SSteve Capper 		saddr = page_table_shareable(svma, vma, addr, idx);
39563212b535SSteve Capper 		if (saddr) {
39573212b535SSteve Capper 			spte = huge_pte_offset(svma->vm_mm, saddr);
39583212b535SSteve Capper 			if (spte) {
3959dc6c9a35SKirill A. Shutemov 				mm_inc_nr_pmds(mm);
39603212b535SSteve Capper 				get_page(virt_to_page(spte));
39613212b535SSteve Capper 				break;
39623212b535SSteve Capper 			}
39633212b535SSteve Capper 		}
39643212b535SSteve Capper 	}
39653212b535SSteve Capper 
39663212b535SSteve Capper 	if (!spte)
39673212b535SSteve Capper 		goto out;
39683212b535SSteve Capper 
3969cb900f41SKirill A. Shutemov 	ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte);
3970cb900f41SKirill A. Shutemov 	spin_lock(ptl);
3971dc6c9a35SKirill A. Shutemov 	if (pud_none(*pud)) {
39723212b535SSteve Capper 		pud_populate(mm, pud,
39733212b535SSteve Capper 				(pmd_t *)((unsigned long)spte & PAGE_MASK));
3974dc6c9a35SKirill A. Shutemov 	} else {
39753212b535SSteve Capper 		put_page(virt_to_page(spte));
3976dc6c9a35SKirill A. Shutemov 		mm_inc_nr_pmds(mm);
3977dc6c9a35SKirill A. Shutemov 	}
3978cb900f41SKirill A. Shutemov 	spin_unlock(ptl);
39793212b535SSteve Capper out:
39803212b535SSteve Capper 	pte = (pte_t *)pmd_alloc(mm, pud, addr);
398183cde9e8SDavidlohr Bueso 	i_mmap_unlock_write(mapping);
39823212b535SSteve Capper 	return pte;
39833212b535SSteve Capper }
39843212b535SSteve Capper 
39853212b535SSteve Capper /*
39863212b535SSteve Capper  * unmap huge page backed by shared pte.
39873212b535SSteve Capper  *
39883212b535SSteve Capper  * Hugetlb pte page is ref counted at the time of mapping.  If pte is shared
39893212b535SSteve Capper  * indicated by page_count > 1, unmap is achieved by clearing pud and
39903212b535SSteve Capper  * decrementing the ref count. If count == 1, the pte page is not shared.
39913212b535SSteve Capper  *
3992cb900f41SKirill A. Shutemov  * called with page table lock held.
39933212b535SSteve Capper  *
39943212b535SSteve Capper  * returns: 1 successfully unmapped a shared pte page
39953212b535SSteve Capper  *	    0 the underlying pte page is not shared, or it is the last user
39963212b535SSteve Capper  */
39973212b535SSteve Capper int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
39983212b535SSteve Capper {
39993212b535SSteve Capper 	pgd_t *pgd = pgd_offset(mm, *addr);
40003212b535SSteve Capper 	pud_t *pud = pud_offset(pgd, *addr);
40013212b535SSteve Capper 
40023212b535SSteve Capper 	BUG_ON(page_count(virt_to_page(ptep)) == 0);
40033212b535SSteve Capper 	if (page_count(virt_to_page(ptep)) == 1)
40043212b535SSteve Capper 		return 0;
40053212b535SSteve Capper 
40063212b535SSteve Capper 	pud_clear(pud);
40073212b535SSteve Capper 	put_page(virt_to_page(ptep));
4008dc6c9a35SKirill A. Shutemov 	mm_dec_nr_pmds(mm);
40093212b535SSteve Capper 	*addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
40103212b535SSteve Capper 	return 1;
40113212b535SSteve Capper }
40129e5fc74cSSteve Capper #define want_pmd_share()	(1)
40139e5fc74cSSteve Capper #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
40149e5fc74cSSteve Capper pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
40159e5fc74cSSteve Capper {
40169e5fc74cSSteve Capper 	return NULL;
40179e5fc74cSSteve Capper }
4018e81f2d22SZhang Zhen 
4019e81f2d22SZhang Zhen int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
4020e81f2d22SZhang Zhen {
4021e81f2d22SZhang Zhen 	return 0;
4022e81f2d22SZhang Zhen }
40239e5fc74cSSteve Capper #define want_pmd_share()	(0)
40243212b535SSteve Capper #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
40253212b535SSteve Capper 
40269e5fc74cSSteve Capper #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
40279e5fc74cSSteve Capper pte_t *huge_pte_alloc(struct mm_struct *mm,
40289e5fc74cSSteve Capper 			unsigned long addr, unsigned long sz)
40299e5fc74cSSteve Capper {
40309e5fc74cSSteve Capper 	pgd_t *pgd;
40319e5fc74cSSteve Capper 	pud_t *pud;
40329e5fc74cSSteve Capper 	pte_t *pte = NULL;
40339e5fc74cSSteve Capper 
40349e5fc74cSSteve Capper 	pgd = pgd_offset(mm, addr);
40359e5fc74cSSteve Capper 	pud = pud_alloc(mm, pgd, addr);
40369e5fc74cSSteve Capper 	if (pud) {
40379e5fc74cSSteve Capper 		if (sz == PUD_SIZE) {
40389e5fc74cSSteve Capper 			pte = (pte_t *)pud;
40399e5fc74cSSteve Capper 		} else {
40409e5fc74cSSteve Capper 			BUG_ON(sz != PMD_SIZE);
40419e5fc74cSSteve Capper 			if (want_pmd_share() && pud_none(*pud))
40429e5fc74cSSteve Capper 				pte = huge_pmd_share(mm, addr, pud);
40439e5fc74cSSteve Capper 			else
40449e5fc74cSSteve Capper 				pte = (pte_t *)pmd_alloc(mm, pud, addr);
40459e5fc74cSSteve Capper 		}
40469e5fc74cSSteve Capper 	}
40479e5fc74cSSteve Capper 	BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
40489e5fc74cSSteve Capper 
40499e5fc74cSSteve Capper 	return pte;
40509e5fc74cSSteve Capper }
40519e5fc74cSSteve Capper 
40529e5fc74cSSteve Capper pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
40539e5fc74cSSteve Capper {
40549e5fc74cSSteve Capper 	pgd_t *pgd;
40559e5fc74cSSteve Capper 	pud_t *pud;
40569e5fc74cSSteve Capper 	pmd_t *pmd = NULL;
40579e5fc74cSSteve Capper 
40589e5fc74cSSteve Capper 	pgd = pgd_offset(mm, addr);
40599e5fc74cSSteve Capper 	if (pgd_present(*pgd)) {
40609e5fc74cSSteve Capper 		pud = pud_offset(pgd, addr);
40619e5fc74cSSteve Capper 		if (pud_present(*pud)) {
40629e5fc74cSSteve Capper 			if (pud_huge(*pud))
40639e5fc74cSSteve Capper 				return (pte_t *)pud;
40649e5fc74cSSteve Capper 			pmd = pmd_offset(pud, addr);
40659e5fc74cSSteve Capper 		}
40669e5fc74cSSteve Capper 	}
40679e5fc74cSSteve Capper 	return (pte_t *) pmd;
40689e5fc74cSSteve Capper }
40699e5fc74cSSteve Capper 
407061f77edaSNaoya Horiguchi #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
407161f77edaSNaoya Horiguchi 
407261f77edaSNaoya Horiguchi /*
407361f77edaSNaoya Horiguchi  * These functions are overwritable if your architecture needs its own
407461f77edaSNaoya Horiguchi  * behavior.
407561f77edaSNaoya Horiguchi  */
407661f77edaSNaoya Horiguchi struct page * __weak
407761f77edaSNaoya Horiguchi follow_huge_addr(struct mm_struct *mm, unsigned long address,
407861f77edaSNaoya Horiguchi 			      int write)
407961f77edaSNaoya Horiguchi {
408061f77edaSNaoya Horiguchi 	return ERR_PTR(-EINVAL);
408161f77edaSNaoya Horiguchi }
408261f77edaSNaoya Horiguchi 
408361f77edaSNaoya Horiguchi struct page * __weak
40849e5fc74cSSteve Capper follow_huge_pmd(struct mm_struct *mm, unsigned long address,
4085e66f17ffSNaoya Horiguchi 		pmd_t *pmd, int flags)
40869e5fc74cSSteve Capper {
4087e66f17ffSNaoya Horiguchi 	struct page *page = NULL;
4088e66f17ffSNaoya Horiguchi 	spinlock_t *ptl;
4089e66f17ffSNaoya Horiguchi retry:
4090e66f17ffSNaoya Horiguchi 	ptl = pmd_lockptr(mm, pmd);
4091e66f17ffSNaoya Horiguchi 	spin_lock(ptl);
4092e66f17ffSNaoya Horiguchi 	/*
4093e66f17ffSNaoya Horiguchi 	 * make sure that the address range covered by this pmd is not
4094e66f17ffSNaoya Horiguchi 	 * unmapped from other threads.
4095e66f17ffSNaoya Horiguchi 	 */
4096e66f17ffSNaoya Horiguchi 	if (!pmd_huge(*pmd))
4097e66f17ffSNaoya Horiguchi 		goto out;
4098e66f17ffSNaoya Horiguchi 	if (pmd_present(*pmd)) {
409997534127SGerald Schaefer 		page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
4100e66f17ffSNaoya Horiguchi 		if (flags & FOLL_GET)
4101e66f17ffSNaoya Horiguchi 			get_page(page);
4102e66f17ffSNaoya Horiguchi 	} else {
4103e66f17ffSNaoya Horiguchi 		if (is_hugetlb_entry_migration(huge_ptep_get((pte_t *)pmd))) {
4104e66f17ffSNaoya Horiguchi 			spin_unlock(ptl);
4105e66f17ffSNaoya Horiguchi 			__migration_entry_wait(mm, (pte_t *)pmd, ptl);
4106e66f17ffSNaoya Horiguchi 			goto retry;
4107e66f17ffSNaoya Horiguchi 		}
4108e66f17ffSNaoya Horiguchi 		/*
4109e66f17ffSNaoya Horiguchi 		 * hwpoisoned entry is treated as no_page_table in
4110e66f17ffSNaoya Horiguchi 		 * follow_page_mask().
4111e66f17ffSNaoya Horiguchi 		 */
4112e66f17ffSNaoya Horiguchi 	}
4113e66f17ffSNaoya Horiguchi out:
4114e66f17ffSNaoya Horiguchi 	spin_unlock(ptl);
41159e5fc74cSSteve Capper 	return page;
41169e5fc74cSSteve Capper }
41179e5fc74cSSteve Capper 
411861f77edaSNaoya Horiguchi struct page * __weak
41199e5fc74cSSteve Capper follow_huge_pud(struct mm_struct *mm, unsigned long address,
4120e66f17ffSNaoya Horiguchi 		pud_t *pud, int flags)
41219e5fc74cSSteve Capper {
4122e66f17ffSNaoya Horiguchi 	if (flags & FOLL_GET)
4123e66f17ffSNaoya Horiguchi 		return NULL;
41249e5fc74cSSteve Capper 
4125e66f17ffSNaoya Horiguchi 	return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
41269e5fc74cSSteve Capper }
41279e5fc74cSSteve Capper 
4128d5bd9106SAndi Kleen #ifdef CONFIG_MEMORY_FAILURE
4129d5bd9106SAndi Kleen 
413093f70f90SNaoya Horiguchi /*
413193f70f90SNaoya Horiguchi  * This function is called from memory failure code.
413293f70f90SNaoya Horiguchi  * Assume the caller holds page lock of the head page.
413393f70f90SNaoya Horiguchi  */
41346de2b1aaSNaoya Horiguchi int dequeue_hwpoisoned_huge_page(struct page *hpage)
413593f70f90SNaoya Horiguchi {
413693f70f90SNaoya Horiguchi 	struct hstate *h = page_hstate(hpage);
413793f70f90SNaoya Horiguchi 	int nid = page_to_nid(hpage);
41386de2b1aaSNaoya Horiguchi 	int ret = -EBUSY;
413993f70f90SNaoya Horiguchi 
414093f70f90SNaoya Horiguchi 	spin_lock(&hugetlb_lock);
41417e1f049eSNaoya Horiguchi 	/*
41427e1f049eSNaoya Horiguchi 	 * Just checking !page_huge_active is not enough, because that could be
41437e1f049eSNaoya Horiguchi 	 * an isolated/hwpoisoned hugepage (which have >0 refcount).
41447e1f049eSNaoya Horiguchi 	 */
41457e1f049eSNaoya Horiguchi 	if (!page_huge_active(hpage) && !page_count(hpage)) {
414656f2fb14SNaoya Horiguchi 		/*
414756f2fb14SNaoya Horiguchi 		 * Hwpoisoned hugepage isn't linked to activelist or freelist,
414856f2fb14SNaoya Horiguchi 		 * but dangling hpage->lru can trigger list-debug warnings
414956f2fb14SNaoya Horiguchi 		 * (this happens when we call unpoison_memory() on it),
415056f2fb14SNaoya Horiguchi 		 * so let it point to itself with list_del_init().
415156f2fb14SNaoya Horiguchi 		 */
415256f2fb14SNaoya Horiguchi 		list_del_init(&hpage->lru);
41538c6c2ecbSNaoya Horiguchi 		set_page_refcounted(hpage);
415493f70f90SNaoya Horiguchi 		h->free_huge_pages--;
415593f70f90SNaoya Horiguchi 		h->free_huge_pages_node[nid]--;
41566de2b1aaSNaoya Horiguchi 		ret = 0;
415793f70f90SNaoya Horiguchi 	}
41586de2b1aaSNaoya Horiguchi 	spin_unlock(&hugetlb_lock);
41596de2b1aaSNaoya Horiguchi 	return ret;
41606de2b1aaSNaoya Horiguchi }
41616de2b1aaSNaoya Horiguchi #endif
416231caf665SNaoya Horiguchi 
416331caf665SNaoya Horiguchi bool isolate_huge_page(struct page *page, struct list_head *list)
416431caf665SNaoya Horiguchi {
4165bcc54222SNaoya Horiguchi 	bool ret = true;
4166bcc54222SNaoya Horiguchi 
4167309381feSSasha Levin 	VM_BUG_ON_PAGE(!PageHead(page), page);
416831caf665SNaoya Horiguchi 	spin_lock(&hugetlb_lock);
4169bcc54222SNaoya Horiguchi 	if (!page_huge_active(page) || !get_page_unless_zero(page)) {
4170bcc54222SNaoya Horiguchi 		ret = false;
4171bcc54222SNaoya Horiguchi 		goto unlock;
4172bcc54222SNaoya Horiguchi 	}
4173bcc54222SNaoya Horiguchi 	clear_page_huge_active(page);
417431caf665SNaoya Horiguchi 	list_move_tail(&page->lru, list);
4175bcc54222SNaoya Horiguchi unlock:
417631caf665SNaoya Horiguchi 	spin_unlock(&hugetlb_lock);
4177bcc54222SNaoya Horiguchi 	return ret;
417831caf665SNaoya Horiguchi }
417931caf665SNaoya Horiguchi 
418031caf665SNaoya Horiguchi void putback_active_hugepage(struct page *page)
418131caf665SNaoya Horiguchi {
4182309381feSSasha Levin 	VM_BUG_ON_PAGE(!PageHead(page), page);
418331caf665SNaoya Horiguchi 	spin_lock(&hugetlb_lock);
4184bcc54222SNaoya Horiguchi 	set_page_huge_active(page);
418531caf665SNaoya Horiguchi 	list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
418631caf665SNaoya Horiguchi 	spin_unlock(&hugetlb_lock);
418731caf665SNaoya Horiguchi 	put_page(page);
418831caf665SNaoya Horiguchi }
4189