xref: /openbmc/linux/mm/hugetlb.c (revision 649920c6)
11da177e4SLinus Torvalds /*
21da177e4SLinus Torvalds  * Generic hugetlb support.
36d49e352SNadia Yvette Chambers  * (C) Nadia Yvette Chambers, April 2004
41da177e4SLinus Torvalds  */
51da177e4SLinus Torvalds #include <linux/list.h>
61da177e4SLinus Torvalds #include <linux/init.h>
71da177e4SLinus Torvalds #include <linux/mm.h>
8e1759c21SAlexey Dobriyan #include <linux/seq_file.h>
91da177e4SLinus Torvalds #include <linux/sysctl.h>
101da177e4SLinus Torvalds #include <linux/highmem.h>
11cddb8a5cSAndrea Arcangeli #include <linux/mmu_notifier.h>
121da177e4SLinus Torvalds #include <linux/nodemask.h>
1363551ae0SDavid Gibson #include <linux/pagemap.h>
145da7ca86SChristoph Lameter #include <linux/mempolicy.h>
153b32123dSGideon Israel Dsouza #include <linux/compiler.h>
16aea47ff3SChristoph Lameter #include <linux/cpuset.h>
173935baa9SDavid Gibson #include <linux/mutex.h>
18aa888a74SAndi Kleen #include <linux/bootmem.h>
19a3437870SNishanth Aravamudan #include <linux/sysfs.h>
205a0e3ad6STejun Heo #include <linux/slab.h>
210fe6e20bSNaoya Horiguchi #include <linux/rmap.h>
22fd6a03edSNaoya Horiguchi #include <linux/swap.h>
23fd6a03edSNaoya Horiguchi #include <linux/swapops.h>
24c8721bbbSNaoya Horiguchi #include <linux/page-isolation.h>
258382d914SDavidlohr Bueso #include <linux/jhash.h>
26d6606683SLinus Torvalds 
2763551ae0SDavid Gibson #include <asm/page.h>
2863551ae0SDavid Gibson #include <asm/pgtable.h>
2924669e58SAneesh Kumar K.V #include <asm/tlb.h>
3063551ae0SDavid Gibson 
3124669e58SAneesh Kumar K.V #include <linux/io.h>
3263551ae0SDavid Gibson #include <linux/hugetlb.h>
339dd540e2SAneesh Kumar K.V #include <linux/hugetlb_cgroup.h>
349a305230SLee Schermerhorn #include <linux/node.h>
357835e98bSNick Piggin #include "internal.h"
361da177e4SLinus Torvalds 
37753162cdSAndrey Ryabinin int hugepages_treat_as_movable;
38a5516438SAndi Kleen 
39c3f38a38SAneesh Kumar K.V int hugetlb_max_hstate __read_mostly;
40e5ff2159SAndi Kleen unsigned int default_hstate_idx;
41e5ff2159SAndi Kleen struct hstate hstates[HUGE_MAX_HSTATE];
42641844f5SNaoya Horiguchi /*
43641844f5SNaoya Horiguchi  * Minimum page order among possible hugepage sizes, set to a proper value
44641844f5SNaoya Horiguchi  * at boot time.
45641844f5SNaoya Horiguchi  */
46641844f5SNaoya Horiguchi static unsigned int minimum_order __read_mostly = UINT_MAX;
47e5ff2159SAndi Kleen 
4853ba51d2SJon Tollefson __initdata LIST_HEAD(huge_boot_pages);
4953ba51d2SJon Tollefson 
50e5ff2159SAndi Kleen /* for command line parsing */
51e5ff2159SAndi Kleen static struct hstate * __initdata parsed_hstate;
52e5ff2159SAndi Kleen static unsigned long __initdata default_hstate_max_huge_pages;
53e11bfbfcSNick Piggin static unsigned long __initdata default_hstate_size;
549fee021dSVaishali Thakkar static bool __initdata parsed_valid_hugepagesz = true;
55e5ff2159SAndi Kleen 
563935baa9SDavid Gibson /*
5731caf665SNaoya Horiguchi  * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
5831caf665SNaoya Horiguchi  * free_huge_pages, and surplus_huge_pages.
593935baa9SDavid Gibson  */
60c3f38a38SAneesh Kumar K.V DEFINE_SPINLOCK(hugetlb_lock);
610bd0f9fbSEric Paris 
628382d914SDavidlohr Bueso /*
638382d914SDavidlohr Bueso  * Serializes faults on the same logical page.  This is used to
648382d914SDavidlohr Bueso  * prevent spurious OOMs when the hugepage pool is fully utilized.
658382d914SDavidlohr Bueso  */
668382d914SDavidlohr Bueso static int num_fault_mutexes;
67c672c7f2SMike Kravetz struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
688382d914SDavidlohr Bueso 
697ca02d0aSMike Kravetz /* Forward declaration */
707ca02d0aSMike Kravetz static int hugetlb_acct_memory(struct hstate *h, long delta);
717ca02d0aSMike Kravetz 
7290481622SDavid Gibson static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
7390481622SDavid Gibson {
7490481622SDavid Gibson 	bool free = (spool->count == 0) && (spool->used_hpages == 0);
7590481622SDavid Gibson 
7690481622SDavid Gibson 	spin_unlock(&spool->lock);
7790481622SDavid Gibson 
7890481622SDavid Gibson 	/* If no pages are used, and no other handles to the subpool
797ca02d0aSMike Kravetz 	 * remain, give up any reservations mased on minimum size and
807ca02d0aSMike Kravetz 	 * free the subpool */
817ca02d0aSMike Kravetz 	if (free) {
827ca02d0aSMike Kravetz 		if (spool->min_hpages != -1)
837ca02d0aSMike Kravetz 			hugetlb_acct_memory(spool->hstate,
847ca02d0aSMike Kravetz 						-spool->min_hpages);
8590481622SDavid Gibson 		kfree(spool);
8690481622SDavid Gibson 	}
877ca02d0aSMike Kravetz }
8890481622SDavid Gibson 
897ca02d0aSMike Kravetz struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
907ca02d0aSMike Kravetz 						long min_hpages)
9190481622SDavid Gibson {
9290481622SDavid Gibson 	struct hugepage_subpool *spool;
9390481622SDavid Gibson 
94c6a91820SMike Kravetz 	spool = kzalloc(sizeof(*spool), GFP_KERNEL);
9590481622SDavid Gibson 	if (!spool)
9690481622SDavid Gibson 		return NULL;
9790481622SDavid Gibson 
9890481622SDavid Gibson 	spin_lock_init(&spool->lock);
9990481622SDavid Gibson 	spool->count = 1;
1007ca02d0aSMike Kravetz 	spool->max_hpages = max_hpages;
1017ca02d0aSMike Kravetz 	spool->hstate = h;
1027ca02d0aSMike Kravetz 	spool->min_hpages = min_hpages;
1037ca02d0aSMike Kravetz 
1047ca02d0aSMike Kravetz 	if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
1057ca02d0aSMike Kravetz 		kfree(spool);
1067ca02d0aSMike Kravetz 		return NULL;
1077ca02d0aSMike Kravetz 	}
1087ca02d0aSMike Kravetz 	spool->rsv_hpages = min_hpages;
10990481622SDavid Gibson 
11090481622SDavid Gibson 	return spool;
11190481622SDavid Gibson }
11290481622SDavid Gibson 
11390481622SDavid Gibson void hugepage_put_subpool(struct hugepage_subpool *spool)
11490481622SDavid Gibson {
11590481622SDavid Gibson 	spin_lock(&spool->lock);
11690481622SDavid Gibson 	BUG_ON(!spool->count);
11790481622SDavid Gibson 	spool->count--;
11890481622SDavid Gibson 	unlock_or_release_subpool(spool);
11990481622SDavid Gibson }
12090481622SDavid Gibson 
1211c5ecae3SMike Kravetz /*
1221c5ecae3SMike Kravetz  * Subpool accounting for allocating and reserving pages.
1231c5ecae3SMike Kravetz  * Return -ENOMEM if there are not enough resources to satisfy the
1241c5ecae3SMike Kravetz  * the request.  Otherwise, return the number of pages by which the
1251c5ecae3SMike Kravetz  * global pools must be adjusted (upward).  The returned value may
1261c5ecae3SMike Kravetz  * only be different than the passed value (delta) in the case where
1271c5ecae3SMike Kravetz  * a subpool minimum size must be manitained.
1281c5ecae3SMike Kravetz  */
1291c5ecae3SMike Kravetz static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
13090481622SDavid Gibson 				      long delta)
13190481622SDavid Gibson {
1321c5ecae3SMike Kravetz 	long ret = delta;
13390481622SDavid Gibson 
13490481622SDavid Gibson 	if (!spool)
1351c5ecae3SMike Kravetz 		return ret;
13690481622SDavid Gibson 
13790481622SDavid Gibson 	spin_lock(&spool->lock);
13890481622SDavid Gibson 
1391c5ecae3SMike Kravetz 	if (spool->max_hpages != -1) {		/* maximum size accounting */
1401c5ecae3SMike Kravetz 		if ((spool->used_hpages + delta) <= spool->max_hpages)
1411c5ecae3SMike Kravetz 			spool->used_hpages += delta;
1421c5ecae3SMike Kravetz 		else {
1431c5ecae3SMike Kravetz 			ret = -ENOMEM;
1441c5ecae3SMike Kravetz 			goto unlock_ret;
1451c5ecae3SMike Kravetz 		}
1461c5ecae3SMike Kravetz 	}
1471c5ecae3SMike Kravetz 
14809a95e29SMike Kravetz 	/* minimum size accounting */
14909a95e29SMike Kravetz 	if (spool->min_hpages != -1 && spool->rsv_hpages) {
1501c5ecae3SMike Kravetz 		if (delta > spool->rsv_hpages) {
1511c5ecae3SMike Kravetz 			/*
1521c5ecae3SMike Kravetz 			 * Asking for more reserves than those already taken on
1531c5ecae3SMike Kravetz 			 * behalf of subpool.  Return difference.
1541c5ecae3SMike Kravetz 			 */
1551c5ecae3SMike Kravetz 			ret = delta - spool->rsv_hpages;
1561c5ecae3SMike Kravetz 			spool->rsv_hpages = 0;
1571c5ecae3SMike Kravetz 		} else {
1581c5ecae3SMike Kravetz 			ret = 0;	/* reserves already accounted for */
1591c5ecae3SMike Kravetz 			spool->rsv_hpages -= delta;
1601c5ecae3SMike Kravetz 		}
1611c5ecae3SMike Kravetz 	}
1621c5ecae3SMike Kravetz 
1631c5ecae3SMike Kravetz unlock_ret:
1641c5ecae3SMike Kravetz 	spin_unlock(&spool->lock);
16590481622SDavid Gibson 	return ret;
16690481622SDavid Gibson }
16790481622SDavid Gibson 
1681c5ecae3SMike Kravetz /*
1691c5ecae3SMike Kravetz  * Subpool accounting for freeing and unreserving pages.
1701c5ecae3SMike Kravetz  * Return the number of global page reservations that must be dropped.
1711c5ecae3SMike Kravetz  * The return value may only be different than the passed value (delta)
1721c5ecae3SMike Kravetz  * in the case where a subpool minimum size must be maintained.
1731c5ecae3SMike Kravetz  */
1741c5ecae3SMike Kravetz static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
17590481622SDavid Gibson 				       long delta)
17690481622SDavid Gibson {
1771c5ecae3SMike Kravetz 	long ret = delta;
1781c5ecae3SMike Kravetz 
17990481622SDavid Gibson 	if (!spool)
1801c5ecae3SMike Kravetz 		return delta;
18190481622SDavid Gibson 
18290481622SDavid Gibson 	spin_lock(&spool->lock);
1831c5ecae3SMike Kravetz 
1841c5ecae3SMike Kravetz 	if (spool->max_hpages != -1)		/* maximum size accounting */
18590481622SDavid Gibson 		spool->used_hpages -= delta;
1861c5ecae3SMike Kravetz 
18709a95e29SMike Kravetz 	 /* minimum size accounting */
18809a95e29SMike Kravetz 	if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) {
1891c5ecae3SMike Kravetz 		if (spool->rsv_hpages + delta <= spool->min_hpages)
1901c5ecae3SMike Kravetz 			ret = 0;
1911c5ecae3SMike Kravetz 		else
1921c5ecae3SMike Kravetz 			ret = spool->rsv_hpages + delta - spool->min_hpages;
1931c5ecae3SMike Kravetz 
1941c5ecae3SMike Kravetz 		spool->rsv_hpages += delta;
1951c5ecae3SMike Kravetz 		if (spool->rsv_hpages > spool->min_hpages)
1961c5ecae3SMike Kravetz 			spool->rsv_hpages = spool->min_hpages;
1971c5ecae3SMike Kravetz 	}
1981c5ecae3SMike Kravetz 
1991c5ecae3SMike Kravetz 	/*
2001c5ecae3SMike Kravetz 	 * If hugetlbfs_put_super couldn't free spool due to an outstanding
2011c5ecae3SMike Kravetz 	 * quota reference, free it now.
2021c5ecae3SMike Kravetz 	 */
20390481622SDavid Gibson 	unlock_or_release_subpool(spool);
2041c5ecae3SMike Kravetz 
2051c5ecae3SMike Kravetz 	return ret;
20690481622SDavid Gibson }
20790481622SDavid Gibson 
20890481622SDavid Gibson static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
20990481622SDavid Gibson {
21090481622SDavid Gibson 	return HUGETLBFS_SB(inode->i_sb)->spool;
21190481622SDavid Gibson }
21290481622SDavid Gibson 
21390481622SDavid Gibson static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
21490481622SDavid Gibson {
215496ad9aaSAl Viro 	return subpool_inode(file_inode(vma->vm_file));
21690481622SDavid Gibson }
21790481622SDavid Gibson 
218e7c4b0bfSAndy Whitcroft /*
21996822904SAndy Whitcroft  * Region tracking -- allows tracking of reservations and instantiated pages
22096822904SAndy Whitcroft  *                    across the pages in a mapping.
22184afd99bSAndy Whitcroft  *
2221dd308a7SMike Kravetz  * The region data structures are embedded into a resv_map and protected
2231dd308a7SMike Kravetz  * by a resv_map's lock.  The set of regions within the resv_map represent
2241dd308a7SMike Kravetz  * reservations for huge pages, or huge pages that have already been
2251dd308a7SMike Kravetz  * instantiated within the map.  The from and to elements are huge page
2261dd308a7SMike Kravetz  * indicies into the associated mapping.  from indicates the starting index
2271dd308a7SMike Kravetz  * of the region.  to represents the first index past the end of  the region.
2281dd308a7SMike Kravetz  *
2291dd308a7SMike Kravetz  * For example, a file region structure with from == 0 and to == 4 represents
2301dd308a7SMike Kravetz  * four huge pages in a mapping.  It is important to note that the to element
2311dd308a7SMike Kravetz  * represents the first element past the end of the region. This is used in
2321dd308a7SMike Kravetz  * arithmetic as 4(to) - 0(from) = 4 huge pages in the region.
2331dd308a7SMike Kravetz  *
2341dd308a7SMike Kravetz  * Interval notation of the form [from, to) will be used to indicate that
2351dd308a7SMike Kravetz  * the endpoint from is inclusive and to is exclusive.
23696822904SAndy Whitcroft  */
23796822904SAndy Whitcroft struct file_region {
23896822904SAndy Whitcroft 	struct list_head link;
23996822904SAndy Whitcroft 	long from;
24096822904SAndy Whitcroft 	long to;
24196822904SAndy Whitcroft };
24296822904SAndy Whitcroft 
2431dd308a7SMike Kravetz /*
2441dd308a7SMike Kravetz  * Add the huge page range represented by [f, t) to the reserve
2455e911373SMike Kravetz  * map.  In the normal case, existing regions will be expanded
2465e911373SMike Kravetz  * to accommodate the specified range.  Sufficient regions should
2475e911373SMike Kravetz  * exist for expansion due to the previous call to region_chg
2485e911373SMike Kravetz  * with the same range.  However, it is possible that region_del
2495e911373SMike Kravetz  * could have been called after region_chg and modifed the map
2505e911373SMike Kravetz  * in such a way that no region exists to be expanded.  In this
2515e911373SMike Kravetz  * case, pull a region descriptor from the cache associated with
2525e911373SMike Kravetz  * the map and use that for the new range.
253cf3ad20bSMike Kravetz  *
254cf3ad20bSMike Kravetz  * Return the number of new huge pages added to the map.  This
255cf3ad20bSMike Kravetz  * number is greater than or equal to zero.
2561dd308a7SMike Kravetz  */
2571406ec9bSJoonsoo Kim static long region_add(struct resv_map *resv, long f, long t)
25896822904SAndy Whitcroft {
2591406ec9bSJoonsoo Kim 	struct list_head *head = &resv->regions;
26096822904SAndy Whitcroft 	struct file_region *rg, *nrg, *trg;
261cf3ad20bSMike Kravetz 	long add = 0;
26296822904SAndy Whitcroft 
2637b24d861SDavidlohr Bueso 	spin_lock(&resv->lock);
26496822904SAndy Whitcroft 	/* Locate the region we are either in or before. */
26596822904SAndy Whitcroft 	list_for_each_entry(rg, head, link)
26696822904SAndy Whitcroft 		if (f <= rg->to)
26796822904SAndy Whitcroft 			break;
26896822904SAndy Whitcroft 
2695e911373SMike Kravetz 	/*
2705e911373SMike Kravetz 	 * If no region exists which can be expanded to include the
2715e911373SMike Kravetz 	 * specified range, the list must have been modified by an
2725e911373SMike Kravetz 	 * interleving call to region_del().  Pull a region descriptor
2735e911373SMike Kravetz 	 * from the cache and use it for this range.
2745e911373SMike Kravetz 	 */
2755e911373SMike Kravetz 	if (&rg->link == head || t < rg->from) {
2765e911373SMike Kravetz 		VM_BUG_ON(resv->region_cache_count <= 0);
2775e911373SMike Kravetz 
2785e911373SMike Kravetz 		resv->region_cache_count--;
2795e911373SMike Kravetz 		nrg = list_first_entry(&resv->region_cache, struct file_region,
2805e911373SMike Kravetz 					link);
2815e911373SMike Kravetz 		list_del(&nrg->link);
2825e911373SMike Kravetz 
2835e911373SMike Kravetz 		nrg->from = f;
2845e911373SMike Kravetz 		nrg->to = t;
2855e911373SMike Kravetz 		list_add(&nrg->link, rg->link.prev);
2865e911373SMike Kravetz 
2875e911373SMike Kravetz 		add += t - f;
2885e911373SMike Kravetz 		goto out_locked;
2895e911373SMike Kravetz 	}
2905e911373SMike Kravetz 
29196822904SAndy Whitcroft 	/* Round our left edge to the current segment if it encloses us. */
29296822904SAndy Whitcroft 	if (f > rg->from)
29396822904SAndy Whitcroft 		f = rg->from;
29496822904SAndy Whitcroft 
29596822904SAndy Whitcroft 	/* Check for and consume any regions we now overlap with. */
29696822904SAndy Whitcroft 	nrg = rg;
29796822904SAndy Whitcroft 	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
29896822904SAndy Whitcroft 		if (&rg->link == head)
29996822904SAndy Whitcroft 			break;
30096822904SAndy Whitcroft 		if (rg->from > t)
30196822904SAndy Whitcroft 			break;
30296822904SAndy Whitcroft 
30396822904SAndy Whitcroft 		/* If this area reaches higher then extend our area to
30496822904SAndy Whitcroft 		 * include it completely.  If this is not the first area
30596822904SAndy Whitcroft 		 * which we intend to reuse, free it. */
30696822904SAndy Whitcroft 		if (rg->to > t)
30796822904SAndy Whitcroft 			t = rg->to;
30896822904SAndy Whitcroft 		if (rg != nrg) {
309cf3ad20bSMike Kravetz 			/* Decrement return value by the deleted range.
310cf3ad20bSMike Kravetz 			 * Another range will span this area so that by
311cf3ad20bSMike Kravetz 			 * end of routine add will be >= zero
312cf3ad20bSMike Kravetz 			 */
313cf3ad20bSMike Kravetz 			add -= (rg->to - rg->from);
31496822904SAndy Whitcroft 			list_del(&rg->link);
31596822904SAndy Whitcroft 			kfree(rg);
31696822904SAndy Whitcroft 		}
31796822904SAndy Whitcroft 	}
318cf3ad20bSMike Kravetz 
319cf3ad20bSMike Kravetz 	add += (nrg->from - f);		/* Added to beginning of region */
32096822904SAndy Whitcroft 	nrg->from = f;
321cf3ad20bSMike Kravetz 	add += t - nrg->to;		/* Added to end of region */
32296822904SAndy Whitcroft 	nrg->to = t;
323cf3ad20bSMike Kravetz 
3245e911373SMike Kravetz out_locked:
3255e911373SMike Kravetz 	resv->adds_in_progress--;
3267b24d861SDavidlohr Bueso 	spin_unlock(&resv->lock);
327cf3ad20bSMike Kravetz 	VM_BUG_ON(add < 0);
328cf3ad20bSMike Kravetz 	return add;
32996822904SAndy Whitcroft }
33096822904SAndy Whitcroft 
3311dd308a7SMike Kravetz /*
3321dd308a7SMike Kravetz  * Examine the existing reserve map and determine how many
3331dd308a7SMike Kravetz  * huge pages in the specified range [f, t) are NOT currently
3341dd308a7SMike Kravetz  * represented.  This routine is called before a subsequent
3351dd308a7SMike Kravetz  * call to region_add that will actually modify the reserve
3361dd308a7SMike Kravetz  * map to add the specified range [f, t).  region_chg does
3371dd308a7SMike Kravetz  * not change the number of huge pages represented by the
3381dd308a7SMike Kravetz  * map.  However, if the existing regions in the map can not
3391dd308a7SMike Kravetz  * be expanded to represent the new range, a new file_region
3401dd308a7SMike Kravetz  * structure is added to the map as a placeholder.  This is
3411dd308a7SMike Kravetz  * so that the subsequent region_add call will have all the
3421dd308a7SMike Kravetz  * regions it needs and will not fail.
3431dd308a7SMike Kravetz  *
3445e911373SMike Kravetz  * Upon entry, region_chg will also examine the cache of region descriptors
3455e911373SMike Kravetz  * associated with the map.  If there are not enough descriptors cached, one
3465e911373SMike Kravetz  * will be allocated for the in progress add operation.
3475e911373SMike Kravetz  *
3485e911373SMike Kravetz  * Returns the number of huge pages that need to be added to the existing
3495e911373SMike Kravetz  * reservation map for the range [f, t).  This number is greater or equal to
3505e911373SMike Kravetz  * zero.  -ENOMEM is returned if a new file_region structure or cache entry
3515e911373SMike Kravetz  * is needed and can not be allocated.
3521dd308a7SMike Kravetz  */
3531406ec9bSJoonsoo Kim static long region_chg(struct resv_map *resv, long f, long t)
35496822904SAndy Whitcroft {
3551406ec9bSJoonsoo Kim 	struct list_head *head = &resv->regions;
3567b24d861SDavidlohr Bueso 	struct file_region *rg, *nrg = NULL;
35796822904SAndy Whitcroft 	long chg = 0;
35896822904SAndy Whitcroft 
3597b24d861SDavidlohr Bueso retry:
3607b24d861SDavidlohr Bueso 	spin_lock(&resv->lock);
3615e911373SMike Kravetz retry_locked:
3625e911373SMike Kravetz 	resv->adds_in_progress++;
3635e911373SMike Kravetz 
3645e911373SMike Kravetz 	/*
3655e911373SMike Kravetz 	 * Check for sufficient descriptors in the cache to accommodate
3665e911373SMike Kravetz 	 * the number of in progress add operations.
3675e911373SMike Kravetz 	 */
3685e911373SMike Kravetz 	if (resv->adds_in_progress > resv->region_cache_count) {
3695e911373SMike Kravetz 		struct file_region *trg;
3705e911373SMike Kravetz 
3715e911373SMike Kravetz 		VM_BUG_ON(resv->adds_in_progress - resv->region_cache_count > 1);
3725e911373SMike Kravetz 		/* Must drop lock to allocate a new descriptor. */
3735e911373SMike Kravetz 		resv->adds_in_progress--;
3745e911373SMike Kravetz 		spin_unlock(&resv->lock);
3755e911373SMike Kravetz 
3765e911373SMike Kravetz 		trg = kmalloc(sizeof(*trg), GFP_KERNEL);
377dbe409e4SMike Kravetz 		if (!trg) {
378dbe409e4SMike Kravetz 			kfree(nrg);
3795e911373SMike Kravetz 			return -ENOMEM;
380dbe409e4SMike Kravetz 		}
3815e911373SMike Kravetz 
3825e911373SMike Kravetz 		spin_lock(&resv->lock);
3835e911373SMike Kravetz 		list_add(&trg->link, &resv->region_cache);
3845e911373SMike Kravetz 		resv->region_cache_count++;
3855e911373SMike Kravetz 		goto retry_locked;
3865e911373SMike Kravetz 	}
3875e911373SMike Kravetz 
38896822904SAndy Whitcroft 	/* Locate the region we are before or in. */
38996822904SAndy Whitcroft 	list_for_each_entry(rg, head, link)
39096822904SAndy Whitcroft 		if (f <= rg->to)
39196822904SAndy Whitcroft 			break;
39296822904SAndy Whitcroft 
39396822904SAndy Whitcroft 	/* If we are below the current region then a new region is required.
39496822904SAndy Whitcroft 	 * Subtle, allocate a new region at the position but make it zero
39596822904SAndy Whitcroft 	 * size such that we can guarantee to record the reservation. */
39696822904SAndy Whitcroft 	if (&rg->link == head || t < rg->from) {
3977b24d861SDavidlohr Bueso 		if (!nrg) {
3985e911373SMike Kravetz 			resv->adds_in_progress--;
3997b24d861SDavidlohr Bueso 			spin_unlock(&resv->lock);
40096822904SAndy Whitcroft 			nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
40196822904SAndy Whitcroft 			if (!nrg)
40296822904SAndy Whitcroft 				return -ENOMEM;
4037b24d861SDavidlohr Bueso 
40496822904SAndy Whitcroft 			nrg->from = f;
40596822904SAndy Whitcroft 			nrg->to   = f;
40696822904SAndy Whitcroft 			INIT_LIST_HEAD(&nrg->link);
4077b24d861SDavidlohr Bueso 			goto retry;
4087b24d861SDavidlohr Bueso 		}
40996822904SAndy Whitcroft 
4107b24d861SDavidlohr Bueso 		list_add(&nrg->link, rg->link.prev);
4117b24d861SDavidlohr Bueso 		chg = t - f;
4127b24d861SDavidlohr Bueso 		goto out_nrg;
41396822904SAndy Whitcroft 	}
41496822904SAndy Whitcroft 
41596822904SAndy Whitcroft 	/* Round our left edge to the current segment if it encloses us. */
41696822904SAndy Whitcroft 	if (f > rg->from)
41796822904SAndy Whitcroft 		f = rg->from;
41896822904SAndy Whitcroft 	chg = t - f;
41996822904SAndy Whitcroft 
42096822904SAndy Whitcroft 	/* Check for and consume any regions we now overlap with. */
42196822904SAndy Whitcroft 	list_for_each_entry(rg, rg->link.prev, link) {
42296822904SAndy Whitcroft 		if (&rg->link == head)
42396822904SAndy Whitcroft 			break;
42496822904SAndy Whitcroft 		if (rg->from > t)
4257b24d861SDavidlohr Bueso 			goto out;
42696822904SAndy Whitcroft 
42725985edcSLucas De Marchi 		/* We overlap with this area, if it extends further than
42896822904SAndy Whitcroft 		 * us then we must extend ourselves.  Account for its
42996822904SAndy Whitcroft 		 * existing reservation. */
43096822904SAndy Whitcroft 		if (rg->to > t) {
43196822904SAndy Whitcroft 			chg += rg->to - t;
43296822904SAndy Whitcroft 			t = rg->to;
43396822904SAndy Whitcroft 		}
43496822904SAndy Whitcroft 		chg -= rg->to - rg->from;
43596822904SAndy Whitcroft 	}
4367b24d861SDavidlohr Bueso 
4377b24d861SDavidlohr Bueso out:
4387b24d861SDavidlohr Bueso 	spin_unlock(&resv->lock);
4397b24d861SDavidlohr Bueso 	/*  We already know we raced and no longer need the new region */
4407b24d861SDavidlohr Bueso 	kfree(nrg);
4417b24d861SDavidlohr Bueso 	return chg;
4427b24d861SDavidlohr Bueso out_nrg:
4437b24d861SDavidlohr Bueso 	spin_unlock(&resv->lock);
44496822904SAndy Whitcroft 	return chg;
44596822904SAndy Whitcroft }
44696822904SAndy Whitcroft 
4471dd308a7SMike Kravetz /*
4485e911373SMike Kravetz  * Abort the in progress add operation.  The adds_in_progress field
4495e911373SMike Kravetz  * of the resv_map keeps track of the operations in progress between
4505e911373SMike Kravetz  * calls to region_chg and region_add.  Operations are sometimes
4515e911373SMike Kravetz  * aborted after the call to region_chg.  In such cases, region_abort
4525e911373SMike Kravetz  * is called to decrement the adds_in_progress counter.
4535e911373SMike Kravetz  *
4545e911373SMike Kravetz  * NOTE: The range arguments [f, t) are not needed or used in this
4555e911373SMike Kravetz  * routine.  They are kept to make reading the calling code easier as
4565e911373SMike Kravetz  * arguments will match the associated region_chg call.
4575e911373SMike Kravetz  */
4585e911373SMike Kravetz static void region_abort(struct resv_map *resv, long f, long t)
4595e911373SMike Kravetz {
4605e911373SMike Kravetz 	spin_lock(&resv->lock);
4615e911373SMike Kravetz 	VM_BUG_ON(!resv->region_cache_count);
4625e911373SMike Kravetz 	resv->adds_in_progress--;
4635e911373SMike Kravetz 	spin_unlock(&resv->lock);
4645e911373SMike Kravetz }
4655e911373SMike Kravetz 
4665e911373SMike Kravetz /*
467feba16e2SMike Kravetz  * Delete the specified range [f, t) from the reserve map.  If the
468feba16e2SMike Kravetz  * t parameter is LONG_MAX, this indicates that ALL regions after f
469feba16e2SMike Kravetz  * should be deleted.  Locate the regions which intersect [f, t)
470feba16e2SMike Kravetz  * and either trim, delete or split the existing regions.
471feba16e2SMike Kravetz  *
472feba16e2SMike Kravetz  * Returns the number of huge pages deleted from the reserve map.
473feba16e2SMike Kravetz  * In the normal case, the return value is zero or more.  In the
474feba16e2SMike Kravetz  * case where a region must be split, a new region descriptor must
475feba16e2SMike Kravetz  * be allocated.  If the allocation fails, -ENOMEM will be returned.
476feba16e2SMike Kravetz  * NOTE: If the parameter t == LONG_MAX, then we will never split
477feba16e2SMike Kravetz  * a region and possibly return -ENOMEM.  Callers specifying
478feba16e2SMike Kravetz  * t == LONG_MAX do not need to check for -ENOMEM error.
4791dd308a7SMike Kravetz  */
480feba16e2SMike Kravetz static long region_del(struct resv_map *resv, long f, long t)
48196822904SAndy Whitcroft {
4821406ec9bSJoonsoo Kim 	struct list_head *head = &resv->regions;
48396822904SAndy Whitcroft 	struct file_region *rg, *trg;
484feba16e2SMike Kravetz 	struct file_region *nrg = NULL;
485feba16e2SMike Kravetz 	long del = 0;
48696822904SAndy Whitcroft 
487feba16e2SMike Kravetz retry:
4887b24d861SDavidlohr Bueso 	spin_lock(&resv->lock);
489feba16e2SMike Kravetz 	list_for_each_entry_safe(rg, trg, head, link) {
490dbe409e4SMike Kravetz 		/*
491dbe409e4SMike Kravetz 		 * Skip regions before the range to be deleted.  file_region
492dbe409e4SMike Kravetz 		 * ranges are normally of the form [from, to).  However, there
493dbe409e4SMike Kravetz 		 * may be a "placeholder" entry in the map which is of the form
494dbe409e4SMike Kravetz 		 * (from, to) with from == to.  Check for placeholder entries
495dbe409e4SMike Kravetz 		 * at the beginning of the range to be deleted.
496dbe409e4SMike Kravetz 		 */
497dbe409e4SMike Kravetz 		if (rg->to <= f && (rg->to != rg->from || rg->to != f))
498feba16e2SMike Kravetz 			continue;
499dbe409e4SMike Kravetz 
500feba16e2SMike Kravetz 		if (rg->from >= t)
50196822904SAndy Whitcroft 			break;
50296822904SAndy Whitcroft 
503feba16e2SMike Kravetz 		if (f > rg->from && t < rg->to) { /* Must split region */
504feba16e2SMike Kravetz 			/*
505feba16e2SMike Kravetz 			 * Check for an entry in the cache before dropping
506feba16e2SMike Kravetz 			 * lock and attempting allocation.
507feba16e2SMike Kravetz 			 */
508feba16e2SMike Kravetz 			if (!nrg &&
509feba16e2SMike Kravetz 			    resv->region_cache_count > resv->adds_in_progress) {
510feba16e2SMike Kravetz 				nrg = list_first_entry(&resv->region_cache,
511feba16e2SMike Kravetz 							struct file_region,
512feba16e2SMike Kravetz 							link);
513feba16e2SMike Kravetz 				list_del(&nrg->link);
514feba16e2SMike Kravetz 				resv->region_cache_count--;
51596822904SAndy Whitcroft 			}
51696822904SAndy Whitcroft 
517feba16e2SMike Kravetz 			if (!nrg) {
518feba16e2SMike Kravetz 				spin_unlock(&resv->lock);
519feba16e2SMike Kravetz 				nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
520feba16e2SMike Kravetz 				if (!nrg)
521feba16e2SMike Kravetz 					return -ENOMEM;
522feba16e2SMike Kravetz 				goto retry;
523feba16e2SMike Kravetz 			}
524feba16e2SMike Kravetz 
525feba16e2SMike Kravetz 			del += t - f;
526feba16e2SMike Kravetz 
527feba16e2SMike Kravetz 			/* New entry for end of split region */
528feba16e2SMike Kravetz 			nrg->from = t;
529feba16e2SMike Kravetz 			nrg->to = rg->to;
530feba16e2SMike Kravetz 			INIT_LIST_HEAD(&nrg->link);
531feba16e2SMike Kravetz 
532feba16e2SMike Kravetz 			/* Original entry is trimmed */
533feba16e2SMike Kravetz 			rg->to = f;
534feba16e2SMike Kravetz 
535feba16e2SMike Kravetz 			list_add(&nrg->link, &rg->link);
536feba16e2SMike Kravetz 			nrg = NULL;
53796822904SAndy Whitcroft 			break;
538feba16e2SMike Kravetz 		}
539feba16e2SMike Kravetz 
540feba16e2SMike Kravetz 		if (f <= rg->from && t >= rg->to) { /* Remove entire region */
541feba16e2SMike Kravetz 			del += rg->to - rg->from;
54296822904SAndy Whitcroft 			list_del(&rg->link);
54396822904SAndy Whitcroft 			kfree(rg);
544feba16e2SMike Kravetz 			continue;
54596822904SAndy Whitcroft 		}
5467b24d861SDavidlohr Bueso 
547feba16e2SMike Kravetz 		if (f <= rg->from) {	/* Trim beginning of region */
548feba16e2SMike Kravetz 			del += t - rg->from;
549feba16e2SMike Kravetz 			rg->from = t;
550feba16e2SMike Kravetz 		} else {		/* Trim end of region */
551feba16e2SMike Kravetz 			del += rg->to - f;
552feba16e2SMike Kravetz 			rg->to = f;
553feba16e2SMike Kravetz 		}
554feba16e2SMike Kravetz 	}
555feba16e2SMike Kravetz 
5567b24d861SDavidlohr Bueso 	spin_unlock(&resv->lock);
557feba16e2SMike Kravetz 	kfree(nrg);
558feba16e2SMike Kravetz 	return del;
55996822904SAndy Whitcroft }
56096822904SAndy Whitcroft 
5611dd308a7SMike Kravetz /*
562b5cec28dSMike Kravetz  * A rare out of memory error was encountered which prevented removal of
563b5cec28dSMike Kravetz  * the reserve map region for a page.  The huge page itself was free'ed
564b5cec28dSMike Kravetz  * and removed from the page cache.  This routine will adjust the subpool
565b5cec28dSMike Kravetz  * usage count, and the global reserve count if needed.  By incrementing
566b5cec28dSMike Kravetz  * these counts, the reserve map entry which could not be deleted will
567b5cec28dSMike Kravetz  * appear as a "reserved" entry instead of simply dangling with incorrect
568b5cec28dSMike Kravetz  * counts.
569b5cec28dSMike Kravetz  */
570b5cec28dSMike Kravetz void hugetlb_fix_reserve_counts(struct inode *inode, bool restore_reserve)
571b5cec28dSMike Kravetz {
572b5cec28dSMike Kravetz 	struct hugepage_subpool *spool = subpool_inode(inode);
573b5cec28dSMike Kravetz 	long rsv_adjust;
574b5cec28dSMike Kravetz 
575b5cec28dSMike Kravetz 	rsv_adjust = hugepage_subpool_get_pages(spool, 1);
576b5cec28dSMike Kravetz 	if (restore_reserve && rsv_adjust) {
577b5cec28dSMike Kravetz 		struct hstate *h = hstate_inode(inode);
578b5cec28dSMike Kravetz 
579b5cec28dSMike Kravetz 		hugetlb_acct_memory(h, 1);
580b5cec28dSMike Kravetz 	}
581b5cec28dSMike Kravetz }
582b5cec28dSMike Kravetz 
583b5cec28dSMike Kravetz /*
5841dd308a7SMike Kravetz  * Count and return the number of huge pages in the reserve map
5851dd308a7SMike Kravetz  * that intersect with the range [f, t).
5861dd308a7SMike Kravetz  */
5871406ec9bSJoonsoo Kim static long region_count(struct resv_map *resv, long f, long t)
58884afd99bSAndy Whitcroft {
5891406ec9bSJoonsoo Kim 	struct list_head *head = &resv->regions;
59084afd99bSAndy Whitcroft 	struct file_region *rg;
59184afd99bSAndy Whitcroft 	long chg = 0;
59284afd99bSAndy Whitcroft 
5937b24d861SDavidlohr Bueso 	spin_lock(&resv->lock);
59484afd99bSAndy Whitcroft 	/* Locate each segment we overlap with, and count that overlap. */
59584afd99bSAndy Whitcroft 	list_for_each_entry(rg, head, link) {
596f2135a4aSWang Sheng-Hui 		long seg_from;
597f2135a4aSWang Sheng-Hui 		long seg_to;
59884afd99bSAndy Whitcroft 
59984afd99bSAndy Whitcroft 		if (rg->to <= f)
60084afd99bSAndy Whitcroft 			continue;
60184afd99bSAndy Whitcroft 		if (rg->from >= t)
60284afd99bSAndy Whitcroft 			break;
60384afd99bSAndy Whitcroft 
60484afd99bSAndy Whitcroft 		seg_from = max(rg->from, f);
60584afd99bSAndy Whitcroft 		seg_to = min(rg->to, t);
60684afd99bSAndy Whitcroft 
60784afd99bSAndy Whitcroft 		chg += seg_to - seg_from;
60884afd99bSAndy Whitcroft 	}
6097b24d861SDavidlohr Bueso 	spin_unlock(&resv->lock);
61084afd99bSAndy Whitcroft 
61184afd99bSAndy Whitcroft 	return chg;
61284afd99bSAndy Whitcroft }
61384afd99bSAndy Whitcroft 
61496822904SAndy Whitcroft /*
615e7c4b0bfSAndy Whitcroft  * Convert the address within this vma to the page offset within
616e7c4b0bfSAndy Whitcroft  * the mapping, in pagecache page units; huge pages here.
617e7c4b0bfSAndy Whitcroft  */
618a5516438SAndi Kleen static pgoff_t vma_hugecache_offset(struct hstate *h,
619a5516438SAndi Kleen 			struct vm_area_struct *vma, unsigned long address)
620e7c4b0bfSAndy Whitcroft {
621a5516438SAndi Kleen 	return ((address - vma->vm_start) >> huge_page_shift(h)) +
622a5516438SAndi Kleen 			(vma->vm_pgoff >> huge_page_order(h));
623e7c4b0bfSAndy Whitcroft }
624e7c4b0bfSAndy Whitcroft 
6250fe6e20bSNaoya Horiguchi pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
6260fe6e20bSNaoya Horiguchi 				     unsigned long address)
6270fe6e20bSNaoya Horiguchi {
6280fe6e20bSNaoya Horiguchi 	return vma_hugecache_offset(hstate_vma(vma), vma, address);
6290fe6e20bSNaoya Horiguchi }
630dee41079SDan Williams EXPORT_SYMBOL_GPL(linear_hugepage_index);
6310fe6e20bSNaoya Horiguchi 
63284afd99bSAndy Whitcroft /*
63308fba699SMel Gorman  * Return the size of the pages allocated when backing a VMA. In the majority
63408fba699SMel Gorman  * cases this will be same size as used by the page table entries.
63508fba699SMel Gorman  */
63608fba699SMel Gorman unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
63708fba699SMel Gorman {
63808fba699SMel Gorman 	struct hstate *hstate;
63908fba699SMel Gorman 
64008fba699SMel Gorman 	if (!is_vm_hugetlb_page(vma))
64108fba699SMel Gorman 		return PAGE_SIZE;
64208fba699SMel Gorman 
64308fba699SMel Gorman 	hstate = hstate_vma(vma);
64408fba699SMel Gorman 
6452415cf12SWanpeng Li 	return 1UL << huge_page_shift(hstate);
64608fba699SMel Gorman }
647f340ca0fSJoerg Roedel EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
64808fba699SMel Gorman 
64908fba699SMel Gorman /*
6503340289dSMel Gorman  * Return the page size being used by the MMU to back a VMA. In the majority
6513340289dSMel Gorman  * of cases, the page size used by the kernel matches the MMU size. On
6523340289dSMel Gorman  * architectures where it differs, an architecture-specific version of this
6533340289dSMel Gorman  * function is required.
6543340289dSMel Gorman  */
6553340289dSMel Gorman #ifndef vma_mmu_pagesize
6563340289dSMel Gorman unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
6573340289dSMel Gorman {
6583340289dSMel Gorman 	return vma_kernel_pagesize(vma);
6593340289dSMel Gorman }
6603340289dSMel Gorman #endif
6613340289dSMel Gorman 
6623340289dSMel Gorman /*
66384afd99bSAndy Whitcroft  * Flags for MAP_PRIVATE reservations.  These are stored in the bottom
66484afd99bSAndy Whitcroft  * bits of the reservation map pointer, which are always clear due to
66584afd99bSAndy Whitcroft  * alignment.
66684afd99bSAndy Whitcroft  */
66784afd99bSAndy Whitcroft #define HPAGE_RESV_OWNER    (1UL << 0)
66884afd99bSAndy Whitcroft #define HPAGE_RESV_UNMAPPED (1UL << 1)
66904f2cbe3SMel Gorman #define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
67084afd99bSAndy Whitcroft 
671a1e78772SMel Gorman /*
672a1e78772SMel Gorman  * These helpers are used to track how many pages are reserved for
673a1e78772SMel Gorman  * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
674a1e78772SMel Gorman  * is guaranteed to have their future faults succeed.
675a1e78772SMel Gorman  *
676a1e78772SMel Gorman  * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
677a1e78772SMel Gorman  * the reserve counters are updated with the hugetlb_lock held. It is safe
678a1e78772SMel Gorman  * to reset the VMA at fork() time as it is not in use yet and there is no
679a1e78772SMel Gorman  * chance of the global counters getting corrupted as a result of the values.
68084afd99bSAndy Whitcroft  *
68184afd99bSAndy Whitcroft  * The private mapping reservation is represented in a subtly different
68284afd99bSAndy Whitcroft  * manner to a shared mapping.  A shared mapping has a region map associated
68384afd99bSAndy Whitcroft  * with the underlying file, this region map represents the backing file
68484afd99bSAndy Whitcroft  * pages which have ever had a reservation assigned which this persists even
68584afd99bSAndy Whitcroft  * after the page is instantiated.  A private mapping has a region map
68684afd99bSAndy Whitcroft  * associated with the original mmap which is attached to all VMAs which
68784afd99bSAndy Whitcroft  * reference it, this region map represents those offsets which have consumed
68884afd99bSAndy Whitcroft  * reservation ie. where pages have been instantiated.
689a1e78772SMel Gorman  */
690e7c4b0bfSAndy Whitcroft static unsigned long get_vma_private_data(struct vm_area_struct *vma)
691e7c4b0bfSAndy Whitcroft {
692e7c4b0bfSAndy Whitcroft 	return (unsigned long)vma->vm_private_data;
693e7c4b0bfSAndy Whitcroft }
694e7c4b0bfSAndy Whitcroft 
695e7c4b0bfSAndy Whitcroft static void set_vma_private_data(struct vm_area_struct *vma,
696e7c4b0bfSAndy Whitcroft 							unsigned long value)
697e7c4b0bfSAndy Whitcroft {
698e7c4b0bfSAndy Whitcroft 	vma->vm_private_data = (void *)value;
699e7c4b0bfSAndy Whitcroft }
700e7c4b0bfSAndy Whitcroft 
7019119a41eSJoonsoo Kim struct resv_map *resv_map_alloc(void)
70284afd99bSAndy Whitcroft {
70384afd99bSAndy Whitcroft 	struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
7045e911373SMike Kravetz 	struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL);
7055e911373SMike Kravetz 
7065e911373SMike Kravetz 	if (!resv_map || !rg) {
7075e911373SMike Kravetz 		kfree(resv_map);
7085e911373SMike Kravetz 		kfree(rg);
70984afd99bSAndy Whitcroft 		return NULL;
7105e911373SMike Kravetz 	}
71184afd99bSAndy Whitcroft 
71284afd99bSAndy Whitcroft 	kref_init(&resv_map->refs);
7137b24d861SDavidlohr Bueso 	spin_lock_init(&resv_map->lock);
71484afd99bSAndy Whitcroft 	INIT_LIST_HEAD(&resv_map->regions);
71584afd99bSAndy Whitcroft 
7165e911373SMike Kravetz 	resv_map->adds_in_progress = 0;
7175e911373SMike Kravetz 
7185e911373SMike Kravetz 	INIT_LIST_HEAD(&resv_map->region_cache);
7195e911373SMike Kravetz 	list_add(&rg->link, &resv_map->region_cache);
7205e911373SMike Kravetz 	resv_map->region_cache_count = 1;
7215e911373SMike Kravetz 
72284afd99bSAndy Whitcroft 	return resv_map;
72384afd99bSAndy Whitcroft }
72484afd99bSAndy Whitcroft 
7259119a41eSJoonsoo Kim void resv_map_release(struct kref *ref)
72684afd99bSAndy Whitcroft {
72784afd99bSAndy Whitcroft 	struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
7285e911373SMike Kravetz 	struct list_head *head = &resv_map->region_cache;
7295e911373SMike Kravetz 	struct file_region *rg, *trg;
73084afd99bSAndy Whitcroft 
73184afd99bSAndy Whitcroft 	/* Clear out any active regions before we release the map. */
732feba16e2SMike Kravetz 	region_del(resv_map, 0, LONG_MAX);
7335e911373SMike Kravetz 
7345e911373SMike Kravetz 	/* ... and any entries left in the cache */
7355e911373SMike Kravetz 	list_for_each_entry_safe(rg, trg, head, link) {
7365e911373SMike Kravetz 		list_del(&rg->link);
7375e911373SMike Kravetz 		kfree(rg);
7385e911373SMike Kravetz 	}
7395e911373SMike Kravetz 
7405e911373SMike Kravetz 	VM_BUG_ON(resv_map->adds_in_progress);
7415e911373SMike Kravetz 
74284afd99bSAndy Whitcroft 	kfree(resv_map);
74384afd99bSAndy Whitcroft }
74484afd99bSAndy Whitcroft 
7454e35f483SJoonsoo Kim static inline struct resv_map *inode_resv_map(struct inode *inode)
7464e35f483SJoonsoo Kim {
7474e35f483SJoonsoo Kim 	return inode->i_mapping->private_data;
7484e35f483SJoonsoo Kim }
7494e35f483SJoonsoo Kim 
75084afd99bSAndy Whitcroft static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
751a1e78772SMel Gorman {
75281d1b09cSSasha Levin 	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
7534e35f483SJoonsoo Kim 	if (vma->vm_flags & VM_MAYSHARE) {
7544e35f483SJoonsoo Kim 		struct address_space *mapping = vma->vm_file->f_mapping;
7554e35f483SJoonsoo Kim 		struct inode *inode = mapping->host;
7564e35f483SJoonsoo Kim 
7574e35f483SJoonsoo Kim 		return inode_resv_map(inode);
7584e35f483SJoonsoo Kim 
7594e35f483SJoonsoo Kim 	} else {
76084afd99bSAndy Whitcroft 		return (struct resv_map *)(get_vma_private_data(vma) &
76184afd99bSAndy Whitcroft 							~HPAGE_RESV_MASK);
7624e35f483SJoonsoo Kim 	}
763a1e78772SMel Gorman }
764a1e78772SMel Gorman 
76584afd99bSAndy Whitcroft static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
766a1e78772SMel Gorman {
76781d1b09cSSasha Levin 	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
76881d1b09cSSasha Levin 	VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
769a1e78772SMel Gorman 
77084afd99bSAndy Whitcroft 	set_vma_private_data(vma, (get_vma_private_data(vma) &
77184afd99bSAndy Whitcroft 				HPAGE_RESV_MASK) | (unsigned long)map);
77204f2cbe3SMel Gorman }
77304f2cbe3SMel Gorman 
77404f2cbe3SMel Gorman static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
77504f2cbe3SMel Gorman {
77681d1b09cSSasha Levin 	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
77781d1b09cSSasha Levin 	VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
778e7c4b0bfSAndy Whitcroft 
779e7c4b0bfSAndy Whitcroft 	set_vma_private_data(vma, get_vma_private_data(vma) | flags);
78004f2cbe3SMel Gorman }
78104f2cbe3SMel Gorman 
78204f2cbe3SMel Gorman static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
78304f2cbe3SMel Gorman {
78481d1b09cSSasha Levin 	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
785e7c4b0bfSAndy Whitcroft 
786e7c4b0bfSAndy Whitcroft 	return (get_vma_private_data(vma) & flag) != 0;
787a1e78772SMel Gorman }
788a1e78772SMel Gorman 
78904f2cbe3SMel Gorman /* Reset counters to 0 and clear all HPAGE_RESV_* flags */
790a1e78772SMel Gorman void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
791a1e78772SMel Gorman {
79281d1b09cSSasha Levin 	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
793f83a275dSMel Gorman 	if (!(vma->vm_flags & VM_MAYSHARE))
794a1e78772SMel Gorman 		vma->vm_private_data = (void *)0;
795a1e78772SMel Gorman }
796a1e78772SMel Gorman 
797a1e78772SMel Gorman /* Returns true if the VMA has associated reserve pages */
798559ec2f8SNicholas Krause static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
799a1e78772SMel Gorman {
800af0ed73eSJoonsoo Kim 	if (vma->vm_flags & VM_NORESERVE) {
801af0ed73eSJoonsoo Kim 		/*
802af0ed73eSJoonsoo Kim 		 * This address is already reserved by other process(chg == 0),
803af0ed73eSJoonsoo Kim 		 * so, we should decrement reserved count. Without decrementing,
804af0ed73eSJoonsoo Kim 		 * reserve count remains after releasing inode, because this
805af0ed73eSJoonsoo Kim 		 * allocated page will go into page cache and is regarded as
806af0ed73eSJoonsoo Kim 		 * coming from reserved pool in releasing step.  Currently, we
807af0ed73eSJoonsoo Kim 		 * don't have any other solution to deal with this situation
808af0ed73eSJoonsoo Kim 		 * properly, so add work-around here.
809af0ed73eSJoonsoo Kim 		 */
810af0ed73eSJoonsoo Kim 		if (vma->vm_flags & VM_MAYSHARE && chg == 0)
811559ec2f8SNicholas Krause 			return true;
812af0ed73eSJoonsoo Kim 		else
813559ec2f8SNicholas Krause 			return false;
814af0ed73eSJoonsoo Kim 	}
815a63884e9SJoonsoo Kim 
816a63884e9SJoonsoo Kim 	/* Shared mappings always use reserves */
8171fb1b0e9SMike Kravetz 	if (vma->vm_flags & VM_MAYSHARE) {
8181fb1b0e9SMike Kravetz 		/*
8191fb1b0e9SMike Kravetz 		 * We know VM_NORESERVE is not set.  Therefore, there SHOULD
8201fb1b0e9SMike Kravetz 		 * be a region map for all pages.  The only situation where
8211fb1b0e9SMike Kravetz 		 * there is no region map is if a hole was punched via
8221fb1b0e9SMike Kravetz 		 * fallocate.  In this case, there really are no reverves to
8231fb1b0e9SMike Kravetz 		 * use.  This situation is indicated if chg != 0.
8241fb1b0e9SMike Kravetz 		 */
8251fb1b0e9SMike Kravetz 		if (chg)
8261fb1b0e9SMike Kravetz 			return false;
8271fb1b0e9SMike Kravetz 		else
828559ec2f8SNicholas Krause 			return true;
8291fb1b0e9SMike Kravetz 	}
830a63884e9SJoonsoo Kim 
831a63884e9SJoonsoo Kim 	/*
832a63884e9SJoonsoo Kim 	 * Only the process that called mmap() has reserves for
833a63884e9SJoonsoo Kim 	 * private mappings.
834a63884e9SJoonsoo Kim 	 */
83567961f9dSMike Kravetz 	if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
83667961f9dSMike Kravetz 		/*
83767961f9dSMike Kravetz 		 * Like the shared case above, a hole punch or truncate
83867961f9dSMike Kravetz 		 * could have been performed on the private mapping.
83967961f9dSMike Kravetz 		 * Examine the value of chg to determine if reserves
84067961f9dSMike Kravetz 		 * actually exist or were previously consumed.
84167961f9dSMike Kravetz 		 * Very Subtle - The value of chg comes from a previous
84267961f9dSMike Kravetz 		 * call to vma_needs_reserves().  The reserve map for
84367961f9dSMike Kravetz 		 * private mappings has different (opposite) semantics
84467961f9dSMike Kravetz 		 * than that of shared mappings.  vma_needs_reserves()
84567961f9dSMike Kravetz 		 * has already taken this difference in semantics into
84667961f9dSMike Kravetz 		 * account.  Therefore, the meaning of chg is the same
84767961f9dSMike Kravetz 		 * as in the shared case above.  Code could easily be
84867961f9dSMike Kravetz 		 * combined, but keeping it separate draws attention to
84967961f9dSMike Kravetz 		 * subtle differences.
85067961f9dSMike Kravetz 		 */
85167961f9dSMike Kravetz 		if (chg)
85267961f9dSMike Kravetz 			return false;
85367961f9dSMike Kravetz 		else
854559ec2f8SNicholas Krause 			return true;
85567961f9dSMike Kravetz 	}
856a63884e9SJoonsoo Kim 
857559ec2f8SNicholas Krause 	return false;
858a1e78772SMel Gorman }
859a1e78772SMel Gorman 
860a5516438SAndi Kleen static void enqueue_huge_page(struct hstate *h, struct page *page)
8611da177e4SLinus Torvalds {
8621da177e4SLinus Torvalds 	int nid = page_to_nid(page);
8630edaecfaSAneesh Kumar K.V 	list_move(&page->lru, &h->hugepage_freelists[nid]);
864a5516438SAndi Kleen 	h->free_huge_pages++;
865a5516438SAndi Kleen 	h->free_huge_pages_node[nid]++;
8661da177e4SLinus Torvalds }
8671da177e4SLinus Torvalds 
868bf50bab2SNaoya Horiguchi static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
869bf50bab2SNaoya Horiguchi {
870bf50bab2SNaoya Horiguchi 	struct page *page;
871bf50bab2SNaoya Horiguchi 
872c8721bbbSNaoya Horiguchi 	list_for_each_entry(page, &h->hugepage_freelists[nid], lru)
873c8721bbbSNaoya Horiguchi 		if (!is_migrate_isolate_page(page))
874c8721bbbSNaoya Horiguchi 			break;
875c8721bbbSNaoya Horiguchi 	/*
876c8721bbbSNaoya Horiguchi 	 * if 'non-isolated free hugepage' not found on the list,
877c8721bbbSNaoya Horiguchi 	 * the allocation fails.
878c8721bbbSNaoya Horiguchi 	 */
879c8721bbbSNaoya Horiguchi 	if (&h->hugepage_freelists[nid] == &page->lru)
880bf50bab2SNaoya Horiguchi 		return NULL;
8810edaecfaSAneesh Kumar K.V 	list_move(&page->lru, &h->hugepage_activelist);
882a9869b83SNaoya Horiguchi 	set_page_refcounted(page);
883bf50bab2SNaoya Horiguchi 	h->free_huge_pages--;
884bf50bab2SNaoya Horiguchi 	h->free_huge_pages_node[nid]--;
885bf50bab2SNaoya Horiguchi 	return page;
886bf50bab2SNaoya Horiguchi }
887bf50bab2SNaoya Horiguchi 
88886cdb465SNaoya Horiguchi /* Movability of hugepages depends on migration support. */
88986cdb465SNaoya Horiguchi static inline gfp_t htlb_alloc_mask(struct hstate *h)
89086cdb465SNaoya Horiguchi {
891100873d7SNaoya Horiguchi 	if (hugepages_treat_as_movable || hugepage_migration_supported(h))
89286cdb465SNaoya Horiguchi 		return GFP_HIGHUSER_MOVABLE;
89386cdb465SNaoya Horiguchi 	else
89486cdb465SNaoya Horiguchi 		return GFP_HIGHUSER;
89586cdb465SNaoya Horiguchi }
89686cdb465SNaoya Horiguchi 
897a5516438SAndi Kleen static struct page *dequeue_huge_page_vma(struct hstate *h,
898a5516438SAndi Kleen 				struct vm_area_struct *vma,
899af0ed73eSJoonsoo Kim 				unsigned long address, int avoid_reserve,
900af0ed73eSJoonsoo Kim 				long chg)
9011da177e4SLinus Torvalds {
902b1c12cbcSKonstantin Khlebnikov 	struct page *page = NULL;
903480eccf9SLee Schermerhorn 	struct mempolicy *mpol;
90419770b32SMel Gorman 	nodemask_t *nodemask;
905c0ff7453SMiao Xie 	struct zonelist *zonelist;
906dd1a239fSMel Gorman 	struct zone *zone;
907dd1a239fSMel Gorman 	struct zoneref *z;
908cc9a6c87SMel Gorman 	unsigned int cpuset_mems_cookie;
9091da177e4SLinus Torvalds 
910a1e78772SMel Gorman 	/*
911a1e78772SMel Gorman 	 * A child process with MAP_PRIVATE mappings created by their parent
912a1e78772SMel Gorman 	 * have no page reserves. This check ensures that reservations are
913a1e78772SMel Gorman 	 * not "stolen". The child may still get SIGKILLed
914a1e78772SMel Gorman 	 */
915af0ed73eSJoonsoo Kim 	if (!vma_has_reserves(vma, chg) &&
916a5516438SAndi Kleen 			h->free_huge_pages - h->resv_huge_pages == 0)
917c0ff7453SMiao Xie 		goto err;
918a1e78772SMel Gorman 
91904f2cbe3SMel Gorman 	/* If reserves cannot be used, ensure enough pages are in the pool */
920a5516438SAndi Kleen 	if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
9216eab04a8SJustin P. Mattock 		goto err;
92204f2cbe3SMel Gorman 
9239966c4bbSJoonsoo Kim retry_cpuset:
924d26914d1SMel Gorman 	cpuset_mems_cookie = read_mems_allowed_begin();
9259966c4bbSJoonsoo Kim 	zonelist = huge_zonelist(vma, address,
92686cdb465SNaoya Horiguchi 					htlb_alloc_mask(h), &mpol, &nodemask);
9279966c4bbSJoonsoo Kim 
92819770b32SMel Gorman 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
92919770b32SMel Gorman 						MAX_NR_ZONES - 1, nodemask) {
930344736f2SVladimir Davydov 		if (cpuset_zone_allowed(zone, htlb_alloc_mask(h))) {
931bf50bab2SNaoya Horiguchi 			page = dequeue_huge_page_node(h, zone_to_nid(zone));
932bf50bab2SNaoya Horiguchi 			if (page) {
933af0ed73eSJoonsoo Kim 				if (avoid_reserve)
934af0ed73eSJoonsoo Kim 					break;
935af0ed73eSJoonsoo Kim 				if (!vma_has_reserves(vma, chg))
936af0ed73eSJoonsoo Kim 					break;
937af0ed73eSJoonsoo Kim 
93807443a85SJoonsoo Kim 				SetPagePrivate(page);
939a63884e9SJoonsoo Kim 				h->resv_huge_pages--;
9405ab3ee7bSKen Chen 				break;
9411da177e4SLinus Torvalds 			}
9423abf7afdSAndrew Morton 		}
943bf50bab2SNaoya Horiguchi 	}
944cc9a6c87SMel Gorman 
945cc9a6c87SMel Gorman 	mpol_cond_put(mpol);
946d26914d1SMel Gorman 	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
947cc9a6c87SMel Gorman 		goto retry_cpuset;
948cc9a6c87SMel Gorman 	return page;
949cc9a6c87SMel Gorman 
950c0ff7453SMiao Xie err:
951cc9a6c87SMel Gorman 	return NULL;
9521da177e4SLinus Torvalds }
9531da177e4SLinus Torvalds 
9541cac6f2cSLuiz Capitulino /*
9551cac6f2cSLuiz Capitulino  * common helper functions for hstate_next_node_to_{alloc|free}.
9561cac6f2cSLuiz Capitulino  * We may have allocated or freed a huge page based on a different
9571cac6f2cSLuiz Capitulino  * nodes_allowed previously, so h->next_node_to_{alloc|free} might
9581cac6f2cSLuiz Capitulino  * be outside of *nodes_allowed.  Ensure that we use an allowed
9591cac6f2cSLuiz Capitulino  * node for alloc or free.
9601cac6f2cSLuiz Capitulino  */
9611cac6f2cSLuiz Capitulino static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
9621cac6f2cSLuiz Capitulino {
9630edaf86cSAndrew Morton 	nid = next_node_in(nid, *nodes_allowed);
9641cac6f2cSLuiz Capitulino 	VM_BUG_ON(nid >= MAX_NUMNODES);
9651cac6f2cSLuiz Capitulino 
9661cac6f2cSLuiz Capitulino 	return nid;
9671cac6f2cSLuiz Capitulino }
9681cac6f2cSLuiz Capitulino 
9691cac6f2cSLuiz Capitulino static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
9701cac6f2cSLuiz Capitulino {
9711cac6f2cSLuiz Capitulino 	if (!node_isset(nid, *nodes_allowed))
9721cac6f2cSLuiz Capitulino 		nid = next_node_allowed(nid, nodes_allowed);
9731cac6f2cSLuiz Capitulino 	return nid;
9741cac6f2cSLuiz Capitulino }
9751cac6f2cSLuiz Capitulino 
9761cac6f2cSLuiz Capitulino /*
9771cac6f2cSLuiz Capitulino  * returns the previously saved node ["this node"] from which to
9781cac6f2cSLuiz Capitulino  * allocate a persistent huge page for the pool and advance the
9791cac6f2cSLuiz Capitulino  * next node from which to allocate, handling wrap at end of node
9801cac6f2cSLuiz Capitulino  * mask.
9811cac6f2cSLuiz Capitulino  */
9821cac6f2cSLuiz Capitulino static int hstate_next_node_to_alloc(struct hstate *h,
9831cac6f2cSLuiz Capitulino 					nodemask_t *nodes_allowed)
9841cac6f2cSLuiz Capitulino {
9851cac6f2cSLuiz Capitulino 	int nid;
9861cac6f2cSLuiz Capitulino 
9871cac6f2cSLuiz Capitulino 	VM_BUG_ON(!nodes_allowed);
9881cac6f2cSLuiz Capitulino 
9891cac6f2cSLuiz Capitulino 	nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
9901cac6f2cSLuiz Capitulino 	h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
9911cac6f2cSLuiz Capitulino 
9921cac6f2cSLuiz Capitulino 	return nid;
9931cac6f2cSLuiz Capitulino }
9941cac6f2cSLuiz Capitulino 
9951cac6f2cSLuiz Capitulino /*
9961cac6f2cSLuiz Capitulino  * helper for free_pool_huge_page() - return the previously saved
9971cac6f2cSLuiz Capitulino  * node ["this node"] from which to free a huge page.  Advance the
9981cac6f2cSLuiz Capitulino  * next node id whether or not we find a free huge page to free so
9991cac6f2cSLuiz Capitulino  * that the next attempt to free addresses the next node.
10001cac6f2cSLuiz Capitulino  */
10011cac6f2cSLuiz Capitulino static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
10021cac6f2cSLuiz Capitulino {
10031cac6f2cSLuiz Capitulino 	int nid;
10041cac6f2cSLuiz Capitulino 
10051cac6f2cSLuiz Capitulino 	VM_BUG_ON(!nodes_allowed);
10061cac6f2cSLuiz Capitulino 
10071cac6f2cSLuiz Capitulino 	nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
10081cac6f2cSLuiz Capitulino 	h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
10091cac6f2cSLuiz Capitulino 
10101cac6f2cSLuiz Capitulino 	return nid;
10111cac6f2cSLuiz Capitulino }
10121cac6f2cSLuiz Capitulino 
10131cac6f2cSLuiz Capitulino #define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask)		\
10141cac6f2cSLuiz Capitulino 	for (nr_nodes = nodes_weight(*mask);				\
10151cac6f2cSLuiz Capitulino 		nr_nodes > 0 &&						\
10161cac6f2cSLuiz Capitulino 		((node = hstate_next_node_to_alloc(hs, mask)) || 1);	\
10171cac6f2cSLuiz Capitulino 		nr_nodes--)
10181cac6f2cSLuiz Capitulino 
10191cac6f2cSLuiz Capitulino #define for_each_node_mask_to_free(hs, nr_nodes, node, mask)		\
10201cac6f2cSLuiz Capitulino 	for (nr_nodes = nodes_weight(*mask);				\
10211cac6f2cSLuiz Capitulino 		nr_nodes > 0 &&						\
10221cac6f2cSLuiz Capitulino 		((node = hstate_next_node_to_free(hs, mask)) || 1);	\
10231cac6f2cSLuiz Capitulino 		nr_nodes--)
10241cac6f2cSLuiz Capitulino 
1025d08de8e2SGerald Schaefer #if (defined(CONFIG_X86_64) || defined(CONFIG_S390)) && \
1026d08de8e2SGerald Schaefer 	((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || \
1027d08de8e2SGerald Schaefer 	defined(CONFIG_CMA))
1028944d9fecSLuiz Capitulino static void destroy_compound_gigantic_page(struct page *page,
1029d00181b9SKirill A. Shutemov 					unsigned int order)
1030944d9fecSLuiz Capitulino {
1031944d9fecSLuiz Capitulino 	int i;
1032944d9fecSLuiz Capitulino 	int nr_pages = 1 << order;
1033944d9fecSLuiz Capitulino 	struct page *p = page + 1;
1034944d9fecSLuiz Capitulino 
1035c8cc708aSGerald Schaefer 	atomic_set(compound_mapcount_ptr(page), 0);
1036944d9fecSLuiz Capitulino 	for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
10371d798ca3SKirill A. Shutemov 		clear_compound_head(p);
1038944d9fecSLuiz Capitulino 		set_page_refcounted(p);
1039944d9fecSLuiz Capitulino 	}
1040944d9fecSLuiz Capitulino 
1041944d9fecSLuiz Capitulino 	set_compound_order(page, 0);
1042944d9fecSLuiz Capitulino 	__ClearPageHead(page);
1043944d9fecSLuiz Capitulino }
1044944d9fecSLuiz Capitulino 
1045d00181b9SKirill A. Shutemov static void free_gigantic_page(struct page *page, unsigned int order)
1046944d9fecSLuiz Capitulino {
1047944d9fecSLuiz Capitulino 	free_contig_range(page_to_pfn(page), 1 << order);
1048944d9fecSLuiz Capitulino }
1049944d9fecSLuiz Capitulino 
1050944d9fecSLuiz Capitulino static int __alloc_gigantic_page(unsigned long start_pfn,
1051944d9fecSLuiz Capitulino 				unsigned long nr_pages)
1052944d9fecSLuiz Capitulino {
1053944d9fecSLuiz Capitulino 	unsigned long end_pfn = start_pfn + nr_pages;
1054944d9fecSLuiz Capitulino 	return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
1055944d9fecSLuiz Capitulino }
1056944d9fecSLuiz Capitulino 
1057f44b2ddaSJoonsoo Kim static bool pfn_range_valid_gigantic(struct zone *z,
1058f44b2ddaSJoonsoo Kim 			unsigned long start_pfn, unsigned long nr_pages)
1059944d9fecSLuiz Capitulino {
1060944d9fecSLuiz Capitulino 	unsigned long i, end_pfn = start_pfn + nr_pages;
1061944d9fecSLuiz Capitulino 	struct page *page;
1062944d9fecSLuiz Capitulino 
1063944d9fecSLuiz Capitulino 	for (i = start_pfn; i < end_pfn; i++) {
1064944d9fecSLuiz Capitulino 		if (!pfn_valid(i))
1065944d9fecSLuiz Capitulino 			return false;
1066944d9fecSLuiz Capitulino 
1067944d9fecSLuiz Capitulino 		page = pfn_to_page(i);
1068944d9fecSLuiz Capitulino 
1069f44b2ddaSJoonsoo Kim 		if (page_zone(page) != z)
1070f44b2ddaSJoonsoo Kim 			return false;
1071f44b2ddaSJoonsoo Kim 
1072944d9fecSLuiz Capitulino 		if (PageReserved(page))
1073944d9fecSLuiz Capitulino 			return false;
1074944d9fecSLuiz Capitulino 
1075944d9fecSLuiz Capitulino 		if (page_count(page) > 0)
1076944d9fecSLuiz Capitulino 			return false;
1077944d9fecSLuiz Capitulino 
1078944d9fecSLuiz Capitulino 		if (PageHuge(page))
1079944d9fecSLuiz Capitulino 			return false;
1080944d9fecSLuiz Capitulino 	}
1081944d9fecSLuiz Capitulino 
1082944d9fecSLuiz Capitulino 	return true;
1083944d9fecSLuiz Capitulino }
1084944d9fecSLuiz Capitulino 
1085944d9fecSLuiz Capitulino static bool zone_spans_last_pfn(const struct zone *zone,
1086944d9fecSLuiz Capitulino 			unsigned long start_pfn, unsigned long nr_pages)
1087944d9fecSLuiz Capitulino {
1088944d9fecSLuiz Capitulino 	unsigned long last_pfn = start_pfn + nr_pages - 1;
1089944d9fecSLuiz Capitulino 	return zone_spans_pfn(zone, last_pfn);
1090944d9fecSLuiz Capitulino }
1091944d9fecSLuiz Capitulino 
1092d00181b9SKirill A. Shutemov static struct page *alloc_gigantic_page(int nid, unsigned int order)
1093944d9fecSLuiz Capitulino {
1094944d9fecSLuiz Capitulino 	unsigned long nr_pages = 1 << order;
1095944d9fecSLuiz Capitulino 	unsigned long ret, pfn, flags;
1096944d9fecSLuiz Capitulino 	struct zone *z;
1097944d9fecSLuiz Capitulino 
1098944d9fecSLuiz Capitulino 	z = NODE_DATA(nid)->node_zones;
1099944d9fecSLuiz Capitulino 	for (; z - NODE_DATA(nid)->node_zones < MAX_NR_ZONES; z++) {
1100944d9fecSLuiz Capitulino 		spin_lock_irqsave(&z->lock, flags);
1101944d9fecSLuiz Capitulino 
1102944d9fecSLuiz Capitulino 		pfn = ALIGN(z->zone_start_pfn, nr_pages);
1103944d9fecSLuiz Capitulino 		while (zone_spans_last_pfn(z, pfn, nr_pages)) {
1104f44b2ddaSJoonsoo Kim 			if (pfn_range_valid_gigantic(z, pfn, nr_pages)) {
1105944d9fecSLuiz Capitulino 				/*
1106944d9fecSLuiz Capitulino 				 * We release the zone lock here because
1107944d9fecSLuiz Capitulino 				 * alloc_contig_range() will also lock the zone
1108944d9fecSLuiz Capitulino 				 * at some point. If there's an allocation
1109944d9fecSLuiz Capitulino 				 * spinning on this lock, it may win the race
1110944d9fecSLuiz Capitulino 				 * and cause alloc_contig_range() to fail...
1111944d9fecSLuiz Capitulino 				 */
1112944d9fecSLuiz Capitulino 				spin_unlock_irqrestore(&z->lock, flags);
1113944d9fecSLuiz Capitulino 				ret = __alloc_gigantic_page(pfn, nr_pages);
1114944d9fecSLuiz Capitulino 				if (!ret)
1115944d9fecSLuiz Capitulino 					return pfn_to_page(pfn);
1116944d9fecSLuiz Capitulino 				spin_lock_irqsave(&z->lock, flags);
1117944d9fecSLuiz Capitulino 			}
1118944d9fecSLuiz Capitulino 			pfn += nr_pages;
1119944d9fecSLuiz Capitulino 		}
1120944d9fecSLuiz Capitulino 
1121944d9fecSLuiz Capitulino 		spin_unlock_irqrestore(&z->lock, flags);
1122944d9fecSLuiz Capitulino 	}
1123944d9fecSLuiz Capitulino 
1124944d9fecSLuiz Capitulino 	return NULL;
1125944d9fecSLuiz Capitulino }
1126944d9fecSLuiz Capitulino 
1127944d9fecSLuiz Capitulino static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
1128d00181b9SKirill A. Shutemov static void prep_compound_gigantic_page(struct page *page, unsigned int order);
1129944d9fecSLuiz Capitulino 
1130944d9fecSLuiz Capitulino static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid)
1131944d9fecSLuiz Capitulino {
1132944d9fecSLuiz Capitulino 	struct page *page;
1133944d9fecSLuiz Capitulino 
1134944d9fecSLuiz Capitulino 	page = alloc_gigantic_page(nid, huge_page_order(h));
1135944d9fecSLuiz Capitulino 	if (page) {
1136944d9fecSLuiz Capitulino 		prep_compound_gigantic_page(page, huge_page_order(h));
1137944d9fecSLuiz Capitulino 		prep_new_huge_page(h, page, nid);
1138944d9fecSLuiz Capitulino 	}
1139944d9fecSLuiz Capitulino 
1140944d9fecSLuiz Capitulino 	return page;
1141944d9fecSLuiz Capitulino }
1142944d9fecSLuiz Capitulino 
1143944d9fecSLuiz Capitulino static int alloc_fresh_gigantic_page(struct hstate *h,
1144944d9fecSLuiz Capitulino 				nodemask_t *nodes_allowed)
1145944d9fecSLuiz Capitulino {
1146944d9fecSLuiz Capitulino 	struct page *page = NULL;
1147944d9fecSLuiz Capitulino 	int nr_nodes, node;
1148944d9fecSLuiz Capitulino 
1149944d9fecSLuiz Capitulino 	for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
1150944d9fecSLuiz Capitulino 		page = alloc_fresh_gigantic_page_node(h, node);
1151944d9fecSLuiz Capitulino 		if (page)
1152944d9fecSLuiz Capitulino 			return 1;
1153944d9fecSLuiz Capitulino 	}
1154944d9fecSLuiz Capitulino 
1155944d9fecSLuiz Capitulino 	return 0;
1156944d9fecSLuiz Capitulino }
1157944d9fecSLuiz Capitulino 
1158944d9fecSLuiz Capitulino static inline bool gigantic_page_supported(void) { return true; }
1159944d9fecSLuiz Capitulino #else
1160944d9fecSLuiz Capitulino static inline bool gigantic_page_supported(void) { return false; }
1161d00181b9SKirill A. Shutemov static inline void free_gigantic_page(struct page *page, unsigned int order) { }
1162944d9fecSLuiz Capitulino static inline void destroy_compound_gigantic_page(struct page *page,
1163d00181b9SKirill A. Shutemov 						unsigned int order) { }
1164944d9fecSLuiz Capitulino static inline int alloc_fresh_gigantic_page(struct hstate *h,
1165944d9fecSLuiz Capitulino 					nodemask_t *nodes_allowed) { return 0; }
1166944d9fecSLuiz Capitulino #endif
1167944d9fecSLuiz Capitulino 
1168a5516438SAndi Kleen static void update_and_free_page(struct hstate *h, struct page *page)
11696af2acb6SAdam Litke {
11706af2acb6SAdam Litke 	int i;
1171a5516438SAndi Kleen 
1172944d9fecSLuiz Capitulino 	if (hstate_is_gigantic(h) && !gigantic_page_supported())
1173944d9fecSLuiz Capitulino 		return;
117418229df5SAndy Whitcroft 
1175a5516438SAndi Kleen 	h->nr_huge_pages--;
1176a5516438SAndi Kleen 	h->nr_huge_pages_node[page_to_nid(page)]--;
1177a5516438SAndi Kleen 	for (i = 0; i < pages_per_huge_page(h); i++) {
117832f84528SChris Forbes 		page[i].flags &= ~(1 << PG_locked | 1 << PG_error |
117932f84528SChris Forbes 				1 << PG_referenced | 1 << PG_dirty |
1180a7407a27SLuiz Capitulino 				1 << PG_active | 1 << PG_private |
1181a7407a27SLuiz Capitulino 				1 << PG_writeback);
11826af2acb6SAdam Litke 	}
1183309381feSSasha Levin 	VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
1184f1e61557SKirill A. Shutemov 	set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
11856af2acb6SAdam Litke 	set_page_refcounted(page);
1186944d9fecSLuiz Capitulino 	if (hstate_is_gigantic(h)) {
1187944d9fecSLuiz Capitulino 		destroy_compound_gigantic_page(page, huge_page_order(h));
1188944d9fecSLuiz Capitulino 		free_gigantic_page(page, huge_page_order(h));
1189944d9fecSLuiz Capitulino 	} else {
1190a5516438SAndi Kleen 		__free_pages(page, huge_page_order(h));
11916af2acb6SAdam Litke 	}
1192944d9fecSLuiz Capitulino }
11936af2acb6SAdam Litke 
1194e5ff2159SAndi Kleen struct hstate *size_to_hstate(unsigned long size)
1195e5ff2159SAndi Kleen {
1196e5ff2159SAndi Kleen 	struct hstate *h;
1197e5ff2159SAndi Kleen 
1198e5ff2159SAndi Kleen 	for_each_hstate(h) {
1199e5ff2159SAndi Kleen 		if (huge_page_size(h) == size)
1200e5ff2159SAndi Kleen 			return h;
1201e5ff2159SAndi Kleen 	}
1202e5ff2159SAndi Kleen 	return NULL;
1203e5ff2159SAndi Kleen }
1204e5ff2159SAndi Kleen 
1205bcc54222SNaoya Horiguchi /*
1206bcc54222SNaoya Horiguchi  * Test to determine whether the hugepage is "active/in-use" (i.e. being linked
1207bcc54222SNaoya Horiguchi  * to hstate->hugepage_activelist.)
1208bcc54222SNaoya Horiguchi  *
1209bcc54222SNaoya Horiguchi  * This function can be called for tail pages, but never returns true for them.
1210bcc54222SNaoya Horiguchi  */
1211bcc54222SNaoya Horiguchi bool page_huge_active(struct page *page)
1212bcc54222SNaoya Horiguchi {
1213bcc54222SNaoya Horiguchi 	VM_BUG_ON_PAGE(!PageHuge(page), page);
1214bcc54222SNaoya Horiguchi 	return PageHead(page) && PagePrivate(&page[1]);
1215bcc54222SNaoya Horiguchi }
1216bcc54222SNaoya Horiguchi 
1217bcc54222SNaoya Horiguchi /* never called for tail page */
1218bcc54222SNaoya Horiguchi static void set_page_huge_active(struct page *page)
1219bcc54222SNaoya Horiguchi {
1220bcc54222SNaoya Horiguchi 	VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
1221bcc54222SNaoya Horiguchi 	SetPagePrivate(&page[1]);
1222bcc54222SNaoya Horiguchi }
1223bcc54222SNaoya Horiguchi 
1224bcc54222SNaoya Horiguchi static void clear_page_huge_active(struct page *page)
1225bcc54222SNaoya Horiguchi {
1226bcc54222SNaoya Horiguchi 	VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
1227bcc54222SNaoya Horiguchi 	ClearPagePrivate(&page[1]);
1228bcc54222SNaoya Horiguchi }
1229bcc54222SNaoya Horiguchi 
12308f1d26d0SAtsushi Kumagai void free_huge_page(struct page *page)
123127a85ef1SDavid Gibson {
1232a5516438SAndi Kleen 	/*
1233a5516438SAndi Kleen 	 * Can't pass hstate in here because it is called from the
1234a5516438SAndi Kleen 	 * compound page destructor.
1235a5516438SAndi Kleen 	 */
1236e5ff2159SAndi Kleen 	struct hstate *h = page_hstate(page);
12377893d1d5SAdam Litke 	int nid = page_to_nid(page);
123890481622SDavid Gibson 	struct hugepage_subpool *spool =
123990481622SDavid Gibson 		(struct hugepage_subpool *)page_private(page);
124007443a85SJoonsoo Kim 	bool restore_reserve;
124127a85ef1SDavid Gibson 
1242e5df70abSAndy Whitcroft 	set_page_private(page, 0);
124323be7468SMel Gorman 	page->mapping = NULL;
1244b4330afbSMike Kravetz 	VM_BUG_ON_PAGE(page_count(page), page);
1245b4330afbSMike Kravetz 	VM_BUG_ON_PAGE(page_mapcount(page), page);
124607443a85SJoonsoo Kim 	restore_reserve = PagePrivate(page);
124716c794b4SJoonsoo Kim 	ClearPagePrivate(page);
124827a85ef1SDavid Gibson 
12491c5ecae3SMike Kravetz 	/*
12501c5ecae3SMike Kravetz 	 * A return code of zero implies that the subpool will be under its
12511c5ecae3SMike Kravetz 	 * minimum size if the reservation is not restored after page is free.
12521c5ecae3SMike Kravetz 	 * Therefore, force restore_reserve operation.
12531c5ecae3SMike Kravetz 	 */
12541c5ecae3SMike Kravetz 	if (hugepage_subpool_put_pages(spool, 1) == 0)
12551c5ecae3SMike Kravetz 		restore_reserve = true;
12561c5ecae3SMike Kravetz 
125727a85ef1SDavid Gibson 	spin_lock(&hugetlb_lock);
1258bcc54222SNaoya Horiguchi 	clear_page_huge_active(page);
12596d76dcf4SAneesh Kumar K.V 	hugetlb_cgroup_uncharge_page(hstate_index(h),
12606d76dcf4SAneesh Kumar K.V 				     pages_per_huge_page(h), page);
126107443a85SJoonsoo Kim 	if (restore_reserve)
126207443a85SJoonsoo Kim 		h->resv_huge_pages++;
126307443a85SJoonsoo Kim 
1264944d9fecSLuiz Capitulino 	if (h->surplus_huge_pages_node[nid]) {
12650edaecfaSAneesh Kumar K.V 		/* remove the page from active list */
12660edaecfaSAneesh Kumar K.V 		list_del(&page->lru);
1267a5516438SAndi Kleen 		update_and_free_page(h, page);
1268a5516438SAndi Kleen 		h->surplus_huge_pages--;
1269a5516438SAndi Kleen 		h->surplus_huge_pages_node[nid]--;
12707893d1d5SAdam Litke 	} else {
12715d3a551cSWill Deacon 		arch_clear_hugepage_flags(page);
1272a5516438SAndi Kleen 		enqueue_huge_page(h, page);
12737893d1d5SAdam Litke 	}
127427a85ef1SDavid Gibson 	spin_unlock(&hugetlb_lock);
127527a85ef1SDavid Gibson }
127627a85ef1SDavid Gibson 
1277a5516438SAndi Kleen static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
1278b7ba30c6SAndi Kleen {
12790edaecfaSAneesh Kumar K.V 	INIT_LIST_HEAD(&page->lru);
1280f1e61557SKirill A. Shutemov 	set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
1281b7ba30c6SAndi Kleen 	spin_lock(&hugetlb_lock);
12829dd540e2SAneesh Kumar K.V 	set_hugetlb_cgroup(page, NULL);
1283a5516438SAndi Kleen 	h->nr_huge_pages++;
1284a5516438SAndi Kleen 	h->nr_huge_pages_node[nid]++;
1285b7ba30c6SAndi Kleen 	spin_unlock(&hugetlb_lock);
1286b7ba30c6SAndi Kleen 	put_page(page); /* free it into the hugepage allocator */
1287b7ba30c6SAndi Kleen }
1288b7ba30c6SAndi Kleen 
1289d00181b9SKirill A. Shutemov static void prep_compound_gigantic_page(struct page *page, unsigned int order)
129020a0307cSWu Fengguang {
129120a0307cSWu Fengguang 	int i;
129220a0307cSWu Fengguang 	int nr_pages = 1 << order;
129320a0307cSWu Fengguang 	struct page *p = page + 1;
129420a0307cSWu Fengguang 
129520a0307cSWu Fengguang 	/* we rely on prep_new_huge_page to set the destructor */
129620a0307cSWu Fengguang 	set_compound_order(page, order);
1297ef5a22beSAndrea Arcangeli 	__ClearPageReserved(page);
1298de09d31dSKirill A. Shutemov 	__SetPageHead(page);
129920a0307cSWu Fengguang 	for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
1300ef5a22beSAndrea Arcangeli 		/*
1301ef5a22beSAndrea Arcangeli 		 * For gigantic hugepages allocated through bootmem at
1302ef5a22beSAndrea Arcangeli 		 * boot, it's safer to be consistent with the not-gigantic
1303ef5a22beSAndrea Arcangeli 		 * hugepages and clear the PG_reserved bit from all tail pages
1304ef5a22beSAndrea Arcangeli 		 * too.  Otherwse drivers using get_user_pages() to access tail
1305ef5a22beSAndrea Arcangeli 		 * pages may get the reference counting wrong if they see
1306ef5a22beSAndrea Arcangeli 		 * PG_reserved set on a tail page (despite the head page not
1307ef5a22beSAndrea Arcangeli 		 * having PG_reserved set).  Enforcing this consistency between
1308ef5a22beSAndrea Arcangeli 		 * head and tail pages allows drivers to optimize away a check
1309ef5a22beSAndrea Arcangeli 		 * on the head page when they need know if put_page() is needed
1310ef5a22beSAndrea Arcangeli 		 * after get_user_pages().
1311ef5a22beSAndrea Arcangeli 		 */
1312ef5a22beSAndrea Arcangeli 		__ClearPageReserved(p);
131358a84aa9SYouquan Song 		set_page_count(p, 0);
13141d798ca3SKirill A. Shutemov 		set_compound_head(p, page);
131520a0307cSWu Fengguang 	}
1316b4330afbSMike Kravetz 	atomic_set(compound_mapcount_ptr(page), -1);
131720a0307cSWu Fengguang }
131820a0307cSWu Fengguang 
13197795912cSAndrew Morton /*
13207795912cSAndrew Morton  * PageHuge() only returns true for hugetlbfs pages, but not for normal or
13217795912cSAndrew Morton  * transparent huge pages.  See the PageTransHuge() documentation for more
13227795912cSAndrew Morton  * details.
13237795912cSAndrew Morton  */
132420a0307cSWu Fengguang int PageHuge(struct page *page)
132520a0307cSWu Fengguang {
132620a0307cSWu Fengguang 	if (!PageCompound(page))
132720a0307cSWu Fengguang 		return 0;
132820a0307cSWu Fengguang 
132920a0307cSWu Fengguang 	page = compound_head(page);
1330f1e61557SKirill A. Shutemov 	return page[1].compound_dtor == HUGETLB_PAGE_DTOR;
133120a0307cSWu Fengguang }
133243131e14SNaoya Horiguchi EXPORT_SYMBOL_GPL(PageHuge);
133343131e14SNaoya Horiguchi 
133427c73ae7SAndrea Arcangeli /*
133527c73ae7SAndrea Arcangeli  * PageHeadHuge() only returns true for hugetlbfs head page, but not for
133627c73ae7SAndrea Arcangeli  * normal or transparent huge pages.
133727c73ae7SAndrea Arcangeli  */
133827c73ae7SAndrea Arcangeli int PageHeadHuge(struct page *page_head)
133927c73ae7SAndrea Arcangeli {
134027c73ae7SAndrea Arcangeli 	if (!PageHead(page_head))
134127c73ae7SAndrea Arcangeli 		return 0;
134227c73ae7SAndrea Arcangeli 
1343758f66a2SAndrew Morton 	return get_compound_page_dtor(page_head) == free_huge_page;
134427c73ae7SAndrea Arcangeli }
134527c73ae7SAndrea Arcangeli 
134613d60f4bSZhang Yi pgoff_t __basepage_index(struct page *page)
134713d60f4bSZhang Yi {
134813d60f4bSZhang Yi 	struct page *page_head = compound_head(page);
134913d60f4bSZhang Yi 	pgoff_t index = page_index(page_head);
135013d60f4bSZhang Yi 	unsigned long compound_idx;
135113d60f4bSZhang Yi 
135213d60f4bSZhang Yi 	if (!PageHuge(page_head))
135313d60f4bSZhang Yi 		return page_index(page);
135413d60f4bSZhang Yi 
135513d60f4bSZhang Yi 	if (compound_order(page_head) >= MAX_ORDER)
135613d60f4bSZhang Yi 		compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
135713d60f4bSZhang Yi 	else
135813d60f4bSZhang Yi 		compound_idx = page - page_head;
135913d60f4bSZhang Yi 
136013d60f4bSZhang Yi 	return (index << compound_order(page_head)) + compound_idx;
136113d60f4bSZhang Yi }
136213d60f4bSZhang Yi 
1363a5516438SAndi Kleen static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
13641da177e4SLinus Torvalds {
13651da177e4SLinus Torvalds 	struct page *page;
1366f96efd58SJoe Jin 
136796db800fSVlastimil Babka 	page = __alloc_pages_node(nid,
136886cdb465SNaoya Horiguchi 		htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
1369551883aeSNishanth Aravamudan 						__GFP_REPEAT|__GFP_NOWARN,
1370a5516438SAndi Kleen 		huge_page_order(h));
13711da177e4SLinus Torvalds 	if (page) {
1372a5516438SAndi Kleen 		prep_new_huge_page(h, page, nid);
13731da177e4SLinus Torvalds 	}
137463b4613cSNishanth Aravamudan 
137563b4613cSNishanth Aravamudan 	return page;
137663b4613cSNishanth Aravamudan }
137763b4613cSNishanth Aravamudan 
1378b2261026SJoonsoo Kim static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
1379b2261026SJoonsoo Kim {
1380b2261026SJoonsoo Kim 	struct page *page;
1381b2261026SJoonsoo Kim 	int nr_nodes, node;
1382b2261026SJoonsoo Kim 	int ret = 0;
1383b2261026SJoonsoo Kim 
1384b2261026SJoonsoo Kim 	for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
1385b2261026SJoonsoo Kim 		page = alloc_fresh_huge_page_node(h, node);
1386b2261026SJoonsoo Kim 		if (page) {
1387b2261026SJoonsoo Kim 			ret = 1;
1388b2261026SJoonsoo Kim 			break;
1389b2261026SJoonsoo Kim 		}
1390b2261026SJoonsoo Kim 	}
1391b2261026SJoonsoo Kim 
1392b2261026SJoonsoo Kim 	if (ret)
1393b2261026SJoonsoo Kim 		count_vm_event(HTLB_BUDDY_PGALLOC);
1394b2261026SJoonsoo Kim 	else
1395b2261026SJoonsoo Kim 		count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
1396b2261026SJoonsoo Kim 
1397b2261026SJoonsoo Kim 	return ret;
1398b2261026SJoonsoo Kim }
1399b2261026SJoonsoo Kim 
1400e8c5c824SLee Schermerhorn /*
1401e8c5c824SLee Schermerhorn  * Free huge page from pool from next node to free.
1402e8c5c824SLee Schermerhorn  * Attempt to keep persistent huge pages more or less
1403e8c5c824SLee Schermerhorn  * balanced over allowed nodes.
1404e8c5c824SLee Schermerhorn  * Called with hugetlb_lock locked.
1405e8c5c824SLee Schermerhorn  */
14066ae11b27SLee Schermerhorn static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
14076ae11b27SLee Schermerhorn 							 bool acct_surplus)
1408e8c5c824SLee Schermerhorn {
1409b2261026SJoonsoo Kim 	int nr_nodes, node;
1410e8c5c824SLee Schermerhorn 	int ret = 0;
1411e8c5c824SLee Schermerhorn 
1412b2261026SJoonsoo Kim 	for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
1413685f3457SLee Schermerhorn 		/*
1414685f3457SLee Schermerhorn 		 * If we're returning unused surplus pages, only examine
1415685f3457SLee Schermerhorn 		 * nodes with surplus pages.
1416685f3457SLee Schermerhorn 		 */
1417b2261026SJoonsoo Kim 		if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
1418b2261026SJoonsoo Kim 		    !list_empty(&h->hugepage_freelists[node])) {
1419e8c5c824SLee Schermerhorn 			struct page *page =
1420b2261026SJoonsoo Kim 				list_entry(h->hugepage_freelists[node].next,
1421e8c5c824SLee Schermerhorn 					  struct page, lru);
1422e8c5c824SLee Schermerhorn 			list_del(&page->lru);
1423e8c5c824SLee Schermerhorn 			h->free_huge_pages--;
1424b2261026SJoonsoo Kim 			h->free_huge_pages_node[node]--;
1425685f3457SLee Schermerhorn 			if (acct_surplus) {
1426685f3457SLee Schermerhorn 				h->surplus_huge_pages--;
1427b2261026SJoonsoo Kim 				h->surplus_huge_pages_node[node]--;
1428685f3457SLee Schermerhorn 			}
1429e8c5c824SLee Schermerhorn 			update_and_free_page(h, page);
1430e8c5c824SLee Schermerhorn 			ret = 1;
14319a76db09SLee Schermerhorn 			break;
1432e8c5c824SLee Schermerhorn 		}
1433b2261026SJoonsoo Kim 	}
1434e8c5c824SLee Schermerhorn 
1435e8c5c824SLee Schermerhorn 	return ret;
1436e8c5c824SLee Schermerhorn }
1437e8c5c824SLee Schermerhorn 
1438c8721bbbSNaoya Horiguchi /*
1439c8721bbbSNaoya Horiguchi  * Dissolve a given free hugepage into free buddy pages. This function does
1440c8721bbbSNaoya Horiguchi  * nothing for in-use (including surplus) hugepages.
1441c8721bbbSNaoya Horiguchi  */
1442c8721bbbSNaoya Horiguchi static void dissolve_free_huge_page(struct page *page)
1443c8721bbbSNaoya Horiguchi {
1444c8721bbbSNaoya Horiguchi 	spin_lock(&hugetlb_lock);
1445c8721bbbSNaoya Horiguchi 	if (PageHuge(page) && !page_count(page)) {
1446c8721bbbSNaoya Horiguchi 		struct hstate *h = page_hstate(page);
1447c8721bbbSNaoya Horiguchi 		int nid = page_to_nid(page);
1448c8721bbbSNaoya Horiguchi 		list_del(&page->lru);
1449c8721bbbSNaoya Horiguchi 		h->free_huge_pages--;
1450c8721bbbSNaoya Horiguchi 		h->free_huge_pages_node[nid]--;
1451c8721bbbSNaoya Horiguchi 		update_and_free_page(h, page);
1452c8721bbbSNaoya Horiguchi 	}
1453c8721bbbSNaoya Horiguchi 	spin_unlock(&hugetlb_lock);
1454c8721bbbSNaoya Horiguchi }
1455c8721bbbSNaoya Horiguchi 
1456c8721bbbSNaoya Horiguchi /*
1457c8721bbbSNaoya Horiguchi  * Dissolve free hugepages in a given pfn range. Used by memory hotplug to
1458c8721bbbSNaoya Horiguchi  * make specified memory blocks removable from the system.
1459c8721bbbSNaoya Horiguchi  * Note that start_pfn should aligned with (minimum) hugepage size.
1460c8721bbbSNaoya Horiguchi  */
1461c8721bbbSNaoya Horiguchi void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
1462c8721bbbSNaoya Horiguchi {
1463c8721bbbSNaoya Horiguchi 	unsigned long pfn;
1464c8721bbbSNaoya Horiguchi 
1465d0177639SLi Zhong 	if (!hugepages_supported())
1466d0177639SLi Zhong 		return;
1467d0177639SLi Zhong 
1468641844f5SNaoya Horiguchi 	VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << minimum_order));
1469641844f5SNaoya Horiguchi 	for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order)
1470c8721bbbSNaoya Horiguchi 		dissolve_free_huge_page(pfn_to_page(pfn));
1471c8721bbbSNaoya Horiguchi }
1472c8721bbbSNaoya Horiguchi 
1473099730d6SDave Hansen /*
1474099730d6SDave Hansen  * There are 3 ways this can get called:
1475099730d6SDave Hansen  * 1. With vma+addr: we use the VMA's memory policy
1476099730d6SDave Hansen  * 2. With !vma, but nid=NUMA_NO_NODE:  We try to allocate a huge
1477099730d6SDave Hansen  *    page from any node, and let the buddy allocator itself figure
1478099730d6SDave Hansen  *    it out.
1479099730d6SDave Hansen  * 3. With !vma, but nid!=NUMA_NO_NODE.  We allocate a huge page
1480099730d6SDave Hansen  *    strictly from 'nid'
1481099730d6SDave Hansen  */
1482099730d6SDave Hansen static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h,
1483099730d6SDave Hansen 		struct vm_area_struct *vma, unsigned long addr, int nid)
1484099730d6SDave Hansen {
1485099730d6SDave Hansen 	int order = huge_page_order(h);
1486099730d6SDave Hansen 	gfp_t gfp = htlb_alloc_mask(h)|__GFP_COMP|__GFP_REPEAT|__GFP_NOWARN;
1487099730d6SDave Hansen 	unsigned int cpuset_mems_cookie;
1488099730d6SDave Hansen 
1489099730d6SDave Hansen 	/*
1490099730d6SDave Hansen 	 * We need a VMA to get a memory policy.  If we do not
1491e0ec90eeSDave Hansen 	 * have one, we use the 'nid' argument.
1492e0ec90eeSDave Hansen 	 *
1493e0ec90eeSDave Hansen 	 * The mempolicy stuff below has some non-inlined bits
1494e0ec90eeSDave Hansen 	 * and calls ->vm_ops.  That makes it hard to optimize at
1495e0ec90eeSDave Hansen 	 * compile-time, even when NUMA is off and it does
1496e0ec90eeSDave Hansen 	 * nothing.  This helps the compiler optimize it out.
1497099730d6SDave Hansen 	 */
1498e0ec90eeSDave Hansen 	if (!IS_ENABLED(CONFIG_NUMA) || !vma) {
1499099730d6SDave Hansen 		/*
1500099730d6SDave Hansen 		 * If a specific node is requested, make sure to
1501099730d6SDave Hansen 		 * get memory from there, but only when a node
1502099730d6SDave Hansen 		 * is explicitly specified.
1503099730d6SDave Hansen 		 */
1504099730d6SDave Hansen 		if (nid != NUMA_NO_NODE)
1505099730d6SDave Hansen 			gfp |= __GFP_THISNODE;
1506099730d6SDave Hansen 		/*
1507099730d6SDave Hansen 		 * Make sure to call something that can handle
1508099730d6SDave Hansen 		 * nid=NUMA_NO_NODE
1509099730d6SDave Hansen 		 */
1510099730d6SDave Hansen 		return alloc_pages_node(nid, gfp, order);
1511099730d6SDave Hansen 	}
1512099730d6SDave Hansen 
1513099730d6SDave Hansen 	/*
1514099730d6SDave Hansen 	 * OK, so we have a VMA.  Fetch the mempolicy and try to
1515e0ec90eeSDave Hansen 	 * allocate a huge page with it.  We will only reach this
1516e0ec90eeSDave Hansen 	 * when CONFIG_NUMA=y.
1517099730d6SDave Hansen 	 */
1518099730d6SDave Hansen 	do {
1519099730d6SDave Hansen 		struct page *page;
1520099730d6SDave Hansen 		struct mempolicy *mpol;
1521099730d6SDave Hansen 		struct zonelist *zl;
1522099730d6SDave Hansen 		nodemask_t *nodemask;
1523099730d6SDave Hansen 
1524099730d6SDave Hansen 		cpuset_mems_cookie = read_mems_allowed_begin();
1525099730d6SDave Hansen 		zl = huge_zonelist(vma, addr, gfp, &mpol, &nodemask);
1526099730d6SDave Hansen 		mpol_cond_put(mpol);
1527099730d6SDave Hansen 		page = __alloc_pages_nodemask(gfp, order, zl, nodemask);
1528099730d6SDave Hansen 		if (page)
1529099730d6SDave Hansen 			return page;
1530099730d6SDave Hansen 	} while (read_mems_allowed_retry(cpuset_mems_cookie));
1531099730d6SDave Hansen 
1532099730d6SDave Hansen 	return NULL;
1533099730d6SDave Hansen }
1534099730d6SDave Hansen 
1535099730d6SDave Hansen /*
1536099730d6SDave Hansen  * There are two ways to allocate a huge page:
1537099730d6SDave Hansen  * 1. When you have a VMA and an address (like a fault)
1538099730d6SDave Hansen  * 2. When you have no VMA (like when setting /proc/.../nr_hugepages)
1539099730d6SDave Hansen  *
1540099730d6SDave Hansen  * 'vma' and 'addr' are only for (1).  'nid' is always NUMA_NO_NODE in
1541099730d6SDave Hansen  * this case which signifies that the allocation should be done with
1542099730d6SDave Hansen  * respect for the VMA's memory policy.
1543099730d6SDave Hansen  *
1544099730d6SDave Hansen  * For (2), we ignore 'vma' and 'addr' and use 'nid' exclusively. This
1545099730d6SDave Hansen  * implies that memory policies will not be taken in to account.
1546099730d6SDave Hansen  */
1547099730d6SDave Hansen static struct page *__alloc_buddy_huge_page(struct hstate *h,
1548099730d6SDave Hansen 		struct vm_area_struct *vma, unsigned long addr, int nid)
15497893d1d5SAdam Litke {
15507893d1d5SAdam Litke 	struct page *page;
1551bf50bab2SNaoya Horiguchi 	unsigned int r_nid;
15527893d1d5SAdam Litke 
1553bae7f4aeSLuiz Capitulino 	if (hstate_is_gigantic(h))
1554aa888a74SAndi Kleen 		return NULL;
1555aa888a74SAndi Kleen 
1556d1c3fb1fSNishanth Aravamudan 	/*
1557099730d6SDave Hansen 	 * Make sure that anyone specifying 'nid' is not also specifying a VMA.
1558099730d6SDave Hansen 	 * This makes sure the caller is picking _one_ of the modes with which
1559099730d6SDave Hansen 	 * we can call this function, not both.
1560099730d6SDave Hansen 	 */
1561099730d6SDave Hansen 	if (vma || (addr != -1)) {
1562e0ec90eeSDave Hansen 		VM_WARN_ON_ONCE(addr == -1);
1563e0ec90eeSDave Hansen 		VM_WARN_ON_ONCE(nid != NUMA_NO_NODE);
1564099730d6SDave Hansen 	}
1565099730d6SDave Hansen 	/*
1566d1c3fb1fSNishanth Aravamudan 	 * Assume we will successfully allocate the surplus page to
1567d1c3fb1fSNishanth Aravamudan 	 * prevent racing processes from causing the surplus to exceed
1568d1c3fb1fSNishanth Aravamudan 	 * overcommit
1569d1c3fb1fSNishanth Aravamudan 	 *
1570d1c3fb1fSNishanth Aravamudan 	 * This however introduces a different race, where a process B
1571d1c3fb1fSNishanth Aravamudan 	 * tries to grow the static hugepage pool while alloc_pages() is
1572d1c3fb1fSNishanth Aravamudan 	 * called by process A. B will only examine the per-node
1573d1c3fb1fSNishanth Aravamudan 	 * counters in determining if surplus huge pages can be
1574d1c3fb1fSNishanth Aravamudan 	 * converted to normal huge pages in adjust_pool_surplus(). A
1575d1c3fb1fSNishanth Aravamudan 	 * won't be able to increment the per-node counter, until the
1576d1c3fb1fSNishanth Aravamudan 	 * lock is dropped by B, but B doesn't drop hugetlb_lock until
1577d1c3fb1fSNishanth Aravamudan 	 * no more huge pages can be converted from surplus to normal
1578d1c3fb1fSNishanth Aravamudan 	 * state (and doesn't try to convert again). Thus, we have a
1579d1c3fb1fSNishanth Aravamudan 	 * case where a surplus huge page exists, the pool is grown, and
1580d1c3fb1fSNishanth Aravamudan 	 * the surplus huge page still exists after, even though it
1581d1c3fb1fSNishanth Aravamudan 	 * should just have been converted to a normal huge page. This
1582d1c3fb1fSNishanth Aravamudan 	 * does not leak memory, though, as the hugepage will be freed
1583d1c3fb1fSNishanth Aravamudan 	 * once it is out of use. It also does not allow the counters to
1584d1c3fb1fSNishanth Aravamudan 	 * go out of whack in adjust_pool_surplus() as we don't modify
1585d1c3fb1fSNishanth Aravamudan 	 * the node values until we've gotten the hugepage and only the
1586d1c3fb1fSNishanth Aravamudan 	 * per-node value is checked there.
1587d1c3fb1fSNishanth Aravamudan 	 */
1588d1c3fb1fSNishanth Aravamudan 	spin_lock(&hugetlb_lock);
1589a5516438SAndi Kleen 	if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
1590d1c3fb1fSNishanth Aravamudan 		spin_unlock(&hugetlb_lock);
1591d1c3fb1fSNishanth Aravamudan 		return NULL;
1592d1c3fb1fSNishanth Aravamudan 	} else {
1593a5516438SAndi Kleen 		h->nr_huge_pages++;
1594a5516438SAndi Kleen 		h->surplus_huge_pages++;
1595d1c3fb1fSNishanth Aravamudan 	}
1596d1c3fb1fSNishanth Aravamudan 	spin_unlock(&hugetlb_lock);
1597d1c3fb1fSNishanth Aravamudan 
1598099730d6SDave Hansen 	page = __hugetlb_alloc_buddy_huge_page(h, vma, addr, nid);
1599d1c3fb1fSNishanth Aravamudan 
16007893d1d5SAdam Litke 	spin_lock(&hugetlb_lock);
1601d1c3fb1fSNishanth Aravamudan 	if (page) {
16020edaecfaSAneesh Kumar K.V 		INIT_LIST_HEAD(&page->lru);
1603bf50bab2SNaoya Horiguchi 		r_nid = page_to_nid(page);
1604f1e61557SKirill A. Shutemov 		set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
16059dd540e2SAneesh Kumar K.V 		set_hugetlb_cgroup(page, NULL);
1606d1c3fb1fSNishanth Aravamudan 		/*
1607d1c3fb1fSNishanth Aravamudan 		 * We incremented the global counters already
1608d1c3fb1fSNishanth Aravamudan 		 */
1609bf50bab2SNaoya Horiguchi 		h->nr_huge_pages_node[r_nid]++;
1610bf50bab2SNaoya Horiguchi 		h->surplus_huge_pages_node[r_nid]++;
16113b116300SAdam Litke 		__count_vm_event(HTLB_BUDDY_PGALLOC);
1612d1c3fb1fSNishanth Aravamudan 	} else {
1613a5516438SAndi Kleen 		h->nr_huge_pages--;
1614a5516438SAndi Kleen 		h->surplus_huge_pages--;
16153b116300SAdam Litke 		__count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
16167893d1d5SAdam Litke 	}
1617d1c3fb1fSNishanth Aravamudan 	spin_unlock(&hugetlb_lock);
16187893d1d5SAdam Litke 
16197893d1d5SAdam Litke 	return page;
16207893d1d5SAdam Litke }
16217893d1d5SAdam Litke 
1622e4e574b7SAdam Litke /*
1623099730d6SDave Hansen  * Allocate a huge page from 'nid'.  Note, 'nid' may be
1624099730d6SDave Hansen  * NUMA_NO_NODE, which means that it may be allocated
1625099730d6SDave Hansen  * anywhere.
1626099730d6SDave Hansen  */
1627e0ec90eeSDave Hansen static
1628099730d6SDave Hansen struct page *__alloc_buddy_huge_page_no_mpol(struct hstate *h, int nid)
1629099730d6SDave Hansen {
1630099730d6SDave Hansen 	unsigned long addr = -1;
1631099730d6SDave Hansen 
1632099730d6SDave Hansen 	return __alloc_buddy_huge_page(h, NULL, addr, nid);
1633099730d6SDave Hansen }
1634099730d6SDave Hansen 
1635099730d6SDave Hansen /*
1636099730d6SDave Hansen  * Use the VMA's mpolicy to allocate a huge page from the buddy.
1637099730d6SDave Hansen  */
1638e0ec90eeSDave Hansen static
1639099730d6SDave Hansen struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h,
1640099730d6SDave Hansen 		struct vm_area_struct *vma, unsigned long addr)
1641099730d6SDave Hansen {
1642099730d6SDave Hansen 	return __alloc_buddy_huge_page(h, vma, addr, NUMA_NO_NODE);
1643099730d6SDave Hansen }
1644099730d6SDave Hansen 
1645099730d6SDave Hansen /*
1646bf50bab2SNaoya Horiguchi  * This allocation function is useful in the context where vma is irrelevant.
1647bf50bab2SNaoya Horiguchi  * E.g. soft-offlining uses this function because it only cares physical
1648bf50bab2SNaoya Horiguchi  * address of error page.
1649bf50bab2SNaoya Horiguchi  */
1650bf50bab2SNaoya Horiguchi struct page *alloc_huge_page_node(struct hstate *h, int nid)
1651bf50bab2SNaoya Horiguchi {
16524ef91848SJoonsoo Kim 	struct page *page = NULL;
1653bf50bab2SNaoya Horiguchi 
1654bf50bab2SNaoya Horiguchi 	spin_lock(&hugetlb_lock);
16554ef91848SJoonsoo Kim 	if (h->free_huge_pages - h->resv_huge_pages > 0)
1656bf50bab2SNaoya Horiguchi 		page = dequeue_huge_page_node(h, nid);
1657bf50bab2SNaoya Horiguchi 	spin_unlock(&hugetlb_lock);
1658bf50bab2SNaoya Horiguchi 
165994ae8ba7SAneesh Kumar K.V 	if (!page)
1660099730d6SDave Hansen 		page = __alloc_buddy_huge_page_no_mpol(h, nid);
1661bf50bab2SNaoya Horiguchi 
1662bf50bab2SNaoya Horiguchi 	return page;
1663bf50bab2SNaoya Horiguchi }
1664bf50bab2SNaoya Horiguchi 
1665bf50bab2SNaoya Horiguchi /*
166625985edcSLucas De Marchi  * Increase the hugetlb pool such that it can accommodate a reservation
1667e4e574b7SAdam Litke  * of size 'delta'.
1668e4e574b7SAdam Litke  */
1669a5516438SAndi Kleen static int gather_surplus_pages(struct hstate *h, int delta)
1670e4e574b7SAdam Litke {
1671e4e574b7SAdam Litke 	struct list_head surplus_list;
1672e4e574b7SAdam Litke 	struct page *page, *tmp;
1673e4e574b7SAdam Litke 	int ret, i;
1674e4e574b7SAdam Litke 	int needed, allocated;
167528073b02SHillf Danton 	bool alloc_ok = true;
1676e4e574b7SAdam Litke 
1677a5516438SAndi Kleen 	needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
1678ac09b3a1SAdam Litke 	if (needed <= 0) {
1679a5516438SAndi Kleen 		h->resv_huge_pages += delta;
1680e4e574b7SAdam Litke 		return 0;
1681ac09b3a1SAdam Litke 	}
1682e4e574b7SAdam Litke 
1683e4e574b7SAdam Litke 	allocated = 0;
1684e4e574b7SAdam Litke 	INIT_LIST_HEAD(&surplus_list);
1685e4e574b7SAdam Litke 
1686e4e574b7SAdam Litke 	ret = -ENOMEM;
1687e4e574b7SAdam Litke retry:
1688e4e574b7SAdam Litke 	spin_unlock(&hugetlb_lock);
1689e4e574b7SAdam Litke 	for (i = 0; i < needed; i++) {
1690099730d6SDave Hansen 		page = __alloc_buddy_huge_page_no_mpol(h, NUMA_NO_NODE);
169128073b02SHillf Danton 		if (!page) {
169228073b02SHillf Danton 			alloc_ok = false;
169328073b02SHillf Danton 			break;
169428073b02SHillf Danton 		}
1695e4e574b7SAdam Litke 		list_add(&page->lru, &surplus_list);
1696e4e574b7SAdam Litke 	}
169728073b02SHillf Danton 	allocated += i;
1698e4e574b7SAdam Litke 
1699e4e574b7SAdam Litke 	/*
1700e4e574b7SAdam Litke 	 * After retaking hugetlb_lock, we need to recalculate 'needed'
1701e4e574b7SAdam Litke 	 * because either resv_huge_pages or free_huge_pages may have changed.
1702e4e574b7SAdam Litke 	 */
1703e4e574b7SAdam Litke 	spin_lock(&hugetlb_lock);
1704a5516438SAndi Kleen 	needed = (h->resv_huge_pages + delta) -
1705a5516438SAndi Kleen 			(h->free_huge_pages + allocated);
170628073b02SHillf Danton 	if (needed > 0) {
170728073b02SHillf Danton 		if (alloc_ok)
1708e4e574b7SAdam Litke 			goto retry;
170928073b02SHillf Danton 		/*
171028073b02SHillf Danton 		 * We were not able to allocate enough pages to
171128073b02SHillf Danton 		 * satisfy the entire reservation so we free what
171228073b02SHillf Danton 		 * we've allocated so far.
171328073b02SHillf Danton 		 */
171428073b02SHillf Danton 		goto free;
171528073b02SHillf Danton 	}
1716e4e574b7SAdam Litke 	/*
1717e4e574b7SAdam Litke 	 * The surplus_list now contains _at_least_ the number of extra pages
171825985edcSLucas De Marchi 	 * needed to accommodate the reservation.  Add the appropriate number
1719e4e574b7SAdam Litke 	 * of pages to the hugetlb pool and free the extras back to the buddy
1720ac09b3a1SAdam Litke 	 * allocator.  Commit the entire reservation here to prevent another
1721ac09b3a1SAdam Litke 	 * process from stealing the pages as they are added to the pool but
1722ac09b3a1SAdam Litke 	 * before they are reserved.
1723e4e574b7SAdam Litke 	 */
1724e4e574b7SAdam Litke 	needed += allocated;
1725a5516438SAndi Kleen 	h->resv_huge_pages += delta;
1726e4e574b7SAdam Litke 	ret = 0;
1727a9869b83SNaoya Horiguchi 
172819fc3f0aSAdam Litke 	/* Free the needed pages to the hugetlb pool */
172919fc3f0aSAdam Litke 	list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
173019fc3f0aSAdam Litke 		if ((--needed) < 0)
173119fc3f0aSAdam Litke 			break;
1732a9869b83SNaoya Horiguchi 		/*
1733a9869b83SNaoya Horiguchi 		 * This page is now managed by the hugetlb allocator and has
1734a9869b83SNaoya Horiguchi 		 * no users -- drop the buddy allocator's reference.
1735a9869b83SNaoya Horiguchi 		 */
1736a9869b83SNaoya Horiguchi 		put_page_testzero(page);
1737309381feSSasha Levin 		VM_BUG_ON_PAGE(page_count(page), page);
1738a5516438SAndi Kleen 		enqueue_huge_page(h, page);
173919fc3f0aSAdam Litke 	}
174028073b02SHillf Danton free:
1741b0365c8dSHillf Danton 	spin_unlock(&hugetlb_lock);
174219fc3f0aSAdam Litke 
174319fc3f0aSAdam Litke 	/* Free unnecessary surplus pages to the buddy allocator */
1744c0d934baSJoonsoo Kim 	list_for_each_entry_safe(page, tmp, &surplus_list, lru)
1745a9869b83SNaoya Horiguchi 		put_page(page);
174619fc3f0aSAdam Litke 	spin_lock(&hugetlb_lock);
1747e4e574b7SAdam Litke 
1748e4e574b7SAdam Litke 	return ret;
1749e4e574b7SAdam Litke }
1750e4e574b7SAdam Litke 
1751e4e574b7SAdam Litke /*
1752e4e574b7SAdam Litke  * When releasing a hugetlb pool reservation, any surplus pages that were
1753e4e574b7SAdam Litke  * allocated to satisfy the reservation must be explicitly freed if they were
1754e4e574b7SAdam Litke  * never used.
1755685f3457SLee Schermerhorn  * Called with hugetlb_lock held.
1756e4e574b7SAdam Litke  */
1757a5516438SAndi Kleen static void return_unused_surplus_pages(struct hstate *h,
1758a5516438SAndi Kleen 					unsigned long unused_resv_pages)
1759e4e574b7SAdam Litke {
1760e4e574b7SAdam Litke 	unsigned long nr_pages;
1761e4e574b7SAdam Litke 
1762ac09b3a1SAdam Litke 	/* Uncommit the reservation */
1763a5516438SAndi Kleen 	h->resv_huge_pages -= unused_resv_pages;
1764ac09b3a1SAdam Litke 
1765aa888a74SAndi Kleen 	/* Cannot return gigantic pages currently */
1766bae7f4aeSLuiz Capitulino 	if (hstate_is_gigantic(h))
1767aa888a74SAndi Kleen 		return;
1768aa888a74SAndi Kleen 
1769a5516438SAndi Kleen 	nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
1770e4e574b7SAdam Litke 
1771685f3457SLee Schermerhorn 	/*
1772685f3457SLee Schermerhorn 	 * We want to release as many surplus pages as possible, spread
17739b5e5d0fSLee Schermerhorn 	 * evenly across all nodes with memory. Iterate across these nodes
17749b5e5d0fSLee Schermerhorn 	 * until we can no longer free unreserved surplus pages. This occurs
17759b5e5d0fSLee Schermerhorn 	 * when the nodes with surplus pages have no free pages.
17769b5e5d0fSLee Schermerhorn 	 * free_pool_huge_page() will balance the the freed pages across the
17779b5e5d0fSLee Schermerhorn 	 * on-line nodes with memory and will handle the hstate accounting.
1778685f3457SLee Schermerhorn 	 */
1779685f3457SLee Schermerhorn 	while (nr_pages--) {
17808cebfcd0SLai Jiangshan 		if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
1781685f3457SLee Schermerhorn 			break;
17827848a4bfSMizuma, Masayoshi 		cond_resched_lock(&hugetlb_lock);
1783e4e574b7SAdam Litke 	}
1784e4e574b7SAdam Litke }
1785e4e574b7SAdam Litke 
17865e911373SMike Kravetz 
1787c37f9fb1SAndy Whitcroft /*
1788feba16e2SMike Kravetz  * vma_needs_reservation, vma_commit_reservation and vma_end_reservation
17895e911373SMike Kravetz  * are used by the huge page allocation routines to manage reservations.
1790cf3ad20bSMike Kravetz  *
1791cf3ad20bSMike Kravetz  * vma_needs_reservation is called to determine if the huge page at addr
1792cf3ad20bSMike Kravetz  * within the vma has an associated reservation.  If a reservation is
1793cf3ad20bSMike Kravetz  * needed, the value 1 is returned.  The caller is then responsible for
1794cf3ad20bSMike Kravetz  * managing the global reservation and subpool usage counts.  After
1795cf3ad20bSMike Kravetz  * the huge page has been allocated, vma_commit_reservation is called
1796feba16e2SMike Kravetz  * to add the page to the reservation map.  If the page allocation fails,
1797feba16e2SMike Kravetz  * the reservation must be ended instead of committed.  vma_end_reservation
1798feba16e2SMike Kravetz  * is called in such cases.
1799cf3ad20bSMike Kravetz  *
1800cf3ad20bSMike Kravetz  * In the normal case, vma_commit_reservation returns the same value
1801cf3ad20bSMike Kravetz  * as the preceding vma_needs_reservation call.  The only time this
1802cf3ad20bSMike Kravetz  * is not the case is if a reserve map was changed between calls.  It
1803cf3ad20bSMike Kravetz  * is the responsibility of the caller to notice the difference and
1804cf3ad20bSMike Kravetz  * take appropriate action.
1805c37f9fb1SAndy Whitcroft  */
18065e911373SMike Kravetz enum vma_resv_mode {
18075e911373SMike Kravetz 	VMA_NEEDS_RESV,
18085e911373SMike Kravetz 	VMA_COMMIT_RESV,
1809feba16e2SMike Kravetz 	VMA_END_RESV,
18105e911373SMike Kravetz };
1811cf3ad20bSMike Kravetz static long __vma_reservation_common(struct hstate *h,
1812cf3ad20bSMike Kravetz 				struct vm_area_struct *vma, unsigned long addr,
18135e911373SMike Kravetz 				enum vma_resv_mode mode)
1814c37f9fb1SAndy Whitcroft {
18154e35f483SJoonsoo Kim 	struct resv_map *resv;
18164e35f483SJoonsoo Kim 	pgoff_t idx;
1817cf3ad20bSMike Kravetz 	long ret;
1818c37f9fb1SAndy Whitcroft 
18194e35f483SJoonsoo Kim 	resv = vma_resv_map(vma);
18204e35f483SJoonsoo Kim 	if (!resv)
1821c37f9fb1SAndy Whitcroft 		return 1;
1822c37f9fb1SAndy Whitcroft 
18234e35f483SJoonsoo Kim 	idx = vma_hugecache_offset(h, vma, addr);
18245e911373SMike Kravetz 	switch (mode) {
18255e911373SMike Kravetz 	case VMA_NEEDS_RESV:
1826cf3ad20bSMike Kravetz 		ret = region_chg(resv, idx, idx + 1);
18275e911373SMike Kravetz 		break;
18285e911373SMike Kravetz 	case VMA_COMMIT_RESV:
18295e911373SMike Kravetz 		ret = region_add(resv, idx, idx + 1);
18305e911373SMike Kravetz 		break;
1831feba16e2SMike Kravetz 	case VMA_END_RESV:
18325e911373SMike Kravetz 		region_abort(resv, idx, idx + 1);
18335e911373SMike Kravetz 		ret = 0;
18345e911373SMike Kravetz 		break;
18355e911373SMike Kravetz 	default:
18365e911373SMike Kravetz 		BUG();
18375e911373SMike Kravetz 	}
183884afd99bSAndy Whitcroft 
18394e35f483SJoonsoo Kim 	if (vma->vm_flags & VM_MAYSHARE)
1840cf3ad20bSMike Kravetz 		return ret;
184167961f9dSMike Kravetz 	else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && ret >= 0) {
184267961f9dSMike Kravetz 		/*
184367961f9dSMike Kravetz 		 * In most cases, reserves always exist for private mappings.
184467961f9dSMike Kravetz 		 * However, a file associated with mapping could have been
184567961f9dSMike Kravetz 		 * hole punched or truncated after reserves were consumed.
184667961f9dSMike Kravetz 		 * As subsequent fault on such a range will not use reserves.
184767961f9dSMike Kravetz 		 * Subtle - The reserve map for private mappings has the
184867961f9dSMike Kravetz 		 * opposite meaning than that of shared mappings.  If NO
184967961f9dSMike Kravetz 		 * entry is in the reserve map, it means a reservation exists.
185067961f9dSMike Kravetz 		 * If an entry exists in the reserve map, it means the
185167961f9dSMike Kravetz 		 * reservation has already been consumed.  As a result, the
185267961f9dSMike Kravetz 		 * return value of this routine is the opposite of the
185367961f9dSMike Kravetz 		 * value returned from reserve map manipulation routines above.
185467961f9dSMike Kravetz 		 */
185567961f9dSMike Kravetz 		if (ret)
185667961f9dSMike Kravetz 			return 0;
185767961f9dSMike Kravetz 		else
185867961f9dSMike Kravetz 			return 1;
185967961f9dSMike Kravetz 	}
18604e35f483SJoonsoo Kim 	else
1861cf3ad20bSMike Kravetz 		return ret < 0 ? ret : 0;
186284afd99bSAndy Whitcroft }
1863cf3ad20bSMike Kravetz 
1864cf3ad20bSMike Kravetz static long vma_needs_reservation(struct hstate *h,
1865a5516438SAndi Kleen 			struct vm_area_struct *vma, unsigned long addr)
1866c37f9fb1SAndy Whitcroft {
18675e911373SMike Kravetz 	return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV);
1868cf3ad20bSMike Kravetz }
1869c37f9fb1SAndy Whitcroft 
1870cf3ad20bSMike Kravetz static long vma_commit_reservation(struct hstate *h,
1871cf3ad20bSMike Kravetz 			struct vm_area_struct *vma, unsigned long addr)
1872cf3ad20bSMike Kravetz {
18735e911373SMike Kravetz 	return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV);
18745e911373SMike Kravetz }
18755e911373SMike Kravetz 
1876feba16e2SMike Kravetz static void vma_end_reservation(struct hstate *h,
18775e911373SMike Kravetz 			struct vm_area_struct *vma, unsigned long addr)
18785e911373SMike Kravetz {
1879feba16e2SMike Kravetz 	(void)__vma_reservation_common(h, vma, addr, VMA_END_RESV);
1880c37f9fb1SAndy Whitcroft }
1881c37f9fb1SAndy Whitcroft 
188270c3547eSMike Kravetz struct page *alloc_huge_page(struct vm_area_struct *vma,
188304f2cbe3SMel Gorman 				    unsigned long addr, int avoid_reserve)
1884348ea204SAdam Litke {
188590481622SDavid Gibson 	struct hugepage_subpool *spool = subpool_vma(vma);
1886a5516438SAndi Kleen 	struct hstate *h = hstate_vma(vma);
1887348ea204SAdam Litke 	struct page *page;
1888d85f69b0SMike Kravetz 	long map_chg, map_commit;
1889d85f69b0SMike Kravetz 	long gbl_chg;
18906d76dcf4SAneesh Kumar K.V 	int ret, idx;
18916d76dcf4SAneesh Kumar K.V 	struct hugetlb_cgroup *h_cg;
18922fc39cecSAdam Litke 
18936d76dcf4SAneesh Kumar K.V 	idx = hstate_index(h);
1894a1e78772SMel Gorman 	/*
1895d85f69b0SMike Kravetz 	 * Examine the region/reserve map to determine if the process
1896d85f69b0SMike Kravetz 	 * has a reservation for the page to be allocated.  A return
1897d85f69b0SMike Kravetz 	 * code of zero indicates a reservation exists (no change).
1898a1e78772SMel Gorman 	 */
1899d85f69b0SMike Kravetz 	map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
1900d85f69b0SMike Kravetz 	if (map_chg < 0)
190176dcee75SAneesh Kumar K.V 		return ERR_PTR(-ENOMEM);
1902d85f69b0SMike Kravetz 
1903d85f69b0SMike Kravetz 	/*
1904d85f69b0SMike Kravetz 	 * Processes that did not create the mapping will have no
1905d85f69b0SMike Kravetz 	 * reserves as indicated by the region/reserve map. Check
1906d85f69b0SMike Kravetz 	 * that the allocation will not exceed the subpool limit.
1907d85f69b0SMike Kravetz 	 * Allocations for MAP_NORESERVE mappings also need to be
1908d85f69b0SMike Kravetz 	 * checked against any subpool limit.
1909d85f69b0SMike Kravetz 	 */
1910d85f69b0SMike Kravetz 	if (map_chg || avoid_reserve) {
1911d85f69b0SMike Kravetz 		gbl_chg = hugepage_subpool_get_pages(spool, 1);
1912d85f69b0SMike Kravetz 		if (gbl_chg < 0) {
1913feba16e2SMike Kravetz 			vma_end_reservation(h, vma, addr);
191476dcee75SAneesh Kumar K.V 			return ERR_PTR(-ENOSPC);
19155e911373SMike Kravetz 		}
191690d8b7e6SAdam Litke 
1917d85f69b0SMike Kravetz 		/*
1918d85f69b0SMike Kravetz 		 * Even though there was no reservation in the region/reserve
1919d85f69b0SMike Kravetz 		 * map, there could be reservations associated with the
1920d85f69b0SMike Kravetz 		 * subpool that can be used.  This would be indicated if the
1921d85f69b0SMike Kravetz 		 * return value of hugepage_subpool_get_pages() is zero.
1922d85f69b0SMike Kravetz 		 * However, if avoid_reserve is specified we still avoid even
1923d85f69b0SMike Kravetz 		 * the subpool reservations.
1924d85f69b0SMike Kravetz 		 */
1925d85f69b0SMike Kravetz 		if (avoid_reserve)
1926d85f69b0SMike Kravetz 			gbl_chg = 1;
1927d85f69b0SMike Kravetz 	}
1928d85f69b0SMike Kravetz 
19296d76dcf4SAneesh Kumar K.V 	ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
19308f34af6fSJianyu Zhan 	if (ret)
19318f34af6fSJianyu Zhan 		goto out_subpool_put;
19328f34af6fSJianyu Zhan 
1933a1e78772SMel Gorman 	spin_lock(&hugetlb_lock);
1934d85f69b0SMike Kravetz 	/*
1935d85f69b0SMike Kravetz 	 * glb_chg is passed to indicate whether or not a page must be taken
1936d85f69b0SMike Kravetz 	 * from the global free pool (global change).  gbl_chg == 0 indicates
1937d85f69b0SMike Kravetz 	 * a reservation exists for the allocation.
1938d85f69b0SMike Kravetz 	 */
1939d85f69b0SMike Kravetz 	page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
194081a6fcaeSJoonsoo Kim 	if (!page) {
194194ae8ba7SAneesh Kumar K.V 		spin_unlock(&hugetlb_lock);
1942099730d6SDave Hansen 		page = __alloc_buddy_huge_page_with_mpol(h, vma, addr);
19438f34af6fSJianyu Zhan 		if (!page)
19448f34af6fSJianyu Zhan 			goto out_uncharge_cgroup;
1945a88c7695SNaoya Horiguchi 		if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
1946a88c7695SNaoya Horiguchi 			SetPagePrivate(page);
1947a88c7695SNaoya Horiguchi 			h->resv_huge_pages--;
1948a88c7695SNaoya Horiguchi 		}
194979dbb236SAneesh Kumar K.V 		spin_lock(&hugetlb_lock);
195079dbb236SAneesh Kumar K.V 		list_move(&page->lru, &h->hugepage_activelist);
195181a6fcaeSJoonsoo Kim 		/* Fall through */
1952a1e78772SMel Gorman 	}
195381a6fcaeSJoonsoo Kim 	hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
195481a6fcaeSJoonsoo Kim 	spin_unlock(&hugetlb_lock);
1955a1e78772SMel Gorman 
195690481622SDavid Gibson 	set_page_private(page, (unsigned long)spool);
1957a1e78772SMel Gorman 
1958d85f69b0SMike Kravetz 	map_commit = vma_commit_reservation(h, vma, addr);
1959d85f69b0SMike Kravetz 	if (unlikely(map_chg > map_commit)) {
196033039678SMike Kravetz 		/*
196133039678SMike Kravetz 		 * The page was added to the reservation map between
196233039678SMike Kravetz 		 * vma_needs_reservation and vma_commit_reservation.
196333039678SMike Kravetz 		 * This indicates a race with hugetlb_reserve_pages.
196433039678SMike Kravetz 		 * Adjust for the subpool count incremented above AND
196533039678SMike Kravetz 		 * in hugetlb_reserve_pages for the same page.  Also,
196633039678SMike Kravetz 		 * the reservation count added in hugetlb_reserve_pages
196733039678SMike Kravetz 		 * no longer applies.
196833039678SMike Kravetz 		 */
196933039678SMike Kravetz 		long rsv_adjust;
197033039678SMike Kravetz 
197133039678SMike Kravetz 		rsv_adjust = hugepage_subpool_put_pages(spool, 1);
197233039678SMike Kravetz 		hugetlb_acct_memory(h, -rsv_adjust);
197333039678SMike Kravetz 	}
19747893d1d5SAdam Litke 	return page;
19758f34af6fSJianyu Zhan 
19768f34af6fSJianyu Zhan out_uncharge_cgroup:
19778f34af6fSJianyu Zhan 	hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
19788f34af6fSJianyu Zhan out_subpool_put:
1979d85f69b0SMike Kravetz 	if (map_chg || avoid_reserve)
19808f34af6fSJianyu Zhan 		hugepage_subpool_put_pages(spool, 1);
1981feba16e2SMike Kravetz 	vma_end_reservation(h, vma, addr);
19828f34af6fSJianyu Zhan 	return ERR_PTR(-ENOSPC);
1983b45b5bd6SDavid Gibson }
1984b45b5bd6SDavid Gibson 
198574060e4dSNaoya Horiguchi /*
198674060e4dSNaoya Horiguchi  * alloc_huge_page()'s wrapper which simply returns the page if allocation
198774060e4dSNaoya Horiguchi  * succeeds, otherwise NULL. This function is called from new_vma_page(),
198874060e4dSNaoya Horiguchi  * where no ERR_VALUE is expected to be returned.
198974060e4dSNaoya Horiguchi  */
199074060e4dSNaoya Horiguchi struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
199174060e4dSNaoya Horiguchi 				unsigned long addr, int avoid_reserve)
199274060e4dSNaoya Horiguchi {
199374060e4dSNaoya Horiguchi 	struct page *page = alloc_huge_page(vma, addr, avoid_reserve);
199474060e4dSNaoya Horiguchi 	if (IS_ERR(page))
199574060e4dSNaoya Horiguchi 		page = NULL;
199674060e4dSNaoya Horiguchi 	return page;
199774060e4dSNaoya Horiguchi }
199874060e4dSNaoya Horiguchi 
199991f47662SCyrill Gorcunov int __weak alloc_bootmem_huge_page(struct hstate *h)
2000aa888a74SAndi Kleen {
2001aa888a74SAndi Kleen 	struct huge_bootmem_page *m;
2002b2261026SJoonsoo Kim 	int nr_nodes, node;
2003aa888a74SAndi Kleen 
2004b2261026SJoonsoo Kim 	for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
2005aa888a74SAndi Kleen 		void *addr;
2006aa888a74SAndi Kleen 
20078b89a116SGrygorii Strashko 		addr = memblock_virt_alloc_try_nid_nopanic(
20088b89a116SGrygorii Strashko 				huge_page_size(h), huge_page_size(h),
20098b89a116SGrygorii Strashko 				0, BOOTMEM_ALLOC_ACCESSIBLE, node);
2010aa888a74SAndi Kleen 		if (addr) {
2011aa888a74SAndi Kleen 			/*
2012aa888a74SAndi Kleen 			 * Use the beginning of the huge page to store the
2013aa888a74SAndi Kleen 			 * huge_bootmem_page struct (until gather_bootmem
2014aa888a74SAndi Kleen 			 * puts them into the mem_map).
2015aa888a74SAndi Kleen 			 */
2016aa888a74SAndi Kleen 			m = addr;
2017aa888a74SAndi Kleen 			goto found;
2018aa888a74SAndi Kleen 		}
2019aa888a74SAndi Kleen 	}
2020aa888a74SAndi Kleen 	return 0;
2021aa888a74SAndi Kleen 
2022aa888a74SAndi Kleen found:
2023df994eadSLuiz Capitulino 	BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h)));
2024aa888a74SAndi Kleen 	/* Put them into a private list first because mem_map is not up yet */
2025aa888a74SAndi Kleen 	list_add(&m->list, &huge_boot_pages);
2026aa888a74SAndi Kleen 	m->hstate = h;
2027aa888a74SAndi Kleen 	return 1;
2028aa888a74SAndi Kleen }
2029aa888a74SAndi Kleen 
2030d00181b9SKirill A. Shutemov static void __init prep_compound_huge_page(struct page *page,
2031d00181b9SKirill A. Shutemov 		unsigned int order)
203218229df5SAndy Whitcroft {
203318229df5SAndy Whitcroft 	if (unlikely(order > (MAX_ORDER - 1)))
203418229df5SAndy Whitcroft 		prep_compound_gigantic_page(page, order);
203518229df5SAndy Whitcroft 	else
203618229df5SAndy Whitcroft 		prep_compound_page(page, order);
203718229df5SAndy Whitcroft }
203818229df5SAndy Whitcroft 
2039aa888a74SAndi Kleen /* Put bootmem huge pages into the standard lists after mem_map is up */
2040aa888a74SAndi Kleen static void __init gather_bootmem_prealloc(void)
2041aa888a74SAndi Kleen {
2042aa888a74SAndi Kleen 	struct huge_bootmem_page *m;
2043aa888a74SAndi Kleen 
2044aa888a74SAndi Kleen 	list_for_each_entry(m, &huge_boot_pages, list) {
2045aa888a74SAndi Kleen 		struct hstate *h = m->hstate;
2046ee8f248dSBecky Bruce 		struct page *page;
2047ee8f248dSBecky Bruce 
2048ee8f248dSBecky Bruce #ifdef CONFIG_HIGHMEM
2049ee8f248dSBecky Bruce 		page = pfn_to_page(m->phys >> PAGE_SHIFT);
20508b89a116SGrygorii Strashko 		memblock_free_late(__pa(m),
2051ee8f248dSBecky Bruce 				   sizeof(struct huge_bootmem_page));
2052ee8f248dSBecky Bruce #else
2053ee8f248dSBecky Bruce 		page = virt_to_page(m);
2054ee8f248dSBecky Bruce #endif
2055aa888a74SAndi Kleen 		WARN_ON(page_count(page) != 1);
205618229df5SAndy Whitcroft 		prep_compound_huge_page(page, h->order);
2057ef5a22beSAndrea Arcangeli 		WARN_ON(PageReserved(page));
2058aa888a74SAndi Kleen 		prep_new_huge_page(h, page, page_to_nid(page));
2059b0320c7bSRafael Aquini 		/*
2060b0320c7bSRafael Aquini 		 * If we had gigantic hugepages allocated at boot time, we need
2061b0320c7bSRafael Aquini 		 * to restore the 'stolen' pages to totalram_pages in order to
2062b0320c7bSRafael Aquini 		 * fix confusing memory reports from free(1) and another
2063b0320c7bSRafael Aquini 		 * side-effects, like CommitLimit going negative.
2064b0320c7bSRafael Aquini 		 */
2065bae7f4aeSLuiz Capitulino 		if (hstate_is_gigantic(h))
20663dcc0571SJiang Liu 			adjust_managed_page_count(page, 1 << h->order);
2067aa888a74SAndi Kleen 	}
2068aa888a74SAndi Kleen }
2069aa888a74SAndi Kleen 
20708faa8b07SAndi Kleen static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
20711da177e4SLinus Torvalds {
20721da177e4SLinus Torvalds 	unsigned long i;
20731da177e4SLinus Torvalds 
2074e5ff2159SAndi Kleen 	for (i = 0; i < h->max_huge_pages; ++i) {
2075bae7f4aeSLuiz Capitulino 		if (hstate_is_gigantic(h)) {
2076aa888a74SAndi Kleen 			if (!alloc_bootmem_huge_page(h))
2077aa888a74SAndi Kleen 				break;
20789b5e5d0fSLee Schermerhorn 		} else if (!alloc_fresh_huge_page(h,
20798cebfcd0SLai Jiangshan 					 &node_states[N_MEMORY]))
20801da177e4SLinus Torvalds 			break;
20811da177e4SLinus Torvalds 	}
20828faa8b07SAndi Kleen 	h->max_huge_pages = i;
2083e5ff2159SAndi Kleen }
2084e5ff2159SAndi Kleen 
2085e5ff2159SAndi Kleen static void __init hugetlb_init_hstates(void)
2086e5ff2159SAndi Kleen {
2087e5ff2159SAndi Kleen 	struct hstate *h;
2088e5ff2159SAndi Kleen 
2089e5ff2159SAndi Kleen 	for_each_hstate(h) {
2090641844f5SNaoya Horiguchi 		if (minimum_order > huge_page_order(h))
2091641844f5SNaoya Horiguchi 			minimum_order = huge_page_order(h);
2092641844f5SNaoya Horiguchi 
20938faa8b07SAndi Kleen 		/* oversize hugepages were init'ed in early boot */
2094bae7f4aeSLuiz Capitulino 		if (!hstate_is_gigantic(h))
20958faa8b07SAndi Kleen 			hugetlb_hstate_alloc_pages(h);
2096e5ff2159SAndi Kleen 	}
2097641844f5SNaoya Horiguchi 	VM_BUG_ON(minimum_order == UINT_MAX);
2098e5ff2159SAndi Kleen }
2099e5ff2159SAndi Kleen 
21004abd32dbSAndi Kleen static char * __init memfmt(char *buf, unsigned long n)
21014abd32dbSAndi Kleen {
21024abd32dbSAndi Kleen 	if (n >= (1UL << 30))
21034abd32dbSAndi Kleen 		sprintf(buf, "%lu GB", n >> 30);
21044abd32dbSAndi Kleen 	else if (n >= (1UL << 20))
21054abd32dbSAndi Kleen 		sprintf(buf, "%lu MB", n >> 20);
21064abd32dbSAndi Kleen 	else
21074abd32dbSAndi Kleen 		sprintf(buf, "%lu KB", n >> 10);
21084abd32dbSAndi Kleen 	return buf;
21094abd32dbSAndi Kleen }
21104abd32dbSAndi Kleen 
2111e5ff2159SAndi Kleen static void __init report_hugepages(void)
2112e5ff2159SAndi Kleen {
2113e5ff2159SAndi Kleen 	struct hstate *h;
2114e5ff2159SAndi Kleen 
2115e5ff2159SAndi Kleen 	for_each_hstate(h) {
21164abd32dbSAndi Kleen 		char buf[32];
2117ffb22af5SAndrew Morton 		pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n",
21184abd32dbSAndi Kleen 			memfmt(buf, huge_page_size(h)),
21194abd32dbSAndi Kleen 			h->free_huge_pages);
2120e5ff2159SAndi Kleen 	}
2121e5ff2159SAndi Kleen }
2122e5ff2159SAndi Kleen 
21231da177e4SLinus Torvalds #ifdef CONFIG_HIGHMEM
21246ae11b27SLee Schermerhorn static void try_to_free_low(struct hstate *h, unsigned long count,
21256ae11b27SLee Schermerhorn 						nodemask_t *nodes_allowed)
21261da177e4SLinus Torvalds {
21274415cc8dSChristoph Lameter 	int i;
21284415cc8dSChristoph Lameter 
2129bae7f4aeSLuiz Capitulino 	if (hstate_is_gigantic(h))
2130aa888a74SAndi Kleen 		return;
2131aa888a74SAndi Kleen 
21326ae11b27SLee Schermerhorn 	for_each_node_mask(i, *nodes_allowed) {
21331da177e4SLinus Torvalds 		struct page *page, *next;
2134a5516438SAndi Kleen 		struct list_head *freel = &h->hugepage_freelists[i];
2135a5516438SAndi Kleen 		list_for_each_entry_safe(page, next, freel, lru) {
2136a5516438SAndi Kleen 			if (count >= h->nr_huge_pages)
21376b0c880dSAdam Litke 				return;
21381da177e4SLinus Torvalds 			if (PageHighMem(page))
21391da177e4SLinus Torvalds 				continue;
21401da177e4SLinus Torvalds 			list_del(&page->lru);
2141e5ff2159SAndi Kleen 			update_and_free_page(h, page);
2142a5516438SAndi Kleen 			h->free_huge_pages--;
2143a5516438SAndi Kleen 			h->free_huge_pages_node[page_to_nid(page)]--;
21441da177e4SLinus Torvalds 		}
21451da177e4SLinus Torvalds 	}
21461da177e4SLinus Torvalds }
21471da177e4SLinus Torvalds #else
21486ae11b27SLee Schermerhorn static inline void try_to_free_low(struct hstate *h, unsigned long count,
21496ae11b27SLee Schermerhorn 						nodemask_t *nodes_allowed)
21501da177e4SLinus Torvalds {
21511da177e4SLinus Torvalds }
21521da177e4SLinus Torvalds #endif
21531da177e4SLinus Torvalds 
215420a0307cSWu Fengguang /*
215520a0307cSWu Fengguang  * Increment or decrement surplus_huge_pages.  Keep node-specific counters
215620a0307cSWu Fengguang  * balanced by operating on them in a round-robin fashion.
215720a0307cSWu Fengguang  * Returns 1 if an adjustment was made.
215820a0307cSWu Fengguang  */
21596ae11b27SLee Schermerhorn static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
21606ae11b27SLee Schermerhorn 				int delta)
216120a0307cSWu Fengguang {
2162b2261026SJoonsoo Kim 	int nr_nodes, node;
216320a0307cSWu Fengguang 
216420a0307cSWu Fengguang 	VM_BUG_ON(delta != -1 && delta != 1);
216520a0307cSWu Fengguang 
2166e8c5c824SLee Schermerhorn 	if (delta < 0) {
2167b2261026SJoonsoo Kim 		for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
2168b2261026SJoonsoo Kim 			if (h->surplus_huge_pages_node[node])
2169b2261026SJoonsoo Kim 				goto found;
2170b2261026SJoonsoo Kim 		}
2171b2261026SJoonsoo Kim 	} else {
2172b2261026SJoonsoo Kim 		for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
2173b2261026SJoonsoo Kim 			if (h->surplus_huge_pages_node[node] <
2174b2261026SJoonsoo Kim 					h->nr_huge_pages_node[node])
2175b2261026SJoonsoo Kim 				goto found;
2176e8c5c824SLee Schermerhorn 		}
21779a76db09SLee Schermerhorn 	}
2178b2261026SJoonsoo Kim 	return 0;
217920a0307cSWu Fengguang 
2180b2261026SJoonsoo Kim found:
218120a0307cSWu Fengguang 	h->surplus_huge_pages += delta;
2182b2261026SJoonsoo Kim 	h->surplus_huge_pages_node[node] += delta;
2183b2261026SJoonsoo Kim 	return 1;
218420a0307cSWu Fengguang }
218520a0307cSWu Fengguang 
2186a5516438SAndi Kleen #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
21876ae11b27SLee Schermerhorn static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
21886ae11b27SLee Schermerhorn 						nodemask_t *nodes_allowed)
21891da177e4SLinus Torvalds {
21907893d1d5SAdam Litke 	unsigned long min_count, ret;
21911da177e4SLinus Torvalds 
2192944d9fecSLuiz Capitulino 	if (hstate_is_gigantic(h) && !gigantic_page_supported())
2193aa888a74SAndi Kleen 		return h->max_huge_pages;
2194aa888a74SAndi Kleen 
21957893d1d5SAdam Litke 	/*
21967893d1d5SAdam Litke 	 * Increase the pool size
21977893d1d5SAdam Litke 	 * First take pages out of surplus state.  Then make up the
21987893d1d5SAdam Litke 	 * remaining difference by allocating fresh huge pages.
2199d1c3fb1fSNishanth Aravamudan 	 *
2200d15c7c09SNaoya Horiguchi 	 * We might race with __alloc_buddy_huge_page() here and be unable
2201d1c3fb1fSNishanth Aravamudan 	 * to convert a surplus huge page to a normal huge page. That is
2202d1c3fb1fSNishanth Aravamudan 	 * not critical, though, it just means the overall size of the
2203d1c3fb1fSNishanth Aravamudan 	 * pool might be one hugepage larger than it needs to be, but
2204d1c3fb1fSNishanth Aravamudan 	 * within all the constraints specified by the sysctls.
22057893d1d5SAdam Litke 	 */
22061da177e4SLinus Torvalds 	spin_lock(&hugetlb_lock);
2207a5516438SAndi Kleen 	while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
22086ae11b27SLee Schermerhorn 		if (!adjust_pool_surplus(h, nodes_allowed, -1))
22097893d1d5SAdam Litke 			break;
22107893d1d5SAdam Litke 	}
22117893d1d5SAdam Litke 
2212a5516438SAndi Kleen 	while (count > persistent_huge_pages(h)) {
22137893d1d5SAdam Litke 		/*
22147893d1d5SAdam Litke 		 * If this allocation races such that we no longer need the
22157893d1d5SAdam Litke 		 * page, free_huge_page will handle it by freeing the page
22167893d1d5SAdam Litke 		 * and reducing the surplus.
22177893d1d5SAdam Litke 		 */
22187893d1d5SAdam Litke 		spin_unlock(&hugetlb_lock);
2219649920c6SJia He 
2220649920c6SJia He 		/* yield cpu to avoid soft lockup */
2221649920c6SJia He 		cond_resched();
2222649920c6SJia He 
2223944d9fecSLuiz Capitulino 		if (hstate_is_gigantic(h))
2224944d9fecSLuiz Capitulino 			ret = alloc_fresh_gigantic_page(h, nodes_allowed);
2225944d9fecSLuiz Capitulino 		else
22266ae11b27SLee Schermerhorn 			ret = alloc_fresh_huge_page(h, nodes_allowed);
22277893d1d5SAdam Litke 		spin_lock(&hugetlb_lock);
22287893d1d5SAdam Litke 		if (!ret)
22297893d1d5SAdam Litke 			goto out;
22307893d1d5SAdam Litke 
2231536240f2SMel Gorman 		/* Bail for signals. Probably ctrl-c from user */
2232536240f2SMel Gorman 		if (signal_pending(current))
2233536240f2SMel Gorman 			goto out;
22347893d1d5SAdam Litke 	}
22357893d1d5SAdam Litke 
22367893d1d5SAdam Litke 	/*
22377893d1d5SAdam Litke 	 * Decrease the pool size
22387893d1d5SAdam Litke 	 * First return free pages to the buddy allocator (being careful
22397893d1d5SAdam Litke 	 * to keep enough around to satisfy reservations).  Then place
22407893d1d5SAdam Litke 	 * pages into surplus state as needed so the pool will shrink
22417893d1d5SAdam Litke 	 * to the desired size as pages become free.
2242d1c3fb1fSNishanth Aravamudan 	 *
2243d1c3fb1fSNishanth Aravamudan 	 * By placing pages into the surplus state independent of the
2244d1c3fb1fSNishanth Aravamudan 	 * overcommit value, we are allowing the surplus pool size to
2245d1c3fb1fSNishanth Aravamudan 	 * exceed overcommit. There are few sane options here. Since
2246d15c7c09SNaoya Horiguchi 	 * __alloc_buddy_huge_page() is checking the global counter,
2247d1c3fb1fSNishanth Aravamudan 	 * though, we'll note that we're not allowed to exceed surplus
2248d1c3fb1fSNishanth Aravamudan 	 * and won't grow the pool anywhere else. Not until one of the
2249d1c3fb1fSNishanth Aravamudan 	 * sysctls are changed, or the surplus pages go out of use.
22507893d1d5SAdam Litke 	 */
2251a5516438SAndi Kleen 	min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
22526b0c880dSAdam Litke 	min_count = max(count, min_count);
22536ae11b27SLee Schermerhorn 	try_to_free_low(h, min_count, nodes_allowed);
2254a5516438SAndi Kleen 	while (min_count < persistent_huge_pages(h)) {
22556ae11b27SLee Schermerhorn 		if (!free_pool_huge_page(h, nodes_allowed, 0))
22561da177e4SLinus Torvalds 			break;
225755f67141SMizuma, Masayoshi 		cond_resched_lock(&hugetlb_lock);
22581da177e4SLinus Torvalds 	}
2259a5516438SAndi Kleen 	while (count < persistent_huge_pages(h)) {
22606ae11b27SLee Schermerhorn 		if (!adjust_pool_surplus(h, nodes_allowed, 1))
22617893d1d5SAdam Litke 			break;
22627893d1d5SAdam Litke 	}
22637893d1d5SAdam Litke out:
2264a5516438SAndi Kleen 	ret = persistent_huge_pages(h);
22651da177e4SLinus Torvalds 	spin_unlock(&hugetlb_lock);
22667893d1d5SAdam Litke 	return ret;
22671da177e4SLinus Torvalds }
22681da177e4SLinus Torvalds 
2269a3437870SNishanth Aravamudan #define HSTATE_ATTR_RO(_name) \
2270a3437870SNishanth Aravamudan 	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
2271a3437870SNishanth Aravamudan 
2272a3437870SNishanth Aravamudan #define HSTATE_ATTR(_name) \
2273a3437870SNishanth Aravamudan 	static struct kobj_attribute _name##_attr = \
2274a3437870SNishanth Aravamudan 		__ATTR(_name, 0644, _name##_show, _name##_store)
2275a3437870SNishanth Aravamudan 
2276a3437870SNishanth Aravamudan static struct kobject *hugepages_kobj;
2277a3437870SNishanth Aravamudan static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
2278a3437870SNishanth Aravamudan 
22799a305230SLee Schermerhorn static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp);
22809a305230SLee Schermerhorn 
22819a305230SLee Schermerhorn static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp)
2282a3437870SNishanth Aravamudan {
2283a3437870SNishanth Aravamudan 	int i;
22849a305230SLee Schermerhorn 
2285a3437870SNishanth Aravamudan 	for (i = 0; i < HUGE_MAX_HSTATE; i++)
22869a305230SLee Schermerhorn 		if (hstate_kobjs[i] == kobj) {
22879a305230SLee Schermerhorn 			if (nidp)
22889a305230SLee Schermerhorn 				*nidp = NUMA_NO_NODE;
2289a3437870SNishanth Aravamudan 			return &hstates[i];
22909a305230SLee Schermerhorn 		}
22919a305230SLee Schermerhorn 
22929a305230SLee Schermerhorn 	return kobj_to_node_hstate(kobj, nidp);
2293a3437870SNishanth Aravamudan }
2294a3437870SNishanth Aravamudan 
229506808b08SLee Schermerhorn static ssize_t nr_hugepages_show_common(struct kobject *kobj,
2296a3437870SNishanth Aravamudan 					struct kobj_attribute *attr, char *buf)
2297a3437870SNishanth Aravamudan {
22989a305230SLee Schermerhorn 	struct hstate *h;
22999a305230SLee Schermerhorn 	unsigned long nr_huge_pages;
23009a305230SLee Schermerhorn 	int nid;
23019a305230SLee Schermerhorn 
23029a305230SLee Schermerhorn 	h = kobj_to_hstate(kobj, &nid);
23039a305230SLee Schermerhorn 	if (nid == NUMA_NO_NODE)
23049a305230SLee Schermerhorn 		nr_huge_pages = h->nr_huge_pages;
23059a305230SLee Schermerhorn 	else
23069a305230SLee Schermerhorn 		nr_huge_pages = h->nr_huge_pages_node[nid];
23079a305230SLee Schermerhorn 
23089a305230SLee Schermerhorn 	return sprintf(buf, "%lu\n", nr_huge_pages);
2309a3437870SNishanth Aravamudan }
2310adbe8726SEric B Munson 
2311238d3c13SDavid Rientjes static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
2312238d3c13SDavid Rientjes 					   struct hstate *h, int nid,
2313238d3c13SDavid Rientjes 					   unsigned long count, size_t len)
2314a3437870SNishanth Aravamudan {
2315a3437870SNishanth Aravamudan 	int err;
2316bad44b5bSDavid Rientjes 	NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
2317a3437870SNishanth Aravamudan 
2318944d9fecSLuiz Capitulino 	if (hstate_is_gigantic(h) && !gigantic_page_supported()) {
2319adbe8726SEric B Munson 		err = -EINVAL;
2320adbe8726SEric B Munson 		goto out;
2321adbe8726SEric B Munson 	}
2322adbe8726SEric B Munson 
23239a305230SLee Schermerhorn 	if (nid == NUMA_NO_NODE) {
23249a305230SLee Schermerhorn 		/*
23259a305230SLee Schermerhorn 		 * global hstate attribute
23269a305230SLee Schermerhorn 		 */
23279a305230SLee Schermerhorn 		if (!(obey_mempolicy &&
23289a305230SLee Schermerhorn 				init_nodemask_of_mempolicy(nodes_allowed))) {
232906808b08SLee Schermerhorn 			NODEMASK_FREE(nodes_allowed);
23308cebfcd0SLai Jiangshan 			nodes_allowed = &node_states[N_MEMORY];
233106808b08SLee Schermerhorn 		}
23329a305230SLee Schermerhorn 	} else if (nodes_allowed) {
23339a305230SLee Schermerhorn 		/*
23349a305230SLee Schermerhorn 		 * per node hstate attribute: adjust count to global,
23359a305230SLee Schermerhorn 		 * but restrict alloc/free to the specified node.
23369a305230SLee Schermerhorn 		 */
23379a305230SLee Schermerhorn 		count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
23389a305230SLee Schermerhorn 		init_nodemask_of_node(nodes_allowed, nid);
23399a305230SLee Schermerhorn 	} else
23408cebfcd0SLai Jiangshan 		nodes_allowed = &node_states[N_MEMORY];
23419a305230SLee Schermerhorn 
234206808b08SLee Schermerhorn 	h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
2343a3437870SNishanth Aravamudan 
23448cebfcd0SLai Jiangshan 	if (nodes_allowed != &node_states[N_MEMORY])
234506808b08SLee Schermerhorn 		NODEMASK_FREE(nodes_allowed);
234606808b08SLee Schermerhorn 
234706808b08SLee Schermerhorn 	return len;
2348adbe8726SEric B Munson out:
2349adbe8726SEric B Munson 	NODEMASK_FREE(nodes_allowed);
2350adbe8726SEric B Munson 	return err;
235106808b08SLee Schermerhorn }
235206808b08SLee Schermerhorn 
2353238d3c13SDavid Rientjes static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
2354238d3c13SDavid Rientjes 					 struct kobject *kobj, const char *buf,
2355238d3c13SDavid Rientjes 					 size_t len)
2356238d3c13SDavid Rientjes {
2357238d3c13SDavid Rientjes 	struct hstate *h;
2358238d3c13SDavid Rientjes 	unsigned long count;
2359238d3c13SDavid Rientjes 	int nid;
2360238d3c13SDavid Rientjes 	int err;
2361238d3c13SDavid Rientjes 
2362238d3c13SDavid Rientjes 	err = kstrtoul(buf, 10, &count);
2363238d3c13SDavid Rientjes 	if (err)
2364238d3c13SDavid Rientjes 		return err;
2365238d3c13SDavid Rientjes 
2366238d3c13SDavid Rientjes 	h = kobj_to_hstate(kobj, &nid);
2367238d3c13SDavid Rientjes 	return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len);
2368238d3c13SDavid Rientjes }
2369238d3c13SDavid Rientjes 
237006808b08SLee Schermerhorn static ssize_t nr_hugepages_show(struct kobject *kobj,
237106808b08SLee Schermerhorn 				       struct kobj_attribute *attr, char *buf)
237206808b08SLee Schermerhorn {
237306808b08SLee Schermerhorn 	return nr_hugepages_show_common(kobj, attr, buf);
237406808b08SLee Schermerhorn }
237506808b08SLee Schermerhorn 
237606808b08SLee Schermerhorn static ssize_t nr_hugepages_store(struct kobject *kobj,
237706808b08SLee Schermerhorn 	       struct kobj_attribute *attr, const char *buf, size_t len)
237806808b08SLee Schermerhorn {
2379238d3c13SDavid Rientjes 	return nr_hugepages_store_common(false, kobj, buf, len);
2380a3437870SNishanth Aravamudan }
2381a3437870SNishanth Aravamudan HSTATE_ATTR(nr_hugepages);
2382a3437870SNishanth Aravamudan 
238306808b08SLee Schermerhorn #ifdef CONFIG_NUMA
238406808b08SLee Schermerhorn 
238506808b08SLee Schermerhorn /*
238606808b08SLee Schermerhorn  * hstate attribute for optionally mempolicy-based constraint on persistent
238706808b08SLee Schermerhorn  * huge page alloc/free.
238806808b08SLee Schermerhorn  */
238906808b08SLee Schermerhorn static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
239006808b08SLee Schermerhorn 				       struct kobj_attribute *attr, char *buf)
239106808b08SLee Schermerhorn {
239206808b08SLee Schermerhorn 	return nr_hugepages_show_common(kobj, attr, buf);
239306808b08SLee Schermerhorn }
239406808b08SLee Schermerhorn 
239506808b08SLee Schermerhorn static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
239606808b08SLee Schermerhorn 	       struct kobj_attribute *attr, const char *buf, size_t len)
239706808b08SLee Schermerhorn {
2398238d3c13SDavid Rientjes 	return nr_hugepages_store_common(true, kobj, buf, len);
239906808b08SLee Schermerhorn }
240006808b08SLee Schermerhorn HSTATE_ATTR(nr_hugepages_mempolicy);
240106808b08SLee Schermerhorn #endif
240206808b08SLee Schermerhorn 
240306808b08SLee Schermerhorn 
2404a3437870SNishanth Aravamudan static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
2405a3437870SNishanth Aravamudan 					struct kobj_attribute *attr, char *buf)
2406a3437870SNishanth Aravamudan {
24079a305230SLee Schermerhorn 	struct hstate *h = kobj_to_hstate(kobj, NULL);
2408a3437870SNishanth Aravamudan 	return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
2409a3437870SNishanth Aravamudan }
2410adbe8726SEric B Munson 
2411a3437870SNishanth Aravamudan static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
2412a3437870SNishanth Aravamudan 		struct kobj_attribute *attr, const char *buf, size_t count)
2413a3437870SNishanth Aravamudan {
2414a3437870SNishanth Aravamudan 	int err;
2415a3437870SNishanth Aravamudan 	unsigned long input;
24169a305230SLee Schermerhorn 	struct hstate *h = kobj_to_hstate(kobj, NULL);
2417a3437870SNishanth Aravamudan 
2418bae7f4aeSLuiz Capitulino 	if (hstate_is_gigantic(h))
2419adbe8726SEric B Munson 		return -EINVAL;
2420adbe8726SEric B Munson 
24213dbb95f7SJingoo Han 	err = kstrtoul(buf, 10, &input);
2422a3437870SNishanth Aravamudan 	if (err)
242373ae31e5SEric B Munson 		return err;
2424a3437870SNishanth Aravamudan 
2425a3437870SNishanth Aravamudan 	spin_lock(&hugetlb_lock);
2426a3437870SNishanth Aravamudan 	h->nr_overcommit_huge_pages = input;
2427a3437870SNishanth Aravamudan 	spin_unlock(&hugetlb_lock);
2428a3437870SNishanth Aravamudan 
2429a3437870SNishanth Aravamudan 	return count;
2430a3437870SNishanth Aravamudan }
2431a3437870SNishanth Aravamudan HSTATE_ATTR(nr_overcommit_hugepages);
2432a3437870SNishanth Aravamudan 
2433a3437870SNishanth Aravamudan static ssize_t free_hugepages_show(struct kobject *kobj,
2434a3437870SNishanth Aravamudan 					struct kobj_attribute *attr, char *buf)
2435a3437870SNishanth Aravamudan {
24369a305230SLee Schermerhorn 	struct hstate *h;
24379a305230SLee Schermerhorn 	unsigned long free_huge_pages;
24389a305230SLee Schermerhorn 	int nid;
24399a305230SLee Schermerhorn 
24409a305230SLee Schermerhorn 	h = kobj_to_hstate(kobj, &nid);
24419a305230SLee Schermerhorn 	if (nid == NUMA_NO_NODE)
24429a305230SLee Schermerhorn 		free_huge_pages = h->free_huge_pages;
24439a305230SLee Schermerhorn 	else
24449a305230SLee Schermerhorn 		free_huge_pages = h->free_huge_pages_node[nid];
24459a305230SLee Schermerhorn 
24469a305230SLee Schermerhorn 	return sprintf(buf, "%lu\n", free_huge_pages);
2447a3437870SNishanth Aravamudan }
2448a3437870SNishanth Aravamudan HSTATE_ATTR_RO(free_hugepages);
2449a3437870SNishanth Aravamudan 
2450a3437870SNishanth Aravamudan static ssize_t resv_hugepages_show(struct kobject *kobj,
2451a3437870SNishanth Aravamudan 					struct kobj_attribute *attr, char *buf)
2452a3437870SNishanth Aravamudan {
24539a305230SLee Schermerhorn 	struct hstate *h = kobj_to_hstate(kobj, NULL);
2454a3437870SNishanth Aravamudan 	return sprintf(buf, "%lu\n", h->resv_huge_pages);
2455a3437870SNishanth Aravamudan }
2456a3437870SNishanth Aravamudan HSTATE_ATTR_RO(resv_hugepages);
2457a3437870SNishanth Aravamudan 
2458a3437870SNishanth Aravamudan static ssize_t surplus_hugepages_show(struct kobject *kobj,
2459a3437870SNishanth Aravamudan 					struct kobj_attribute *attr, char *buf)
2460a3437870SNishanth Aravamudan {
24619a305230SLee Schermerhorn 	struct hstate *h;
24629a305230SLee Schermerhorn 	unsigned long surplus_huge_pages;
24639a305230SLee Schermerhorn 	int nid;
24649a305230SLee Schermerhorn 
24659a305230SLee Schermerhorn 	h = kobj_to_hstate(kobj, &nid);
24669a305230SLee Schermerhorn 	if (nid == NUMA_NO_NODE)
24679a305230SLee Schermerhorn 		surplus_huge_pages = h->surplus_huge_pages;
24689a305230SLee Schermerhorn 	else
24699a305230SLee Schermerhorn 		surplus_huge_pages = h->surplus_huge_pages_node[nid];
24709a305230SLee Schermerhorn 
24719a305230SLee Schermerhorn 	return sprintf(buf, "%lu\n", surplus_huge_pages);
2472a3437870SNishanth Aravamudan }
2473a3437870SNishanth Aravamudan HSTATE_ATTR_RO(surplus_hugepages);
2474a3437870SNishanth Aravamudan 
2475a3437870SNishanth Aravamudan static struct attribute *hstate_attrs[] = {
2476a3437870SNishanth Aravamudan 	&nr_hugepages_attr.attr,
2477a3437870SNishanth Aravamudan 	&nr_overcommit_hugepages_attr.attr,
2478a3437870SNishanth Aravamudan 	&free_hugepages_attr.attr,
2479a3437870SNishanth Aravamudan 	&resv_hugepages_attr.attr,
2480a3437870SNishanth Aravamudan 	&surplus_hugepages_attr.attr,
248106808b08SLee Schermerhorn #ifdef CONFIG_NUMA
248206808b08SLee Schermerhorn 	&nr_hugepages_mempolicy_attr.attr,
248306808b08SLee Schermerhorn #endif
2484a3437870SNishanth Aravamudan 	NULL,
2485a3437870SNishanth Aravamudan };
2486a3437870SNishanth Aravamudan 
2487a3437870SNishanth Aravamudan static struct attribute_group hstate_attr_group = {
2488a3437870SNishanth Aravamudan 	.attrs = hstate_attrs,
2489a3437870SNishanth Aravamudan };
2490a3437870SNishanth Aravamudan 
2491094e9539SJeff Mahoney static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
24929a305230SLee Schermerhorn 				    struct kobject **hstate_kobjs,
24939a305230SLee Schermerhorn 				    struct attribute_group *hstate_attr_group)
2494a3437870SNishanth Aravamudan {
2495a3437870SNishanth Aravamudan 	int retval;
2496972dc4deSAneesh Kumar K.V 	int hi = hstate_index(h);
2497a3437870SNishanth Aravamudan 
24989a305230SLee Schermerhorn 	hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
24999a305230SLee Schermerhorn 	if (!hstate_kobjs[hi])
2500a3437870SNishanth Aravamudan 		return -ENOMEM;
2501a3437870SNishanth Aravamudan 
25029a305230SLee Schermerhorn 	retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
2503a3437870SNishanth Aravamudan 	if (retval)
25049a305230SLee Schermerhorn 		kobject_put(hstate_kobjs[hi]);
2505a3437870SNishanth Aravamudan 
2506a3437870SNishanth Aravamudan 	return retval;
2507a3437870SNishanth Aravamudan }
2508a3437870SNishanth Aravamudan 
2509a3437870SNishanth Aravamudan static void __init hugetlb_sysfs_init(void)
2510a3437870SNishanth Aravamudan {
2511a3437870SNishanth Aravamudan 	struct hstate *h;
2512a3437870SNishanth Aravamudan 	int err;
2513a3437870SNishanth Aravamudan 
2514a3437870SNishanth Aravamudan 	hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
2515a3437870SNishanth Aravamudan 	if (!hugepages_kobj)
2516a3437870SNishanth Aravamudan 		return;
2517a3437870SNishanth Aravamudan 
2518a3437870SNishanth Aravamudan 	for_each_hstate(h) {
25199a305230SLee Schermerhorn 		err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
25209a305230SLee Schermerhorn 					 hstate_kobjs, &hstate_attr_group);
2521a3437870SNishanth Aravamudan 		if (err)
2522ffb22af5SAndrew Morton 			pr_err("Hugetlb: Unable to add hstate %s", h->name);
2523a3437870SNishanth Aravamudan 	}
2524a3437870SNishanth Aravamudan }
2525a3437870SNishanth Aravamudan 
25269a305230SLee Schermerhorn #ifdef CONFIG_NUMA
25279a305230SLee Schermerhorn 
25289a305230SLee Schermerhorn /*
25299a305230SLee Schermerhorn  * node_hstate/s - associate per node hstate attributes, via their kobjects,
253010fbcf4cSKay Sievers  * with node devices in node_devices[] using a parallel array.  The array
253110fbcf4cSKay Sievers  * index of a node device or _hstate == node id.
253210fbcf4cSKay Sievers  * This is here to avoid any static dependency of the node device driver, in
25339a305230SLee Schermerhorn  * the base kernel, on the hugetlb module.
25349a305230SLee Schermerhorn  */
25359a305230SLee Schermerhorn struct node_hstate {
25369a305230SLee Schermerhorn 	struct kobject		*hugepages_kobj;
25379a305230SLee Schermerhorn 	struct kobject		*hstate_kobjs[HUGE_MAX_HSTATE];
25389a305230SLee Schermerhorn };
2539b4e289a6SAlexander Kuleshov static struct node_hstate node_hstates[MAX_NUMNODES];
25409a305230SLee Schermerhorn 
25419a305230SLee Schermerhorn /*
254210fbcf4cSKay Sievers  * A subset of global hstate attributes for node devices
25439a305230SLee Schermerhorn  */
25449a305230SLee Schermerhorn static struct attribute *per_node_hstate_attrs[] = {
25459a305230SLee Schermerhorn 	&nr_hugepages_attr.attr,
25469a305230SLee Schermerhorn 	&free_hugepages_attr.attr,
25479a305230SLee Schermerhorn 	&surplus_hugepages_attr.attr,
25489a305230SLee Schermerhorn 	NULL,
25499a305230SLee Schermerhorn };
25509a305230SLee Schermerhorn 
25519a305230SLee Schermerhorn static struct attribute_group per_node_hstate_attr_group = {
25529a305230SLee Schermerhorn 	.attrs = per_node_hstate_attrs,
25539a305230SLee Schermerhorn };
25549a305230SLee Schermerhorn 
25559a305230SLee Schermerhorn /*
255610fbcf4cSKay Sievers  * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj.
25579a305230SLee Schermerhorn  * Returns node id via non-NULL nidp.
25589a305230SLee Schermerhorn  */
25599a305230SLee Schermerhorn static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
25609a305230SLee Schermerhorn {
25619a305230SLee Schermerhorn 	int nid;
25629a305230SLee Schermerhorn 
25639a305230SLee Schermerhorn 	for (nid = 0; nid < nr_node_ids; nid++) {
25649a305230SLee Schermerhorn 		struct node_hstate *nhs = &node_hstates[nid];
25659a305230SLee Schermerhorn 		int i;
25669a305230SLee Schermerhorn 		for (i = 0; i < HUGE_MAX_HSTATE; i++)
25679a305230SLee Schermerhorn 			if (nhs->hstate_kobjs[i] == kobj) {
25689a305230SLee Schermerhorn 				if (nidp)
25699a305230SLee Schermerhorn 					*nidp = nid;
25709a305230SLee Schermerhorn 				return &hstates[i];
25719a305230SLee Schermerhorn 			}
25729a305230SLee Schermerhorn 	}
25739a305230SLee Schermerhorn 
25749a305230SLee Schermerhorn 	BUG();
25759a305230SLee Schermerhorn 	return NULL;
25769a305230SLee Schermerhorn }
25779a305230SLee Schermerhorn 
25789a305230SLee Schermerhorn /*
257910fbcf4cSKay Sievers  * Unregister hstate attributes from a single node device.
25809a305230SLee Schermerhorn  * No-op if no hstate attributes attached.
25819a305230SLee Schermerhorn  */
25823cd8b44fSClaudiu Ghioc static void hugetlb_unregister_node(struct node *node)
25839a305230SLee Schermerhorn {
25849a305230SLee Schermerhorn 	struct hstate *h;
258510fbcf4cSKay Sievers 	struct node_hstate *nhs = &node_hstates[node->dev.id];
25869a305230SLee Schermerhorn 
25879a305230SLee Schermerhorn 	if (!nhs->hugepages_kobj)
25889b5e5d0fSLee Schermerhorn 		return;		/* no hstate attributes */
25899a305230SLee Schermerhorn 
2590972dc4deSAneesh Kumar K.V 	for_each_hstate(h) {
2591972dc4deSAneesh Kumar K.V 		int idx = hstate_index(h);
2592972dc4deSAneesh Kumar K.V 		if (nhs->hstate_kobjs[idx]) {
2593972dc4deSAneesh Kumar K.V 			kobject_put(nhs->hstate_kobjs[idx]);
2594972dc4deSAneesh Kumar K.V 			nhs->hstate_kobjs[idx] = NULL;
2595972dc4deSAneesh Kumar K.V 		}
25969a305230SLee Schermerhorn 	}
25979a305230SLee Schermerhorn 
25989a305230SLee Schermerhorn 	kobject_put(nhs->hugepages_kobj);
25999a305230SLee Schermerhorn 	nhs->hugepages_kobj = NULL;
26009a305230SLee Schermerhorn }
26019a305230SLee Schermerhorn 
26029a305230SLee Schermerhorn 
26039a305230SLee Schermerhorn /*
260410fbcf4cSKay Sievers  * Register hstate attributes for a single node device.
26059a305230SLee Schermerhorn  * No-op if attributes already registered.
26069a305230SLee Schermerhorn  */
26073cd8b44fSClaudiu Ghioc static void hugetlb_register_node(struct node *node)
26089a305230SLee Schermerhorn {
26099a305230SLee Schermerhorn 	struct hstate *h;
261010fbcf4cSKay Sievers 	struct node_hstate *nhs = &node_hstates[node->dev.id];
26119a305230SLee Schermerhorn 	int err;
26129a305230SLee Schermerhorn 
26139a305230SLee Schermerhorn 	if (nhs->hugepages_kobj)
26149a305230SLee Schermerhorn 		return;		/* already allocated */
26159a305230SLee Schermerhorn 
26169a305230SLee Schermerhorn 	nhs->hugepages_kobj = kobject_create_and_add("hugepages",
261710fbcf4cSKay Sievers 							&node->dev.kobj);
26189a305230SLee Schermerhorn 	if (!nhs->hugepages_kobj)
26199a305230SLee Schermerhorn 		return;
26209a305230SLee Schermerhorn 
26219a305230SLee Schermerhorn 	for_each_hstate(h) {
26229a305230SLee Schermerhorn 		err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj,
26239a305230SLee Schermerhorn 						nhs->hstate_kobjs,
26249a305230SLee Schermerhorn 						&per_node_hstate_attr_group);
26259a305230SLee Schermerhorn 		if (err) {
2626ffb22af5SAndrew Morton 			pr_err("Hugetlb: Unable to add hstate %s for node %d\n",
262710fbcf4cSKay Sievers 				h->name, node->dev.id);
26289a305230SLee Schermerhorn 			hugetlb_unregister_node(node);
26299a305230SLee Schermerhorn 			break;
26309a305230SLee Schermerhorn 		}
26319a305230SLee Schermerhorn 	}
26329a305230SLee Schermerhorn }
26339a305230SLee Schermerhorn 
26349a305230SLee Schermerhorn /*
26359b5e5d0fSLee Schermerhorn  * hugetlb init time:  register hstate attributes for all registered node
263610fbcf4cSKay Sievers  * devices of nodes that have memory.  All on-line nodes should have
263710fbcf4cSKay Sievers  * registered their associated device by this time.
26389a305230SLee Schermerhorn  */
26397d9ca000SLuiz Capitulino static void __init hugetlb_register_all_nodes(void)
26409a305230SLee Schermerhorn {
26419a305230SLee Schermerhorn 	int nid;
26429a305230SLee Schermerhorn 
26438cebfcd0SLai Jiangshan 	for_each_node_state(nid, N_MEMORY) {
26448732794bSWen Congyang 		struct node *node = node_devices[nid];
264510fbcf4cSKay Sievers 		if (node->dev.id == nid)
26469a305230SLee Schermerhorn 			hugetlb_register_node(node);
26479a305230SLee Schermerhorn 	}
26489a305230SLee Schermerhorn 
26499a305230SLee Schermerhorn 	/*
265010fbcf4cSKay Sievers 	 * Let the node device driver know we're here so it can
26519a305230SLee Schermerhorn 	 * [un]register hstate attributes on node hotplug.
26529a305230SLee Schermerhorn 	 */
26539a305230SLee Schermerhorn 	register_hugetlbfs_with_node(hugetlb_register_node,
26549a305230SLee Schermerhorn 				     hugetlb_unregister_node);
26559a305230SLee Schermerhorn }
26569a305230SLee Schermerhorn #else	/* !CONFIG_NUMA */
26579a305230SLee Schermerhorn 
26589a305230SLee Schermerhorn static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
26599a305230SLee Schermerhorn {
26609a305230SLee Schermerhorn 	BUG();
26619a305230SLee Schermerhorn 	if (nidp)
26629a305230SLee Schermerhorn 		*nidp = -1;
26639a305230SLee Schermerhorn 	return NULL;
26649a305230SLee Schermerhorn }
26659a305230SLee Schermerhorn 
26669a305230SLee Schermerhorn static void hugetlb_register_all_nodes(void) { }
26679a305230SLee Schermerhorn 
26689a305230SLee Schermerhorn #endif
26699a305230SLee Schermerhorn 
2670a3437870SNishanth Aravamudan static int __init hugetlb_init(void)
2671a3437870SNishanth Aravamudan {
26728382d914SDavidlohr Bueso 	int i;
26738382d914SDavidlohr Bueso 
2674457c1b27SNishanth Aravamudan 	if (!hugepages_supported())
26750ef89d25SBenjamin Herrenschmidt 		return 0;
2676a3437870SNishanth Aravamudan 
2677e11bfbfcSNick Piggin 	if (!size_to_hstate(default_hstate_size)) {
2678e11bfbfcSNick Piggin 		default_hstate_size = HPAGE_SIZE;
2679e11bfbfcSNick Piggin 		if (!size_to_hstate(default_hstate_size))
2680a3437870SNishanth Aravamudan 			hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
2681a3437870SNishanth Aravamudan 	}
2682972dc4deSAneesh Kumar K.V 	default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size));
2683f8b74815SVaishali Thakkar 	if (default_hstate_max_huge_pages) {
2684f8b74815SVaishali Thakkar 		if (!default_hstate.max_huge_pages)
2685e11bfbfcSNick Piggin 			default_hstate.max_huge_pages = default_hstate_max_huge_pages;
2686f8b74815SVaishali Thakkar 	}
2687a3437870SNishanth Aravamudan 
2688a3437870SNishanth Aravamudan 	hugetlb_init_hstates();
2689aa888a74SAndi Kleen 	gather_bootmem_prealloc();
2690a3437870SNishanth Aravamudan 	report_hugepages();
2691a3437870SNishanth Aravamudan 
2692a3437870SNishanth Aravamudan 	hugetlb_sysfs_init();
26939a305230SLee Schermerhorn 	hugetlb_register_all_nodes();
26947179e7bfSJianguo Wu 	hugetlb_cgroup_file_init();
26959a305230SLee Schermerhorn 
26968382d914SDavidlohr Bueso #ifdef CONFIG_SMP
26978382d914SDavidlohr Bueso 	num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus());
26988382d914SDavidlohr Bueso #else
26998382d914SDavidlohr Bueso 	num_fault_mutexes = 1;
27008382d914SDavidlohr Bueso #endif
2701c672c7f2SMike Kravetz 	hugetlb_fault_mutex_table =
27028382d914SDavidlohr Bueso 		kmalloc(sizeof(struct mutex) * num_fault_mutexes, GFP_KERNEL);
2703c672c7f2SMike Kravetz 	BUG_ON(!hugetlb_fault_mutex_table);
27048382d914SDavidlohr Bueso 
27058382d914SDavidlohr Bueso 	for (i = 0; i < num_fault_mutexes; i++)
2706c672c7f2SMike Kravetz 		mutex_init(&hugetlb_fault_mutex_table[i]);
2707a3437870SNishanth Aravamudan 	return 0;
2708a3437870SNishanth Aravamudan }
27093e89e1c5SPaul Gortmaker subsys_initcall(hugetlb_init);
2710a3437870SNishanth Aravamudan 
2711a3437870SNishanth Aravamudan /* Should be called on processing a hugepagesz=... option */
27129fee021dSVaishali Thakkar void __init hugetlb_bad_size(void)
27139fee021dSVaishali Thakkar {
27149fee021dSVaishali Thakkar 	parsed_valid_hugepagesz = false;
27159fee021dSVaishali Thakkar }
27169fee021dSVaishali Thakkar 
2717d00181b9SKirill A. Shutemov void __init hugetlb_add_hstate(unsigned int order)
2718a3437870SNishanth Aravamudan {
2719a3437870SNishanth Aravamudan 	struct hstate *h;
27208faa8b07SAndi Kleen 	unsigned long i;
27218faa8b07SAndi Kleen 
2722a3437870SNishanth Aravamudan 	if (size_to_hstate(PAGE_SIZE << order)) {
2723598d8091SJoe Perches 		pr_warn("hugepagesz= specified twice, ignoring\n");
2724a3437870SNishanth Aravamudan 		return;
2725a3437870SNishanth Aravamudan 	}
272647d38344SAneesh Kumar K.V 	BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
2727a3437870SNishanth Aravamudan 	BUG_ON(order == 0);
272847d38344SAneesh Kumar K.V 	h = &hstates[hugetlb_max_hstate++];
2729a3437870SNishanth Aravamudan 	h->order = order;
2730a3437870SNishanth Aravamudan 	h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
27318faa8b07SAndi Kleen 	h->nr_huge_pages = 0;
27328faa8b07SAndi Kleen 	h->free_huge_pages = 0;
27338faa8b07SAndi Kleen 	for (i = 0; i < MAX_NUMNODES; ++i)
27348faa8b07SAndi Kleen 		INIT_LIST_HEAD(&h->hugepage_freelists[i]);
27350edaecfaSAneesh Kumar K.V 	INIT_LIST_HEAD(&h->hugepage_activelist);
273654f18d35SAndrew Morton 	h->next_nid_to_alloc = first_memory_node;
273754f18d35SAndrew Morton 	h->next_nid_to_free = first_memory_node;
2738a3437870SNishanth Aravamudan 	snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
2739a3437870SNishanth Aravamudan 					huge_page_size(h)/1024);
27408faa8b07SAndi Kleen 
2741a3437870SNishanth Aravamudan 	parsed_hstate = h;
2742a3437870SNishanth Aravamudan }
2743a3437870SNishanth Aravamudan 
2744e11bfbfcSNick Piggin static int __init hugetlb_nrpages_setup(char *s)
2745a3437870SNishanth Aravamudan {
2746a3437870SNishanth Aravamudan 	unsigned long *mhp;
27478faa8b07SAndi Kleen 	static unsigned long *last_mhp;
2748a3437870SNishanth Aravamudan 
27499fee021dSVaishali Thakkar 	if (!parsed_valid_hugepagesz) {
27509fee021dSVaishali Thakkar 		pr_warn("hugepages = %s preceded by "
27519fee021dSVaishali Thakkar 			"an unsupported hugepagesz, ignoring\n", s);
27529fee021dSVaishali Thakkar 		parsed_valid_hugepagesz = true;
27539fee021dSVaishali Thakkar 		return 1;
27549fee021dSVaishali Thakkar 	}
2755a3437870SNishanth Aravamudan 	/*
275647d38344SAneesh Kumar K.V 	 * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,
2757a3437870SNishanth Aravamudan 	 * so this hugepages= parameter goes to the "default hstate".
2758a3437870SNishanth Aravamudan 	 */
27599fee021dSVaishali Thakkar 	else if (!hugetlb_max_hstate)
2760a3437870SNishanth Aravamudan 		mhp = &default_hstate_max_huge_pages;
2761a3437870SNishanth Aravamudan 	else
2762a3437870SNishanth Aravamudan 		mhp = &parsed_hstate->max_huge_pages;
2763a3437870SNishanth Aravamudan 
27648faa8b07SAndi Kleen 	if (mhp == last_mhp) {
2765598d8091SJoe Perches 		pr_warn("hugepages= specified twice without interleaving hugepagesz=, ignoring\n");
27668faa8b07SAndi Kleen 		return 1;
27678faa8b07SAndi Kleen 	}
27688faa8b07SAndi Kleen 
2769a3437870SNishanth Aravamudan 	if (sscanf(s, "%lu", mhp) <= 0)
2770a3437870SNishanth Aravamudan 		*mhp = 0;
2771a3437870SNishanth Aravamudan 
27728faa8b07SAndi Kleen 	/*
27738faa8b07SAndi Kleen 	 * Global state is always initialized later in hugetlb_init.
27748faa8b07SAndi Kleen 	 * But we need to allocate >= MAX_ORDER hstates here early to still
27758faa8b07SAndi Kleen 	 * use the bootmem allocator.
27768faa8b07SAndi Kleen 	 */
277747d38344SAneesh Kumar K.V 	if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)
27788faa8b07SAndi Kleen 		hugetlb_hstate_alloc_pages(parsed_hstate);
27798faa8b07SAndi Kleen 
27808faa8b07SAndi Kleen 	last_mhp = mhp;
27818faa8b07SAndi Kleen 
2782a3437870SNishanth Aravamudan 	return 1;
2783a3437870SNishanth Aravamudan }
2784e11bfbfcSNick Piggin __setup("hugepages=", hugetlb_nrpages_setup);
2785e11bfbfcSNick Piggin 
2786e11bfbfcSNick Piggin static int __init hugetlb_default_setup(char *s)
2787e11bfbfcSNick Piggin {
2788e11bfbfcSNick Piggin 	default_hstate_size = memparse(s, &s);
2789e11bfbfcSNick Piggin 	return 1;
2790e11bfbfcSNick Piggin }
2791e11bfbfcSNick Piggin __setup("default_hugepagesz=", hugetlb_default_setup);
2792a3437870SNishanth Aravamudan 
27938a213460SNishanth Aravamudan static unsigned int cpuset_mems_nr(unsigned int *array)
27948a213460SNishanth Aravamudan {
27958a213460SNishanth Aravamudan 	int node;
27968a213460SNishanth Aravamudan 	unsigned int nr = 0;
27978a213460SNishanth Aravamudan 
27988a213460SNishanth Aravamudan 	for_each_node_mask(node, cpuset_current_mems_allowed)
27998a213460SNishanth Aravamudan 		nr += array[node];
28008a213460SNishanth Aravamudan 
28018a213460SNishanth Aravamudan 	return nr;
28028a213460SNishanth Aravamudan }
28038a213460SNishanth Aravamudan 
28048a213460SNishanth Aravamudan #ifdef CONFIG_SYSCTL
280506808b08SLee Schermerhorn static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
280606808b08SLee Schermerhorn 			 struct ctl_table *table, int write,
280706808b08SLee Schermerhorn 			 void __user *buffer, size_t *length, loff_t *ppos)
28081da177e4SLinus Torvalds {
2809e5ff2159SAndi Kleen 	struct hstate *h = &default_hstate;
2810238d3c13SDavid Rientjes 	unsigned long tmp = h->max_huge_pages;
281108d4a246SMichal Hocko 	int ret;
2812e5ff2159SAndi Kleen 
2813457c1b27SNishanth Aravamudan 	if (!hugepages_supported())
281486613628SJan Stancek 		return -EOPNOTSUPP;
2815457c1b27SNishanth Aravamudan 
2816e5ff2159SAndi Kleen 	table->data = &tmp;
2817e5ff2159SAndi Kleen 	table->maxlen = sizeof(unsigned long);
281808d4a246SMichal Hocko 	ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
281908d4a246SMichal Hocko 	if (ret)
282008d4a246SMichal Hocko 		goto out;
2821e5ff2159SAndi Kleen 
2822238d3c13SDavid Rientjes 	if (write)
2823238d3c13SDavid Rientjes 		ret = __nr_hugepages_store_common(obey_mempolicy, h,
2824238d3c13SDavid Rientjes 						  NUMA_NO_NODE, tmp, *length);
282508d4a246SMichal Hocko out:
282608d4a246SMichal Hocko 	return ret;
28271da177e4SLinus Torvalds }
2828396faf03SMel Gorman 
282906808b08SLee Schermerhorn int hugetlb_sysctl_handler(struct ctl_table *table, int write,
283006808b08SLee Schermerhorn 			  void __user *buffer, size_t *length, loff_t *ppos)
283106808b08SLee Schermerhorn {
283206808b08SLee Schermerhorn 
283306808b08SLee Schermerhorn 	return hugetlb_sysctl_handler_common(false, table, write,
283406808b08SLee Schermerhorn 							buffer, length, ppos);
283506808b08SLee Schermerhorn }
283606808b08SLee Schermerhorn 
283706808b08SLee Schermerhorn #ifdef CONFIG_NUMA
283806808b08SLee Schermerhorn int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
283906808b08SLee Schermerhorn 			  void __user *buffer, size_t *length, loff_t *ppos)
284006808b08SLee Schermerhorn {
284106808b08SLee Schermerhorn 	return hugetlb_sysctl_handler_common(true, table, write,
284206808b08SLee Schermerhorn 							buffer, length, ppos);
284306808b08SLee Schermerhorn }
284406808b08SLee Schermerhorn #endif /* CONFIG_NUMA */
284506808b08SLee Schermerhorn 
2846a3d0c6aaSNishanth Aravamudan int hugetlb_overcommit_handler(struct ctl_table *table, int write,
28478d65af78SAlexey Dobriyan 			void __user *buffer,
2848a3d0c6aaSNishanth Aravamudan 			size_t *length, loff_t *ppos)
2849a3d0c6aaSNishanth Aravamudan {
2850a5516438SAndi Kleen 	struct hstate *h = &default_hstate;
2851e5ff2159SAndi Kleen 	unsigned long tmp;
285208d4a246SMichal Hocko 	int ret;
2853e5ff2159SAndi Kleen 
2854457c1b27SNishanth Aravamudan 	if (!hugepages_supported())
285586613628SJan Stancek 		return -EOPNOTSUPP;
2856457c1b27SNishanth Aravamudan 
2857e5ff2159SAndi Kleen 	tmp = h->nr_overcommit_huge_pages;
2858e5ff2159SAndi Kleen 
2859bae7f4aeSLuiz Capitulino 	if (write && hstate_is_gigantic(h))
2860adbe8726SEric B Munson 		return -EINVAL;
2861adbe8726SEric B Munson 
2862e5ff2159SAndi Kleen 	table->data = &tmp;
2863e5ff2159SAndi Kleen 	table->maxlen = sizeof(unsigned long);
286408d4a246SMichal Hocko 	ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
286508d4a246SMichal Hocko 	if (ret)
286608d4a246SMichal Hocko 		goto out;
2867e5ff2159SAndi Kleen 
2868e5ff2159SAndi Kleen 	if (write) {
2869064d9efeSNishanth Aravamudan 		spin_lock(&hugetlb_lock);
2870e5ff2159SAndi Kleen 		h->nr_overcommit_huge_pages = tmp;
2871a3d0c6aaSNishanth Aravamudan 		spin_unlock(&hugetlb_lock);
2872e5ff2159SAndi Kleen 	}
287308d4a246SMichal Hocko out:
287408d4a246SMichal Hocko 	return ret;
2875a3d0c6aaSNishanth Aravamudan }
2876a3d0c6aaSNishanth Aravamudan 
28771da177e4SLinus Torvalds #endif /* CONFIG_SYSCTL */
28781da177e4SLinus Torvalds 
2879e1759c21SAlexey Dobriyan void hugetlb_report_meminfo(struct seq_file *m)
28801da177e4SLinus Torvalds {
2881a5516438SAndi Kleen 	struct hstate *h = &default_hstate;
2882457c1b27SNishanth Aravamudan 	if (!hugepages_supported())
2883457c1b27SNishanth Aravamudan 		return;
2884e1759c21SAlexey Dobriyan 	seq_printf(m,
28851da177e4SLinus Torvalds 			"HugePages_Total:   %5lu\n"
28861da177e4SLinus Torvalds 			"HugePages_Free:    %5lu\n"
2887b45b5bd6SDavid Gibson 			"HugePages_Rsvd:    %5lu\n"
28887893d1d5SAdam Litke 			"HugePages_Surp:    %5lu\n"
28894f98a2feSRik van Riel 			"Hugepagesize:   %8lu kB\n",
2890a5516438SAndi Kleen 			h->nr_huge_pages,
2891a5516438SAndi Kleen 			h->free_huge_pages,
2892a5516438SAndi Kleen 			h->resv_huge_pages,
2893a5516438SAndi Kleen 			h->surplus_huge_pages,
2894a5516438SAndi Kleen 			1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
28951da177e4SLinus Torvalds }
28961da177e4SLinus Torvalds 
28971da177e4SLinus Torvalds int hugetlb_report_node_meminfo(int nid, char *buf)
28981da177e4SLinus Torvalds {
2899a5516438SAndi Kleen 	struct hstate *h = &default_hstate;
2900457c1b27SNishanth Aravamudan 	if (!hugepages_supported())
2901457c1b27SNishanth Aravamudan 		return 0;
29021da177e4SLinus Torvalds 	return sprintf(buf,
29031da177e4SLinus Torvalds 		"Node %d HugePages_Total: %5u\n"
2904a1de0919SNishanth Aravamudan 		"Node %d HugePages_Free:  %5u\n"
2905a1de0919SNishanth Aravamudan 		"Node %d HugePages_Surp:  %5u\n",
2906a5516438SAndi Kleen 		nid, h->nr_huge_pages_node[nid],
2907a5516438SAndi Kleen 		nid, h->free_huge_pages_node[nid],
2908a5516438SAndi Kleen 		nid, h->surplus_huge_pages_node[nid]);
29091da177e4SLinus Torvalds }
29101da177e4SLinus Torvalds 
2911949f7ec5SDavid Rientjes void hugetlb_show_meminfo(void)
2912949f7ec5SDavid Rientjes {
2913949f7ec5SDavid Rientjes 	struct hstate *h;
2914949f7ec5SDavid Rientjes 	int nid;
2915949f7ec5SDavid Rientjes 
2916457c1b27SNishanth Aravamudan 	if (!hugepages_supported())
2917457c1b27SNishanth Aravamudan 		return;
2918457c1b27SNishanth Aravamudan 
2919949f7ec5SDavid Rientjes 	for_each_node_state(nid, N_MEMORY)
2920949f7ec5SDavid Rientjes 		for_each_hstate(h)
2921949f7ec5SDavid Rientjes 			pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
2922949f7ec5SDavid Rientjes 				nid,
2923949f7ec5SDavid Rientjes 				h->nr_huge_pages_node[nid],
2924949f7ec5SDavid Rientjes 				h->free_huge_pages_node[nid],
2925949f7ec5SDavid Rientjes 				h->surplus_huge_pages_node[nid],
2926949f7ec5SDavid Rientjes 				1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
2927949f7ec5SDavid Rientjes }
2928949f7ec5SDavid Rientjes 
29295d317b2bSNaoya Horiguchi void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm)
29305d317b2bSNaoya Horiguchi {
29315d317b2bSNaoya Horiguchi 	seq_printf(m, "HugetlbPages:\t%8lu kB\n",
29325d317b2bSNaoya Horiguchi 		   atomic_long_read(&mm->hugetlb_usage) << (PAGE_SHIFT - 10));
29335d317b2bSNaoya Horiguchi }
29345d317b2bSNaoya Horiguchi 
29351da177e4SLinus Torvalds /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
29361da177e4SLinus Torvalds unsigned long hugetlb_total_pages(void)
29371da177e4SLinus Torvalds {
2938d0028588SWanpeng Li 	struct hstate *h;
2939d0028588SWanpeng Li 	unsigned long nr_total_pages = 0;
2940d0028588SWanpeng Li 
2941d0028588SWanpeng Li 	for_each_hstate(h)
2942d0028588SWanpeng Li 		nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h);
2943d0028588SWanpeng Li 	return nr_total_pages;
29441da177e4SLinus Torvalds }
29451da177e4SLinus Torvalds 
2946a5516438SAndi Kleen static int hugetlb_acct_memory(struct hstate *h, long delta)
2947fc1b8a73SMel Gorman {
2948fc1b8a73SMel Gorman 	int ret = -ENOMEM;
2949fc1b8a73SMel Gorman 
2950fc1b8a73SMel Gorman 	spin_lock(&hugetlb_lock);
2951fc1b8a73SMel Gorman 	/*
2952fc1b8a73SMel Gorman 	 * When cpuset is configured, it breaks the strict hugetlb page
2953fc1b8a73SMel Gorman 	 * reservation as the accounting is done on a global variable. Such
2954fc1b8a73SMel Gorman 	 * reservation is completely rubbish in the presence of cpuset because
2955fc1b8a73SMel Gorman 	 * the reservation is not checked against page availability for the
2956fc1b8a73SMel Gorman 	 * current cpuset. Application can still potentially OOM'ed by kernel
2957fc1b8a73SMel Gorman 	 * with lack of free htlb page in cpuset that the task is in.
2958fc1b8a73SMel Gorman 	 * Attempt to enforce strict accounting with cpuset is almost
2959fc1b8a73SMel Gorman 	 * impossible (or too ugly) because cpuset is too fluid that
2960fc1b8a73SMel Gorman 	 * task or memory node can be dynamically moved between cpusets.
2961fc1b8a73SMel Gorman 	 *
2962fc1b8a73SMel Gorman 	 * The change of semantics for shared hugetlb mapping with cpuset is
2963fc1b8a73SMel Gorman 	 * undesirable. However, in order to preserve some of the semantics,
2964fc1b8a73SMel Gorman 	 * we fall back to check against current free page availability as
2965fc1b8a73SMel Gorman 	 * a best attempt and hopefully to minimize the impact of changing
2966fc1b8a73SMel Gorman 	 * semantics that cpuset has.
2967fc1b8a73SMel Gorman 	 */
2968fc1b8a73SMel Gorman 	if (delta > 0) {
2969a5516438SAndi Kleen 		if (gather_surplus_pages(h, delta) < 0)
2970fc1b8a73SMel Gorman 			goto out;
2971fc1b8a73SMel Gorman 
2972a5516438SAndi Kleen 		if (delta > cpuset_mems_nr(h->free_huge_pages_node)) {
2973a5516438SAndi Kleen 			return_unused_surplus_pages(h, delta);
2974fc1b8a73SMel Gorman 			goto out;
2975fc1b8a73SMel Gorman 		}
2976fc1b8a73SMel Gorman 	}
2977fc1b8a73SMel Gorman 
2978fc1b8a73SMel Gorman 	ret = 0;
2979fc1b8a73SMel Gorman 	if (delta < 0)
2980a5516438SAndi Kleen 		return_unused_surplus_pages(h, (unsigned long) -delta);
2981fc1b8a73SMel Gorman 
2982fc1b8a73SMel Gorman out:
2983fc1b8a73SMel Gorman 	spin_unlock(&hugetlb_lock);
2984fc1b8a73SMel Gorman 	return ret;
2985fc1b8a73SMel Gorman }
2986fc1b8a73SMel Gorman 
298784afd99bSAndy Whitcroft static void hugetlb_vm_op_open(struct vm_area_struct *vma)
298884afd99bSAndy Whitcroft {
2989f522c3acSJoonsoo Kim 	struct resv_map *resv = vma_resv_map(vma);
299084afd99bSAndy Whitcroft 
299184afd99bSAndy Whitcroft 	/*
299284afd99bSAndy Whitcroft 	 * This new VMA should share its siblings reservation map if present.
299384afd99bSAndy Whitcroft 	 * The VMA will only ever have a valid reservation map pointer where
299484afd99bSAndy Whitcroft 	 * it is being copied for another still existing VMA.  As that VMA
299525985edcSLucas De Marchi 	 * has a reference to the reservation map it cannot disappear until
299684afd99bSAndy Whitcroft 	 * after this open call completes.  It is therefore safe to take a
299784afd99bSAndy Whitcroft 	 * new reference here without additional locking.
299884afd99bSAndy Whitcroft 	 */
29994e35f483SJoonsoo Kim 	if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
3000f522c3acSJoonsoo Kim 		kref_get(&resv->refs);
300184afd99bSAndy Whitcroft }
300284afd99bSAndy Whitcroft 
3003a1e78772SMel Gorman static void hugetlb_vm_op_close(struct vm_area_struct *vma)
3004a1e78772SMel Gorman {
3005a5516438SAndi Kleen 	struct hstate *h = hstate_vma(vma);
3006f522c3acSJoonsoo Kim 	struct resv_map *resv = vma_resv_map(vma);
300790481622SDavid Gibson 	struct hugepage_subpool *spool = subpool_vma(vma);
30084e35f483SJoonsoo Kim 	unsigned long reserve, start, end;
30091c5ecae3SMike Kravetz 	long gbl_reserve;
301084afd99bSAndy Whitcroft 
30114e35f483SJoonsoo Kim 	if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
30124e35f483SJoonsoo Kim 		return;
30134e35f483SJoonsoo Kim 
3014a5516438SAndi Kleen 	start = vma_hugecache_offset(h, vma, vma->vm_start);
3015a5516438SAndi Kleen 	end = vma_hugecache_offset(h, vma, vma->vm_end);
301684afd99bSAndy Whitcroft 
30174e35f483SJoonsoo Kim 	reserve = (end - start) - region_count(resv, start, end);
301884afd99bSAndy Whitcroft 
3019f031dd27SJoonsoo Kim 	kref_put(&resv->refs, resv_map_release);
302084afd99bSAndy Whitcroft 
30217251ff78SAdam Litke 	if (reserve) {
30221c5ecae3SMike Kravetz 		/*
30231c5ecae3SMike Kravetz 		 * Decrement reserve counts.  The global reserve count may be
30241c5ecae3SMike Kravetz 		 * adjusted if the subpool has a minimum size.
30251c5ecae3SMike Kravetz 		 */
30261c5ecae3SMike Kravetz 		gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
30271c5ecae3SMike Kravetz 		hugetlb_acct_memory(h, -gbl_reserve);
30287251ff78SAdam Litke 	}
3029a1e78772SMel Gorman }
3030a1e78772SMel Gorman 
30311da177e4SLinus Torvalds /*
30321da177e4SLinus Torvalds  * We cannot handle pagefaults against hugetlb pages at all.  They cause
30331da177e4SLinus Torvalds  * handle_mm_fault() to try to instantiate regular-sized pages in the
30341da177e4SLinus Torvalds  * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
30351da177e4SLinus Torvalds  * this far.
30361da177e4SLinus Torvalds  */
3037d0217ac0SNick Piggin static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
30381da177e4SLinus Torvalds {
30391da177e4SLinus Torvalds 	BUG();
3040d0217ac0SNick Piggin 	return 0;
30411da177e4SLinus Torvalds }
30421da177e4SLinus Torvalds 
3043f0f37e2fSAlexey Dobriyan const struct vm_operations_struct hugetlb_vm_ops = {
3044d0217ac0SNick Piggin 	.fault = hugetlb_vm_op_fault,
304584afd99bSAndy Whitcroft 	.open = hugetlb_vm_op_open,
3046a1e78772SMel Gorman 	.close = hugetlb_vm_op_close,
30471da177e4SLinus Torvalds };
30481da177e4SLinus Torvalds 
30491e8f889bSDavid Gibson static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
30501e8f889bSDavid Gibson 				int writable)
305163551ae0SDavid Gibson {
305263551ae0SDavid Gibson 	pte_t entry;
305363551ae0SDavid Gibson 
30541e8f889bSDavid Gibson 	if (writable) {
3055106c992aSGerald Schaefer 		entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
3056106c992aSGerald Schaefer 					 vma->vm_page_prot)));
305763551ae0SDavid Gibson 	} else {
3058106c992aSGerald Schaefer 		entry = huge_pte_wrprotect(mk_huge_pte(page,
3059106c992aSGerald Schaefer 					   vma->vm_page_prot));
306063551ae0SDavid Gibson 	}
306163551ae0SDavid Gibson 	entry = pte_mkyoung(entry);
306263551ae0SDavid Gibson 	entry = pte_mkhuge(entry);
3063d9ed9faaSChris Metcalf 	entry = arch_make_huge_pte(entry, vma, page, writable);
306463551ae0SDavid Gibson 
306563551ae0SDavid Gibson 	return entry;
306663551ae0SDavid Gibson }
306763551ae0SDavid Gibson 
30681e8f889bSDavid Gibson static void set_huge_ptep_writable(struct vm_area_struct *vma,
30691e8f889bSDavid Gibson 				   unsigned long address, pte_t *ptep)
30701e8f889bSDavid Gibson {
30711e8f889bSDavid Gibson 	pte_t entry;
30721e8f889bSDavid Gibson 
3073106c992aSGerald Schaefer 	entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep)));
307432f84528SChris Forbes 	if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
30754b3073e1SRussell King 		update_mmu_cache(vma, address, ptep);
30761e8f889bSDavid Gibson }
30771e8f889bSDavid Gibson 
30784a705fefSNaoya Horiguchi static int is_hugetlb_entry_migration(pte_t pte)
30794a705fefSNaoya Horiguchi {
30804a705fefSNaoya Horiguchi 	swp_entry_t swp;
30814a705fefSNaoya Horiguchi 
30824a705fefSNaoya Horiguchi 	if (huge_pte_none(pte) || pte_present(pte))
30834a705fefSNaoya Horiguchi 		return 0;
30844a705fefSNaoya Horiguchi 	swp = pte_to_swp_entry(pte);
30854a705fefSNaoya Horiguchi 	if (non_swap_entry(swp) && is_migration_entry(swp))
30864a705fefSNaoya Horiguchi 		return 1;
30874a705fefSNaoya Horiguchi 	else
30884a705fefSNaoya Horiguchi 		return 0;
30894a705fefSNaoya Horiguchi }
30904a705fefSNaoya Horiguchi 
30914a705fefSNaoya Horiguchi static int is_hugetlb_entry_hwpoisoned(pte_t pte)
30924a705fefSNaoya Horiguchi {
30934a705fefSNaoya Horiguchi 	swp_entry_t swp;
30944a705fefSNaoya Horiguchi 
30954a705fefSNaoya Horiguchi 	if (huge_pte_none(pte) || pte_present(pte))
30964a705fefSNaoya Horiguchi 		return 0;
30974a705fefSNaoya Horiguchi 	swp = pte_to_swp_entry(pte);
30984a705fefSNaoya Horiguchi 	if (non_swap_entry(swp) && is_hwpoison_entry(swp))
30994a705fefSNaoya Horiguchi 		return 1;
31004a705fefSNaoya Horiguchi 	else
31014a705fefSNaoya Horiguchi 		return 0;
31024a705fefSNaoya Horiguchi }
31031e8f889bSDavid Gibson 
310463551ae0SDavid Gibson int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
310563551ae0SDavid Gibson 			    struct vm_area_struct *vma)
310663551ae0SDavid Gibson {
310763551ae0SDavid Gibson 	pte_t *src_pte, *dst_pte, entry;
310863551ae0SDavid Gibson 	struct page *ptepage;
31091c59827dSHugh Dickins 	unsigned long addr;
31101e8f889bSDavid Gibson 	int cow;
3111a5516438SAndi Kleen 	struct hstate *h = hstate_vma(vma);
3112a5516438SAndi Kleen 	unsigned long sz = huge_page_size(h);
3113e8569dd2SAndreas Sandberg 	unsigned long mmun_start;	/* For mmu_notifiers */
3114e8569dd2SAndreas Sandberg 	unsigned long mmun_end;		/* For mmu_notifiers */
3115e8569dd2SAndreas Sandberg 	int ret = 0;
31161e8f889bSDavid Gibson 
31171e8f889bSDavid Gibson 	cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
311863551ae0SDavid Gibson 
3119e8569dd2SAndreas Sandberg 	mmun_start = vma->vm_start;
3120e8569dd2SAndreas Sandberg 	mmun_end = vma->vm_end;
3121e8569dd2SAndreas Sandberg 	if (cow)
3122e8569dd2SAndreas Sandberg 		mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end);
3123e8569dd2SAndreas Sandberg 
3124a5516438SAndi Kleen 	for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
3125cb900f41SKirill A. Shutemov 		spinlock_t *src_ptl, *dst_ptl;
3126c74df32cSHugh Dickins 		src_pte = huge_pte_offset(src, addr);
3127c74df32cSHugh Dickins 		if (!src_pte)
3128c74df32cSHugh Dickins 			continue;
3129a5516438SAndi Kleen 		dst_pte = huge_pte_alloc(dst, addr, sz);
3130e8569dd2SAndreas Sandberg 		if (!dst_pte) {
3131e8569dd2SAndreas Sandberg 			ret = -ENOMEM;
3132e8569dd2SAndreas Sandberg 			break;
3133e8569dd2SAndreas Sandberg 		}
3134c5c99429SLarry Woodman 
3135c5c99429SLarry Woodman 		/* If the pagetables are shared don't copy or take references */
3136c5c99429SLarry Woodman 		if (dst_pte == src_pte)
3137c5c99429SLarry Woodman 			continue;
3138c5c99429SLarry Woodman 
3139cb900f41SKirill A. Shutemov 		dst_ptl = huge_pte_lock(h, dst, dst_pte);
3140cb900f41SKirill A. Shutemov 		src_ptl = huge_pte_lockptr(h, src, src_pte);
3141cb900f41SKirill A. Shutemov 		spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
31424a705fefSNaoya Horiguchi 		entry = huge_ptep_get(src_pte);
31434a705fefSNaoya Horiguchi 		if (huge_pte_none(entry)) { /* skip none entry */
31444a705fefSNaoya Horiguchi 			;
31454a705fefSNaoya Horiguchi 		} else if (unlikely(is_hugetlb_entry_migration(entry) ||
31464a705fefSNaoya Horiguchi 				    is_hugetlb_entry_hwpoisoned(entry))) {
31474a705fefSNaoya Horiguchi 			swp_entry_t swp_entry = pte_to_swp_entry(entry);
31484a705fefSNaoya Horiguchi 
31494a705fefSNaoya Horiguchi 			if (is_write_migration_entry(swp_entry) && cow) {
31504a705fefSNaoya Horiguchi 				/*
31514a705fefSNaoya Horiguchi 				 * COW mappings require pages in both
31524a705fefSNaoya Horiguchi 				 * parent and child to be set to read.
31534a705fefSNaoya Horiguchi 				 */
31544a705fefSNaoya Horiguchi 				make_migration_entry_read(&swp_entry);
31554a705fefSNaoya Horiguchi 				entry = swp_entry_to_pte(swp_entry);
31564a705fefSNaoya Horiguchi 				set_huge_pte_at(src, addr, src_pte, entry);
31574a705fefSNaoya Horiguchi 			}
31584a705fefSNaoya Horiguchi 			set_huge_pte_at(dst, addr, dst_pte, entry);
31594a705fefSNaoya Horiguchi 		} else {
316034ee645eSJoerg Roedel 			if (cow) {
31617f2e9525SGerald Schaefer 				huge_ptep_set_wrprotect(src, addr, src_pte);
316234ee645eSJoerg Roedel 				mmu_notifier_invalidate_range(src, mmun_start,
316334ee645eSJoerg Roedel 								   mmun_end);
316434ee645eSJoerg Roedel 			}
31650253d634SNaoya Horiguchi 			entry = huge_ptep_get(src_pte);
316663551ae0SDavid Gibson 			ptepage = pte_page(entry);
316763551ae0SDavid Gibson 			get_page(ptepage);
316853f9263bSKirill A. Shutemov 			page_dup_rmap(ptepage, true);
316963551ae0SDavid Gibson 			set_huge_pte_at(dst, addr, dst_pte, entry);
31705d317b2bSNaoya Horiguchi 			hugetlb_count_add(pages_per_huge_page(h), dst);
31711c59827dSHugh Dickins 		}
3172cb900f41SKirill A. Shutemov 		spin_unlock(src_ptl);
3173cb900f41SKirill A. Shutemov 		spin_unlock(dst_ptl);
317463551ae0SDavid Gibson 	}
317563551ae0SDavid Gibson 
3176e8569dd2SAndreas Sandberg 	if (cow)
3177e8569dd2SAndreas Sandberg 		mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end);
3178e8569dd2SAndreas Sandberg 
3179e8569dd2SAndreas Sandberg 	return ret;
318063551ae0SDavid Gibson }
318163551ae0SDavid Gibson 
318224669e58SAneesh Kumar K.V void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
318324669e58SAneesh Kumar K.V 			    unsigned long start, unsigned long end,
318424669e58SAneesh Kumar K.V 			    struct page *ref_page)
318563551ae0SDavid Gibson {
318663551ae0SDavid Gibson 	struct mm_struct *mm = vma->vm_mm;
318763551ae0SDavid Gibson 	unsigned long address;
3188c7546f8fSDavid Gibson 	pte_t *ptep;
318963551ae0SDavid Gibson 	pte_t pte;
3190cb900f41SKirill A. Shutemov 	spinlock_t *ptl;
319163551ae0SDavid Gibson 	struct page *page;
3192a5516438SAndi Kleen 	struct hstate *h = hstate_vma(vma);
3193a5516438SAndi Kleen 	unsigned long sz = huge_page_size(h);
31942ec74c3eSSagi Grimberg 	const unsigned long mmun_start = start;	/* For mmu_notifiers */
31952ec74c3eSSagi Grimberg 	const unsigned long mmun_end   = end;	/* For mmu_notifiers */
3196a5516438SAndi Kleen 
319763551ae0SDavid Gibson 	WARN_ON(!is_vm_hugetlb_page(vma));
3198a5516438SAndi Kleen 	BUG_ON(start & ~huge_page_mask(h));
3199a5516438SAndi Kleen 	BUG_ON(end & ~huge_page_mask(h));
320063551ae0SDavid Gibson 
320124669e58SAneesh Kumar K.V 	tlb_start_vma(tlb, vma);
32022ec74c3eSSagi Grimberg 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
3203569f48b8SHillf Danton 	address = start;
3204569f48b8SHillf Danton 	for (; address < end; address += sz) {
3205c7546f8fSDavid Gibson 		ptep = huge_pte_offset(mm, address);
3206c7546f8fSDavid Gibson 		if (!ptep)
3207c7546f8fSDavid Gibson 			continue;
3208c7546f8fSDavid Gibson 
3209cb900f41SKirill A. Shutemov 		ptl = huge_pte_lock(h, mm, ptep);
321031d49da5SAneesh Kumar K.V 		if (huge_pmd_unshare(mm, &address, ptep)) {
321131d49da5SAneesh Kumar K.V 			spin_unlock(ptl);
321231d49da5SAneesh Kumar K.V 			continue;
321331d49da5SAneesh Kumar K.V 		}
321439dde65cSChen, Kenneth W 
32156629326bSHillf Danton 		pte = huge_ptep_get(ptep);
321631d49da5SAneesh Kumar K.V 		if (huge_pte_none(pte)) {
321731d49da5SAneesh Kumar K.V 			spin_unlock(ptl);
321831d49da5SAneesh Kumar K.V 			continue;
321931d49da5SAneesh Kumar K.V 		}
32206629326bSHillf Danton 
32216629326bSHillf Danton 		/*
32229fbc1f63SNaoya Horiguchi 		 * Migrating hugepage or HWPoisoned hugepage is already
32239fbc1f63SNaoya Horiguchi 		 * unmapped and its refcount is dropped, so just clear pte here.
32246629326bSHillf Danton 		 */
32259fbc1f63SNaoya Horiguchi 		if (unlikely(!pte_present(pte))) {
3226106c992aSGerald Schaefer 			huge_pte_clear(mm, address, ptep);
322731d49da5SAneesh Kumar K.V 			spin_unlock(ptl);
322831d49da5SAneesh Kumar K.V 			continue;
32298c4894c6SNaoya Horiguchi 		}
32306629326bSHillf Danton 
32316629326bSHillf Danton 		page = pte_page(pte);
323204f2cbe3SMel Gorman 		/*
323304f2cbe3SMel Gorman 		 * If a reference page is supplied, it is because a specific
323404f2cbe3SMel Gorman 		 * page is being unmapped, not a range. Ensure the page we
323504f2cbe3SMel Gorman 		 * are about to unmap is the actual page of interest.
323604f2cbe3SMel Gorman 		 */
323704f2cbe3SMel Gorman 		if (ref_page) {
323831d49da5SAneesh Kumar K.V 			if (page != ref_page) {
323931d49da5SAneesh Kumar K.V 				spin_unlock(ptl);
324031d49da5SAneesh Kumar K.V 				continue;
324131d49da5SAneesh Kumar K.V 			}
324204f2cbe3SMel Gorman 			/*
324304f2cbe3SMel Gorman 			 * Mark the VMA as having unmapped its page so that
324404f2cbe3SMel Gorman 			 * future faults in this VMA will fail rather than
324504f2cbe3SMel Gorman 			 * looking like data was lost
324604f2cbe3SMel Gorman 			 */
324704f2cbe3SMel Gorman 			set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
324804f2cbe3SMel Gorman 		}
324904f2cbe3SMel Gorman 
3250c7546f8fSDavid Gibson 		pte = huge_ptep_get_and_clear(mm, address, ptep);
325124669e58SAneesh Kumar K.V 		tlb_remove_tlb_entry(tlb, ptep, address);
3252106c992aSGerald Schaefer 		if (huge_pte_dirty(pte))
32536649a386SKen Chen 			set_page_dirty(page);
32549e81130bSHillf Danton 
32555d317b2bSNaoya Horiguchi 		hugetlb_count_sub(pages_per_huge_page(h), mm);
3256d281ee61SKirill A. Shutemov 		page_remove_rmap(page, true);
325731d49da5SAneesh Kumar K.V 
3258cb900f41SKirill A. Shutemov 		spin_unlock(ptl);
3259e77b0852SAneesh Kumar K.V 		tlb_remove_page_size(tlb, page, huge_page_size(h));
326024669e58SAneesh Kumar K.V 		/*
326131d49da5SAneesh Kumar K.V 		 * Bail out after unmapping reference page if supplied
326224669e58SAneesh Kumar K.V 		 */
326331d49da5SAneesh Kumar K.V 		if (ref_page)
326431d49da5SAneesh Kumar K.V 			break;
3265fe1668aeSChen, Kenneth W 	}
32662ec74c3eSSagi Grimberg 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
326724669e58SAneesh Kumar K.V 	tlb_end_vma(tlb, vma);
32681da177e4SLinus Torvalds }
326963551ae0SDavid Gibson 
3270d833352aSMel Gorman void __unmap_hugepage_range_final(struct mmu_gather *tlb,
3271d833352aSMel Gorman 			  struct vm_area_struct *vma, unsigned long start,
3272d833352aSMel Gorman 			  unsigned long end, struct page *ref_page)
3273d833352aSMel Gorman {
3274d833352aSMel Gorman 	__unmap_hugepage_range(tlb, vma, start, end, ref_page);
3275d833352aSMel Gorman 
3276d833352aSMel Gorman 	/*
3277d833352aSMel Gorman 	 * Clear this flag so that x86's huge_pmd_share page_table_shareable
3278d833352aSMel Gorman 	 * test will fail on a vma being torn down, and not grab a page table
3279d833352aSMel Gorman 	 * on its way out.  We're lucky that the flag has such an appropriate
3280d833352aSMel Gorman 	 * name, and can in fact be safely cleared here. We could clear it
3281d833352aSMel Gorman 	 * before the __unmap_hugepage_range above, but all that's necessary
3282c8c06efaSDavidlohr Bueso 	 * is to clear it before releasing the i_mmap_rwsem. This works
3283d833352aSMel Gorman 	 * because in the context this is called, the VMA is about to be
3284c8c06efaSDavidlohr Bueso 	 * destroyed and the i_mmap_rwsem is held.
3285d833352aSMel Gorman 	 */
3286d833352aSMel Gorman 	vma->vm_flags &= ~VM_MAYSHARE;
3287d833352aSMel Gorman }
3288d833352aSMel Gorman 
3289502717f4SChen, Kenneth W void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
329004f2cbe3SMel Gorman 			  unsigned long end, struct page *ref_page)
3291502717f4SChen, Kenneth W {
329224669e58SAneesh Kumar K.V 	struct mm_struct *mm;
329324669e58SAneesh Kumar K.V 	struct mmu_gather tlb;
329424669e58SAneesh Kumar K.V 
329524669e58SAneesh Kumar K.V 	mm = vma->vm_mm;
329624669e58SAneesh Kumar K.V 
32972b047252SLinus Torvalds 	tlb_gather_mmu(&tlb, mm, start, end);
329824669e58SAneesh Kumar K.V 	__unmap_hugepage_range(&tlb, vma, start, end, ref_page);
329924669e58SAneesh Kumar K.V 	tlb_finish_mmu(&tlb, start, end);
3300502717f4SChen, Kenneth W }
3301502717f4SChen, Kenneth W 
330204f2cbe3SMel Gorman /*
330304f2cbe3SMel Gorman  * This is called when the original mapper is failing to COW a MAP_PRIVATE
330404f2cbe3SMel Gorman  * mappping it owns the reserve page for. The intention is to unmap the page
330504f2cbe3SMel Gorman  * from other VMAs and let the children be SIGKILLed if they are faulting the
330604f2cbe3SMel Gorman  * same region.
330704f2cbe3SMel Gorman  */
33082f4612afSDavidlohr Bueso static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
33092a4b3dedSHarvey Harrison 			      struct page *page, unsigned long address)
331004f2cbe3SMel Gorman {
33117526674dSAdam Litke 	struct hstate *h = hstate_vma(vma);
331204f2cbe3SMel Gorman 	struct vm_area_struct *iter_vma;
331304f2cbe3SMel Gorman 	struct address_space *mapping;
331404f2cbe3SMel Gorman 	pgoff_t pgoff;
331504f2cbe3SMel Gorman 
331604f2cbe3SMel Gorman 	/*
331704f2cbe3SMel Gorman 	 * vm_pgoff is in PAGE_SIZE units, hence the different calculation
331804f2cbe3SMel Gorman 	 * from page cache lookup which is in HPAGE_SIZE units.
331904f2cbe3SMel Gorman 	 */
33207526674dSAdam Litke 	address = address & huge_page_mask(h);
332136e4f20aSMichal Hocko 	pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
332236e4f20aSMichal Hocko 			vma->vm_pgoff;
332393c76a3dSAl Viro 	mapping = vma->vm_file->f_mapping;
332404f2cbe3SMel Gorman 
33254eb2b1dcSMel Gorman 	/*
33264eb2b1dcSMel Gorman 	 * Take the mapping lock for the duration of the table walk. As
33274eb2b1dcSMel Gorman 	 * this mapping should be shared between all the VMAs,
33284eb2b1dcSMel Gorman 	 * __unmap_hugepage_range() is called as the lock is already held
33294eb2b1dcSMel Gorman 	 */
333083cde9e8SDavidlohr Bueso 	i_mmap_lock_write(mapping);
33316b2dbba8SMichel Lespinasse 	vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
333204f2cbe3SMel Gorman 		/* Do not unmap the current VMA */
333304f2cbe3SMel Gorman 		if (iter_vma == vma)
333404f2cbe3SMel Gorman 			continue;
333504f2cbe3SMel Gorman 
333604f2cbe3SMel Gorman 		/*
33372f84a899SMel Gorman 		 * Shared VMAs have their own reserves and do not affect
33382f84a899SMel Gorman 		 * MAP_PRIVATE accounting but it is possible that a shared
33392f84a899SMel Gorman 		 * VMA is using the same page so check and skip such VMAs.
33402f84a899SMel Gorman 		 */
33412f84a899SMel Gorman 		if (iter_vma->vm_flags & VM_MAYSHARE)
33422f84a899SMel Gorman 			continue;
33432f84a899SMel Gorman 
33442f84a899SMel Gorman 		/*
334504f2cbe3SMel Gorman 		 * Unmap the page from other VMAs without their own reserves.
334604f2cbe3SMel Gorman 		 * They get marked to be SIGKILLed if they fault in these
334704f2cbe3SMel Gorman 		 * areas. This is because a future no-page fault on this VMA
334804f2cbe3SMel Gorman 		 * could insert a zeroed page instead of the data existing
334904f2cbe3SMel Gorman 		 * from the time of fork. This would look like data corruption
335004f2cbe3SMel Gorman 		 */
335104f2cbe3SMel Gorman 		if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
335224669e58SAneesh Kumar K.V 			unmap_hugepage_range(iter_vma, address,
335324669e58SAneesh Kumar K.V 					     address + huge_page_size(h), page);
335404f2cbe3SMel Gorman 	}
335583cde9e8SDavidlohr Bueso 	i_mmap_unlock_write(mapping);
335604f2cbe3SMel Gorman }
335704f2cbe3SMel Gorman 
33580fe6e20bSNaoya Horiguchi /*
33590fe6e20bSNaoya Horiguchi  * Hugetlb_cow() should be called with page lock of the original hugepage held.
3360ef009b25SMichal Hocko  * Called with hugetlb_instantiation_mutex held and pte_page locked so we
3361ef009b25SMichal Hocko  * cannot race with other handlers or page migration.
3362ef009b25SMichal Hocko  * Keep the pte_same checks anyway to make transition from the mutex easier.
33630fe6e20bSNaoya Horiguchi  */
33641e8f889bSDavid Gibson static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
336504f2cbe3SMel Gorman 			unsigned long address, pte_t *ptep, pte_t pte,
3366cb900f41SKirill A. Shutemov 			struct page *pagecache_page, spinlock_t *ptl)
33671e8f889bSDavid Gibson {
3368a5516438SAndi Kleen 	struct hstate *h = hstate_vma(vma);
33691e8f889bSDavid Gibson 	struct page *old_page, *new_page;
3370ad4404a2SDavidlohr Bueso 	int ret = 0, outside_reserve = 0;
33712ec74c3eSSagi Grimberg 	unsigned long mmun_start;	/* For mmu_notifiers */
33722ec74c3eSSagi Grimberg 	unsigned long mmun_end;		/* For mmu_notifiers */
33731e8f889bSDavid Gibson 
33741e8f889bSDavid Gibson 	old_page = pte_page(pte);
33751e8f889bSDavid Gibson 
337604f2cbe3SMel Gorman retry_avoidcopy:
33771e8f889bSDavid Gibson 	/* If no-one else is actually using this page, avoid the copy
33781e8f889bSDavid Gibson 	 * and just make the page writable */
337937a2140dSJoonsoo Kim 	if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
33805a49973dSHugh Dickins 		page_move_anon_rmap(old_page, vma);
33811e8f889bSDavid Gibson 		set_huge_ptep_writable(vma, address, ptep);
338283c54070SNick Piggin 		return 0;
33831e8f889bSDavid Gibson 	}
33841e8f889bSDavid Gibson 
338504f2cbe3SMel Gorman 	/*
338604f2cbe3SMel Gorman 	 * If the process that created a MAP_PRIVATE mapping is about to
338704f2cbe3SMel Gorman 	 * perform a COW due to a shared page count, attempt to satisfy
338804f2cbe3SMel Gorman 	 * the allocation without using the existing reserves. The pagecache
338904f2cbe3SMel Gorman 	 * page is used to determine if the reserve at this address was
339004f2cbe3SMel Gorman 	 * consumed or not. If reserves were used, a partial faulted mapping
339104f2cbe3SMel Gorman 	 * at the time of fork() could consume its reserves on COW instead
339204f2cbe3SMel Gorman 	 * of the full address range.
339304f2cbe3SMel Gorman 	 */
33945944d011SJoonsoo Kim 	if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
339504f2cbe3SMel Gorman 			old_page != pagecache_page)
339604f2cbe3SMel Gorman 		outside_reserve = 1;
339704f2cbe3SMel Gorman 
339809cbfeafSKirill A. Shutemov 	get_page(old_page);
3399b76c8cfbSLarry Woodman 
3400ad4404a2SDavidlohr Bueso 	/*
3401ad4404a2SDavidlohr Bueso 	 * Drop page table lock as buddy allocator may be called. It will
3402ad4404a2SDavidlohr Bueso 	 * be acquired again before returning to the caller, as expected.
3403ad4404a2SDavidlohr Bueso 	 */
3404cb900f41SKirill A. Shutemov 	spin_unlock(ptl);
340504f2cbe3SMel Gorman 	new_page = alloc_huge_page(vma, address, outside_reserve);
34061e8f889bSDavid Gibson 
34072fc39cecSAdam Litke 	if (IS_ERR(new_page)) {
340804f2cbe3SMel Gorman 		/*
340904f2cbe3SMel Gorman 		 * If a process owning a MAP_PRIVATE mapping fails to COW,
341004f2cbe3SMel Gorman 		 * it is due to references held by a child and an insufficient
341104f2cbe3SMel Gorman 		 * huge page pool. To guarantee the original mappers
341204f2cbe3SMel Gorman 		 * reliability, unmap the page from child processes. The child
341304f2cbe3SMel Gorman 		 * may get SIGKILLed if it later faults.
341404f2cbe3SMel Gorman 		 */
341504f2cbe3SMel Gorman 		if (outside_reserve) {
341609cbfeafSKirill A. Shutemov 			put_page(old_page);
341704f2cbe3SMel Gorman 			BUG_ON(huge_pte_none(pte));
34182f4612afSDavidlohr Bueso 			unmap_ref_private(mm, vma, old_page, address);
341904f2cbe3SMel Gorman 			BUG_ON(huge_pte_none(pte));
3420cb900f41SKirill A. Shutemov 			spin_lock(ptl);
3421a734bcc8SHillf Danton 			ptep = huge_pte_offset(mm, address & huge_page_mask(h));
3422a9af0c5dSNaoya Horiguchi 			if (likely(ptep &&
3423a9af0c5dSNaoya Horiguchi 				   pte_same(huge_ptep_get(ptep), pte)))
342404f2cbe3SMel Gorman 				goto retry_avoidcopy;
3425a734bcc8SHillf Danton 			/*
3426cb900f41SKirill A. Shutemov 			 * race occurs while re-acquiring page table
3427cb900f41SKirill A. Shutemov 			 * lock, and our job is done.
3428a734bcc8SHillf Danton 			 */
3429a734bcc8SHillf Danton 			return 0;
343004f2cbe3SMel Gorman 		}
343104f2cbe3SMel Gorman 
3432ad4404a2SDavidlohr Bueso 		ret = (PTR_ERR(new_page) == -ENOMEM) ?
3433ad4404a2SDavidlohr Bueso 			VM_FAULT_OOM : VM_FAULT_SIGBUS;
3434ad4404a2SDavidlohr Bueso 		goto out_release_old;
34351e8f889bSDavid Gibson 	}
34361e8f889bSDavid Gibson 
34370fe6e20bSNaoya Horiguchi 	/*
34380fe6e20bSNaoya Horiguchi 	 * When the original hugepage is shared one, it does not have
34390fe6e20bSNaoya Horiguchi 	 * anon_vma prepared.
34400fe6e20bSNaoya Horiguchi 	 */
344144e2aa93SDean Nelson 	if (unlikely(anon_vma_prepare(vma))) {
3442ad4404a2SDavidlohr Bueso 		ret = VM_FAULT_OOM;
3443ad4404a2SDavidlohr Bueso 		goto out_release_all;
344444e2aa93SDean Nelson 	}
34450fe6e20bSNaoya Horiguchi 
344647ad8475SAndrea Arcangeli 	copy_user_huge_page(new_page, old_page, address, vma,
344747ad8475SAndrea Arcangeli 			    pages_per_huge_page(h));
34480ed361deSNick Piggin 	__SetPageUptodate(new_page);
3449bcc54222SNaoya Horiguchi 	set_page_huge_active(new_page);
34501e8f889bSDavid Gibson 
34512ec74c3eSSagi Grimberg 	mmun_start = address & huge_page_mask(h);
34522ec74c3eSSagi Grimberg 	mmun_end = mmun_start + huge_page_size(h);
34532ec74c3eSSagi Grimberg 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
3454ad4404a2SDavidlohr Bueso 
3455b76c8cfbSLarry Woodman 	/*
3456cb900f41SKirill A. Shutemov 	 * Retake the page table lock to check for racing updates
3457b76c8cfbSLarry Woodman 	 * before the page tables are altered
3458b76c8cfbSLarry Woodman 	 */
3459cb900f41SKirill A. Shutemov 	spin_lock(ptl);
3460a5516438SAndi Kleen 	ptep = huge_pte_offset(mm, address & huge_page_mask(h));
3461a9af0c5dSNaoya Horiguchi 	if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
346207443a85SJoonsoo Kim 		ClearPagePrivate(new_page);
346307443a85SJoonsoo Kim 
34641e8f889bSDavid Gibson 		/* Break COW */
34658fe627ecSGerald Schaefer 		huge_ptep_clear_flush(vma, address, ptep);
346634ee645eSJoerg Roedel 		mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
34671e8f889bSDavid Gibson 		set_huge_pte_at(mm, address, ptep,
34681e8f889bSDavid Gibson 				make_huge_pte(vma, new_page, 1));
3469d281ee61SKirill A. Shutemov 		page_remove_rmap(old_page, true);
3470cd67f0d2SNaoya Horiguchi 		hugepage_add_new_anon_rmap(new_page, vma, address);
34711e8f889bSDavid Gibson 		/* Make the old page be freed below */
34721e8f889bSDavid Gibson 		new_page = old_page;
34731e8f889bSDavid Gibson 	}
3474cb900f41SKirill A. Shutemov 	spin_unlock(ptl);
34752ec74c3eSSagi Grimberg 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
3476ad4404a2SDavidlohr Bueso out_release_all:
347709cbfeafSKirill A. Shutemov 	put_page(new_page);
3478ad4404a2SDavidlohr Bueso out_release_old:
347909cbfeafSKirill A. Shutemov 	put_page(old_page);
34808312034fSJoonsoo Kim 
3481ad4404a2SDavidlohr Bueso 	spin_lock(ptl); /* Caller expects lock to be held */
3482ad4404a2SDavidlohr Bueso 	return ret;
34831e8f889bSDavid Gibson }
34841e8f889bSDavid Gibson 
348504f2cbe3SMel Gorman /* Return the pagecache page at a given address within a VMA */
3486a5516438SAndi Kleen static struct page *hugetlbfs_pagecache_page(struct hstate *h,
3487a5516438SAndi Kleen 			struct vm_area_struct *vma, unsigned long address)
348804f2cbe3SMel Gorman {
348904f2cbe3SMel Gorman 	struct address_space *mapping;
3490e7c4b0bfSAndy Whitcroft 	pgoff_t idx;
349104f2cbe3SMel Gorman 
349204f2cbe3SMel Gorman 	mapping = vma->vm_file->f_mapping;
3493a5516438SAndi Kleen 	idx = vma_hugecache_offset(h, vma, address);
349404f2cbe3SMel Gorman 
349504f2cbe3SMel Gorman 	return find_lock_page(mapping, idx);
349604f2cbe3SMel Gorman }
349704f2cbe3SMel Gorman 
34983ae77f43SHugh Dickins /*
34993ae77f43SHugh Dickins  * Return whether there is a pagecache page to back given address within VMA.
35003ae77f43SHugh Dickins  * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
35013ae77f43SHugh Dickins  */
35023ae77f43SHugh Dickins static bool hugetlbfs_pagecache_present(struct hstate *h,
35032a15efc9SHugh Dickins 			struct vm_area_struct *vma, unsigned long address)
35042a15efc9SHugh Dickins {
35052a15efc9SHugh Dickins 	struct address_space *mapping;
35062a15efc9SHugh Dickins 	pgoff_t idx;
35072a15efc9SHugh Dickins 	struct page *page;
35082a15efc9SHugh Dickins 
35092a15efc9SHugh Dickins 	mapping = vma->vm_file->f_mapping;
35102a15efc9SHugh Dickins 	idx = vma_hugecache_offset(h, vma, address);
35112a15efc9SHugh Dickins 
35122a15efc9SHugh Dickins 	page = find_get_page(mapping, idx);
35132a15efc9SHugh Dickins 	if (page)
35142a15efc9SHugh Dickins 		put_page(page);
35152a15efc9SHugh Dickins 	return page != NULL;
35162a15efc9SHugh Dickins }
35172a15efc9SHugh Dickins 
3518ab76ad54SMike Kravetz int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
3519ab76ad54SMike Kravetz 			   pgoff_t idx)
3520ab76ad54SMike Kravetz {
3521ab76ad54SMike Kravetz 	struct inode *inode = mapping->host;
3522ab76ad54SMike Kravetz 	struct hstate *h = hstate_inode(inode);
3523ab76ad54SMike Kravetz 	int err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
3524ab76ad54SMike Kravetz 
3525ab76ad54SMike Kravetz 	if (err)
3526ab76ad54SMike Kravetz 		return err;
3527ab76ad54SMike Kravetz 	ClearPagePrivate(page);
3528ab76ad54SMike Kravetz 
3529ab76ad54SMike Kravetz 	spin_lock(&inode->i_lock);
3530ab76ad54SMike Kravetz 	inode->i_blocks += blocks_per_huge_page(h);
3531ab76ad54SMike Kravetz 	spin_unlock(&inode->i_lock);
3532ab76ad54SMike Kravetz 	return 0;
3533ab76ad54SMike Kravetz }
3534ab76ad54SMike Kravetz 
3535a1ed3ddaSRobert P. J. Day static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
35368382d914SDavidlohr Bueso 			   struct address_space *mapping, pgoff_t idx,
3537788c7df4SHugh Dickins 			   unsigned long address, pte_t *ptep, unsigned int flags)
3538ac9b9c66SHugh Dickins {
3539a5516438SAndi Kleen 	struct hstate *h = hstate_vma(vma);
3540ac9b9c66SHugh Dickins 	int ret = VM_FAULT_SIGBUS;
3541409eb8c2SHillf Danton 	int anon_rmap = 0;
35424c887265SAdam Litke 	unsigned long size;
35434c887265SAdam Litke 	struct page *page;
35441e8f889bSDavid Gibson 	pte_t new_pte;
3545cb900f41SKirill A. Shutemov 	spinlock_t *ptl;
35464c887265SAdam Litke 
354704f2cbe3SMel Gorman 	/*
354804f2cbe3SMel Gorman 	 * Currently, we are forced to kill the process in the event the
354904f2cbe3SMel Gorman 	 * original mapper has unmapped pages from the child due to a failed
355025985edcSLucas De Marchi 	 * COW. Warn that such a situation has occurred as it may not be obvious
355104f2cbe3SMel Gorman 	 */
355204f2cbe3SMel Gorman 	if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
3553910154d5SGeoffrey Thomas 		pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
355404f2cbe3SMel Gorman 			   current->pid);
355504f2cbe3SMel Gorman 		return ret;
355604f2cbe3SMel Gorman 	}
355704f2cbe3SMel Gorman 
35584c887265SAdam Litke 	/*
35594c887265SAdam Litke 	 * Use page lock to guard against racing truncation
35604c887265SAdam Litke 	 * before we get page_table_lock.
35614c887265SAdam Litke 	 */
35626bda666aSChristoph Lameter retry:
35636bda666aSChristoph Lameter 	page = find_lock_page(mapping, idx);
35646bda666aSChristoph Lameter 	if (!page) {
3565a5516438SAndi Kleen 		size = i_size_read(mapping->host) >> huge_page_shift(h);
3566ebed4bfcSHugh Dickins 		if (idx >= size)
3567ebed4bfcSHugh Dickins 			goto out;
356804f2cbe3SMel Gorman 		page = alloc_huge_page(vma, address, 0);
35692fc39cecSAdam Litke 		if (IS_ERR(page)) {
357076dcee75SAneesh Kumar K.V 			ret = PTR_ERR(page);
357176dcee75SAneesh Kumar K.V 			if (ret == -ENOMEM)
357276dcee75SAneesh Kumar K.V 				ret = VM_FAULT_OOM;
357376dcee75SAneesh Kumar K.V 			else
357476dcee75SAneesh Kumar K.V 				ret = VM_FAULT_SIGBUS;
35756bda666aSChristoph Lameter 			goto out;
35766bda666aSChristoph Lameter 		}
357747ad8475SAndrea Arcangeli 		clear_huge_page(page, address, pages_per_huge_page(h));
35780ed361deSNick Piggin 		__SetPageUptodate(page);
3579bcc54222SNaoya Horiguchi 		set_page_huge_active(page);
3580ac9b9c66SHugh Dickins 
3581f83a275dSMel Gorman 		if (vma->vm_flags & VM_MAYSHARE) {
3582ab76ad54SMike Kravetz 			int err = huge_add_to_page_cache(page, mapping, idx);
35836bda666aSChristoph Lameter 			if (err) {
35846bda666aSChristoph Lameter 				put_page(page);
35856bda666aSChristoph Lameter 				if (err == -EEXIST)
35866bda666aSChristoph Lameter 					goto retry;
35876bda666aSChristoph Lameter 				goto out;
35886bda666aSChristoph Lameter 			}
358923be7468SMel Gorman 		} else {
35906bda666aSChristoph Lameter 			lock_page(page);
35910fe6e20bSNaoya Horiguchi 			if (unlikely(anon_vma_prepare(vma))) {
35920fe6e20bSNaoya Horiguchi 				ret = VM_FAULT_OOM;
35930fe6e20bSNaoya Horiguchi 				goto backout_unlocked;
359423be7468SMel Gorman 			}
3595409eb8c2SHillf Danton 			anon_rmap = 1;
35960fe6e20bSNaoya Horiguchi 		}
35970fe6e20bSNaoya Horiguchi 	} else {
359857303d80SAndy Whitcroft 		/*
3599998b4382SNaoya Horiguchi 		 * If memory error occurs between mmap() and fault, some process
3600998b4382SNaoya Horiguchi 		 * don't have hwpoisoned swap entry for errored virtual address.
3601998b4382SNaoya Horiguchi 		 * So we need to block hugepage fault by PG_hwpoison bit check.
3602fd6a03edSNaoya Horiguchi 		 */
3603fd6a03edSNaoya Horiguchi 		if (unlikely(PageHWPoison(page))) {
3604aa50d3a7SAndi Kleen 			ret = VM_FAULT_HWPOISON |
3605972dc4deSAneesh Kumar K.V 				VM_FAULT_SET_HINDEX(hstate_index(h));
3606fd6a03edSNaoya Horiguchi 			goto backout_unlocked;
36076bda666aSChristoph Lameter 		}
3608998b4382SNaoya Horiguchi 	}
36091e8f889bSDavid Gibson 
361057303d80SAndy Whitcroft 	/*
361157303d80SAndy Whitcroft 	 * If we are going to COW a private mapping later, we examine the
361257303d80SAndy Whitcroft 	 * pending reservations for this page now. This will ensure that
361357303d80SAndy Whitcroft 	 * any allocations necessary to record that reservation occur outside
361457303d80SAndy Whitcroft 	 * the spinlock.
361557303d80SAndy Whitcroft 	 */
36165e911373SMike Kravetz 	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
36172b26736cSAndy Whitcroft 		if (vma_needs_reservation(h, vma, address) < 0) {
36182b26736cSAndy Whitcroft 			ret = VM_FAULT_OOM;
36192b26736cSAndy Whitcroft 			goto backout_unlocked;
36202b26736cSAndy Whitcroft 		}
36215e911373SMike Kravetz 		/* Just decrements count, does not deallocate */
3622feba16e2SMike Kravetz 		vma_end_reservation(h, vma, address);
36235e911373SMike Kravetz 	}
362457303d80SAndy Whitcroft 
3625cb900f41SKirill A. Shutemov 	ptl = huge_pte_lockptr(h, mm, ptep);
3626cb900f41SKirill A. Shutemov 	spin_lock(ptl);
3627a5516438SAndi Kleen 	size = i_size_read(mapping->host) >> huge_page_shift(h);
36284c887265SAdam Litke 	if (idx >= size)
36294c887265SAdam Litke 		goto backout;
36304c887265SAdam Litke 
363183c54070SNick Piggin 	ret = 0;
36327f2e9525SGerald Schaefer 	if (!huge_pte_none(huge_ptep_get(ptep)))
36334c887265SAdam Litke 		goto backout;
36344c887265SAdam Litke 
363507443a85SJoonsoo Kim 	if (anon_rmap) {
363607443a85SJoonsoo Kim 		ClearPagePrivate(page);
3637409eb8c2SHillf Danton 		hugepage_add_new_anon_rmap(page, vma, address);
3638ac714904SChoi Gi-yong 	} else
363953f9263bSKirill A. Shutemov 		page_dup_rmap(page, true);
36401e8f889bSDavid Gibson 	new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
36411e8f889bSDavid Gibson 				&& (vma->vm_flags & VM_SHARED)));
36421e8f889bSDavid Gibson 	set_huge_pte_at(mm, address, ptep, new_pte);
36431e8f889bSDavid Gibson 
36445d317b2bSNaoya Horiguchi 	hugetlb_count_add(pages_per_huge_page(h), mm);
3645788c7df4SHugh Dickins 	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
36461e8f889bSDavid Gibson 		/* Optimization, do the COW without a second fault */
3647cb900f41SKirill A. Shutemov 		ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl);
36481e8f889bSDavid Gibson 	}
36491e8f889bSDavid Gibson 
3650cb900f41SKirill A. Shutemov 	spin_unlock(ptl);
36514c887265SAdam Litke 	unlock_page(page);
36524c887265SAdam Litke out:
3653ac9b9c66SHugh Dickins 	return ret;
36544c887265SAdam Litke 
36554c887265SAdam Litke backout:
3656cb900f41SKirill A. Shutemov 	spin_unlock(ptl);
36572b26736cSAndy Whitcroft backout_unlocked:
36584c887265SAdam Litke 	unlock_page(page);
36594c887265SAdam Litke 	put_page(page);
36604c887265SAdam Litke 	goto out;
3661ac9b9c66SHugh Dickins }
3662ac9b9c66SHugh Dickins 
36638382d914SDavidlohr Bueso #ifdef CONFIG_SMP
3664c672c7f2SMike Kravetz u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
36658382d914SDavidlohr Bueso 			    struct vm_area_struct *vma,
36668382d914SDavidlohr Bueso 			    struct address_space *mapping,
36678382d914SDavidlohr Bueso 			    pgoff_t idx, unsigned long address)
36688382d914SDavidlohr Bueso {
36698382d914SDavidlohr Bueso 	unsigned long key[2];
36708382d914SDavidlohr Bueso 	u32 hash;
36718382d914SDavidlohr Bueso 
36728382d914SDavidlohr Bueso 	if (vma->vm_flags & VM_SHARED) {
36738382d914SDavidlohr Bueso 		key[0] = (unsigned long) mapping;
36748382d914SDavidlohr Bueso 		key[1] = idx;
36758382d914SDavidlohr Bueso 	} else {
36768382d914SDavidlohr Bueso 		key[0] = (unsigned long) mm;
36778382d914SDavidlohr Bueso 		key[1] = address >> huge_page_shift(h);
36788382d914SDavidlohr Bueso 	}
36798382d914SDavidlohr Bueso 
36808382d914SDavidlohr Bueso 	hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0);
36818382d914SDavidlohr Bueso 
36828382d914SDavidlohr Bueso 	return hash & (num_fault_mutexes - 1);
36838382d914SDavidlohr Bueso }
36848382d914SDavidlohr Bueso #else
36858382d914SDavidlohr Bueso /*
36868382d914SDavidlohr Bueso  * For uniprocesor systems we always use a single mutex, so just
36878382d914SDavidlohr Bueso  * return 0 and avoid the hashing overhead.
36888382d914SDavidlohr Bueso  */
3689c672c7f2SMike Kravetz u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
36908382d914SDavidlohr Bueso 			    struct vm_area_struct *vma,
36918382d914SDavidlohr Bueso 			    struct address_space *mapping,
36928382d914SDavidlohr Bueso 			    pgoff_t idx, unsigned long address)
36938382d914SDavidlohr Bueso {
36948382d914SDavidlohr Bueso 	return 0;
36958382d914SDavidlohr Bueso }
36968382d914SDavidlohr Bueso #endif
36978382d914SDavidlohr Bueso 
369886e5216fSAdam Litke int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3699788c7df4SHugh Dickins 			unsigned long address, unsigned int flags)
370086e5216fSAdam Litke {
37018382d914SDavidlohr Bueso 	pte_t *ptep, entry;
3702cb900f41SKirill A. Shutemov 	spinlock_t *ptl;
37031e8f889bSDavid Gibson 	int ret;
37048382d914SDavidlohr Bueso 	u32 hash;
37058382d914SDavidlohr Bueso 	pgoff_t idx;
37060fe6e20bSNaoya Horiguchi 	struct page *page = NULL;
370757303d80SAndy Whitcroft 	struct page *pagecache_page = NULL;
3708a5516438SAndi Kleen 	struct hstate *h = hstate_vma(vma);
37098382d914SDavidlohr Bueso 	struct address_space *mapping;
37100f792cf9SNaoya Horiguchi 	int need_wait_lock = 0;
371186e5216fSAdam Litke 
37121e16a539SKAMEZAWA Hiroyuki 	address &= huge_page_mask(h);
37131e16a539SKAMEZAWA Hiroyuki 
3714fd6a03edSNaoya Horiguchi 	ptep = huge_pte_offset(mm, address);
3715fd6a03edSNaoya Horiguchi 	if (ptep) {
3716fd6a03edSNaoya Horiguchi 		entry = huge_ptep_get(ptep);
3717290408d4SNaoya Horiguchi 		if (unlikely(is_hugetlb_entry_migration(entry))) {
3718cb900f41SKirill A. Shutemov 			migration_entry_wait_huge(vma, mm, ptep);
3719290408d4SNaoya Horiguchi 			return 0;
3720290408d4SNaoya Horiguchi 		} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
3721aa50d3a7SAndi Kleen 			return VM_FAULT_HWPOISON_LARGE |
3722972dc4deSAneesh Kumar K.V 				VM_FAULT_SET_HINDEX(hstate_index(h));
37230d777df5SNaoya Horiguchi 	} else {
3724a5516438SAndi Kleen 		ptep = huge_pte_alloc(mm, address, huge_page_size(h));
372586e5216fSAdam Litke 		if (!ptep)
372686e5216fSAdam Litke 			return VM_FAULT_OOM;
37270d777df5SNaoya Horiguchi 	}
372886e5216fSAdam Litke 
37298382d914SDavidlohr Bueso 	mapping = vma->vm_file->f_mapping;
37308382d914SDavidlohr Bueso 	idx = vma_hugecache_offset(h, vma, address);
37318382d914SDavidlohr Bueso 
37323935baa9SDavid Gibson 	/*
37333935baa9SDavid Gibson 	 * Serialize hugepage allocation and instantiation, so that we don't
37343935baa9SDavid Gibson 	 * get spurious allocation failures if two CPUs race to instantiate
37353935baa9SDavid Gibson 	 * the same page in the page cache.
37363935baa9SDavid Gibson 	 */
3737c672c7f2SMike Kravetz 	hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, address);
3738c672c7f2SMike Kravetz 	mutex_lock(&hugetlb_fault_mutex_table[hash]);
37398382d914SDavidlohr Bueso 
37407f2e9525SGerald Schaefer 	entry = huge_ptep_get(ptep);
37417f2e9525SGerald Schaefer 	if (huge_pte_none(entry)) {
37428382d914SDavidlohr Bueso 		ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags);
3743b4d1d99fSDavid Gibson 		goto out_mutex;
37443935baa9SDavid Gibson 	}
374586e5216fSAdam Litke 
374683c54070SNick Piggin 	ret = 0;
37471e8f889bSDavid Gibson 
374857303d80SAndy Whitcroft 	/*
37490f792cf9SNaoya Horiguchi 	 * entry could be a migration/hwpoison entry at this point, so this
37500f792cf9SNaoya Horiguchi 	 * check prevents the kernel from going below assuming that we have
37510f792cf9SNaoya Horiguchi 	 * a active hugepage in pagecache. This goto expects the 2nd page fault,
37520f792cf9SNaoya Horiguchi 	 * and is_hugetlb_entry_(migration|hwpoisoned) check will properly
37530f792cf9SNaoya Horiguchi 	 * handle it.
37540f792cf9SNaoya Horiguchi 	 */
37550f792cf9SNaoya Horiguchi 	if (!pte_present(entry))
37560f792cf9SNaoya Horiguchi 		goto out_mutex;
37570f792cf9SNaoya Horiguchi 
37580f792cf9SNaoya Horiguchi 	/*
375957303d80SAndy Whitcroft 	 * If we are going to COW the mapping later, we examine the pending
376057303d80SAndy Whitcroft 	 * reservations for this page now. This will ensure that any
376157303d80SAndy Whitcroft 	 * allocations necessary to record that reservation occur outside the
376257303d80SAndy Whitcroft 	 * spinlock. For private mappings, we also lookup the pagecache
376357303d80SAndy Whitcroft 	 * page now as it is used to determine if a reservation has been
376457303d80SAndy Whitcroft 	 * consumed.
376557303d80SAndy Whitcroft 	 */
3766106c992aSGerald Schaefer 	if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
37672b26736cSAndy Whitcroft 		if (vma_needs_reservation(h, vma, address) < 0) {
37682b26736cSAndy Whitcroft 			ret = VM_FAULT_OOM;
3769b4d1d99fSDavid Gibson 			goto out_mutex;
37702b26736cSAndy Whitcroft 		}
37715e911373SMike Kravetz 		/* Just decrements count, does not deallocate */
3772feba16e2SMike Kravetz 		vma_end_reservation(h, vma, address);
377357303d80SAndy Whitcroft 
3774f83a275dSMel Gorman 		if (!(vma->vm_flags & VM_MAYSHARE))
377557303d80SAndy Whitcroft 			pagecache_page = hugetlbfs_pagecache_page(h,
377657303d80SAndy Whitcroft 								vma, address);
377757303d80SAndy Whitcroft 	}
377857303d80SAndy Whitcroft 
37790f792cf9SNaoya Horiguchi 	ptl = huge_pte_lock(h, mm, ptep);
37800fe6e20bSNaoya Horiguchi 
37811e8f889bSDavid Gibson 	/* Check for a racing update before calling hugetlb_cow */
3782b4d1d99fSDavid Gibson 	if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
3783cb900f41SKirill A. Shutemov 		goto out_ptl;
3784b4d1d99fSDavid Gibson 
37850f792cf9SNaoya Horiguchi 	/*
37860f792cf9SNaoya Horiguchi 	 * hugetlb_cow() requires page locks of pte_page(entry) and
37870f792cf9SNaoya Horiguchi 	 * pagecache_page, so here we need take the former one
37880f792cf9SNaoya Horiguchi 	 * when page != pagecache_page or !pagecache_page.
37890f792cf9SNaoya Horiguchi 	 */
37900f792cf9SNaoya Horiguchi 	page = pte_page(entry);
37910f792cf9SNaoya Horiguchi 	if (page != pagecache_page)
37920f792cf9SNaoya Horiguchi 		if (!trylock_page(page)) {
37930f792cf9SNaoya Horiguchi 			need_wait_lock = 1;
37940f792cf9SNaoya Horiguchi 			goto out_ptl;
37950f792cf9SNaoya Horiguchi 		}
37960f792cf9SNaoya Horiguchi 
37970f792cf9SNaoya Horiguchi 	get_page(page);
3798b4d1d99fSDavid Gibson 
3799788c7df4SHugh Dickins 	if (flags & FAULT_FLAG_WRITE) {
3800106c992aSGerald Schaefer 		if (!huge_pte_write(entry)) {
380157303d80SAndy Whitcroft 			ret = hugetlb_cow(mm, vma, address, ptep, entry,
3802cb900f41SKirill A. Shutemov 					pagecache_page, ptl);
38030f792cf9SNaoya Horiguchi 			goto out_put_page;
3804b4d1d99fSDavid Gibson 		}
3805106c992aSGerald Schaefer 		entry = huge_pte_mkdirty(entry);
3806b4d1d99fSDavid Gibson 	}
3807b4d1d99fSDavid Gibson 	entry = pte_mkyoung(entry);
3808788c7df4SHugh Dickins 	if (huge_ptep_set_access_flags(vma, address, ptep, entry,
3809788c7df4SHugh Dickins 						flags & FAULT_FLAG_WRITE))
38104b3073e1SRussell King 		update_mmu_cache(vma, address, ptep);
38110f792cf9SNaoya Horiguchi out_put_page:
38120f792cf9SNaoya Horiguchi 	if (page != pagecache_page)
38130f792cf9SNaoya Horiguchi 		unlock_page(page);
38140f792cf9SNaoya Horiguchi 	put_page(page);
3815cb900f41SKirill A. Shutemov out_ptl:
3816cb900f41SKirill A. Shutemov 	spin_unlock(ptl);
381757303d80SAndy Whitcroft 
381857303d80SAndy Whitcroft 	if (pagecache_page) {
381957303d80SAndy Whitcroft 		unlock_page(pagecache_page);
382057303d80SAndy Whitcroft 		put_page(pagecache_page);
382157303d80SAndy Whitcroft 	}
3822b4d1d99fSDavid Gibson out_mutex:
3823c672c7f2SMike Kravetz 	mutex_unlock(&hugetlb_fault_mutex_table[hash]);
38240f792cf9SNaoya Horiguchi 	/*
38250f792cf9SNaoya Horiguchi 	 * Generally it's safe to hold refcount during waiting page lock. But
38260f792cf9SNaoya Horiguchi 	 * here we just wait to defer the next page fault to avoid busy loop and
38270f792cf9SNaoya Horiguchi 	 * the page is not used after unlocked before returning from the current
38280f792cf9SNaoya Horiguchi 	 * page fault. So we are safe from accessing freed page, even if we wait
38290f792cf9SNaoya Horiguchi 	 * here without taking refcount.
38300f792cf9SNaoya Horiguchi 	 */
38310f792cf9SNaoya Horiguchi 	if (need_wait_lock)
38320f792cf9SNaoya Horiguchi 		wait_on_page_locked(page);
38331e8f889bSDavid Gibson 	return ret;
383486e5216fSAdam Litke }
383586e5216fSAdam Litke 
383628a35716SMichel Lespinasse long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
383763551ae0SDavid Gibson 			 struct page **pages, struct vm_area_struct **vmas,
383828a35716SMichel Lespinasse 			 unsigned long *position, unsigned long *nr_pages,
383928a35716SMichel Lespinasse 			 long i, unsigned int flags)
384063551ae0SDavid Gibson {
3841d5d4b0aaSChen, Kenneth W 	unsigned long pfn_offset;
3842d5d4b0aaSChen, Kenneth W 	unsigned long vaddr = *position;
384328a35716SMichel Lespinasse 	unsigned long remainder = *nr_pages;
3844a5516438SAndi Kleen 	struct hstate *h = hstate_vma(vma);
384563551ae0SDavid Gibson 
384663551ae0SDavid Gibson 	while (vaddr < vma->vm_end && remainder) {
384763551ae0SDavid Gibson 		pte_t *pte;
3848cb900f41SKirill A. Shutemov 		spinlock_t *ptl = NULL;
38492a15efc9SHugh Dickins 		int absent;
385063551ae0SDavid Gibson 		struct page *page;
385163551ae0SDavid Gibson 
38524c887265SAdam Litke 		/*
385302057967SDavid Rientjes 		 * If we have a pending SIGKILL, don't keep faulting pages and
385402057967SDavid Rientjes 		 * potentially allocating memory.
385502057967SDavid Rientjes 		 */
385602057967SDavid Rientjes 		if (unlikely(fatal_signal_pending(current))) {
385702057967SDavid Rientjes 			remainder = 0;
385802057967SDavid Rientjes 			break;
385902057967SDavid Rientjes 		}
386002057967SDavid Rientjes 
386102057967SDavid Rientjes 		/*
38624c887265SAdam Litke 		 * Some archs (sparc64, sh*) have multiple pte_ts to
38632a15efc9SHugh Dickins 		 * each hugepage.  We have to make sure we get the
38644c887265SAdam Litke 		 * first, for the page indexing below to work.
3865cb900f41SKirill A. Shutemov 		 *
3866cb900f41SKirill A. Shutemov 		 * Note that page table lock is not held when pte is null.
38674c887265SAdam Litke 		 */
3868a5516438SAndi Kleen 		pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
3869cb900f41SKirill A. Shutemov 		if (pte)
3870cb900f41SKirill A. Shutemov 			ptl = huge_pte_lock(h, mm, pte);
38712a15efc9SHugh Dickins 		absent = !pte || huge_pte_none(huge_ptep_get(pte));
387263551ae0SDavid Gibson 
38732a15efc9SHugh Dickins 		/*
38742a15efc9SHugh Dickins 		 * When coredumping, it suits get_dump_page if we just return
38753ae77f43SHugh Dickins 		 * an error where there's an empty slot with no huge pagecache
38763ae77f43SHugh Dickins 		 * to back it.  This way, we avoid allocating a hugepage, and
38773ae77f43SHugh Dickins 		 * the sparse dumpfile avoids allocating disk blocks, but its
38783ae77f43SHugh Dickins 		 * huge holes still show up with zeroes where they need to be.
38792a15efc9SHugh Dickins 		 */
38803ae77f43SHugh Dickins 		if (absent && (flags & FOLL_DUMP) &&
38813ae77f43SHugh Dickins 		    !hugetlbfs_pagecache_present(h, vma, vaddr)) {
3882cb900f41SKirill A. Shutemov 			if (pte)
3883cb900f41SKirill A. Shutemov 				spin_unlock(ptl);
38842a15efc9SHugh Dickins 			remainder = 0;
38852a15efc9SHugh Dickins 			break;
38862a15efc9SHugh Dickins 		}
38872a15efc9SHugh Dickins 
38889cc3a5bdSNaoya Horiguchi 		/*
38899cc3a5bdSNaoya Horiguchi 		 * We need call hugetlb_fault for both hugepages under migration
38909cc3a5bdSNaoya Horiguchi 		 * (in which case hugetlb_fault waits for the migration,) and
38919cc3a5bdSNaoya Horiguchi 		 * hwpoisoned hugepages (in which case we need to prevent the
38929cc3a5bdSNaoya Horiguchi 		 * caller from accessing to them.) In order to do this, we use
38939cc3a5bdSNaoya Horiguchi 		 * here is_swap_pte instead of is_hugetlb_entry_migration and
38949cc3a5bdSNaoya Horiguchi 		 * is_hugetlb_entry_hwpoisoned. This is because it simply covers
38959cc3a5bdSNaoya Horiguchi 		 * both cases, and because we can't follow correct pages
38969cc3a5bdSNaoya Horiguchi 		 * directly from any kind of swap entries.
38979cc3a5bdSNaoya Horiguchi 		 */
38989cc3a5bdSNaoya Horiguchi 		if (absent || is_swap_pte(huge_ptep_get(pte)) ||
3899106c992aSGerald Schaefer 		    ((flags & FOLL_WRITE) &&
3900106c992aSGerald Schaefer 		      !huge_pte_write(huge_ptep_get(pte)))) {
39014c887265SAdam Litke 			int ret;
39024c887265SAdam Litke 
3903cb900f41SKirill A. Shutemov 			if (pte)
3904cb900f41SKirill A. Shutemov 				spin_unlock(ptl);
39052a15efc9SHugh Dickins 			ret = hugetlb_fault(mm, vma, vaddr,
39062a15efc9SHugh Dickins 				(flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
3907a89182c7SAdam Litke 			if (!(ret & VM_FAULT_ERROR))
39084c887265SAdam Litke 				continue;
39094c887265SAdam Litke 
39101c59827dSHugh Dickins 			remainder = 0;
39111c59827dSHugh Dickins 			break;
39121c59827dSHugh Dickins 		}
391363551ae0SDavid Gibson 
3914a5516438SAndi Kleen 		pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
39157f2e9525SGerald Schaefer 		page = pte_page(huge_ptep_get(pte));
3916d5d4b0aaSChen, Kenneth W same_page:
3917d6692183SChen, Kenneth W 		if (pages) {
391869d177c2SAndy Whitcroft 			pages[i] = mem_map_offset(page, pfn_offset);
3919ddc58f27SKirill A. Shutemov 			get_page(pages[i]);
3920d6692183SChen, Kenneth W 		}
392163551ae0SDavid Gibson 
392263551ae0SDavid Gibson 		if (vmas)
392363551ae0SDavid Gibson 			vmas[i] = vma;
392463551ae0SDavid Gibson 
392563551ae0SDavid Gibson 		vaddr += PAGE_SIZE;
3926d5d4b0aaSChen, Kenneth W 		++pfn_offset;
392763551ae0SDavid Gibson 		--remainder;
392863551ae0SDavid Gibson 		++i;
3929d5d4b0aaSChen, Kenneth W 		if (vaddr < vma->vm_end && remainder &&
3930a5516438SAndi Kleen 				pfn_offset < pages_per_huge_page(h)) {
3931d5d4b0aaSChen, Kenneth W 			/*
3932d5d4b0aaSChen, Kenneth W 			 * We use pfn_offset to avoid touching the pageframes
3933d5d4b0aaSChen, Kenneth W 			 * of this compound page.
3934d5d4b0aaSChen, Kenneth W 			 */
3935d5d4b0aaSChen, Kenneth W 			goto same_page;
3936d5d4b0aaSChen, Kenneth W 		}
3937cb900f41SKirill A. Shutemov 		spin_unlock(ptl);
393863551ae0SDavid Gibson 	}
393928a35716SMichel Lespinasse 	*nr_pages = remainder;
394063551ae0SDavid Gibson 	*position = vaddr;
394163551ae0SDavid Gibson 
39422a15efc9SHugh Dickins 	return i ? i : -EFAULT;
394363551ae0SDavid Gibson }
39448f860591SZhang, Yanmin 
39457da4d641SPeter Zijlstra unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
39468f860591SZhang, Yanmin 		unsigned long address, unsigned long end, pgprot_t newprot)
39478f860591SZhang, Yanmin {
39488f860591SZhang, Yanmin 	struct mm_struct *mm = vma->vm_mm;
39498f860591SZhang, Yanmin 	unsigned long start = address;
39508f860591SZhang, Yanmin 	pte_t *ptep;
39518f860591SZhang, Yanmin 	pte_t pte;
3952a5516438SAndi Kleen 	struct hstate *h = hstate_vma(vma);
39537da4d641SPeter Zijlstra 	unsigned long pages = 0;
39548f860591SZhang, Yanmin 
39558f860591SZhang, Yanmin 	BUG_ON(address >= end);
39568f860591SZhang, Yanmin 	flush_cache_range(vma, address, end);
39578f860591SZhang, Yanmin 
3958a5338093SRik van Riel 	mmu_notifier_invalidate_range_start(mm, start, end);
395983cde9e8SDavidlohr Bueso 	i_mmap_lock_write(vma->vm_file->f_mapping);
3960a5516438SAndi Kleen 	for (; address < end; address += huge_page_size(h)) {
3961cb900f41SKirill A. Shutemov 		spinlock_t *ptl;
39628f860591SZhang, Yanmin 		ptep = huge_pte_offset(mm, address);
39638f860591SZhang, Yanmin 		if (!ptep)
39648f860591SZhang, Yanmin 			continue;
3965cb900f41SKirill A. Shutemov 		ptl = huge_pte_lock(h, mm, ptep);
39667da4d641SPeter Zijlstra 		if (huge_pmd_unshare(mm, &address, ptep)) {
39677da4d641SPeter Zijlstra 			pages++;
3968cb900f41SKirill A. Shutemov 			spin_unlock(ptl);
396939dde65cSChen, Kenneth W 			continue;
39707da4d641SPeter Zijlstra 		}
3971a8bda28dSNaoya Horiguchi 		pte = huge_ptep_get(ptep);
3972a8bda28dSNaoya Horiguchi 		if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
3973a8bda28dSNaoya Horiguchi 			spin_unlock(ptl);
3974a8bda28dSNaoya Horiguchi 			continue;
3975a8bda28dSNaoya Horiguchi 		}
3976a8bda28dSNaoya Horiguchi 		if (unlikely(is_hugetlb_entry_migration(pte))) {
3977a8bda28dSNaoya Horiguchi 			swp_entry_t entry = pte_to_swp_entry(pte);
3978a8bda28dSNaoya Horiguchi 
3979a8bda28dSNaoya Horiguchi 			if (is_write_migration_entry(entry)) {
3980a8bda28dSNaoya Horiguchi 				pte_t newpte;
3981a8bda28dSNaoya Horiguchi 
3982a8bda28dSNaoya Horiguchi 				make_migration_entry_read(&entry);
3983a8bda28dSNaoya Horiguchi 				newpte = swp_entry_to_pte(entry);
3984a8bda28dSNaoya Horiguchi 				set_huge_pte_at(mm, address, ptep, newpte);
3985a8bda28dSNaoya Horiguchi 				pages++;
3986a8bda28dSNaoya Horiguchi 			}
3987a8bda28dSNaoya Horiguchi 			spin_unlock(ptl);
3988a8bda28dSNaoya Horiguchi 			continue;
3989a8bda28dSNaoya Horiguchi 		}
3990a8bda28dSNaoya Horiguchi 		if (!huge_pte_none(pte)) {
39918f860591SZhang, Yanmin 			pte = huge_ptep_get_and_clear(mm, address, ptep);
3992106c992aSGerald Schaefer 			pte = pte_mkhuge(huge_pte_modify(pte, newprot));
3993be7517d6STony Lu 			pte = arch_make_huge_pte(pte, vma, NULL, 0);
39948f860591SZhang, Yanmin 			set_huge_pte_at(mm, address, ptep, pte);
39957da4d641SPeter Zijlstra 			pages++;
39968f860591SZhang, Yanmin 		}
3997cb900f41SKirill A. Shutemov 		spin_unlock(ptl);
39988f860591SZhang, Yanmin 	}
3999d833352aSMel Gorman 	/*
4000c8c06efaSDavidlohr Bueso 	 * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare
4001d833352aSMel Gorman 	 * may have cleared our pud entry and done put_page on the page table:
4002c8c06efaSDavidlohr Bueso 	 * once we release i_mmap_rwsem, another task can do the final put_page
4003d833352aSMel Gorman 	 * and that page table be reused and filled with junk.
4004d833352aSMel Gorman 	 */
40058f860591SZhang, Yanmin 	flush_tlb_range(vma, start, end);
400634ee645eSJoerg Roedel 	mmu_notifier_invalidate_range(mm, start, end);
400783cde9e8SDavidlohr Bueso 	i_mmap_unlock_write(vma->vm_file->f_mapping);
4008a5338093SRik van Riel 	mmu_notifier_invalidate_range_end(mm, start, end);
40097da4d641SPeter Zijlstra 
40107da4d641SPeter Zijlstra 	return pages << h->order;
40118f860591SZhang, Yanmin }
40128f860591SZhang, Yanmin 
4013a1e78772SMel Gorman int hugetlb_reserve_pages(struct inode *inode,
4014a1e78772SMel Gorman 					long from, long to,
40155a6fe125SMel Gorman 					struct vm_area_struct *vma,
4016ca16d140SKOSAKI Motohiro 					vm_flags_t vm_flags)
4017e4e574b7SAdam Litke {
401817c9d12eSMel Gorman 	long ret, chg;
4019a5516438SAndi Kleen 	struct hstate *h = hstate_inode(inode);
402090481622SDavid Gibson 	struct hugepage_subpool *spool = subpool_inode(inode);
40219119a41eSJoonsoo Kim 	struct resv_map *resv_map;
40221c5ecae3SMike Kravetz 	long gbl_reserve;
4023e4e574b7SAdam Litke 
4024a1e78772SMel Gorman 	/*
402517c9d12eSMel Gorman 	 * Only apply hugepage reservation if asked. At fault time, an
402617c9d12eSMel Gorman 	 * attempt will be made for VM_NORESERVE to allocate a page
402790481622SDavid Gibson 	 * without using reserves
402817c9d12eSMel Gorman 	 */
4029ca16d140SKOSAKI Motohiro 	if (vm_flags & VM_NORESERVE)
403017c9d12eSMel Gorman 		return 0;
403117c9d12eSMel Gorman 
403217c9d12eSMel Gorman 	/*
4033a1e78772SMel Gorman 	 * Shared mappings base their reservation on the number of pages that
4034a1e78772SMel Gorman 	 * are already allocated on behalf of the file. Private mappings need
4035a1e78772SMel Gorman 	 * to reserve the full area even if read-only as mprotect() may be
4036a1e78772SMel Gorman 	 * called to make the mapping read-write. Assume !vma is a shm mapping
4037a1e78772SMel Gorman 	 */
40389119a41eSJoonsoo Kim 	if (!vma || vma->vm_flags & VM_MAYSHARE) {
40394e35f483SJoonsoo Kim 		resv_map = inode_resv_map(inode);
40409119a41eSJoonsoo Kim 
40411406ec9bSJoonsoo Kim 		chg = region_chg(resv_map, from, to);
40429119a41eSJoonsoo Kim 
40439119a41eSJoonsoo Kim 	} else {
40449119a41eSJoonsoo Kim 		resv_map = resv_map_alloc();
40455a6fe125SMel Gorman 		if (!resv_map)
40465a6fe125SMel Gorman 			return -ENOMEM;
40475a6fe125SMel Gorman 
404817c9d12eSMel Gorman 		chg = to - from;
404917c9d12eSMel Gorman 
40505a6fe125SMel Gorman 		set_vma_resv_map(vma, resv_map);
40515a6fe125SMel Gorman 		set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
40525a6fe125SMel Gorman 	}
40535a6fe125SMel Gorman 
4054c50ac050SDave Hansen 	if (chg < 0) {
4055c50ac050SDave Hansen 		ret = chg;
4056c50ac050SDave Hansen 		goto out_err;
4057c50ac050SDave Hansen 	}
405817c9d12eSMel Gorman 
40591c5ecae3SMike Kravetz 	/*
40601c5ecae3SMike Kravetz 	 * There must be enough pages in the subpool for the mapping. If
40611c5ecae3SMike Kravetz 	 * the subpool has a minimum size, there may be some global
40621c5ecae3SMike Kravetz 	 * reservations already in place (gbl_reserve).
40631c5ecae3SMike Kravetz 	 */
40641c5ecae3SMike Kravetz 	gbl_reserve = hugepage_subpool_get_pages(spool, chg);
40651c5ecae3SMike Kravetz 	if (gbl_reserve < 0) {
4066c50ac050SDave Hansen 		ret = -ENOSPC;
4067c50ac050SDave Hansen 		goto out_err;
4068c50ac050SDave Hansen 	}
406917c9d12eSMel Gorman 
407017c9d12eSMel Gorman 	/*
407117c9d12eSMel Gorman 	 * Check enough hugepages are available for the reservation.
407290481622SDavid Gibson 	 * Hand the pages back to the subpool if there are not
407317c9d12eSMel Gorman 	 */
40741c5ecae3SMike Kravetz 	ret = hugetlb_acct_memory(h, gbl_reserve);
407517c9d12eSMel Gorman 	if (ret < 0) {
40761c5ecae3SMike Kravetz 		/* put back original number of pages, chg */
40771c5ecae3SMike Kravetz 		(void)hugepage_subpool_put_pages(spool, chg);
4078c50ac050SDave Hansen 		goto out_err;
407917c9d12eSMel Gorman 	}
408017c9d12eSMel Gorman 
408117c9d12eSMel Gorman 	/*
408217c9d12eSMel Gorman 	 * Account for the reservations made. Shared mappings record regions
408317c9d12eSMel Gorman 	 * that have reservations as they are shared by multiple VMAs.
408417c9d12eSMel Gorman 	 * When the last VMA disappears, the region map says how much
408517c9d12eSMel Gorman 	 * the reservation was and the page cache tells how much of
408617c9d12eSMel Gorman 	 * the reservation was consumed. Private mappings are per-VMA and
408717c9d12eSMel Gorman 	 * only the consumed reservations are tracked. When the VMA
408817c9d12eSMel Gorman 	 * disappears, the original reservation is the VMA size and the
408917c9d12eSMel Gorman 	 * consumed reservations are stored in the map. Hence, nothing
409017c9d12eSMel Gorman 	 * else has to be done for private mappings here
409117c9d12eSMel Gorman 	 */
409233039678SMike Kravetz 	if (!vma || vma->vm_flags & VM_MAYSHARE) {
409333039678SMike Kravetz 		long add = region_add(resv_map, from, to);
409433039678SMike Kravetz 
409533039678SMike Kravetz 		if (unlikely(chg > add)) {
409633039678SMike Kravetz 			/*
409733039678SMike Kravetz 			 * pages in this range were added to the reserve
409833039678SMike Kravetz 			 * map between region_chg and region_add.  This
409933039678SMike Kravetz 			 * indicates a race with alloc_huge_page.  Adjust
410033039678SMike Kravetz 			 * the subpool and reserve counts modified above
410133039678SMike Kravetz 			 * based on the difference.
410233039678SMike Kravetz 			 */
410333039678SMike Kravetz 			long rsv_adjust;
410433039678SMike Kravetz 
410533039678SMike Kravetz 			rsv_adjust = hugepage_subpool_put_pages(spool,
410633039678SMike Kravetz 								chg - add);
410733039678SMike Kravetz 			hugetlb_acct_memory(h, -rsv_adjust);
410833039678SMike Kravetz 		}
410933039678SMike Kravetz 	}
4110a43a8c39SChen, Kenneth W 	return 0;
4111c50ac050SDave Hansen out_err:
41125e911373SMike Kravetz 	if (!vma || vma->vm_flags & VM_MAYSHARE)
41135e911373SMike Kravetz 		region_abort(resv_map, from, to);
4114f031dd27SJoonsoo Kim 	if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
4115f031dd27SJoonsoo Kim 		kref_put(&resv_map->refs, resv_map_release);
4116c50ac050SDave Hansen 	return ret;
4117a43a8c39SChen, Kenneth W }
4118a43a8c39SChen, Kenneth W 
4119b5cec28dSMike Kravetz long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
4120b5cec28dSMike Kravetz 								long freed)
4121a43a8c39SChen, Kenneth W {
4122a5516438SAndi Kleen 	struct hstate *h = hstate_inode(inode);
41234e35f483SJoonsoo Kim 	struct resv_map *resv_map = inode_resv_map(inode);
41249119a41eSJoonsoo Kim 	long chg = 0;
412590481622SDavid Gibson 	struct hugepage_subpool *spool = subpool_inode(inode);
41261c5ecae3SMike Kravetz 	long gbl_reserve;
412745c682a6SKen Chen 
4128b5cec28dSMike Kravetz 	if (resv_map) {
4129b5cec28dSMike Kravetz 		chg = region_del(resv_map, start, end);
4130b5cec28dSMike Kravetz 		/*
4131b5cec28dSMike Kravetz 		 * region_del() can fail in the rare case where a region
4132b5cec28dSMike Kravetz 		 * must be split and another region descriptor can not be
4133b5cec28dSMike Kravetz 		 * allocated.  If end == LONG_MAX, it will not fail.
4134b5cec28dSMike Kravetz 		 */
4135b5cec28dSMike Kravetz 		if (chg < 0)
4136b5cec28dSMike Kravetz 			return chg;
4137b5cec28dSMike Kravetz 	}
4138b5cec28dSMike Kravetz 
413945c682a6SKen Chen 	spin_lock(&inode->i_lock);
4140e4c6f8beSEric Sandeen 	inode->i_blocks -= (blocks_per_huge_page(h) * freed);
414145c682a6SKen Chen 	spin_unlock(&inode->i_lock);
414245c682a6SKen Chen 
41431c5ecae3SMike Kravetz 	/*
41441c5ecae3SMike Kravetz 	 * If the subpool has a minimum size, the number of global
41451c5ecae3SMike Kravetz 	 * reservations to be released may be adjusted.
41461c5ecae3SMike Kravetz 	 */
41471c5ecae3SMike Kravetz 	gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
41481c5ecae3SMike Kravetz 	hugetlb_acct_memory(h, -gbl_reserve);
4149b5cec28dSMike Kravetz 
4150b5cec28dSMike Kravetz 	return 0;
4151a43a8c39SChen, Kenneth W }
415293f70f90SNaoya Horiguchi 
41533212b535SSteve Capper #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
41543212b535SSteve Capper static unsigned long page_table_shareable(struct vm_area_struct *svma,
41553212b535SSteve Capper 				struct vm_area_struct *vma,
41563212b535SSteve Capper 				unsigned long addr, pgoff_t idx)
41573212b535SSteve Capper {
41583212b535SSteve Capper 	unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
41593212b535SSteve Capper 				svma->vm_start;
41603212b535SSteve Capper 	unsigned long sbase = saddr & PUD_MASK;
41613212b535SSteve Capper 	unsigned long s_end = sbase + PUD_SIZE;
41623212b535SSteve Capper 
41633212b535SSteve Capper 	/* Allow segments to share if only one is marked locked */
4164de60f5f1SEric B Munson 	unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
4165de60f5f1SEric B Munson 	unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK;
41663212b535SSteve Capper 
41673212b535SSteve Capper 	/*
41683212b535SSteve Capper 	 * match the virtual addresses, permission and the alignment of the
41693212b535SSteve Capper 	 * page table page.
41703212b535SSteve Capper 	 */
41713212b535SSteve Capper 	if (pmd_index(addr) != pmd_index(saddr) ||
41723212b535SSteve Capper 	    vm_flags != svm_flags ||
41733212b535SSteve Capper 	    sbase < svma->vm_start || svma->vm_end < s_end)
41743212b535SSteve Capper 		return 0;
41753212b535SSteve Capper 
41763212b535SSteve Capper 	return saddr;
41773212b535SSteve Capper }
41783212b535SSteve Capper 
417931aafb45SNicholas Krause static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
41803212b535SSteve Capper {
41813212b535SSteve Capper 	unsigned long base = addr & PUD_MASK;
41823212b535SSteve Capper 	unsigned long end = base + PUD_SIZE;
41833212b535SSteve Capper 
41843212b535SSteve Capper 	/*
41853212b535SSteve Capper 	 * check on proper vm_flags and page table alignment
41863212b535SSteve Capper 	 */
41873212b535SSteve Capper 	if (vma->vm_flags & VM_MAYSHARE &&
41883212b535SSteve Capper 	    vma->vm_start <= base && end <= vma->vm_end)
418931aafb45SNicholas Krause 		return true;
419031aafb45SNicholas Krause 	return false;
41913212b535SSteve Capper }
41923212b535SSteve Capper 
41933212b535SSteve Capper /*
41943212b535SSteve Capper  * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
41953212b535SSteve Capper  * and returns the corresponding pte. While this is not necessary for the
41963212b535SSteve Capper  * !shared pmd case because we can allocate the pmd later as well, it makes the
41973212b535SSteve Capper  * code much cleaner. pmd allocation is essential for the shared case because
4198c8c06efaSDavidlohr Bueso  * pud has to be populated inside the same i_mmap_rwsem section - otherwise
41993212b535SSteve Capper  * racing tasks could either miss the sharing (see huge_pte_offset) or select a
42003212b535SSteve Capper  * bad pmd for sharing.
42013212b535SSteve Capper  */
42023212b535SSteve Capper pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
42033212b535SSteve Capper {
42043212b535SSteve Capper 	struct vm_area_struct *vma = find_vma(mm, addr);
42053212b535SSteve Capper 	struct address_space *mapping = vma->vm_file->f_mapping;
42063212b535SSteve Capper 	pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
42073212b535SSteve Capper 			vma->vm_pgoff;
42083212b535SSteve Capper 	struct vm_area_struct *svma;
42093212b535SSteve Capper 	unsigned long saddr;
42103212b535SSteve Capper 	pte_t *spte = NULL;
42113212b535SSteve Capper 	pte_t *pte;
4212cb900f41SKirill A. Shutemov 	spinlock_t *ptl;
42133212b535SSteve Capper 
42143212b535SSteve Capper 	if (!vma_shareable(vma, addr))
42153212b535SSteve Capper 		return (pte_t *)pmd_alloc(mm, pud, addr);
42163212b535SSteve Capper 
421783cde9e8SDavidlohr Bueso 	i_mmap_lock_write(mapping);
42183212b535SSteve Capper 	vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
42193212b535SSteve Capper 		if (svma == vma)
42203212b535SSteve Capper 			continue;
42213212b535SSteve Capper 
42223212b535SSteve Capper 		saddr = page_table_shareable(svma, vma, addr, idx);
42233212b535SSteve Capper 		if (saddr) {
42243212b535SSteve Capper 			spte = huge_pte_offset(svma->vm_mm, saddr);
42253212b535SSteve Capper 			if (spte) {
42263212b535SSteve Capper 				get_page(virt_to_page(spte));
42273212b535SSteve Capper 				break;
42283212b535SSteve Capper 			}
42293212b535SSteve Capper 		}
42303212b535SSteve Capper 	}
42313212b535SSteve Capper 
42323212b535SSteve Capper 	if (!spte)
42333212b535SSteve Capper 		goto out;
42343212b535SSteve Capper 
4235cb900f41SKirill A. Shutemov 	ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte);
4236cb900f41SKirill A. Shutemov 	spin_lock(ptl);
4237dc6c9a35SKirill A. Shutemov 	if (pud_none(*pud)) {
42383212b535SSteve Capper 		pud_populate(mm, pud,
42393212b535SSteve Capper 				(pmd_t *)((unsigned long)spte & PAGE_MASK));
4240c17b1f42SKirill A. Shutemov 		mm_inc_nr_pmds(mm);
4241dc6c9a35SKirill A. Shutemov 	} else {
42423212b535SSteve Capper 		put_page(virt_to_page(spte));
4243dc6c9a35SKirill A. Shutemov 	}
4244cb900f41SKirill A. Shutemov 	spin_unlock(ptl);
42453212b535SSteve Capper out:
42463212b535SSteve Capper 	pte = (pte_t *)pmd_alloc(mm, pud, addr);
424783cde9e8SDavidlohr Bueso 	i_mmap_unlock_write(mapping);
42483212b535SSteve Capper 	return pte;
42493212b535SSteve Capper }
42503212b535SSteve Capper 
42513212b535SSteve Capper /*
42523212b535SSteve Capper  * unmap huge page backed by shared pte.
42533212b535SSteve Capper  *
42543212b535SSteve Capper  * Hugetlb pte page is ref counted at the time of mapping.  If pte is shared
42553212b535SSteve Capper  * indicated by page_count > 1, unmap is achieved by clearing pud and
42563212b535SSteve Capper  * decrementing the ref count. If count == 1, the pte page is not shared.
42573212b535SSteve Capper  *
4258cb900f41SKirill A. Shutemov  * called with page table lock held.
42593212b535SSteve Capper  *
42603212b535SSteve Capper  * returns: 1 successfully unmapped a shared pte page
42613212b535SSteve Capper  *	    0 the underlying pte page is not shared, or it is the last user
42623212b535SSteve Capper  */
42633212b535SSteve Capper int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
42643212b535SSteve Capper {
42653212b535SSteve Capper 	pgd_t *pgd = pgd_offset(mm, *addr);
42663212b535SSteve Capper 	pud_t *pud = pud_offset(pgd, *addr);
42673212b535SSteve Capper 
42683212b535SSteve Capper 	BUG_ON(page_count(virt_to_page(ptep)) == 0);
42693212b535SSteve Capper 	if (page_count(virt_to_page(ptep)) == 1)
42703212b535SSteve Capper 		return 0;
42713212b535SSteve Capper 
42723212b535SSteve Capper 	pud_clear(pud);
42733212b535SSteve Capper 	put_page(virt_to_page(ptep));
4274dc6c9a35SKirill A. Shutemov 	mm_dec_nr_pmds(mm);
42753212b535SSteve Capper 	*addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
42763212b535SSteve Capper 	return 1;
42773212b535SSteve Capper }
42789e5fc74cSSteve Capper #define want_pmd_share()	(1)
42799e5fc74cSSteve Capper #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
42809e5fc74cSSteve Capper pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
42819e5fc74cSSteve Capper {
42829e5fc74cSSteve Capper 	return NULL;
42839e5fc74cSSteve Capper }
4284e81f2d22SZhang Zhen 
4285e81f2d22SZhang Zhen int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
4286e81f2d22SZhang Zhen {
4287e81f2d22SZhang Zhen 	return 0;
4288e81f2d22SZhang Zhen }
42899e5fc74cSSteve Capper #define want_pmd_share()	(0)
42903212b535SSteve Capper #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
42913212b535SSteve Capper 
42929e5fc74cSSteve Capper #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
42939e5fc74cSSteve Capper pte_t *huge_pte_alloc(struct mm_struct *mm,
42949e5fc74cSSteve Capper 			unsigned long addr, unsigned long sz)
42959e5fc74cSSteve Capper {
42969e5fc74cSSteve Capper 	pgd_t *pgd;
42979e5fc74cSSteve Capper 	pud_t *pud;
42989e5fc74cSSteve Capper 	pte_t *pte = NULL;
42999e5fc74cSSteve Capper 
43009e5fc74cSSteve Capper 	pgd = pgd_offset(mm, addr);
43019e5fc74cSSteve Capper 	pud = pud_alloc(mm, pgd, addr);
43029e5fc74cSSteve Capper 	if (pud) {
43039e5fc74cSSteve Capper 		if (sz == PUD_SIZE) {
43049e5fc74cSSteve Capper 			pte = (pte_t *)pud;
43059e5fc74cSSteve Capper 		} else {
43069e5fc74cSSteve Capper 			BUG_ON(sz != PMD_SIZE);
43079e5fc74cSSteve Capper 			if (want_pmd_share() && pud_none(*pud))
43089e5fc74cSSteve Capper 				pte = huge_pmd_share(mm, addr, pud);
43099e5fc74cSSteve Capper 			else
43109e5fc74cSSteve Capper 				pte = (pte_t *)pmd_alloc(mm, pud, addr);
43119e5fc74cSSteve Capper 		}
43129e5fc74cSSteve Capper 	}
43139e5fc74cSSteve Capper 	BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
43149e5fc74cSSteve Capper 
43159e5fc74cSSteve Capper 	return pte;
43169e5fc74cSSteve Capper }
43179e5fc74cSSteve Capper 
43189e5fc74cSSteve Capper pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
43199e5fc74cSSteve Capper {
43209e5fc74cSSteve Capper 	pgd_t *pgd;
43219e5fc74cSSteve Capper 	pud_t *pud;
43229e5fc74cSSteve Capper 	pmd_t *pmd = NULL;
43239e5fc74cSSteve Capper 
43249e5fc74cSSteve Capper 	pgd = pgd_offset(mm, addr);
43259e5fc74cSSteve Capper 	if (pgd_present(*pgd)) {
43269e5fc74cSSteve Capper 		pud = pud_offset(pgd, addr);
43279e5fc74cSSteve Capper 		if (pud_present(*pud)) {
43289e5fc74cSSteve Capper 			if (pud_huge(*pud))
43299e5fc74cSSteve Capper 				return (pte_t *)pud;
43309e5fc74cSSteve Capper 			pmd = pmd_offset(pud, addr);
43319e5fc74cSSteve Capper 		}
43329e5fc74cSSteve Capper 	}
43339e5fc74cSSteve Capper 	return (pte_t *) pmd;
43349e5fc74cSSteve Capper }
43359e5fc74cSSteve Capper 
433661f77edaSNaoya Horiguchi #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
433761f77edaSNaoya Horiguchi 
433861f77edaSNaoya Horiguchi /*
433961f77edaSNaoya Horiguchi  * These functions are overwritable if your architecture needs its own
434061f77edaSNaoya Horiguchi  * behavior.
434161f77edaSNaoya Horiguchi  */
434261f77edaSNaoya Horiguchi struct page * __weak
434361f77edaSNaoya Horiguchi follow_huge_addr(struct mm_struct *mm, unsigned long address,
434461f77edaSNaoya Horiguchi 			      int write)
434561f77edaSNaoya Horiguchi {
434661f77edaSNaoya Horiguchi 	return ERR_PTR(-EINVAL);
434761f77edaSNaoya Horiguchi }
434861f77edaSNaoya Horiguchi 
434961f77edaSNaoya Horiguchi struct page * __weak
43509e5fc74cSSteve Capper follow_huge_pmd(struct mm_struct *mm, unsigned long address,
4351e66f17ffSNaoya Horiguchi 		pmd_t *pmd, int flags)
43529e5fc74cSSteve Capper {
4353e66f17ffSNaoya Horiguchi 	struct page *page = NULL;
4354e66f17ffSNaoya Horiguchi 	spinlock_t *ptl;
4355e66f17ffSNaoya Horiguchi retry:
4356e66f17ffSNaoya Horiguchi 	ptl = pmd_lockptr(mm, pmd);
4357e66f17ffSNaoya Horiguchi 	spin_lock(ptl);
4358e66f17ffSNaoya Horiguchi 	/*
4359e66f17ffSNaoya Horiguchi 	 * make sure that the address range covered by this pmd is not
4360e66f17ffSNaoya Horiguchi 	 * unmapped from other threads.
4361e66f17ffSNaoya Horiguchi 	 */
4362e66f17ffSNaoya Horiguchi 	if (!pmd_huge(*pmd))
4363e66f17ffSNaoya Horiguchi 		goto out;
4364e66f17ffSNaoya Horiguchi 	if (pmd_present(*pmd)) {
436597534127SGerald Schaefer 		page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
4366e66f17ffSNaoya Horiguchi 		if (flags & FOLL_GET)
4367e66f17ffSNaoya Horiguchi 			get_page(page);
4368e66f17ffSNaoya Horiguchi 	} else {
4369e66f17ffSNaoya Horiguchi 		if (is_hugetlb_entry_migration(huge_ptep_get((pte_t *)pmd))) {
4370e66f17ffSNaoya Horiguchi 			spin_unlock(ptl);
4371e66f17ffSNaoya Horiguchi 			__migration_entry_wait(mm, (pte_t *)pmd, ptl);
4372e66f17ffSNaoya Horiguchi 			goto retry;
4373e66f17ffSNaoya Horiguchi 		}
4374e66f17ffSNaoya Horiguchi 		/*
4375e66f17ffSNaoya Horiguchi 		 * hwpoisoned entry is treated as no_page_table in
4376e66f17ffSNaoya Horiguchi 		 * follow_page_mask().
4377e66f17ffSNaoya Horiguchi 		 */
4378e66f17ffSNaoya Horiguchi 	}
4379e66f17ffSNaoya Horiguchi out:
4380e66f17ffSNaoya Horiguchi 	spin_unlock(ptl);
43819e5fc74cSSteve Capper 	return page;
43829e5fc74cSSteve Capper }
43839e5fc74cSSteve Capper 
438461f77edaSNaoya Horiguchi struct page * __weak
43859e5fc74cSSteve Capper follow_huge_pud(struct mm_struct *mm, unsigned long address,
4386e66f17ffSNaoya Horiguchi 		pud_t *pud, int flags)
43879e5fc74cSSteve Capper {
4388e66f17ffSNaoya Horiguchi 	if (flags & FOLL_GET)
4389e66f17ffSNaoya Horiguchi 		return NULL;
43909e5fc74cSSteve Capper 
4391e66f17ffSNaoya Horiguchi 	return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
43929e5fc74cSSteve Capper }
43939e5fc74cSSteve Capper 
4394d5bd9106SAndi Kleen #ifdef CONFIG_MEMORY_FAILURE
4395d5bd9106SAndi Kleen 
439693f70f90SNaoya Horiguchi /*
439793f70f90SNaoya Horiguchi  * This function is called from memory failure code.
439893f70f90SNaoya Horiguchi  */
43996de2b1aaSNaoya Horiguchi int dequeue_hwpoisoned_huge_page(struct page *hpage)
440093f70f90SNaoya Horiguchi {
440193f70f90SNaoya Horiguchi 	struct hstate *h = page_hstate(hpage);
440293f70f90SNaoya Horiguchi 	int nid = page_to_nid(hpage);
44036de2b1aaSNaoya Horiguchi 	int ret = -EBUSY;
440493f70f90SNaoya Horiguchi 
440593f70f90SNaoya Horiguchi 	spin_lock(&hugetlb_lock);
44067e1f049eSNaoya Horiguchi 	/*
44077e1f049eSNaoya Horiguchi 	 * Just checking !page_huge_active is not enough, because that could be
44087e1f049eSNaoya Horiguchi 	 * an isolated/hwpoisoned hugepage (which have >0 refcount).
44097e1f049eSNaoya Horiguchi 	 */
44107e1f049eSNaoya Horiguchi 	if (!page_huge_active(hpage) && !page_count(hpage)) {
441156f2fb14SNaoya Horiguchi 		/*
441256f2fb14SNaoya Horiguchi 		 * Hwpoisoned hugepage isn't linked to activelist or freelist,
441356f2fb14SNaoya Horiguchi 		 * but dangling hpage->lru can trigger list-debug warnings
441456f2fb14SNaoya Horiguchi 		 * (this happens when we call unpoison_memory() on it),
441556f2fb14SNaoya Horiguchi 		 * so let it point to itself with list_del_init().
441656f2fb14SNaoya Horiguchi 		 */
441756f2fb14SNaoya Horiguchi 		list_del_init(&hpage->lru);
44188c6c2ecbSNaoya Horiguchi 		set_page_refcounted(hpage);
441993f70f90SNaoya Horiguchi 		h->free_huge_pages--;
442093f70f90SNaoya Horiguchi 		h->free_huge_pages_node[nid]--;
44216de2b1aaSNaoya Horiguchi 		ret = 0;
442293f70f90SNaoya Horiguchi 	}
44236de2b1aaSNaoya Horiguchi 	spin_unlock(&hugetlb_lock);
44246de2b1aaSNaoya Horiguchi 	return ret;
44256de2b1aaSNaoya Horiguchi }
44266de2b1aaSNaoya Horiguchi #endif
442731caf665SNaoya Horiguchi 
442831caf665SNaoya Horiguchi bool isolate_huge_page(struct page *page, struct list_head *list)
442931caf665SNaoya Horiguchi {
4430bcc54222SNaoya Horiguchi 	bool ret = true;
4431bcc54222SNaoya Horiguchi 
4432309381feSSasha Levin 	VM_BUG_ON_PAGE(!PageHead(page), page);
443331caf665SNaoya Horiguchi 	spin_lock(&hugetlb_lock);
4434bcc54222SNaoya Horiguchi 	if (!page_huge_active(page) || !get_page_unless_zero(page)) {
4435bcc54222SNaoya Horiguchi 		ret = false;
4436bcc54222SNaoya Horiguchi 		goto unlock;
4437bcc54222SNaoya Horiguchi 	}
4438bcc54222SNaoya Horiguchi 	clear_page_huge_active(page);
443931caf665SNaoya Horiguchi 	list_move_tail(&page->lru, list);
4440bcc54222SNaoya Horiguchi unlock:
444131caf665SNaoya Horiguchi 	spin_unlock(&hugetlb_lock);
4442bcc54222SNaoya Horiguchi 	return ret;
444331caf665SNaoya Horiguchi }
444431caf665SNaoya Horiguchi 
444531caf665SNaoya Horiguchi void putback_active_hugepage(struct page *page)
444631caf665SNaoya Horiguchi {
4447309381feSSasha Levin 	VM_BUG_ON_PAGE(!PageHead(page), page);
444831caf665SNaoya Horiguchi 	spin_lock(&hugetlb_lock);
4449bcc54222SNaoya Horiguchi 	set_page_huge_active(page);
445031caf665SNaoya Horiguchi 	list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
445131caf665SNaoya Horiguchi 	spin_unlock(&hugetlb_lock);
445231caf665SNaoya Horiguchi 	put_page(page);
445331caf665SNaoya Horiguchi }
4454