xref: /openbmc/linux/mm/hugetlb.c (revision 1dd308a7)
11da177e4SLinus Torvalds /*
21da177e4SLinus Torvalds  * Generic hugetlb support.
36d49e352SNadia Yvette Chambers  * (C) Nadia Yvette Chambers, April 2004
41da177e4SLinus Torvalds  */
51da177e4SLinus Torvalds #include <linux/list.h>
61da177e4SLinus Torvalds #include <linux/init.h>
71da177e4SLinus Torvalds #include <linux/module.h>
81da177e4SLinus Torvalds #include <linux/mm.h>
9e1759c21SAlexey Dobriyan #include <linux/seq_file.h>
101da177e4SLinus Torvalds #include <linux/sysctl.h>
111da177e4SLinus Torvalds #include <linux/highmem.h>
12cddb8a5cSAndrea Arcangeli #include <linux/mmu_notifier.h>
131da177e4SLinus Torvalds #include <linux/nodemask.h>
1463551ae0SDavid Gibson #include <linux/pagemap.h>
155da7ca86SChristoph Lameter #include <linux/mempolicy.h>
163b32123dSGideon Israel Dsouza #include <linux/compiler.h>
17aea47ff3SChristoph Lameter #include <linux/cpuset.h>
183935baa9SDavid Gibson #include <linux/mutex.h>
19aa888a74SAndi Kleen #include <linux/bootmem.h>
20a3437870SNishanth Aravamudan #include <linux/sysfs.h>
215a0e3ad6STejun Heo #include <linux/slab.h>
220fe6e20bSNaoya Horiguchi #include <linux/rmap.h>
23fd6a03edSNaoya Horiguchi #include <linux/swap.h>
24fd6a03edSNaoya Horiguchi #include <linux/swapops.h>
25c8721bbbSNaoya Horiguchi #include <linux/page-isolation.h>
268382d914SDavidlohr Bueso #include <linux/jhash.h>
27d6606683SLinus Torvalds 
2863551ae0SDavid Gibson #include <asm/page.h>
2963551ae0SDavid Gibson #include <asm/pgtable.h>
3024669e58SAneesh Kumar K.V #include <asm/tlb.h>
3163551ae0SDavid Gibson 
3224669e58SAneesh Kumar K.V #include <linux/io.h>
3363551ae0SDavid Gibson #include <linux/hugetlb.h>
349dd540e2SAneesh Kumar K.V #include <linux/hugetlb_cgroup.h>
359a305230SLee Schermerhorn #include <linux/node.h>
367835e98bSNick Piggin #include "internal.h"
371da177e4SLinus Torvalds 
38753162cdSAndrey Ryabinin int hugepages_treat_as_movable;
39a5516438SAndi Kleen 
40c3f38a38SAneesh Kumar K.V int hugetlb_max_hstate __read_mostly;
41e5ff2159SAndi Kleen unsigned int default_hstate_idx;
42e5ff2159SAndi Kleen struct hstate hstates[HUGE_MAX_HSTATE];
43641844f5SNaoya Horiguchi /*
44641844f5SNaoya Horiguchi  * Minimum page order among possible hugepage sizes, set to a proper value
45641844f5SNaoya Horiguchi  * at boot time.
46641844f5SNaoya Horiguchi  */
47641844f5SNaoya Horiguchi static unsigned int minimum_order __read_mostly = UINT_MAX;
48e5ff2159SAndi Kleen 
4953ba51d2SJon Tollefson __initdata LIST_HEAD(huge_boot_pages);
5053ba51d2SJon Tollefson 
51e5ff2159SAndi Kleen /* for command line parsing */
52e5ff2159SAndi Kleen static struct hstate * __initdata parsed_hstate;
53e5ff2159SAndi Kleen static unsigned long __initdata default_hstate_max_huge_pages;
54e11bfbfcSNick Piggin static unsigned long __initdata default_hstate_size;
55e5ff2159SAndi Kleen 
563935baa9SDavid Gibson /*
5731caf665SNaoya Horiguchi  * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
5831caf665SNaoya Horiguchi  * free_huge_pages, and surplus_huge_pages.
593935baa9SDavid Gibson  */
60c3f38a38SAneesh Kumar K.V DEFINE_SPINLOCK(hugetlb_lock);
610bd0f9fbSEric Paris 
628382d914SDavidlohr Bueso /*
638382d914SDavidlohr Bueso  * Serializes faults on the same logical page.  This is used to
648382d914SDavidlohr Bueso  * prevent spurious OOMs when the hugepage pool is fully utilized.
658382d914SDavidlohr Bueso  */
668382d914SDavidlohr Bueso static int num_fault_mutexes;
678382d914SDavidlohr Bueso static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp;
688382d914SDavidlohr Bueso 
697ca02d0aSMike Kravetz /* Forward declaration */
707ca02d0aSMike Kravetz static int hugetlb_acct_memory(struct hstate *h, long delta);
717ca02d0aSMike Kravetz 
7290481622SDavid Gibson static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
7390481622SDavid Gibson {
7490481622SDavid Gibson 	bool free = (spool->count == 0) && (spool->used_hpages == 0);
7590481622SDavid Gibson 
7690481622SDavid Gibson 	spin_unlock(&spool->lock);
7790481622SDavid Gibson 
7890481622SDavid Gibson 	/* If no pages are used, and no other handles to the subpool
797ca02d0aSMike Kravetz 	 * remain, give up any reservations mased on minimum size and
807ca02d0aSMike Kravetz 	 * free the subpool */
817ca02d0aSMike Kravetz 	if (free) {
827ca02d0aSMike Kravetz 		if (spool->min_hpages != -1)
837ca02d0aSMike Kravetz 			hugetlb_acct_memory(spool->hstate,
847ca02d0aSMike Kravetz 						-spool->min_hpages);
8590481622SDavid Gibson 		kfree(spool);
8690481622SDavid Gibson 	}
877ca02d0aSMike Kravetz }
8890481622SDavid Gibson 
897ca02d0aSMike Kravetz struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
907ca02d0aSMike Kravetz 						long min_hpages)
9190481622SDavid Gibson {
9290481622SDavid Gibson 	struct hugepage_subpool *spool;
9390481622SDavid Gibson 
94c6a91820SMike Kravetz 	spool = kzalloc(sizeof(*spool), GFP_KERNEL);
9590481622SDavid Gibson 	if (!spool)
9690481622SDavid Gibson 		return NULL;
9790481622SDavid Gibson 
9890481622SDavid Gibson 	spin_lock_init(&spool->lock);
9990481622SDavid Gibson 	spool->count = 1;
1007ca02d0aSMike Kravetz 	spool->max_hpages = max_hpages;
1017ca02d0aSMike Kravetz 	spool->hstate = h;
1027ca02d0aSMike Kravetz 	spool->min_hpages = min_hpages;
1037ca02d0aSMike Kravetz 
1047ca02d0aSMike Kravetz 	if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
1057ca02d0aSMike Kravetz 		kfree(spool);
1067ca02d0aSMike Kravetz 		return NULL;
1077ca02d0aSMike Kravetz 	}
1087ca02d0aSMike Kravetz 	spool->rsv_hpages = min_hpages;
10990481622SDavid Gibson 
11090481622SDavid Gibson 	return spool;
11190481622SDavid Gibson }
11290481622SDavid Gibson 
11390481622SDavid Gibson void hugepage_put_subpool(struct hugepage_subpool *spool)
11490481622SDavid Gibson {
11590481622SDavid Gibson 	spin_lock(&spool->lock);
11690481622SDavid Gibson 	BUG_ON(!spool->count);
11790481622SDavid Gibson 	spool->count--;
11890481622SDavid Gibson 	unlock_or_release_subpool(spool);
11990481622SDavid Gibson }
12090481622SDavid Gibson 
1211c5ecae3SMike Kravetz /*
1221c5ecae3SMike Kravetz  * Subpool accounting for allocating and reserving pages.
1231c5ecae3SMike Kravetz  * Return -ENOMEM if there are not enough resources to satisfy the
1241c5ecae3SMike Kravetz  * the request.  Otherwise, return the number of pages by which the
1251c5ecae3SMike Kravetz  * global pools must be adjusted (upward).  The returned value may
1261c5ecae3SMike Kravetz  * only be different than the passed value (delta) in the case where
1271c5ecae3SMike Kravetz  * a subpool minimum size must be manitained.
1281c5ecae3SMike Kravetz  */
1291c5ecae3SMike Kravetz static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
13090481622SDavid Gibson 				      long delta)
13190481622SDavid Gibson {
1321c5ecae3SMike Kravetz 	long ret = delta;
13390481622SDavid Gibson 
13490481622SDavid Gibson 	if (!spool)
1351c5ecae3SMike Kravetz 		return ret;
13690481622SDavid Gibson 
13790481622SDavid Gibson 	spin_lock(&spool->lock);
13890481622SDavid Gibson 
1391c5ecae3SMike Kravetz 	if (spool->max_hpages != -1) {		/* maximum size accounting */
1401c5ecae3SMike Kravetz 		if ((spool->used_hpages + delta) <= spool->max_hpages)
1411c5ecae3SMike Kravetz 			spool->used_hpages += delta;
1421c5ecae3SMike Kravetz 		else {
1431c5ecae3SMike Kravetz 			ret = -ENOMEM;
1441c5ecae3SMike Kravetz 			goto unlock_ret;
1451c5ecae3SMike Kravetz 		}
1461c5ecae3SMike Kravetz 	}
1471c5ecae3SMike Kravetz 
1481c5ecae3SMike Kravetz 	if (spool->min_hpages != -1) {		/* minimum size accounting */
1491c5ecae3SMike Kravetz 		if (delta > spool->rsv_hpages) {
1501c5ecae3SMike Kravetz 			/*
1511c5ecae3SMike Kravetz 			 * Asking for more reserves than those already taken on
1521c5ecae3SMike Kravetz 			 * behalf of subpool.  Return difference.
1531c5ecae3SMike Kravetz 			 */
1541c5ecae3SMike Kravetz 			ret = delta - spool->rsv_hpages;
1551c5ecae3SMike Kravetz 			spool->rsv_hpages = 0;
1561c5ecae3SMike Kravetz 		} else {
1571c5ecae3SMike Kravetz 			ret = 0;	/* reserves already accounted for */
1581c5ecae3SMike Kravetz 			spool->rsv_hpages -= delta;
1591c5ecae3SMike Kravetz 		}
1601c5ecae3SMike Kravetz 	}
1611c5ecae3SMike Kravetz 
1621c5ecae3SMike Kravetz unlock_ret:
1631c5ecae3SMike Kravetz 	spin_unlock(&spool->lock);
16490481622SDavid Gibson 	return ret;
16590481622SDavid Gibson }
16690481622SDavid Gibson 
1671c5ecae3SMike Kravetz /*
1681c5ecae3SMike Kravetz  * Subpool accounting for freeing and unreserving pages.
1691c5ecae3SMike Kravetz  * Return the number of global page reservations that must be dropped.
1701c5ecae3SMike Kravetz  * The return value may only be different than the passed value (delta)
1711c5ecae3SMike Kravetz  * in the case where a subpool minimum size must be maintained.
1721c5ecae3SMike Kravetz  */
1731c5ecae3SMike Kravetz static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
17490481622SDavid Gibson 				       long delta)
17590481622SDavid Gibson {
1761c5ecae3SMike Kravetz 	long ret = delta;
1771c5ecae3SMike Kravetz 
17890481622SDavid Gibson 	if (!spool)
1791c5ecae3SMike Kravetz 		return delta;
18090481622SDavid Gibson 
18190481622SDavid Gibson 	spin_lock(&spool->lock);
1821c5ecae3SMike Kravetz 
1831c5ecae3SMike Kravetz 	if (spool->max_hpages != -1)		/* maximum size accounting */
18490481622SDavid Gibson 		spool->used_hpages -= delta;
1851c5ecae3SMike Kravetz 
1861c5ecae3SMike Kravetz 	if (spool->min_hpages != -1) {		/* minimum size accounting */
1871c5ecae3SMike Kravetz 		if (spool->rsv_hpages + delta <= spool->min_hpages)
1881c5ecae3SMike Kravetz 			ret = 0;
1891c5ecae3SMike Kravetz 		else
1901c5ecae3SMike Kravetz 			ret = spool->rsv_hpages + delta - spool->min_hpages;
1911c5ecae3SMike Kravetz 
1921c5ecae3SMike Kravetz 		spool->rsv_hpages += delta;
1931c5ecae3SMike Kravetz 		if (spool->rsv_hpages > spool->min_hpages)
1941c5ecae3SMike Kravetz 			spool->rsv_hpages = spool->min_hpages;
1951c5ecae3SMike Kravetz 	}
1961c5ecae3SMike Kravetz 
1971c5ecae3SMike Kravetz 	/*
1981c5ecae3SMike Kravetz 	 * If hugetlbfs_put_super couldn't free spool due to an outstanding
1991c5ecae3SMike Kravetz 	 * quota reference, free it now.
2001c5ecae3SMike Kravetz 	 */
20190481622SDavid Gibson 	unlock_or_release_subpool(spool);
2021c5ecae3SMike Kravetz 
2031c5ecae3SMike Kravetz 	return ret;
20490481622SDavid Gibson }
20590481622SDavid Gibson 
20690481622SDavid Gibson static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
20790481622SDavid Gibson {
20890481622SDavid Gibson 	return HUGETLBFS_SB(inode->i_sb)->spool;
20990481622SDavid Gibson }
21090481622SDavid Gibson 
21190481622SDavid Gibson static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
21290481622SDavid Gibson {
213496ad9aaSAl Viro 	return subpool_inode(file_inode(vma->vm_file));
21490481622SDavid Gibson }
21590481622SDavid Gibson 
216e7c4b0bfSAndy Whitcroft /*
21796822904SAndy Whitcroft  * Region tracking -- allows tracking of reservations and instantiated pages
21896822904SAndy Whitcroft  *                    across the pages in a mapping.
21984afd99bSAndy Whitcroft  *
2201dd308a7SMike Kravetz  * The region data structures are embedded into a resv_map and protected
2211dd308a7SMike Kravetz  * by a resv_map's lock.  The set of regions within the resv_map represent
2221dd308a7SMike Kravetz  * reservations for huge pages, or huge pages that have already been
2231dd308a7SMike Kravetz  * instantiated within the map.  The from and to elements are huge page
2241dd308a7SMike Kravetz  * indicies into the associated mapping.  from indicates the starting index
2251dd308a7SMike Kravetz  * of the region.  to represents the first index past the end of  the region.
2261dd308a7SMike Kravetz  *
2271dd308a7SMike Kravetz  * For example, a file region structure with from == 0 and to == 4 represents
2281dd308a7SMike Kravetz  * four huge pages in a mapping.  It is important to note that the to element
2291dd308a7SMike Kravetz  * represents the first element past the end of the region. This is used in
2301dd308a7SMike Kravetz  * arithmetic as 4(to) - 0(from) = 4 huge pages in the region.
2311dd308a7SMike Kravetz  *
2321dd308a7SMike Kravetz  * Interval notation of the form [from, to) will be used to indicate that
2331dd308a7SMike Kravetz  * the endpoint from is inclusive and to is exclusive.
23496822904SAndy Whitcroft  */
23596822904SAndy Whitcroft struct file_region {
23696822904SAndy Whitcroft 	struct list_head link;
23796822904SAndy Whitcroft 	long from;
23896822904SAndy Whitcroft 	long to;
23996822904SAndy Whitcroft };
24096822904SAndy Whitcroft 
2411dd308a7SMike Kravetz /*
2421dd308a7SMike Kravetz  * Add the huge page range represented by [f, t) to the reserve
2431dd308a7SMike Kravetz  * map.  Existing regions will be expanded to accommodate the
2441dd308a7SMike Kravetz  * specified range.  We know only existing regions need to be
2451dd308a7SMike Kravetz  * expanded, because region_add is only called after region_chg
2461dd308a7SMike Kravetz  * with the same range.  If a new file_region structure must
2471dd308a7SMike Kravetz  * be allocated, it is done in region_chg.
2481dd308a7SMike Kravetz  */
2491406ec9bSJoonsoo Kim static long region_add(struct resv_map *resv, long f, long t)
25096822904SAndy Whitcroft {
2511406ec9bSJoonsoo Kim 	struct list_head *head = &resv->regions;
25296822904SAndy Whitcroft 	struct file_region *rg, *nrg, *trg;
25396822904SAndy Whitcroft 
2547b24d861SDavidlohr Bueso 	spin_lock(&resv->lock);
25596822904SAndy Whitcroft 	/* Locate the region we are either in or before. */
25696822904SAndy Whitcroft 	list_for_each_entry(rg, head, link)
25796822904SAndy Whitcroft 		if (f <= rg->to)
25896822904SAndy Whitcroft 			break;
25996822904SAndy Whitcroft 
26096822904SAndy Whitcroft 	/* Round our left edge to the current segment if it encloses us. */
26196822904SAndy Whitcroft 	if (f > rg->from)
26296822904SAndy Whitcroft 		f = rg->from;
26396822904SAndy Whitcroft 
26496822904SAndy Whitcroft 	/* Check for and consume any regions we now overlap with. */
26596822904SAndy Whitcroft 	nrg = rg;
26696822904SAndy Whitcroft 	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
26796822904SAndy Whitcroft 		if (&rg->link == head)
26896822904SAndy Whitcroft 			break;
26996822904SAndy Whitcroft 		if (rg->from > t)
27096822904SAndy Whitcroft 			break;
27196822904SAndy Whitcroft 
27296822904SAndy Whitcroft 		/* If this area reaches higher then extend our area to
27396822904SAndy Whitcroft 		 * include it completely.  If this is not the first area
27496822904SAndy Whitcroft 		 * which we intend to reuse, free it. */
27596822904SAndy Whitcroft 		if (rg->to > t)
27696822904SAndy Whitcroft 			t = rg->to;
27796822904SAndy Whitcroft 		if (rg != nrg) {
27896822904SAndy Whitcroft 			list_del(&rg->link);
27996822904SAndy Whitcroft 			kfree(rg);
28096822904SAndy Whitcroft 		}
28196822904SAndy Whitcroft 	}
28296822904SAndy Whitcroft 	nrg->from = f;
28396822904SAndy Whitcroft 	nrg->to = t;
2847b24d861SDavidlohr Bueso 	spin_unlock(&resv->lock);
28596822904SAndy Whitcroft 	return 0;
28696822904SAndy Whitcroft }
28796822904SAndy Whitcroft 
2881dd308a7SMike Kravetz /*
2891dd308a7SMike Kravetz  * Examine the existing reserve map and determine how many
2901dd308a7SMike Kravetz  * huge pages in the specified range [f, t) are NOT currently
2911dd308a7SMike Kravetz  * represented.  This routine is called before a subsequent
2921dd308a7SMike Kravetz  * call to region_add that will actually modify the reserve
2931dd308a7SMike Kravetz  * map to add the specified range [f, t).  region_chg does
2941dd308a7SMike Kravetz  * not change the number of huge pages represented by the
2951dd308a7SMike Kravetz  * map.  However, if the existing regions in the map can not
2961dd308a7SMike Kravetz  * be expanded to represent the new range, a new file_region
2971dd308a7SMike Kravetz  * structure is added to the map as a placeholder.  This is
2981dd308a7SMike Kravetz  * so that the subsequent region_add call will have all the
2991dd308a7SMike Kravetz  * regions it needs and will not fail.
3001dd308a7SMike Kravetz  *
3011dd308a7SMike Kravetz  * Returns the number of huge pages that need to be added
3021dd308a7SMike Kravetz  * to the existing reservation map for the range [f, t).
3031dd308a7SMike Kravetz  * This number is greater or equal to zero.  -ENOMEM is
3041dd308a7SMike Kravetz  * returned if a new file_region structure is needed and can
3051dd308a7SMike Kravetz  * not be allocated.
3061dd308a7SMike Kravetz  */
3071406ec9bSJoonsoo Kim static long region_chg(struct resv_map *resv, long f, long t)
30896822904SAndy Whitcroft {
3091406ec9bSJoonsoo Kim 	struct list_head *head = &resv->regions;
3107b24d861SDavidlohr Bueso 	struct file_region *rg, *nrg = NULL;
31196822904SAndy Whitcroft 	long chg = 0;
31296822904SAndy Whitcroft 
3137b24d861SDavidlohr Bueso retry:
3147b24d861SDavidlohr Bueso 	spin_lock(&resv->lock);
31596822904SAndy Whitcroft 	/* Locate the region we are before or in. */
31696822904SAndy Whitcroft 	list_for_each_entry(rg, head, link)
31796822904SAndy Whitcroft 		if (f <= rg->to)
31896822904SAndy Whitcroft 			break;
31996822904SAndy Whitcroft 
32096822904SAndy Whitcroft 	/* If we are below the current region then a new region is required.
32196822904SAndy Whitcroft 	 * Subtle, allocate a new region at the position but make it zero
32296822904SAndy Whitcroft 	 * size such that we can guarantee to record the reservation. */
32396822904SAndy Whitcroft 	if (&rg->link == head || t < rg->from) {
3247b24d861SDavidlohr Bueso 		if (!nrg) {
3257b24d861SDavidlohr Bueso 			spin_unlock(&resv->lock);
32696822904SAndy Whitcroft 			nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
32796822904SAndy Whitcroft 			if (!nrg)
32896822904SAndy Whitcroft 				return -ENOMEM;
3297b24d861SDavidlohr Bueso 
33096822904SAndy Whitcroft 			nrg->from = f;
33196822904SAndy Whitcroft 			nrg->to   = f;
33296822904SAndy Whitcroft 			INIT_LIST_HEAD(&nrg->link);
3337b24d861SDavidlohr Bueso 			goto retry;
3347b24d861SDavidlohr Bueso 		}
33596822904SAndy Whitcroft 
3367b24d861SDavidlohr Bueso 		list_add(&nrg->link, rg->link.prev);
3377b24d861SDavidlohr Bueso 		chg = t - f;
3387b24d861SDavidlohr Bueso 		goto out_nrg;
33996822904SAndy Whitcroft 	}
34096822904SAndy Whitcroft 
34196822904SAndy Whitcroft 	/* Round our left edge to the current segment if it encloses us. */
34296822904SAndy Whitcroft 	if (f > rg->from)
34396822904SAndy Whitcroft 		f = rg->from;
34496822904SAndy Whitcroft 	chg = t - f;
34596822904SAndy Whitcroft 
34696822904SAndy Whitcroft 	/* Check for and consume any regions we now overlap with. */
34796822904SAndy Whitcroft 	list_for_each_entry(rg, rg->link.prev, link) {
34896822904SAndy Whitcroft 		if (&rg->link == head)
34996822904SAndy Whitcroft 			break;
35096822904SAndy Whitcroft 		if (rg->from > t)
3517b24d861SDavidlohr Bueso 			goto out;
35296822904SAndy Whitcroft 
35325985edcSLucas De Marchi 		/* We overlap with this area, if it extends further than
35496822904SAndy Whitcroft 		 * us then we must extend ourselves.  Account for its
35596822904SAndy Whitcroft 		 * existing reservation. */
35696822904SAndy Whitcroft 		if (rg->to > t) {
35796822904SAndy Whitcroft 			chg += rg->to - t;
35896822904SAndy Whitcroft 			t = rg->to;
35996822904SAndy Whitcroft 		}
36096822904SAndy Whitcroft 		chg -= rg->to - rg->from;
36196822904SAndy Whitcroft 	}
3627b24d861SDavidlohr Bueso 
3637b24d861SDavidlohr Bueso out:
3647b24d861SDavidlohr Bueso 	spin_unlock(&resv->lock);
3657b24d861SDavidlohr Bueso 	/*  We already know we raced and no longer need the new region */
3667b24d861SDavidlohr Bueso 	kfree(nrg);
3677b24d861SDavidlohr Bueso 	return chg;
3687b24d861SDavidlohr Bueso out_nrg:
3697b24d861SDavidlohr Bueso 	spin_unlock(&resv->lock);
37096822904SAndy Whitcroft 	return chg;
37196822904SAndy Whitcroft }
37296822904SAndy Whitcroft 
3731dd308a7SMike Kravetz /*
3741dd308a7SMike Kravetz  * Truncate the reserve map at index 'end'.  Modify/truncate any
3751dd308a7SMike Kravetz  * region which contains end.  Delete any regions past end.
3761dd308a7SMike Kravetz  * Return the number of huge pages removed from the map.
3771dd308a7SMike Kravetz  */
3781406ec9bSJoonsoo Kim static long region_truncate(struct resv_map *resv, long end)
37996822904SAndy Whitcroft {
3801406ec9bSJoonsoo Kim 	struct list_head *head = &resv->regions;
38196822904SAndy Whitcroft 	struct file_region *rg, *trg;
38296822904SAndy Whitcroft 	long chg = 0;
38396822904SAndy Whitcroft 
3847b24d861SDavidlohr Bueso 	spin_lock(&resv->lock);
38596822904SAndy Whitcroft 	/* Locate the region we are either in or before. */
38696822904SAndy Whitcroft 	list_for_each_entry(rg, head, link)
38796822904SAndy Whitcroft 		if (end <= rg->to)
38896822904SAndy Whitcroft 			break;
38996822904SAndy Whitcroft 	if (&rg->link == head)
3907b24d861SDavidlohr Bueso 		goto out;
39196822904SAndy Whitcroft 
39296822904SAndy Whitcroft 	/* If we are in the middle of a region then adjust it. */
39396822904SAndy Whitcroft 	if (end > rg->from) {
39496822904SAndy Whitcroft 		chg = rg->to - end;
39596822904SAndy Whitcroft 		rg->to = end;
39696822904SAndy Whitcroft 		rg = list_entry(rg->link.next, typeof(*rg), link);
39796822904SAndy Whitcroft 	}
39896822904SAndy Whitcroft 
39996822904SAndy Whitcroft 	/* Drop any remaining regions. */
40096822904SAndy Whitcroft 	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
40196822904SAndy Whitcroft 		if (&rg->link == head)
40296822904SAndy Whitcroft 			break;
40396822904SAndy Whitcroft 		chg += rg->to - rg->from;
40496822904SAndy Whitcroft 		list_del(&rg->link);
40596822904SAndy Whitcroft 		kfree(rg);
40696822904SAndy Whitcroft 	}
4077b24d861SDavidlohr Bueso 
4087b24d861SDavidlohr Bueso out:
4097b24d861SDavidlohr Bueso 	spin_unlock(&resv->lock);
41096822904SAndy Whitcroft 	return chg;
41196822904SAndy Whitcroft }
41296822904SAndy Whitcroft 
4131dd308a7SMike Kravetz /*
4141dd308a7SMike Kravetz  * Count and return the number of huge pages in the reserve map
4151dd308a7SMike Kravetz  * that intersect with the range [f, t).
4161dd308a7SMike Kravetz  */
4171406ec9bSJoonsoo Kim static long region_count(struct resv_map *resv, long f, long t)
41884afd99bSAndy Whitcroft {
4191406ec9bSJoonsoo Kim 	struct list_head *head = &resv->regions;
42084afd99bSAndy Whitcroft 	struct file_region *rg;
42184afd99bSAndy Whitcroft 	long chg = 0;
42284afd99bSAndy Whitcroft 
4237b24d861SDavidlohr Bueso 	spin_lock(&resv->lock);
42484afd99bSAndy Whitcroft 	/* Locate each segment we overlap with, and count that overlap. */
42584afd99bSAndy Whitcroft 	list_for_each_entry(rg, head, link) {
426f2135a4aSWang Sheng-Hui 		long seg_from;
427f2135a4aSWang Sheng-Hui 		long seg_to;
42884afd99bSAndy Whitcroft 
42984afd99bSAndy Whitcroft 		if (rg->to <= f)
43084afd99bSAndy Whitcroft 			continue;
43184afd99bSAndy Whitcroft 		if (rg->from >= t)
43284afd99bSAndy Whitcroft 			break;
43384afd99bSAndy Whitcroft 
43484afd99bSAndy Whitcroft 		seg_from = max(rg->from, f);
43584afd99bSAndy Whitcroft 		seg_to = min(rg->to, t);
43684afd99bSAndy Whitcroft 
43784afd99bSAndy Whitcroft 		chg += seg_to - seg_from;
43884afd99bSAndy Whitcroft 	}
4397b24d861SDavidlohr Bueso 	spin_unlock(&resv->lock);
44084afd99bSAndy Whitcroft 
44184afd99bSAndy Whitcroft 	return chg;
44284afd99bSAndy Whitcroft }
44384afd99bSAndy Whitcroft 
44496822904SAndy Whitcroft /*
445e7c4b0bfSAndy Whitcroft  * Convert the address within this vma to the page offset within
446e7c4b0bfSAndy Whitcroft  * the mapping, in pagecache page units; huge pages here.
447e7c4b0bfSAndy Whitcroft  */
448a5516438SAndi Kleen static pgoff_t vma_hugecache_offset(struct hstate *h,
449a5516438SAndi Kleen 			struct vm_area_struct *vma, unsigned long address)
450e7c4b0bfSAndy Whitcroft {
451a5516438SAndi Kleen 	return ((address - vma->vm_start) >> huge_page_shift(h)) +
452a5516438SAndi Kleen 			(vma->vm_pgoff >> huge_page_order(h));
453e7c4b0bfSAndy Whitcroft }
454e7c4b0bfSAndy Whitcroft 
4550fe6e20bSNaoya Horiguchi pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
4560fe6e20bSNaoya Horiguchi 				     unsigned long address)
4570fe6e20bSNaoya Horiguchi {
4580fe6e20bSNaoya Horiguchi 	return vma_hugecache_offset(hstate_vma(vma), vma, address);
4590fe6e20bSNaoya Horiguchi }
4600fe6e20bSNaoya Horiguchi 
46184afd99bSAndy Whitcroft /*
46208fba699SMel Gorman  * Return the size of the pages allocated when backing a VMA. In the majority
46308fba699SMel Gorman  * cases this will be same size as used by the page table entries.
46408fba699SMel Gorman  */
46508fba699SMel Gorman unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
46608fba699SMel Gorman {
46708fba699SMel Gorman 	struct hstate *hstate;
46808fba699SMel Gorman 
46908fba699SMel Gorman 	if (!is_vm_hugetlb_page(vma))
47008fba699SMel Gorman 		return PAGE_SIZE;
47108fba699SMel Gorman 
47208fba699SMel Gorman 	hstate = hstate_vma(vma);
47308fba699SMel Gorman 
4742415cf12SWanpeng Li 	return 1UL << huge_page_shift(hstate);
47508fba699SMel Gorman }
476f340ca0fSJoerg Roedel EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
47708fba699SMel Gorman 
47808fba699SMel Gorman /*
4793340289dSMel Gorman  * Return the page size being used by the MMU to back a VMA. In the majority
4803340289dSMel Gorman  * of cases, the page size used by the kernel matches the MMU size. On
4813340289dSMel Gorman  * architectures where it differs, an architecture-specific version of this
4823340289dSMel Gorman  * function is required.
4833340289dSMel Gorman  */
4843340289dSMel Gorman #ifndef vma_mmu_pagesize
4853340289dSMel Gorman unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
4863340289dSMel Gorman {
4873340289dSMel Gorman 	return vma_kernel_pagesize(vma);
4883340289dSMel Gorman }
4893340289dSMel Gorman #endif
4903340289dSMel Gorman 
4913340289dSMel Gorman /*
49284afd99bSAndy Whitcroft  * Flags for MAP_PRIVATE reservations.  These are stored in the bottom
49384afd99bSAndy Whitcroft  * bits of the reservation map pointer, which are always clear due to
49484afd99bSAndy Whitcroft  * alignment.
49584afd99bSAndy Whitcroft  */
49684afd99bSAndy Whitcroft #define HPAGE_RESV_OWNER    (1UL << 0)
49784afd99bSAndy Whitcroft #define HPAGE_RESV_UNMAPPED (1UL << 1)
49804f2cbe3SMel Gorman #define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
49984afd99bSAndy Whitcroft 
500a1e78772SMel Gorman /*
501a1e78772SMel Gorman  * These helpers are used to track how many pages are reserved for
502a1e78772SMel Gorman  * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
503a1e78772SMel Gorman  * is guaranteed to have their future faults succeed.
504a1e78772SMel Gorman  *
505a1e78772SMel Gorman  * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
506a1e78772SMel Gorman  * the reserve counters are updated with the hugetlb_lock held. It is safe
507a1e78772SMel Gorman  * to reset the VMA at fork() time as it is not in use yet and there is no
508a1e78772SMel Gorman  * chance of the global counters getting corrupted as a result of the values.
50984afd99bSAndy Whitcroft  *
51084afd99bSAndy Whitcroft  * The private mapping reservation is represented in a subtly different
51184afd99bSAndy Whitcroft  * manner to a shared mapping.  A shared mapping has a region map associated
51284afd99bSAndy Whitcroft  * with the underlying file, this region map represents the backing file
51384afd99bSAndy Whitcroft  * pages which have ever had a reservation assigned which this persists even
51484afd99bSAndy Whitcroft  * after the page is instantiated.  A private mapping has a region map
51584afd99bSAndy Whitcroft  * associated with the original mmap which is attached to all VMAs which
51684afd99bSAndy Whitcroft  * reference it, this region map represents those offsets which have consumed
51784afd99bSAndy Whitcroft  * reservation ie. where pages have been instantiated.
518a1e78772SMel Gorman  */
519e7c4b0bfSAndy Whitcroft static unsigned long get_vma_private_data(struct vm_area_struct *vma)
520e7c4b0bfSAndy Whitcroft {
521e7c4b0bfSAndy Whitcroft 	return (unsigned long)vma->vm_private_data;
522e7c4b0bfSAndy Whitcroft }
523e7c4b0bfSAndy Whitcroft 
524e7c4b0bfSAndy Whitcroft static void set_vma_private_data(struct vm_area_struct *vma,
525e7c4b0bfSAndy Whitcroft 							unsigned long value)
526e7c4b0bfSAndy Whitcroft {
527e7c4b0bfSAndy Whitcroft 	vma->vm_private_data = (void *)value;
528e7c4b0bfSAndy Whitcroft }
529e7c4b0bfSAndy Whitcroft 
5309119a41eSJoonsoo Kim struct resv_map *resv_map_alloc(void)
53184afd99bSAndy Whitcroft {
53284afd99bSAndy Whitcroft 	struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
53384afd99bSAndy Whitcroft 	if (!resv_map)
53484afd99bSAndy Whitcroft 		return NULL;
53584afd99bSAndy Whitcroft 
53684afd99bSAndy Whitcroft 	kref_init(&resv_map->refs);
5377b24d861SDavidlohr Bueso 	spin_lock_init(&resv_map->lock);
53884afd99bSAndy Whitcroft 	INIT_LIST_HEAD(&resv_map->regions);
53984afd99bSAndy Whitcroft 
54084afd99bSAndy Whitcroft 	return resv_map;
54184afd99bSAndy Whitcroft }
54284afd99bSAndy Whitcroft 
5439119a41eSJoonsoo Kim void resv_map_release(struct kref *ref)
54484afd99bSAndy Whitcroft {
54584afd99bSAndy Whitcroft 	struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
54684afd99bSAndy Whitcroft 
54784afd99bSAndy Whitcroft 	/* Clear out any active regions before we release the map. */
5481406ec9bSJoonsoo Kim 	region_truncate(resv_map, 0);
54984afd99bSAndy Whitcroft 	kfree(resv_map);
55084afd99bSAndy Whitcroft }
55184afd99bSAndy Whitcroft 
5524e35f483SJoonsoo Kim static inline struct resv_map *inode_resv_map(struct inode *inode)
5534e35f483SJoonsoo Kim {
5544e35f483SJoonsoo Kim 	return inode->i_mapping->private_data;
5554e35f483SJoonsoo Kim }
5564e35f483SJoonsoo Kim 
55784afd99bSAndy Whitcroft static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
558a1e78772SMel Gorman {
55981d1b09cSSasha Levin 	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
5604e35f483SJoonsoo Kim 	if (vma->vm_flags & VM_MAYSHARE) {
5614e35f483SJoonsoo Kim 		struct address_space *mapping = vma->vm_file->f_mapping;
5624e35f483SJoonsoo Kim 		struct inode *inode = mapping->host;
5634e35f483SJoonsoo Kim 
5644e35f483SJoonsoo Kim 		return inode_resv_map(inode);
5654e35f483SJoonsoo Kim 
5664e35f483SJoonsoo Kim 	} else {
56784afd99bSAndy Whitcroft 		return (struct resv_map *)(get_vma_private_data(vma) &
56884afd99bSAndy Whitcroft 							~HPAGE_RESV_MASK);
5694e35f483SJoonsoo Kim 	}
570a1e78772SMel Gorman }
571a1e78772SMel Gorman 
57284afd99bSAndy Whitcroft static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
573a1e78772SMel Gorman {
57481d1b09cSSasha Levin 	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
57581d1b09cSSasha Levin 	VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
576a1e78772SMel Gorman 
57784afd99bSAndy Whitcroft 	set_vma_private_data(vma, (get_vma_private_data(vma) &
57884afd99bSAndy Whitcroft 				HPAGE_RESV_MASK) | (unsigned long)map);
57904f2cbe3SMel Gorman }
58004f2cbe3SMel Gorman 
58104f2cbe3SMel Gorman static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
58204f2cbe3SMel Gorman {
58381d1b09cSSasha Levin 	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
58481d1b09cSSasha Levin 	VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
585e7c4b0bfSAndy Whitcroft 
586e7c4b0bfSAndy Whitcroft 	set_vma_private_data(vma, get_vma_private_data(vma) | flags);
58704f2cbe3SMel Gorman }
58804f2cbe3SMel Gorman 
58904f2cbe3SMel Gorman static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
59004f2cbe3SMel Gorman {
59181d1b09cSSasha Levin 	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
592e7c4b0bfSAndy Whitcroft 
593e7c4b0bfSAndy Whitcroft 	return (get_vma_private_data(vma) & flag) != 0;
594a1e78772SMel Gorman }
595a1e78772SMel Gorman 
59604f2cbe3SMel Gorman /* Reset counters to 0 and clear all HPAGE_RESV_* flags */
597a1e78772SMel Gorman void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
598a1e78772SMel Gorman {
59981d1b09cSSasha Levin 	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
600f83a275dSMel Gorman 	if (!(vma->vm_flags & VM_MAYSHARE))
601a1e78772SMel Gorman 		vma->vm_private_data = (void *)0;
602a1e78772SMel Gorman }
603a1e78772SMel Gorman 
604a1e78772SMel Gorman /* Returns true if the VMA has associated reserve pages */
605af0ed73eSJoonsoo Kim static int vma_has_reserves(struct vm_area_struct *vma, long chg)
606a1e78772SMel Gorman {
607af0ed73eSJoonsoo Kim 	if (vma->vm_flags & VM_NORESERVE) {
608af0ed73eSJoonsoo Kim 		/*
609af0ed73eSJoonsoo Kim 		 * This address is already reserved by other process(chg == 0),
610af0ed73eSJoonsoo Kim 		 * so, we should decrement reserved count. Without decrementing,
611af0ed73eSJoonsoo Kim 		 * reserve count remains after releasing inode, because this
612af0ed73eSJoonsoo Kim 		 * allocated page will go into page cache and is regarded as
613af0ed73eSJoonsoo Kim 		 * coming from reserved pool in releasing step.  Currently, we
614af0ed73eSJoonsoo Kim 		 * don't have any other solution to deal with this situation
615af0ed73eSJoonsoo Kim 		 * properly, so add work-around here.
616af0ed73eSJoonsoo Kim 		 */
617af0ed73eSJoonsoo Kim 		if (vma->vm_flags & VM_MAYSHARE && chg == 0)
618af0ed73eSJoonsoo Kim 			return 1;
619af0ed73eSJoonsoo Kim 		else
62072231b03SJoonsoo Kim 			return 0;
621af0ed73eSJoonsoo Kim 	}
622a63884e9SJoonsoo Kim 
623a63884e9SJoonsoo Kim 	/* Shared mappings always use reserves */
624f83a275dSMel Gorman 	if (vma->vm_flags & VM_MAYSHARE)
625a1e78772SMel Gorman 		return 1;
626a63884e9SJoonsoo Kim 
627a63884e9SJoonsoo Kim 	/*
628a63884e9SJoonsoo Kim 	 * Only the process that called mmap() has reserves for
629a63884e9SJoonsoo Kim 	 * private mappings.
630a63884e9SJoonsoo Kim 	 */
6317f09ca51SMel Gorman 	if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
6327f09ca51SMel Gorman 		return 1;
633a63884e9SJoonsoo Kim 
6347f09ca51SMel Gorman 	return 0;
635a1e78772SMel Gorman }
636a1e78772SMel Gorman 
637a5516438SAndi Kleen static void enqueue_huge_page(struct hstate *h, struct page *page)
6381da177e4SLinus Torvalds {
6391da177e4SLinus Torvalds 	int nid = page_to_nid(page);
6400edaecfaSAneesh Kumar K.V 	list_move(&page->lru, &h->hugepage_freelists[nid]);
641a5516438SAndi Kleen 	h->free_huge_pages++;
642a5516438SAndi Kleen 	h->free_huge_pages_node[nid]++;
6431da177e4SLinus Torvalds }
6441da177e4SLinus Torvalds 
645bf50bab2SNaoya Horiguchi static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
646bf50bab2SNaoya Horiguchi {
647bf50bab2SNaoya Horiguchi 	struct page *page;
648bf50bab2SNaoya Horiguchi 
649c8721bbbSNaoya Horiguchi 	list_for_each_entry(page, &h->hugepage_freelists[nid], lru)
650c8721bbbSNaoya Horiguchi 		if (!is_migrate_isolate_page(page))
651c8721bbbSNaoya Horiguchi 			break;
652c8721bbbSNaoya Horiguchi 	/*
653c8721bbbSNaoya Horiguchi 	 * if 'non-isolated free hugepage' not found on the list,
654c8721bbbSNaoya Horiguchi 	 * the allocation fails.
655c8721bbbSNaoya Horiguchi 	 */
656c8721bbbSNaoya Horiguchi 	if (&h->hugepage_freelists[nid] == &page->lru)
657bf50bab2SNaoya Horiguchi 		return NULL;
6580edaecfaSAneesh Kumar K.V 	list_move(&page->lru, &h->hugepage_activelist);
659a9869b83SNaoya Horiguchi 	set_page_refcounted(page);
660bf50bab2SNaoya Horiguchi 	h->free_huge_pages--;
661bf50bab2SNaoya Horiguchi 	h->free_huge_pages_node[nid]--;
662bf50bab2SNaoya Horiguchi 	return page;
663bf50bab2SNaoya Horiguchi }
664bf50bab2SNaoya Horiguchi 
66586cdb465SNaoya Horiguchi /* Movability of hugepages depends on migration support. */
66686cdb465SNaoya Horiguchi static inline gfp_t htlb_alloc_mask(struct hstate *h)
66786cdb465SNaoya Horiguchi {
668100873d7SNaoya Horiguchi 	if (hugepages_treat_as_movable || hugepage_migration_supported(h))
66986cdb465SNaoya Horiguchi 		return GFP_HIGHUSER_MOVABLE;
67086cdb465SNaoya Horiguchi 	else
67186cdb465SNaoya Horiguchi 		return GFP_HIGHUSER;
67286cdb465SNaoya Horiguchi }
67386cdb465SNaoya Horiguchi 
674a5516438SAndi Kleen static struct page *dequeue_huge_page_vma(struct hstate *h,
675a5516438SAndi Kleen 				struct vm_area_struct *vma,
676af0ed73eSJoonsoo Kim 				unsigned long address, int avoid_reserve,
677af0ed73eSJoonsoo Kim 				long chg)
6781da177e4SLinus Torvalds {
679b1c12cbcSKonstantin Khlebnikov 	struct page *page = NULL;
680480eccf9SLee Schermerhorn 	struct mempolicy *mpol;
68119770b32SMel Gorman 	nodemask_t *nodemask;
682c0ff7453SMiao Xie 	struct zonelist *zonelist;
683dd1a239fSMel Gorman 	struct zone *zone;
684dd1a239fSMel Gorman 	struct zoneref *z;
685cc9a6c87SMel Gorman 	unsigned int cpuset_mems_cookie;
6861da177e4SLinus Torvalds 
687a1e78772SMel Gorman 	/*
688a1e78772SMel Gorman 	 * A child process with MAP_PRIVATE mappings created by their parent
689a1e78772SMel Gorman 	 * have no page reserves. This check ensures that reservations are
690a1e78772SMel Gorman 	 * not "stolen". The child may still get SIGKILLed
691a1e78772SMel Gorman 	 */
692af0ed73eSJoonsoo Kim 	if (!vma_has_reserves(vma, chg) &&
693a5516438SAndi Kleen 			h->free_huge_pages - h->resv_huge_pages == 0)
694c0ff7453SMiao Xie 		goto err;
695a1e78772SMel Gorman 
69604f2cbe3SMel Gorman 	/* If reserves cannot be used, ensure enough pages are in the pool */
697a5516438SAndi Kleen 	if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
6986eab04a8SJustin P. Mattock 		goto err;
69904f2cbe3SMel Gorman 
7009966c4bbSJoonsoo Kim retry_cpuset:
701d26914d1SMel Gorman 	cpuset_mems_cookie = read_mems_allowed_begin();
7029966c4bbSJoonsoo Kim 	zonelist = huge_zonelist(vma, address,
70386cdb465SNaoya Horiguchi 					htlb_alloc_mask(h), &mpol, &nodemask);
7049966c4bbSJoonsoo Kim 
70519770b32SMel Gorman 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
70619770b32SMel Gorman 						MAX_NR_ZONES - 1, nodemask) {
707344736f2SVladimir Davydov 		if (cpuset_zone_allowed(zone, htlb_alloc_mask(h))) {
708bf50bab2SNaoya Horiguchi 			page = dequeue_huge_page_node(h, zone_to_nid(zone));
709bf50bab2SNaoya Horiguchi 			if (page) {
710af0ed73eSJoonsoo Kim 				if (avoid_reserve)
711af0ed73eSJoonsoo Kim 					break;
712af0ed73eSJoonsoo Kim 				if (!vma_has_reserves(vma, chg))
713af0ed73eSJoonsoo Kim 					break;
714af0ed73eSJoonsoo Kim 
71507443a85SJoonsoo Kim 				SetPagePrivate(page);
716a63884e9SJoonsoo Kim 				h->resv_huge_pages--;
7175ab3ee7bSKen Chen 				break;
7181da177e4SLinus Torvalds 			}
7193abf7afdSAndrew Morton 		}
720bf50bab2SNaoya Horiguchi 	}
721cc9a6c87SMel Gorman 
722cc9a6c87SMel Gorman 	mpol_cond_put(mpol);
723d26914d1SMel Gorman 	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
724cc9a6c87SMel Gorman 		goto retry_cpuset;
725cc9a6c87SMel Gorman 	return page;
726cc9a6c87SMel Gorman 
727c0ff7453SMiao Xie err:
728cc9a6c87SMel Gorman 	return NULL;
7291da177e4SLinus Torvalds }
7301da177e4SLinus Torvalds 
7311cac6f2cSLuiz Capitulino /*
7321cac6f2cSLuiz Capitulino  * common helper functions for hstate_next_node_to_{alloc|free}.
7331cac6f2cSLuiz Capitulino  * We may have allocated or freed a huge page based on a different
7341cac6f2cSLuiz Capitulino  * nodes_allowed previously, so h->next_node_to_{alloc|free} might
7351cac6f2cSLuiz Capitulino  * be outside of *nodes_allowed.  Ensure that we use an allowed
7361cac6f2cSLuiz Capitulino  * node for alloc or free.
7371cac6f2cSLuiz Capitulino  */
7381cac6f2cSLuiz Capitulino static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
7391cac6f2cSLuiz Capitulino {
7401cac6f2cSLuiz Capitulino 	nid = next_node(nid, *nodes_allowed);
7411cac6f2cSLuiz Capitulino 	if (nid == MAX_NUMNODES)
7421cac6f2cSLuiz Capitulino 		nid = first_node(*nodes_allowed);
7431cac6f2cSLuiz Capitulino 	VM_BUG_ON(nid >= MAX_NUMNODES);
7441cac6f2cSLuiz Capitulino 
7451cac6f2cSLuiz Capitulino 	return nid;
7461cac6f2cSLuiz Capitulino }
7471cac6f2cSLuiz Capitulino 
7481cac6f2cSLuiz Capitulino static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
7491cac6f2cSLuiz Capitulino {
7501cac6f2cSLuiz Capitulino 	if (!node_isset(nid, *nodes_allowed))
7511cac6f2cSLuiz Capitulino 		nid = next_node_allowed(nid, nodes_allowed);
7521cac6f2cSLuiz Capitulino 	return nid;
7531cac6f2cSLuiz Capitulino }
7541cac6f2cSLuiz Capitulino 
7551cac6f2cSLuiz Capitulino /*
7561cac6f2cSLuiz Capitulino  * returns the previously saved node ["this node"] from which to
7571cac6f2cSLuiz Capitulino  * allocate a persistent huge page for the pool and advance the
7581cac6f2cSLuiz Capitulino  * next node from which to allocate, handling wrap at end of node
7591cac6f2cSLuiz Capitulino  * mask.
7601cac6f2cSLuiz Capitulino  */
7611cac6f2cSLuiz Capitulino static int hstate_next_node_to_alloc(struct hstate *h,
7621cac6f2cSLuiz Capitulino 					nodemask_t *nodes_allowed)
7631cac6f2cSLuiz Capitulino {
7641cac6f2cSLuiz Capitulino 	int nid;
7651cac6f2cSLuiz Capitulino 
7661cac6f2cSLuiz Capitulino 	VM_BUG_ON(!nodes_allowed);
7671cac6f2cSLuiz Capitulino 
7681cac6f2cSLuiz Capitulino 	nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
7691cac6f2cSLuiz Capitulino 	h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
7701cac6f2cSLuiz Capitulino 
7711cac6f2cSLuiz Capitulino 	return nid;
7721cac6f2cSLuiz Capitulino }
7731cac6f2cSLuiz Capitulino 
7741cac6f2cSLuiz Capitulino /*
7751cac6f2cSLuiz Capitulino  * helper for free_pool_huge_page() - return the previously saved
7761cac6f2cSLuiz Capitulino  * node ["this node"] from which to free a huge page.  Advance the
7771cac6f2cSLuiz Capitulino  * next node id whether or not we find a free huge page to free so
7781cac6f2cSLuiz Capitulino  * that the next attempt to free addresses the next node.
7791cac6f2cSLuiz Capitulino  */
7801cac6f2cSLuiz Capitulino static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
7811cac6f2cSLuiz Capitulino {
7821cac6f2cSLuiz Capitulino 	int nid;
7831cac6f2cSLuiz Capitulino 
7841cac6f2cSLuiz Capitulino 	VM_BUG_ON(!nodes_allowed);
7851cac6f2cSLuiz Capitulino 
7861cac6f2cSLuiz Capitulino 	nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
7871cac6f2cSLuiz Capitulino 	h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
7881cac6f2cSLuiz Capitulino 
7891cac6f2cSLuiz Capitulino 	return nid;
7901cac6f2cSLuiz Capitulino }
7911cac6f2cSLuiz Capitulino 
7921cac6f2cSLuiz Capitulino #define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask)		\
7931cac6f2cSLuiz Capitulino 	for (nr_nodes = nodes_weight(*mask);				\
7941cac6f2cSLuiz Capitulino 		nr_nodes > 0 &&						\
7951cac6f2cSLuiz Capitulino 		((node = hstate_next_node_to_alloc(hs, mask)) || 1);	\
7961cac6f2cSLuiz Capitulino 		nr_nodes--)
7971cac6f2cSLuiz Capitulino 
7981cac6f2cSLuiz Capitulino #define for_each_node_mask_to_free(hs, nr_nodes, node, mask)		\
7991cac6f2cSLuiz Capitulino 	for (nr_nodes = nodes_weight(*mask);				\
8001cac6f2cSLuiz Capitulino 		nr_nodes > 0 &&						\
8011cac6f2cSLuiz Capitulino 		((node = hstate_next_node_to_free(hs, mask)) || 1);	\
8021cac6f2cSLuiz Capitulino 		nr_nodes--)
8031cac6f2cSLuiz Capitulino 
804944d9fecSLuiz Capitulino #if defined(CONFIG_CMA) && defined(CONFIG_X86_64)
805944d9fecSLuiz Capitulino static void destroy_compound_gigantic_page(struct page *page,
806944d9fecSLuiz Capitulino 					unsigned long order)
807944d9fecSLuiz Capitulino {
808944d9fecSLuiz Capitulino 	int i;
809944d9fecSLuiz Capitulino 	int nr_pages = 1 << order;
810944d9fecSLuiz Capitulino 	struct page *p = page + 1;
811944d9fecSLuiz Capitulino 
812944d9fecSLuiz Capitulino 	for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
813944d9fecSLuiz Capitulino 		__ClearPageTail(p);
814944d9fecSLuiz Capitulino 		set_page_refcounted(p);
815944d9fecSLuiz Capitulino 		p->first_page = NULL;
816944d9fecSLuiz Capitulino 	}
817944d9fecSLuiz Capitulino 
818944d9fecSLuiz Capitulino 	set_compound_order(page, 0);
819944d9fecSLuiz Capitulino 	__ClearPageHead(page);
820944d9fecSLuiz Capitulino }
821944d9fecSLuiz Capitulino 
822944d9fecSLuiz Capitulino static void free_gigantic_page(struct page *page, unsigned order)
823944d9fecSLuiz Capitulino {
824944d9fecSLuiz Capitulino 	free_contig_range(page_to_pfn(page), 1 << order);
825944d9fecSLuiz Capitulino }
826944d9fecSLuiz Capitulino 
827944d9fecSLuiz Capitulino static int __alloc_gigantic_page(unsigned long start_pfn,
828944d9fecSLuiz Capitulino 				unsigned long nr_pages)
829944d9fecSLuiz Capitulino {
830944d9fecSLuiz Capitulino 	unsigned long end_pfn = start_pfn + nr_pages;
831944d9fecSLuiz Capitulino 	return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
832944d9fecSLuiz Capitulino }
833944d9fecSLuiz Capitulino 
834944d9fecSLuiz Capitulino static bool pfn_range_valid_gigantic(unsigned long start_pfn,
835944d9fecSLuiz Capitulino 				unsigned long nr_pages)
836944d9fecSLuiz Capitulino {
837944d9fecSLuiz Capitulino 	unsigned long i, end_pfn = start_pfn + nr_pages;
838944d9fecSLuiz Capitulino 	struct page *page;
839944d9fecSLuiz Capitulino 
840944d9fecSLuiz Capitulino 	for (i = start_pfn; i < end_pfn; i++) {
841944d9fecSLuiz Capitulino 		if (!pfn_valid(i))
842944d9fecSLuiz Capitulino 			return false;
843944d9fecSLuiz Capitulino 
844944d9fecSLuiz Capitulino 		page = pfn_to_page(i);
845944d9fecSLuiz Capitulino 
846944d9fecSLuiz Capitulino 		if (PageReserved(page))
847944d9fecSLuiz Capitulino 			return false;
848944d9fecSLuiz Capitulino 
849944d9fecSLuiz Capitulino 		if (page_count(page) > 0)
850944d9fecSLuiz Capitulino 			return false;
851944d9fecSLuiz Capitulino 
852944d9fecSLuiz Capitulino 		if (PageHuge(page))
853944d9fecSLuiz Capitulino 			return false;
854944d9fecSLuiz Capitulino 	}
855944d9fecSLuiz Capitulino 
856944d9fecSLuiz Capitulino 	return true;
857944d9fecSLuiz Capitulino }
858944d9fecSLuiz Capitulino 
859944d9fecSLuiz Capitulino static bool zone_spans_last_pfn(const struct zone *zone,
860944d9fecSLuiz Capitulino 			unsigned long start_pfn, unsigned long nr_pages)
861944d9fecSLuiz Capitulino {
862944d9fecSLuiz Capitulino 	unsigned long last_pfn = start_pfn + nr_pages - 1;
863944d9fecSLuiz Capitulino 	return zone_spans_pfn(zone, last_pfn);
864944d9fecSLuiz Capitulino }
865944d9fecSLuiz Capitulino 
866944d9fecSLuiz Capitulino static struct page *alloc_gigantic_page(int nid, unsigned order)
867944d9fecSLuiz Capitulino {
868944d9fecSLuiz Capitulino 	unsigned long nr_pages = 1 << order;
869944d9fecSLuiz Capitulino 	unsigned long ret, pfn, flags;
870944d9fecSLuiz Capitulino 	struct zone *z;
871944d9fecSLuiz Capitulino 
872944d9fecSLuiz Capitulino 	z = NODE_DATA(nid)->node_zones;
873944d9fecSLuiz Capitulino 	for (; z - NODE_DATA(nid)->node_zones < MAX_NR_ZONES; z++) {
874944d9fecSLuiz Capitulino 		spin_lock_irqsave(&z->lock, flags);
875944d9fecSLuiz Capitulino 
876944d9fecSLuiz Capitulino 		pfn = ALIGN(z->zone_start_pfn, nr_pages);
877944d9fecSLuiz Capitulino 		while (zone_spans_last_pfn(z, pfn, nr_pages)) {
878944d9fecSLuiz Capitulino 			if (pfn_range_valid_gigantic(pfn, nr_pages)) {
879944d9fecSLuiz Capitulino 				/*
880944d9fecSLuiz Capitulino 				 * We release the zone lock here because
881944d9fecSLuiz Capitulino 				 * alloc_contig_range() will also lock the zone
882944d9fecSLuiz Capitulino 				 * at some point. If there's an allocation
883944d9fecSLuiz Capitulino 				 * spinning on this lock, it may win the race
884944d9fecSLuiz Capitulino 				 * and cause alloc_contig_range() to fail...
885944d9fecSLuiz Capitulino 				 */
886944d9fecSLuiz Capitulino 				spin_unlock_irqrestore(&z->lock, flags);
887944d9fecSLuiz Capitulino 				ret = __alloc_gigantic_page(pfn, nr_pages);
888944d9fecSLuiz Capitulino 				if (!ret)
889944d9fecSLuiz Capitulino 					return pfn_to_page(pfn);
890944d9fecSLuiz Capitulino 				spin_lock_irqsave(&z->lock, flags);
891944d9fecSLuiz Capitulino 			}
892944d9fecSLuiz Capitulino 			pfn += nr_pages;
893944d9fecSLuiz Capitulino 		}
894944d9fecSLuiz Capitulino 
895944d9fecSLuiz Capitulino 		spin_unlock_irqrestore(&z->lock, flags);
896944d9fecSLuiz Capitulino 	}
897944d9fecSLuiz Capitulino 
898944d9fecSLuiz Capitulino 	return NULL;
899944d9fecSLuiz Capitulino }
900944d9fecSLuiz Capitulino 
901944d9fecSLuiz Capitulino static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
902944d9fecSLuiz Capitulino static void prep_compound_gigantic_page(struct page *page, unsigned long order);
903944d9fecSLuiz Capitulino 
904944d9fecSLuiz Capitulino static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid)
905944d9fecSLuiz Capitulino {
906944d9fecSLuiz Capitulino 	struct page *page;
907944d9fecSLuiz Capitulino 
908944d9fecSLuiz Capitulino 	page = alloc_gigantic_page(nid, huge_page_order(h));
909944d9fecSLuiz Capitulino 	if (page) {
910944d9fecSLuiz Capitulino 		prep_compound_gigantic_page(page, huge_page_order(h));
911944d9fecSLuiz Capitulino 		prep_new_huge_page(h, page, nid);
912944d9fecSLuiz Capitulino 	}
913944d9fecSLuiz Capitulino 
914944d9fecSLuiz Capitulino 	return page;
915944d9fecSLuiz Capitulino }
916944d9fecSLuiz Capitulino 
917944d9fecSLuiz Capitulino static int alloc_fresh_gigantic_page(struct hstate *h,
918944d9fecSLuiz Capitulino 				nodemask_t *nodes_allowed)
919944d9fecSLuiz Capitulino {
920944d9fecSLuiz Capitulino 	struct page *page = NULL;
921944d9fecSLuiz Capitulino 	int nr_nodes, node;
922944d9fecSLuiz Capitulino 
923944d9fecSLuiz Capitulino 	for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
924944d9fecSLuiz Capitulino 		page = alloc_fresh_gigantic_page_node(h, node);
925944d9fecSLuiz Capitulino 		if (page)
926944d9fecSLuiz Capitulino 			return 1;
927944d9fecSLuiz Capitulino 	}
928944d9fecSLuiz Capitulino 
929944d9fecSLuiz Capitulino 	return 0;
930944d9fecSLuiz Capitulino }
931944d9fecSLuiz Capitulino 
932944d9fecSLuiz Capitulino static inline bool gigantic_page_supported(void) { return true; }
933944d9fecSLuiz Capitulino #else
934944d9fecSLuiz Capitulino static inline bool gigantic_page_supported(void) { return false; }
935944d9fecSLuiz Capitulino static inline void free_gigantic_page(struct page *page, unsigned order) { }
936944d9fecSLuiz Capitulino static inline void destroy_compound_gigantic_page(struct page *page,
937944d9fecSLuiz Capitulino 						unsigned long order) { }
938944d9fecSLuiz Capitulino static inline int alloc_fresh_gigantic_page(struct hstate *h,
939944d9fecSLuiz Capitulino 					nodemask_t *nodes_allowed) { return 0; }
940944d9fecSLuiz Capitulino #endif
941944d9fecSLuiz Capitulino 
942a5516438SAndi Kleen static void update_and_free_page(struct hstate *h, struct page *page)
9436af2acb6SAdam Litke {
9446af2acb6SAdam Litke 	int i;
945a5516438SAndi Kleen 
946944d9fecSLuiz Capitulino 	if (hstate_is_gigantic(h) && !gigantic_page_supported())
947944d9fecSLuiz Capitulino 		return;
94818229df5SAndy Whitcroft 
949a5516438SAndi Kleen 	h->nr_huge_pages--;
950a5516438SAndi Kleen 	h->nr_huge_pages_node[page_to_nid(page)]--;
951a5516438SAndi Kleen 	for (i = 0; i < pages_per_huge_page(h); i++) {
95232f84528SChris Forbes 		page[i].flags &= ~(1 << PG_locked | 1 << PG_error |
95332f84528SChris Forbes 				1 << PG_referenced | 1 << PG_dirty |
954a7407a27SLuiz Capitulino 				1 << PG_active | 1 << PG_private |
955a7407a27SLuiz Capitulino 				1 << PG_writeback);
9566af2acb6SAdam Litke 	}
957309381feSSasha Levin 	VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
9586af2acb6SAdam Litke 	set_compound_page_dtor(page, NULL);
9596af2acb6SAdam Litke 	set_page_refcounted(page);
960944d9fecSLuiz Capitulino 	if (hstate_is_gigantic(h)) {
961944d9fecSLuiz Capitulino 		destroy_compound_gigantic_page(page, huge_page_order(h));
962944d9fecSLuiz Capitulino 		free_gigantic_page(page, huge_page_order(h));
963944d9fecSLuiz Capitulino 	} else {
9647f2e9525SGerald Schaefer 		arch_release_hugepage(page);
965a5516438SAndi Kleen 		__free_pages(page, huge_page_order(h));
9666af2acb6SAdam Litke 	}
967944d9fecSLuiz Capitulino }
9686af2acb6SAdam Litke 
969e5ff2159SAndi Kleen struct hstate *size_to_hstate(unsigned long size)
970e5ff2159SAndi Kleen {
971e5ff2159SAndi Kleen 	struct hstate *h;
972e5ff2159SAndi Kleen 
973e5ff2159SAndi Kleen 	for_each_hstate(h) {
974e5ff2159SAndi Kleen 		if (huge_page_size(h) == size)
975e5ff2159SAndi Kleen 			return h;
976e5ff2159SAndi Kleen 	}
977e5ff2159SAndi Kleen 	return NULL;
978e5ff2159SAndi Kleen }
979e5ff2159SAndi Kleen 
980bcc54222SNaoya Horiguchi /*
981bcc54222SNaoya Horiguchi  * Test to determine whether the hugepage is "active/in-use" (i.e. being linked
982bcc54222SNaoya Horiguchi  * to hstate->hugepage_activelist.)
983bcc54222SNaoya Horiguchi  *
984bcc54222SNaoya Horiguchi  * This function can be called for tail pages, but never returns true for them.
985bcc54222SNaoya Horiguchi  */
986bcc54222SNaoya Horiguchi bool page_huge_active(struct page *page)
987bcc54222SNaoya Horiguchi {
988bcc54222SNaoya Horiguchi 	VM_BUG_ON_PAGE(!PageHuge(page), page);
989bcc54222SNaoya Horiguchi 	return PageHead(page) && PagePrivate(&page[1]);
990bcc54222SNaoya Horiguchi }
991bcc54222SNaoya Horiguchi 
992bcc54222SNaoya Horiguchi /* never called for tail page */
993bcc54222SNaoya Horiguchi static void set_page_huge_active(struct page *page)
994bcc54222SNaoya Horiguchi {
995bcc54222SNaoya Horiguchi 	VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
996bcc54222SNaoya Horiguchi 	SetPagePrivate(&page[1]);
997bcc54222SNaoya Horiguchi }
998bcc54222SNaoya Horiguchi 
999bcc54222SNaoya Horiguchi static void clear_page_huge_active(struct page *page)
1000bcc54222SNaoya Horiguchi {
1001bcc54222SNaoya Horiguchi 	VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
1002bcc54222SNaoya Horiguchi 	ClearPagePrivate(&page[1]);
1003bcc54222SNaoya Horiguchi }
1004bcc54222SNaoya Horiguchi 
10058f1d26d0SAtsushi Kumagai void free_huge_page(struct page *page)
100627a85ef1SDavid Gibson {
1007a5516438SAndi Kleen 	/*
1008a5516438SAndi Kleen 	 * Can't pass hstate in here because it is called from the
1009a5516438SAndi Kleen 	 * compound page destructor.
1010a5516438SAndi Kleen 	 */
1011e5ff2159SAndi Kleen 	struct hstate *h = page_hstate(page);
10127893d1d5SAdam Litke 	int nid = page_to_nid(page);
101390481622SDavid Gibson 	struct hugepage_subpool *spool =
101490481622SDavid Gibson 		(struct hugepage_subpool *)page_private(page);
101507443a85SJoonsoo Kim 	bool restore_reserve;
101627a85ef1SDavid Gibson 
1017e5df70abSAndy Whitcroft 	set_page_private(page, 0);
101823be7468SMel Gorman 	page->mapping = NULL;
10197893d1d5SAdam Litke 	BUG_ON(page_count(page));
10200fe6e20bSNaoya Horiguchi 	BUG_ON(page_mapcount(page));
102107443a85SJoonsoo Kim 	restore_reserve = PagePrivate(page);
102216c794b4SJoonsoo Kim 	ClearPagePrivate(page);
102327a85ef1SDavid Gibson 
10241c5ecae3SMike Kravetz 	/*
10251c5ecae3SMike Kravetz 	 * A return code of zero implies that the subpool will be under its
10261c5ecae3SMike Kravetz 	 * minimum size if the reservation is not restored after page is free.
10271c5ecae3SMike Kravetz 	 * Therefore, force restore_reserve operation.
10281c5ecae3SMike Kravetz 	 */
10291c5ecae3SMike Kravetz 	if (hugepage_subpool_put_pages(spool, 1) == 0)
10301c5ecae3SMike Kravetz 		restore_reserve = true;
10311c5ecae3SMike Kravetz 
103227a85ef1SDavid Gibson 	spin_lock(&hugetlb_lock);
1033bcc54222SNaoya Horiguchi 	clear_page_huge_active(page);
10346d76dcf4SAneesh Kumar K.V 	hugetlb_cgroup_uncharge_page(hstate_index(h),
10356d76dcf4SAneesh Kumar K.V 				     pages_per_huge_page(h), page);
103607443a85SJoonsoo Kim 	if (restore_reserve)
103707443a85SJoonsoo Kim 		h->resv_huge_pages++;
103807443a85SJoonsoo Kim 
1039944d9fecSLuiz Capitulino 	if (h->surplus_huge_pages_node[nid]) {
10400edaecfaSAneesh Kumar K.V 		/* remove the page from active list */
10410edaecfaSAneesh Kumar K.V 		list_del(&page->lru);
1042a5516438SAndi Kleen 		update_and_free_page(h, page);
1043a5516438SAndi Kleen 		h->surplus_huge_pages--;
1044a5516438SAndi Kleen 		h->surplus_huge_pages_node[nid]--;
10457893d1d5SAdam Litke 	} else {
10465d3a551cSWill Deacon 		arch_clear_hugepage_flags(page);
1047a5516438SAndi Kleen 		enqueue_huge_page(h, page);
10487893d1d5SAdam Litke 	}
104927a85ef1SDavid Gibson 	spin_unlock(&hugetlb_lock);
105027a85ef1SDavid Gibson }
105127a85ef1SDavid Gibson 
1052a5516438SAndi Kleen static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
1053b7ba30c6SAndi Kleen {
10540edaecfaSAneesh Kumar K.V 	INIT_LIST_HEAD(&page->lru);
1055b7ba30c6SAndi Kleen 	set_compound_page_dtor(page, free_huge_page);
1056b7ba30c6SAndi Kleen 	spin_lock(&hugetlb_lock);
10579dd540e2SAneesh Kumar K.V 	set_hugetlb_cgroup(page, NULL);
1058a5516438SAndi Kleen 	h->nr_huge_pages++;
1059a5516438SAndi Kleen 	h->nr_huge_pages_node[nid]++;
1060b7ba30c6SAndi Kleen 	spin_unlock(&hugetlb_lock);
1061b7ba30c6SAndi Kleen 	put_page(page); /* free it into the hugepage allocator */
1062b7ba30c6SAndi Kleen }
1063b7ba30c6SAndi Kleen 
10642906dd52SLuiz Capitulino static void prep_compound_gigantic_page(struct page *page, unsigned long order)
106520a0307cSWu Fengguang {
106620a0307cSWu Fengguang 	int i;
106720a0307cSWu Fengguang 	int nr_pages = 1 << order;
106820a0307cSWu Fengguang 	struct page *p = page + 1;
106920a0307cSWu Fengguang 
107020a0307cSWu Fengguang 	/* we rely on prep_new_huge_page to set the destructor */
107120a0307cSWu Fengguang 	set_compound_order(page, order);
107220a0307cSWu Fengguang 	__SetPageHead(page);
1073ef5a22beSAndrea Arcangeli 	__ClearPageReserved(page);
107420a0307cSWu Fengguang 	for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
1075ef5a22beSAndrea Arcangeli 		/*
1076ef5a22beSAndrea Arcangeli 		 * For gigantic hugepages allocated through bootmem at
1077ef5a22beSAndrea Arcangeli 		 * boot, it's safer to be consistent with the not-gigantic
1078ef5a22beSAndrea Arcangeli 		 * hugepages and clear the PG_reserved bit from all tail pages
1079ef5a22beSAndrea Arcangeli 		 * too.  Otherwse drivers using get_user_pages() to access tail
1080ef5a22beSAndrea Arcangeli 		 * pages may get the reference counting wrong if they see
1081ef5a22beSAndrea Arcangeli 		 * PG_reserved set on a tail page (despite the head page not
1082ef5a22beSAndrea Arcangeli 		 * having PG_reserved set).  Enforcing this consistency between
1083ef5a22beSAndrea Arcangeli 		 * head and tail pages allows drivers to optimize away a check
1084ef5a22beSAndrea Arcangeli 		 * on the head page when they need know if put_page() is needed
1085ef5a22beSAndrea Arcangeli 		 * after get_user_pages().
1086ef5a22beSAndrea Arcangeli 		 */
1087ef5a22beSAndrea Arcangeli 		__ClearPageReserved(p);
108858a84aa9SYouquan Song 		set_page_count(p, 0);
108920a0307cSWu Fengguang 		p->first_page = page;
109044fc8057SDavid Rientjes 		/* Make sure p->first_page is always valid for PageTail() */
109144fc8057SDavid Rientjes 		smp_wmb();
109244fc8057SDavid Rientjes 		__SetPageTail(p);
109320a0307cSWu Fengguang 	}
109420a0307cSWu Fengguang }
109520a0307cSWu Fengguang 
10967795912cSAndrew Morton /*
10977795912cSAndrew Morton  * PageHuge() only returns true for hugetlbfs pages, but not for normal or
10987795912cSAndrew Morton  * transparent huge pages.  See the PageTransHuge() documentation for more
10997795912cSAndrew Morton  * details.
11007795912cSAndrew Morton  */
110120a0307cSWu Fengguang int PageHuge(struct page *page)
110220a0307cSWu Fengguang {
110320a0307cSWu Fengguang 	if (!PageCompound(page))
110420a0307cSWu Fengguang 		return 0;
110520a0307cSWu Fengguang 
110620a0307cSWu Fengguang 	page = compound_head(page);
1107758f66a2SAndrew Morton 	return get_compound_page_dtor(page) == free_huge_page;
110820a0307cSWu Fengguang }
110943131e14SNaoya Horiguchi EXPORT_SYMBOL_GPL(PageHuge);
111043131e14SNaoya Horiguchi 
111127c73ae7SAndrea Arcangeli /*
111227c73ae7SAndrea Arcangeli  * PageHeadHuge() only returns true for hugetlbfs head page, but not for
111327c73ae7SAndrea Arcangeli  * normal or transparent huge pages.
111427c73ae7SAndrea Arcangeli  */
111527c73ae7SAndrea Arcangeli int PageHeadHuge(struct page *page_head)
111627c73ae7SAndrea Arcangeli {
111727c73ae7SAndrea Arcangeli 	if (!PageHead(page_head))
111827c73ae7SAndrea Arcangeli 		return 0;
111927c73ae7SAndrea Arcangeli 
1120758f66a2SAndrew Morton 	return get_compound_page_dtor(page_head) == free_huge_page;
112127c73ae7SAndrea Arcangeli }
112227c73ae7SAndrea Arcangeli 
112313d60f4bSZhang Yi pgoff_t __basepage_index(struct page *page)
112413d60f4bSZhang Yi {
112513d60f4bSZhang Yi 	struct page *page_head = compound_head(page);
112613d60f4bSZhang Yi 	pgoff_t index = page_index(page_head);
112713d60f4bSZhang Yi 	unsigned long compound_idx;
112813d60f4bSZhang Yi 
112913d60f4bSZhang Yi 	if (!PageHuge(page_head))
113013d60f4bSZhang Yi 		return page_index(page);
113113d60f4bSZhang Yi 
113213d60f4bSZhang Yi 	if (compound_order(page_head) >= MAX_ORDER)
113313d60f4bSZhang Yi 		compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
113413d60f4bSZhang Yi 	else
113513d60f4bSZhang Yi 		compound_idx = page - page_head;
113613d60f4bSZhang Yi 
113713d60f4bSZhang Yi 	return (index << compound_order(page_head)) + compound_idx;
113813d60f4bSZhang Yi }
113913d60f4bSZhang Yi 
1140a5516438SAndi Kleen static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
11411da177e4SLinus Torvalds {
11421da177e4SLinus Torvalds 	struct page *page;
1143f96efd58SJoe Jin 
11446484eb3eSMel Gorman 	page = alloc_pages_exact_node(nid,
114586cdb465SNaoya Horiguchi 		htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
1146551883aeSNishanth Aravamudan 						__GFP_REPEAT|__GFP_NOWARN,
1147a5516438SAndi Kleen 		huge_page_order(h));
11481da177e4SLinus Torvalds 	if (page) {
11497f2e9525SGerald Schaefer 		if (arch_prepare_hugepage(page)) {
1150caff3a2cSGerald Schaefer 			__free_pages(page, huge_page_order(h));
11517b8ee84dSHarvey Harrison 			return NULL;
11527f2e9525SGerald Schaefer 		}
1153a5516438SAndi Kleen 		prep_new_huge_page(h, page, nid);
11541da177e4SLinus Torvalds 	}
115563b4613cSNishanth Aravamudan 
115663b4613cSNishanth Aravamudan 	return page;
115763b4613cSNishanth Aravamudan }
115863b4613cSNishanth Aravamudan 
1159b2261026SJoonsoo Kim static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
1160b2261026SJoonsoo Kim {
1161b2261026SJoonsoo Kim 	struct page *page;
1162b2261026SJoonsoo Kim 	int nr_nodes, node;
1163b2261026SJoonsoo Kim 	int ret = 0;
1164b2261026SJoonsoo Kim 
1165b2261026SJoonsoo Kim 	for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
1166b2261026SJoonsoo Kim 		page = alloc_fresh_huge_page_node(h, node);
1167b2261026SJoonsoo Kim 		if (page) {
1168b2261026SJoonsoo Kim 			ret = 1;
1169b2261026SJoonsoo Kim 			break;
1170b2261026SJoonsoo Kim 		}
1171b2261026SJoonsoo Kim 	}
1172b2261026SJoonsoo Kim 
1173b2261026SJoonsoo Kim 	if (ret)
1174b2261026SJoonsoo Kim 		count_vm_event(HTLB_BUDDY_PGALLOC);
1175b2261026SJoonsoo Kim 	else
1176b2261026SJoonsoo Kim 		count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
1177b2261026SJoonsoo Kim 
1178b2261026SJoonsoo Kim 	return ret;
1179b2261026SJoonsoo Kim }
1180b2261026SJoonsoo Kim 
1181e8c5c824SLee Schermerhorn /*
1182e8c5c824SLee Schermerhorn  * Free huge page from pool from next node to free.
1183e8c5c824SLee Schermerhorn  * Attempt to keep persistent huge pages more or less
1184e8c5c824SLee Schermerhorn  * balanced over allowed nodes.
1185e8c5c824SLee Schermerhorn  * Called with hugetlb_lock locked.
1186e8c5c824SLee Schermerhorn  */
11876ae11b27SLee Schermerhorn static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
11886ae11b27SLee Schermerhorn 							 bool acct_surplus)
1189e8c5c824SLee Schermerhorn {
1190b2261026SJoonsoo Kim 	int nr_nodes, node;
1191e8c5c824SLee Schermerhorn 	int ret = 0;
1192e8c5c824SLee Schermerhorn 
1193b2261026SJoonsoo Kim 	for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
1194685f3457SLee Schermerhorn 		/*
1195685f3457SLee Schermerhorn 		 * If we're returning unused surplus pages, only examine
1196685f3457SLee Schermerhorn 		 * nodes with surplus pages.
1197685f3457SLee Schermerhorn 		 */
1198b2261026SJoonsoo Kim 		if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
1199b2261026SJoonsoo Kim 		    !list_empty(&h->hugepage_freelists[node])) {
1200e8c5c824SLee Schermerhorn 			struct page *page =
1201b2261026SJoonsoo Kim 				list_entry(h->hugepage_freelists[node].next,
1202e8c5c824SLee Schermerhorn 					  struct page, lru);
1203e8c5c824SLee Schermerhorn 			list_del(&page->lru);
1204e8c5c824SLee Schermerhorn 			h->free_huge_pages--;
1205b2261026SJoonsoo Kim 			h->free_huge_pages_node[node]--;
1206685f3457SLee Schermerhorn 			if (acct_surplus) {
1207685f3457SLee Schermerhorn 				h->surplus_huge_pages--;
1208b2261026SJoonsoo Kim 				h->surplus_huge_pages_node[node]--;
1209685f3457SLee Schermerhorn 			}
1210e8c5c824SLee Schermerhorn 			update_and_free_page(h, page);
1211e8c5c824SLee Schermerhorn 			ret = 1;
12129a76db09SLee Schermerhorn 			break;
1213e8c5c824SLee Schermerhorn 		}
1214b2261026SJoonsoo Kim 	}
1215e8c5c824SLee Schermerhorn 
1216e8c5c824SLee Schermerhorn 	return ret;
1217e8c5c824SLee Schermerhorn }
1218e8c5c824SLee Schermerhorn 
1219c8721bbbSNaoya Horiguchi /*
1220c8721bbbSNaoya Horiguchi  * Dissolve a given free hugepage into free buddy pages. This function does
1221c8721bbbSNaoya Horiguchi  * nothing for in-use (including surplus) hugepages.
1222c8721bbbSNaoya Horiguchi  */
1223c8721bbbSNaoya Horiguchi static void dissolve_free_huge_page(struct page *page)
1224c8721bbbSNaoya Horiguchi {
1225c8721bbbSNaoya Horiguchi 	spin_lock(&hugetlb_lock);
1226c8721bbbSNaoya Horiguchi 	if (PageHuge(page) && !page_count(page)) {
1227c8721bbbSNaoya Horiguchi 		struct hstate *h = page_hstate(page);
1228c8721bbbSNaoya Horiguchi 		int nid = page_to_nid(page);
1229c8721bbbSNaoya Horiguchi 		list_del(&page->lru);
1230c8721bbbSNaoya Horiguchi 		h->free_huge_pages--;
1231c8721bbbSNaoya Horiguchi 		h->free_huge_pages_node[nid]--;
1232c8721bbbSNaoya Horiguchi 		update_and_free_page(h, page);
1233c8721bbbSNaoya Horiguchi 	}
1234c8721bbbSNaoya Horiguchi 	spin_unlock(&hugetlb_lock);
1235c8721bbbSNaoya Horiguchi }
1236c8721bbbSNaoya Horiguchi 
1237c8721bbbSNaoya Horiguchi /*
1238c8721bbbSNaoya Horiguchi  * Dissolve free hugepages in a given pfn range. Used by memory hotplug to
1239c8721bbbSNaoya Horiguchi  * make specified memory blocks removable from the system.
1240c8721bbbSNaoya Horiguchi  * Note that start_pfn should aligned with (minimum) hugepage size.
1241c8721bbbSNaoya Horiguchi  */
1242c8721bbbSNaoya Horiguchi void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
1243c8721bbbSNaoya Horiguchi {
1244c8721bbbSNaoya Horiguchi 	unsigned long pfn;
1245c8721bbbSNaoya Horiguchi 
1246d0177639SLi Zhong 	if (!hugepages_supported())
1247d0177639SLi Zhong 		return;
1248d0177639SLi Zhong 
1249641844f5SNaoya Horiguchi 	VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << minimum_order));
1250641844f5SNaoya Horiguchi 	for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order)
1251c8721bbbSNaoya Horiguchi 		dissolve_free_huge_page(pfn_to_page(pfn));
1252c8721bbbSNaoya Horiguchi }
1253c8721bbbSNaoya Horiguchi 
1254bf50bab2SNaoya Horiguchi static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
12557893d1d5SAdam Litke {
12567893d1d5SAdam Litke 	struct page *page;
1257bf50bab2SNaoya Horiguchi 	unsigned int r_nid;
12587893d1d5SAdam Litke 
1259bae7f4aeSLuiz Capitulino 	if (hstate_is_gigantic(h))
1260aa888a74SAndi Kleen 		return NULL;
1261aa888a74SAndi Kleen 
1262d1c3fb1fSNishanth Aravamudan 	/*
1263d1c3fb1fSNishanth Aravamudan 	 * Assume we will successfully allocate the surplus page to
1264d1c3fb1fSNishanth Aravamudan 	 * prevent racing processes from causing the surplus to exceed
1265d1c3fb1fSNishanth Aravamudan 	 * overcommit
1266d1c3fb1fSNishanth Aravamudan 	 *
1267d1c3fb1fSNishanth Aravamudan 	 * This however introduces a different race, where a process B
1268d1c3fb1fSNishanth Aravamudan 	 * tries to grow the static hugepage pool while alloc_pages() is
1269d1c3fb1fSNishanth Aravamudan 	 * called by process A. B will only examine the per-node
1270d1c3fb1fSNishanth Aravamudan 	 * counters in determining if surplus huge pages can be
1271d1c3fb1fSNishanth Aravamudan 	 * converted to normal huge pages in adjust_pool_surplus(). A
1272d1c3fb1fSNishanth Aravamudan 	 * won't be able to increment the per-node counter, until the
1273d1c3fb1fSNishanth Aravamudan 	 * lock is dropped by B, but B doesn't drop hugetlb_lock until
1274d1c3fb1fSNishanth Aravamudan 	 * no more huge pages can be converted from surplus to normal
1275d1c3fb1fSNishanth Aravamudan 	 * state (and doesn't try to convert again). Thus, we have a
1276d1c3fb1fSNishanth Aravamudan 	 * case where a surplus huge page exists, the pool is grown, and
1277d1c3fb1fSNishanth Aravamudan 	 * the surplus huge page still exists after, even though it
1278d1c3fb1fSNishanth Aravamudan 	 * should just have been converted to a normal huge page. This
1279d1c3fb1fSNishanth Aravamudan 	 * does not leak memory, though, as the hugepage will be freed
1280d1c3fb1fSNishanth Aravamudan 	 * once it is out of use. It also does not allow the counters to
1281d1c3fb1fSNishanth Aravamudan 	 * go out of whack in adjust_pool_surplus() as we don't modify
1282d1c3fb1fSNishanth Aravamudan 	 * the node values until we've gotten the hugepage and only the
1283d1c3fb1fSNishanth Aravamudan 	 * per-node value is checked there.
1284d1c3fb1fSNishanth Aravamudan 	 */
1285d1c3fb1fSNishanth Aravamudan 	spin_lock(&hugetlb_lock);
1286a5516438SAndi Kleen 	if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
1287d1c3fb1fSNishanth Aravamudan 		spin_unlock(&hugetlb_lock);
1288d1c3fb1fSNishanth Aravamudan 		return NULL;
1289d1c3fb1fSNishanth Aravamudan 	} else {
1290a5516438SAndi Kleen 		h->nr_huge_pages++;
1291a5516438SAndi Kleen 		h->surplus_huge_pages++;
1292d1c3fb1fSNishanth Aravamudan 	}
1293d1c3fb1fSNishanth Aravamudan 	spin_unlock(&hugetlb_lock);
1294d1c3fb1fSNishanth Aravamudan 
1295bf50bab2SNaoya Horiguchi 	if (nid == NUMA_NO_NODE)
129686cdb465SNaoya Horiguchi 		page = alloc_pages(htlb_alloc_mask(h)|__GFP_COMP|
1297551883aeSNishanth Aravamudan 				   __GFP_REPEAT|__GFP_NOWARN,
1298a5516438SAndi Kleen 				   huge_page_order(h));
1299bf50bab2SNaoya Horiguchi 	else
1300bf50bab2SNaoya Horiguchi 		page = alloc_pages_exact_node(nid,
130186cdb465SNaoya Horiguchi 			htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
1302bf50bab2SNaoya Horiguchi 			__GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
1303d1c3fb1fSNishanth Aravamudan 
1304caff3a2cSGerald Schaefer 	if (page && arch_prepare_hugepage(page)) {
1305caff3a2cSGerald Schaefer 		__free_pages(page, huge_page_order(h));
1306ea5768c7SHillf Danton 		page = NULL;
1307caff3a2cSGerald Schaefer 	}
1308caff3a2cSGerald Schaefer 
13097893d1d5SAdam Litke 	spin_lock(&hugetlb_lock);
1310d1c3fb1fSNishanth Aravamudan 	if (page) {
13110edaecfaSAneesh Kumar K.V 		INIT_LIST_HEAD(&page->lru);
1312bf50bab2SNaoya Horiguchi 		r_nid = page_to_nid(page);
1313d1c3fb1fSNishanth Aravamudan 		set_compound_page_dtor(page, free_huge_page);
13149dd540e2SAneesh Kumar K.V 		set_hugetlb_cgroup(page, NULL);
1315d1c3fb1fSNishanth Aravamudan 		/*
1316d1c3fb1fSNishanth Aravamudan 		 * We incremented the global counters already
1317d1c3fb1fSNishanth Aravamudan 		 */
1318bf50bab2SNaoya Horiguchi 		h->nr_huge_pages_node[r_nid]++;
1319bf50bab2SNaoya Horiguchi 		h->surplus_huge_pages_node[r_nid]++;
13203b116300SAdam Litke 		__count_vm_event(HTLB_BUDDY_PGALLOC);
1321d1c3fb1fSNishanth Aravamudan 	} else {
1322a5516438SAndi Kleen 		h->nr_huge_pages--;
1323a5516438SAndi Kleen 		h->surplus_huge_pages--;
13243b116300SAdam Litke 		__count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
13257893d1d5SAdam Litke 	}
1326d1c3fb1fSNishanth Aravamudan 	spin_unlock(&hugetlb_lock);
13277893d1d5SAdam Litke 
13287893d1d5SAdam Litke 	return page;
13297893d1d5SAdam Litke }
13307893d1d5SAdam Litke 
1331e4e574b7SAdam Litke /*
1332bf50bab2SNaoya Horiguchi  * This allocation function is useful in the context where vma is irrelevant.
1333bf50bab2SNaoya Horiguchi  * E.g. soft-offlining uses this function because it only cares physical
1334bf50bab2SNaoya Horiguchi  * address of error page.
1335bf50bab2SNaoya Horiguchi  */
1336bf50bab2SNaoya Horiguchi struct page *alloc_huge_page_node(struct hstate *h, int nid)
1337bf50bab2SNaoya Horiguchi {
13384ef91848SJoonsoo Kim 	struct page *page = NULL;
1339bf50bab2SNaoya Horiguchi 
1340bf50bab2SNaoya Horiguchi 	spin_lock(&hugetlb_lock);
13414ef91848SJoonsoo Kim 	if (h->free_huge_pages - h->resv_huge_pages > 0)
1342bf50bab2SNaoya Horiguchi 		page = dequeue_huge_page_node(h, nid);
1343bf50bab2SNaoya Horiguchi 	spin_unlock(&hugetlb_lock);
1344bf50bab2SNaoya Horiguchi 
134594ae8ba7SAneesh Kumar K.V 	if (!page)
1346bf50bab2SNaoya Horiguchi 		page = alloc_buddy_huge_page(h, nid);
1347bf50bab2SNaoya Horiguchi 
1348bf50bab2SNaoya Horiguchi 	return page;
1349bf50bab2SNaoya Horiguchi }
1350bf50bab2SNaoya Horiguchi 
1351bf50bab2SNaoya Horiguchi /*
135225985edcSLucas De Marchi  * Increase the hugetlb pool such that it can accommodate a reservation
1353e4e574b7SAdam Litke  * of size 'delta'.
1354e4e574b7SAdam Litke  */
1355a5516438SAndi Kleen static int gather_surplus_pages(struct hstate *h, int delta)
1356e4e574b7SAdam Litke {
1357e4e574b7SAdam Litke 	struct list_head surplus_list;
1358e4e574b7SAdam Litke 	struct page *page, *tmp;
1359e4e574b7SAdam Litke 	int ret, i;
1360e4e574b7SAdam Litke 	int needed, allocated;
136128073b02SHillf Danton 	bool alloc_ok = true;
1362e4e574b7SAdam Litke 
1363a5516438SAndi Kleen 	needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
1364ac09b3a1SAdam Litke 	if (needed <= 0) {
1365a5516438SAndi Kleen 		h->resv_huge_pages += delta;
1366e4e574b7SAdam Litke 		return 0;
1367ac09b3a1SAdam Litke 	}
1368e4e574b7SAdam Litke 
1369e4e574b7SAdam Litke 	allocated = 0;
1370e4e574b7SAdam Litke 	INIT_LIST_HEAD(&surplus_list);
1371e4e574b7SAdam Litke 
1372e4e574b7SAdam Litke 	ret = -ENOMEM;
1373e4e574b7SAdam Litke retry:
1374e4e574b7SAdam Litke 	spin_unlock(&hugetlb_lock);
1375e4e574b7SAdam Litke 	for (i = 0; i < needed; i++) {
1376bf50bab2SNaoya Horiguchi 		page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
137728073b02SHillf Danton 		if (!page) {
137828073b02SHillf Danton 			alloc_ok = false;
137928073b02SHillf Danton 			break;
138028073b02SHillf Danton 		}
1381e4e574b7SAdam Litke 		list_add(&page->lru, &surplus_list);
1382e4e574b7SAdam Litke 	}
138328073b02SHillf Danton 	allocated += i;
1384e4e574b7SAdam Litke 
1385e4e574b7SAdam Litke 	/*
1386e4e574b7SAdam Litke 	 * After retaking hugetlb_lock, we need to recalculate 'needed'
1387e4e574b7SAdam Litke 	 * because either resv_huge_pages or free_huge_pages may have changed.
1388e4e574b7SAdam Litke 	 */
1389e4e574b7SAdam Litke 	spin_lock(&hugetlb_lock);
1390a5516438SAndi Kleen 	needed = (h->resv_huge_pages + delta) -
1391a5516438SAndi Kleen 			(h->free_huge_pages + allocated);
139228073b02SHillf Danton 	if (needed > 0) {
139328073b02SHillf Danton 		if (alloc_ok)
1394e4e574b7SAdam Litke 			goto retry;
139528073b02SHillf Danton 		/*
139628073b02SHillf Danton 		 * We were not able to allocate enough pages to
139728073b02SHillf Danton 		 * satisfy the entire reservation so we free what
139828073b02SHillf Danton 		 * we've allocated so far.
139928073b02SHillf Danton 		 */
140028073b02SHillf Danton 		goto free;
140128073b02SHillf Danton 	}
1402e4e574b7SAdam Litke 	/*
1403e4e574b7SAdam Litke 	 * The surplus_list now contains _at_least_ the number of extra pages
140425985edcSLucas De Marchi 	 * needed to accommodate the reservation.  Add the appropriate number
1405e4e574b7SAdam Litke 	 * of pages to the hugetlb pool and free the extras back to the buddy
1406ac09b3a1SAdam Litke 	 * allocator.  Commit the entire reservation here to prevent another
1407ac09b3a1SAdam Litke 	 * process from stealing the pages as they are added to the pool but
1408ac09b3a1SAdam Litke 	 * before they are reserved.
1409e4e574b7SAdam Litke 	 */
1410e4e574b7SAdam Litke 	needed += allocated;
1411a5516438SAndi Kleen 	h->resv_huge_pages += delta;
1412e4e574b7SAdam Litke 	ret = 0;
1413a9869b83SNaoya Horiguchi 
141419fc3f0aSAdam Litke 	/* Free the needed pages to the hugetlb pool */
141519fc3f0aSAdam Litke 	list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
141619fc3f0aSAdam Litke 		if ((--needed) < 0)
141719fc3f0aSAdam Litke 			break;
1418a9869b83SNaoya Horiguchi 		/*
1419a9869b83SNaoya Horiguchi 		 * This page is now managed by the hugetlb allocator and has
1420a9869b83SNaoya Horiguchi 		 * no users -- drop the buddy allocator's reference.
1421a9869b83SNaoya Horiguchi 		 */
1422a9869b83SNaoya Horiguchi 		put_page_testzero(page);
1423309381feSSasha Levin 		VM_BUG_ON_PAGE(page_count(page), page);
1424a5516438SAndi Kleen 		enqueue_huge_page(h, page);
142519fc3f0aSAdam Litke 	}
142628073b02SHillf Danton free:
1427b0365c8dSHillf Danton 	spin_unlock(&hugetlb_lock);
142819fc3f0aSAdam Litke 
142919fc3f0aSAdam Litke 	/* Free unnecessary surplus pages to the buddy allocator */
1430c0d934baSJoonsoo Kim 	list_for_each_entry_safe(page, tmp, &surplus_list, lru)
1431a9869b83SNaoya Horiguchi 		put_page(page);
143219fc3f0aSAdam Litke 	spin_lock(&hugetlb_lock);
1433e4e574b7SAdam Litke 
1434e4e574b7SAdam Litke 	return ret;
1435e4e574b7SAdam Litke }
1436e4e574b7SAdam Litke 
1437e4e574b7SAdam Litke /*
1438e4e574b7SAdam Litke  * When releasing a hugetlb pool reservation, any surplus pages that were
1439e4e574b7SAdam Litke  * allocated to satisfy the reservation must be explicitly freed if they were
1440e4e574b7SAdam Litke  * never used.
1441685f3457SLee Schermerhorn  * Called with hugetlb_lock held.
1442e4e574b7SAdam Litke  */
1443a5516438SAndi Kleen static void return_unused_surplus_pages(struct hstate *h,
1444a5516438SAndi Kleen 					unsigned long unused_resv_pages)
1445e4e574b7SAdam Litke {
1446e4e574b7SAdam Litke 	unsigned long nr_pages;
1447e4e574b7SAdam Litke 
1448ac09b3a1SAdam Litke 	/* Uncommit the reservation */
1449a5516438SAndi Kleen 	h->resv_huge_pages -= unused_resv_pages;
1450ac09b3a1SAdam Litke 
1451aa888a74SAndi Kleen 	/* Cannot return gigantic pages currently */
1452bae7f4aeSLuiz Capitulino 	if (hstate_is_gigantic(h))
1453aa888a74SAndi Kleen 		return;
1454aa888a74SAndi Kleen 
1455a5516438SAndi Kleen 	nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
1456e4e574b7SAdam Litke 
1457685f3457SLee Schermerhorn 	/*
1458685f3457SLee Schermerhorn 	 * We want to release as many surplus pages as possible, spread
14599b5e5d0fSLee Schermerhorn 	 * evenly across all nodes with memory. Iterate across these nodes
14609b5e5d0fSLee Schermerhorn 	 * until we can no longer free unreserved surplus pages. This occurs
14619b5e5d0fSLee Schermerhorn 	 * when the nodes with surplus pages have no free pages.
14629b5e5d0fSLee Schermerhorn 	 * free_pool_huge_page() will balance the the freed pages across the
14639b5e5d0fSLee Schermerhorn 	 * on-line nodes with memory and will handle the hstate accounting.
1464685f3457SLee Schermerhorn 	 */
1465685f3457SLee Schermerhorn 	while (nr_pages--) {
14668cebfcd0SLai Jiangshan 		if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
1467685f3457SLee Schermerhorn 			break;
14687848a4bfSMizuma, Masayoshi 		cond_resched_lock(&hugetlb_lock);
1469e4e574b7SAdam Litke 	}
1470e4e574b7SAdam Litke }
1471e4e574b7SAdam Litke 
1472c37f9fb1SAndy Whitcroft /*
1473c37f9fb1SAndy Whitcroft  * Determine if the huge page at addr within the vma has an associated
1474c37f9fb1SAndy Whitcroft  * reservation.  Where it does not we will need to logically increase
147590481622SDavid Gibson  * reservation and actually increase subpool usage before an allocation
147690481622SDavid Gibson  * can occur.  Where any new reservation would be required the
147790481622SDavid Gibson  * reservation change is prepared, but not committed.  Once the page
147890481622SDavid Gibson  * has been allocated from the subpool and instantiated the change should
147990481622SDavid Gibson  * be committed via vma_commit_reservation.  No action is required on
148090481622SDavid Gibson  * failure.
1481c37f9fb1SAndy Whitcroft  */
1482e2f17d94SRoel Kluin static long vma_needs_reservation(struct hstate *h,
1483a5516438SAndi Kleen 			struct vm_area_struct *vma, unsigned long addr)
1484c37f9fb1SAndy Whitcroft {
14854e35f483SJoonsoo Kim 	struct resv_map *resv;
14864e35f483SJoonsoo Kim 	pgoff_t idx;
14874e35f483SJoonsoo Kim 	long chg;
1488c37f9fb1SAndy Whitcroft 
14894e35f483SJoonsoo Kim 	resv = vma_resv_map(vma);
14904e35f483SJoonsoo Kim 	if (!resv)
1491c37f9fb1SAndy Whitcroft 		return 1;
1492c37f9fb1SAndy Whitcroft 
14934e35f483SJoonsoo Kim 	idx = vma_hugecache_offset(h, vma, addr);
14944e35f483SJoonsoo Kim 	chg = region_chg(resv, idx, idx + 1);
149584afd99bSAndy Whitcroft 
14964e35f483SJoonsoo Kim 	if (vma->vm_flags & VM_MAYSHARE)
14974e35f483SJoonsoo Kim 		return chg;
14984e35f483SJoonsoo Kim 	else
14994e35f483SJoonsoo Kim 		return chg < 0 ? chg : 0;
150084afd99bSAndy Whitcroft }
1501a5516438SAndi Kleen static void vma_commit_reservation(struct hstate *h,
1502a5516438SAndi Kleen 			struct vm_area_struct *vma, unsigned long addr)
1503c37f9fb1SAndy Whitcroft {
15044e35f483SJoonsoo Kim 	struct resv_map *resv;
15054e35f483SJoonsoo Kim 	pgoff_t idx;
1506c37f9fb1SAndy Whitcroft 
15074e35f483SJoonsoo Kim 	resv = vma_resv_map(vma);
15084e35f483SJoonsoo Kim 	if (!resv)
15094e35f483SJoonsoo Kim 		return;
15109119a41eSJoonsoo Kim 
15114e35f483SJoonsoo Kim 	idx = vma_hugecache_offset(h, vma, addr);
15121406ec9bSJoonsoo Kim 	region_add(resv, idx, idx + 1);
1513c37f9fb1SAndy Whitcroft }
1514c37f9fb1SAndy Whitcroft 
1515348ea204SAdam Litke static struct page *alloc_huge_page(struct vm_area_struct *vma,
151604f2cbe3SMel Gorman 				    unsigned long addr, int avoid_reserve)
1517348ea204SAdam Litke {
151890481622SDavid Gibson 	struct hugepage_subpool *spool = subpool_vma(vma);
1519a5516438SAndi Kleen 	struct hstate *h = hstate_vma(vma);
1520348ea204SAdam Litke 	struct page *page;
1521e2f17d94SRoel Kluin 	long chg;
15226d76dcf4SAneesh Kumar K.V 	int ret, idx;
15236d76dcf4SAneesh Kumar K.V 	struct hugetlb_cgroup *h_cg;
15242fc39cecSAdam Litke 
15256d76dcf4SAneesh Kumar K.V 	idx = hstate_index(h);
1526a1e78772SMel Gorman 	/*
152790481622SDavid Gibson 	 * Processes that did not create the mapping will have no
152890481622SDavid Gibson 	 * reserves and will not have accounted against subpool
152990481622SDavid Gibson 	 * limit. Check that the subpool limit can be made before
153090481622SDavid Gibson 	 * satisfying the allocation MAP_NORESERVE mappings may also
153190481622SDavid Gibson 	 * need pages and subpool limit allocated allocated if no reserve
153290481622SDavid Gibson 	 * mapping overlaps.
1533a1e78772SMel Gorman 	 */
1534a5516438SAndi Kleen 	chg = vma_needs_reservation(h, vma, addr);
1535c37f9fb1SAndy Whitcroft 	if (chg < 0)
153676dcee75SAneesh Kumar K.V 		return ERR_PTR(-ENOMEM);
15378bb3f12eSJoonsoo Kim 	if (chg || avoid_reserve)
15381c5ecae3SMike Kravetz 		if (hugepage_subpool_get_pages(spool, 1) < 0)
153976dcee75SAneesh Kumar K.V 			return ERR_PTR(-ENOSPC);
154090d8b7e6SAdam Litke 
15416d76dcf4SAneesh Kumar K.V 	ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
15428f34af6fSJianyu Zhan 	if (ret)
15438f34af6fSJianyu Zhan 		goto out_subpool_put;
15448f34af6fSJianyu Zhan 
1545a1e78772SMel Gorman 	spin_lock(&hugetlb_lock);
1546af0ed73eSJoonsoo Kim 	page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg);
154781a6fcaeSJoonsoo Kim 	if (!page) {
154894ae8ba7SAneesh Kumar K.V 		spin_unlock(&hugetlb_lock);
1549bf50bab2SNaoya Horiguchi 		page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
15508f34af6fSJianyu Zhan 		if (!page)
15518f34af6fSJianyu Zhan 			goto out_uncharge_cgroup;
15528f34af6fSJianyu Zhan 
155379dbb236SAneesh Kumar K.V 		spin_lock(&hugetlb_lock);
155479dbb236SAneesh Kumar K.V 		list_move(&page->lru, &h->hugepage_activelist);
155581a6fcaeSJoonsoo Kim 		/* Fall through */
1556a1e78772SMel Gorman 	}
155781a6fcaeSJoonsoo Kim 	hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
155881a6fcaeSJoonsoo Kim 	spin_unlock(&hugetlb_lock);
1559a1e78772SMel Gorman 
156090481622SDavid Gibson 	set_page_private(page, (unsigned long)spool);
1561a1e78772SMel Gorman 
1562a5516438SAndi Kleen 	vma_commit_reservation(h, vma, addr);
15637893d1d5SAdam Litke 	return page;
15648f34af6fSJianyu Zhan 
15658f34af6fSJianyu Zhan out_uncharge_cgroup:
15668f34af6fSJianyu Zhan 	hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
15678f34af6fSJianyu Zhan out_subpool_put:
15688f34af6fSJianyu Zhan 	if (chg || avoid_reserve)
15698f34af6fSJianyu Zhan 		hugepage_subpool_put_pages(spool, 1);
15708f34af6fSJianyu Zhan 	return ERR_PTR(-ENOSPC);
1571b45b5bd6SDavid Gibson }
1572b45b5bd6SDavid Gibson 
157374060e4dSNaoya Horiguchi /*
157474060e4dSNaoya Horiguchi  * alloc_huge_page()'s wrapper which simply returns the page if allocation
157574060e4dSNaoya Horiguchi  * succeeds, otherwise NULL. This function is called from new_vma_page(),
157674060e4dSNaoya Horiguchi  * where no ERR_VALUE is expected to be returned.
157774060e4dSNaoya Horiguchi  */
157874060e4dSNaoya Horiguchi struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
157974060e4dSNaoya Horiguchi 				unsigned long addr, int avoid_reserve)
158074060e4dSNaoya Horiguchi {
158174060e4dSNaoya Horiguchi 	struct page *page = alloc_huge_page(vma, addr, avoid_reserve);
158274060e4dSNaoya Horiguchi 	if (IS_ERR(page))
158374060e4dSNaoya Horiguchi 		page = NULL;
158474060e4dSNaoya Horiguchi 	return page;
158574060e4dSNaoya Horiguchi }
158674060e4dSNaoya Horiguchi 
158791f47662SCyrill Gorcunov int __weak alloc_bootmem_huge_page(struct hstate *h)
1588aa888a74SAndi Kleen {
1589aa888a74SAndi Kleen 	struct huge_bootmem_page *m;
1590b2261026SJoonsoo Kim 	int nr_nodes, node;
1591aa888a74SAndi Kleen 
1592b2261026SJoonsoo Kim 	for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
1593aa888a74SAndi Kleen 		void *addr;
1594aa888a74SAndi Kleen 
15958b89a116SGrygorii Strashko 		addr = memblock_virt_alloc_try_nid_nopanic(
15968b89a116SGrygorii Strashko 				huge_page_size(h), huge_page_size(h),
15978b89a116SGrygorii Strashko 				0, BOOTMEM_ALLOC_ACCESSIBLE, node);
1598aa888a74SAndi Kleen 		if (addr) {
1599aa888a74SAndi Kleen 			/*
1600aa888a74SAndi Kleen 			 * Use the beginning of the huge page to store the
1601aa888a74SAndi Kleen 			 * huge_bootmem_page struct (until gather_bootmem
1602aa888a74SAndi Kleen 			 * puts them into the mem_map).
1603aa888a74SAndi Kleen 			 */
1604aa888a74SAndi Kleen 			m = addr;
1605aa888a74SAndi Kleen 			goto found;
1606aa888a74SAndi Kleen 		}
1607aa888a74SAndi Kleen 	}
1608aa888a74SAndi Kleen 	return 0;
1609aa888a74SAndi Kleen 
1610aa888a74SAndi Kleen found:
1611df994eadSLuiz Capitulino 	BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h)));
1612aa888a74SAndi Kleen 	/* Put them into a private list first because mem_map is not up yet */
1613aa888a74SAndi Kleen 	list_add(&m->list, &huge_boot_pages);
1614aa888a74SAndi Kleen 	m->hstate = h;
1615aa888a74SAndi Kleen 	return 1;
1616aa888a74SAndi Kleen }
1617aa888a74SAndi Kleen 
1618f412c97aSDavid Rientjes static void __init prep_compound_huge_page(struct page *page, int order)
161918229df5SAndy Whitcroft {
162018229df5SAndy Whitcroft 	if (unlikely(order > (MAX_ORDER - 1)))
162118229df5SAndy Whitcroft 		prep_compound_gigantic_page(page, order);
162218229df5SAndy Whitcroft 	else
162318229df5SAndy Whitcroft 		prep_compound_page(page, order);
162418229df5SAndy Whitcroft }
162518229df5SAndy Whitcroft 
1626aa888a74SAndi Kleen /* Put bootmem huge pages into the standard lists after mem_map is up */
1627aa888a74SAndi Kleen static void __init gather_bootmem_prealloc(void)
1628aa888a74SAndi Kleen {
1629aa888a74SAndi Kleen 	struct huge_bootmem_page *m;
1630aa888a74SAndi Kleen 
1631aa888a74SAndi Kleen 	list_for_each_entry(m, &huge_boot_pages, list) {
1632aa888a74SAndi Kleen 		struct hstate *h = m->hstate;
1633ee8f248dSBecky Bruce 		struct page *page;
1634ee8f248dSBecky Bruce 
1635ee8f248dSBecky Bruce #ifdef CONFIG_HIGHMEM
1636ee8f248dSBecky Bruce 		page = pfn_to_page(m->phys >> PAGE_SHIFT);
16378b89a116SGrygorii Strashko 		memblock_free_late(__pa(m),
1638ee8f248dSBecky Bruce 				   sizeof(struct huge_bootmem_page));
1639ee8f248dSBecky Bruce #else
1640ee8f248dSBecky Bruce 		page = virt_to_page(m);
1641ee8f248dSBecky Bruce #endif
1642aa888a74SAndi Kleen 		WARN_ON(page_count(page) != 1);
164318229df5SAndy Whitcroft 		prep_compound_huge_page(page, h->order);
1644ef5a22beSAndrea Arcangeli 		WARN_ON(PageReserved(page));
1645aa888a74SAndi Kleen 		prep_new_huge_page(h, page, page_to_nid(page));
1646b0320c7bSRafael Aquini 		/*
1647b0320c7bSRafael Aquini 		 * If we had gigantic hugepages allocated at boot time, we need
1648b0320c7bSRafael Aquini 		 * to restore the 'stolen' pages to totalram_pages in order to
1649b0320c7bSRafael Aquini 		 * fix confusing memory reports from free(1) and another
1650b0320c7bSRafael Aquini 		 * side-effects, like CommitLimit going negative.
1651b0320c7bSRafael Aquini 		 */
1652bae7f4aeSLuiz Capitulino 		if (hstate_is_gigantic(h))
16533dcc0571SJiang Liu 			adjust_managed_page_count(page, 1 << h->order);
1654aa888a74SAndi Kleen 	}
1655aa888a74SAndi Kleen }
1656aa888a74SAndi Kleen 
16578faa8b07SAndi Kleen static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
16581da177e4SLinus Torvalds {
16591da177e4SLinus Torvalds 	unsigned long i;
16601da177e4SLinus Torvalds 
1661e5ff2159SAndi Kleen 	for (i = 0; i < h->max_huge_pages; ++i) {
1662bae7f4aeSLuiz Capitulino 		if (hstate_is_gigantic(h)) {
1663aa888a74SAndi Kleen 			if (!alloc_bootmem_huge_page(h))
1664aa888a74SAndi Kleen 				break;
16659b5e5d0fSLee Schermerhorn 		} else if (!alloc_fresh_huge_page(h,
16668cebfcd0SLai Jiangshan 					 &node_states[N_MEMORY]))
16671da177e4SLinus Torvalds 			break;
16681da177e4SLinus Torvalds 	}
16698faa8b07SAndi Kleen 	h->max_huge_pages = i;
1670e5ff2159SAndi Kleen }
1671e5ff2159SAndi Kleen 
1672e5ff2159SAndi Kleen static void __init hugetlb_init_hstates(void)
1673e5ff2159SAndi Kleen {
1674e5ff2159SAndi Kleen 	struct hstate *h;
1675e5ff2159SAndi Kleen 
1676e5ff2159SAndi Kleen 	for_each_hstate(h) {
1677641844f5SNaoya Horiguchi 		if (minimum_order > huge_page_order(h))
1678641844f5SNaoya Horiguchi 			minimum_order = huge_page_order(h);
1679641844f5SNaoya Horiguchi 
16808faa8b07SAndi Kleen 		/* oversize hugepages were init'ed in early boot */
1681bae7f4aeSLuiz Capitulino 		if (!hstate_is_gigantic(h))
16828faa8b07SAndi Kleen 			hugetlb_hstate_alloc_pages(h);
1683e5ff2159SAndi Kleen 	}
1684641844f5SNaoya Horiguchi 	VM_BUG_ON(minimum_order == UINT_MAX);
1685e5ff2159SAndi Kleen }
1686e5ff2159SAndi Kleen 
16874abd32dbSAndi Kleen static char * __init memfmt(char *buf, unsigned long n)
16884abd32dbSAndi Kleen {
16894abd32dbSAndi Kleen 	if (n >= (1UL << 30))
16904abd32dbSAndi Kleen 		sprintf(buf, "%lu GB", n >> 30);
16914abd32dbSAndi Kleen 	else if (n >= (1UL << 20))
16924abd32dbSAndi Kleen 		sprintf(buf, "%lu MB", n >> 20);
16934abd32dbSAndi Kleen 	else
16944abd32dbSAndi Kleen 		sprintf(buf, "%lu KB", n >> 10);
16954abd32dbSAndi Kleen 	return buf;
16964abd32dbSAndi Kleen }
16974abd32dbSAndi Kleen 
1698e5ff2159SAndi Kleen static void __init report_hugepages(void)
1699e5ff2159SAndi Kleen {
1700e5ff2159SAndi Kleen 	struct hstate *h;
1701e5ff2159SAndi Kleen 
1702e5ff2159SAndi Kleen 	for_each_hstate(h) {
17034abd32dbSAndi Kleen 		char buf[32];
1704ffb22af5SAndrew Morton 		pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n",
17054abd32dbSAndi Kleen 			memfmt(buf, huge_page_size(h)),
17064abd32dbSAndi Kleen 			h->free_huge_pages);
1707e5ff2159SAndi Kleen 	}
1708e5ff2159SAndi Kleen }
1709e5ff2159SAndi Kleen 
17101da177e4SLinus Torvalds #ifdef CONFIG_HIGHMEM
17116ae11b27SLee Schermerhorn static void try_to_free_low(struct hstate *h, unsigned long count,
17126ae11b27SLee Schermerhorn 						nodemask_t *nodes_allowed)
17131da177e4SLinus Torvalds {
17144415cc8dSChristoph Lameter 	int i;
17154415cc8dSChristoph Lameter 
1716bae7f4aeSLuiz Capitulino 	if (hstate_is_gigantic(h))
1717aa888a74SAndi Kleen 		return;
1718aa888a74SAndi Kleen 
17196ae11b27SLee Schermerhorn 	for_each_node_mask(i, *nodes_allowed) {
17201da177e4SLinus Torvalds 		struct page *page, *next;
1721a5516438SAndi Kleen 		struct list_head *freel = &h->hugepage_freelists[i];
1722a5516438SAndi Kleen 		list_for_each_entry_safe(page, next, freel, lru) {
1723a5516438SAndi Kleen 			if (count >= h->nr_huge_pages)
17246b0c880dSAdam Litke 				return;
17251da177e4SLinus Torvalds 			if (PageHighMem(page))
17261da177e4SLinus Torvalds 				continue;
17271da177e4SLinus Torvalds 			list_del(&page->lru);
1728e5ff2159SAndi Kleen 			update_and_free_page(h, page);
1729a5516438SAndi Kleen 			h->free_huge_pages--;
1730a5516438SAndi Kleen 			h->free_huge_pages_node[page_to_nid(page)]--;
17311da177e4SLinus Torvalds 		}
17321da177e4SLinus Torvalds 	}
17331da177e4SLinus Torvalds }
17341da177e4SLinus Torvalds #else
17356ae11b27SLee Schermerhorn static inline void try_to_free_low(struct hstate *h, unsigned long count,
17366ae11b27SLee Schermerhorn 						nodemask_t *nodes_allowed)
17371da177e4SLinus Torvalds {
17381da177e4SLinus Torvalds }
17391da177e4SLinus Torvalds #endif
17401da177e4SLinus Torvalds 
174120a0307cSWu Fengguang /*
174220a0307cSWu Fengguang  * Increment or decrement surplus_huge_pages.  Keep node-specific counters
174320a0307cSWu Fengguang  * balanced by operating on them in a round-robin fashion.
174420a0307cSWu Fengguang  * Returns 1 if an adjustment was made.
174520a0307cSWu Fengguang  */
17466ae11b27SLee Schermerhorn static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
17476ae11b27SLee Schermerhorn 				int delta)
174820a0307cSWu Fengguang {
1749b2261026SJoonsoo Kim 	int nr_nodes, node;
175020a0307cSWu Fengguang 
175120a0307cSWu Fengguang 	VM_BUG_ON(delta != -1 && delta != 1);
175220a0307cSWu Fengguang 
1753e8c5c824SLee Schermerhorn 	if (delta < 0) {
1754b2261026SJoonsoo Kim 		for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
1755b2261026SJoonsoo Kim 			if (h->surplus_huge_pages_node[node])
1756b2261026SJoonsoo Kim 				goto found;
1757b2261026SJoonsoo Kim 		}
1758b2261026SJoonsoo Kim 	} else {
1759b2261026SJoonsoo Kim 		for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
1760b2261026SJoonsoo Kim 			if (h->surplus_huge_pages_node[node] <
1761b2261026SJoonsoo Kim 					h->nr_huge_pages_node[node])
1762b2261026SJoonsoo Kim 				goto found;
1763e8c5c824SLee Schermerhorn 		}
17649a76db09SLee Schermerhorn 	}
1765b2261026SJoonsoo Kim 	return 0;
176620a0307cSWu Fengguang 
1767b2261026SJoonsoo Kim found:
176820a0307cSWu Fengguang 	h->surplus_huge_pages += delta;
1769b2261026SJoonsoo Kim 	h->surplus_huge_pages_node[node] += delta;
1770b2261026SJoonsoo Kim 	return 1;
177120a0307cSWu Fengguang }
177220a0307cSWu Fengguang 
1773a5516438SAndi Kleen #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
17746ae11b27SLee Schermerhorn static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
17756ae11b27SLee Schermerhorn 						nodemask_t *nodes_allowed)
17761da177e4SLinus Torvalds {
17777893d1d5SAdam Litke 	unsigned long min_count, ret;
17781da177e4SLinus Torvalds 
1779944d9fecSLuiz Capitulino 	if (hstate_is_gigantic(h) && !gigantic_page_supported())
1780aa888a74SAndi Kleen 		return h->max_huge_pages;
1781aa888a74SAndi Kleen 
17827893d1d5SAdam Litke 	/*
17837893d1d5SAdam Litke 	 * Increase the pool size
17847893d1d5SAdam Litke 	 * First take pages out of surplus state.  Then make up the
17857893d1d5SAdam Litke 	 * remaining difference by allocating fresh huge pages.
1786d1c3fb1fSNishanth Aravamudan 	 *
1787d1c3fb1fSNishanth Aravamudan 	 * We might race with alloc_buddy_huge_page() here and be unable
1788d1c3fb1fSNishanth Aravamudan 	 * to convert a surplus huge page to a normal huge page. That is
1789d1c3fb1fSNishanth Aravamudan 	 * not critical, though, it just means the overall size of the
1790d1c3fb1fSNishanth Aravamudan 	 * pool might be one hugepage larger than it needs to be, but
1791d1c3fb1fSNishanth Aravamudan 	 * within all the constraints specified by the sysctls.
17927893d1d5SAdam Litke 	 */
17931da177e4SLinus Torvalds 	spin_lock(&hugetlb_lock);
1794a5516438SAndi Kleen 	while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
17956ae11b27SLee Schermerhorn 		if (!adjust_pool_surplus(h, nodes_allowed, -1))
17967893d1d5SAdam Litke 			break;
17977893d1d5SAdam Litke 	}
17987893d1d5SAdam Litke 
1799a5516438SAndi Kleen 	while (count > persistent_huge_pages(h)) {
18007893d1d5SAdam Litke 		/*
18017893d1d5SAdam Litke 		 * If this allocation races such that we no longer need the
18027893d1d5SAdam Litke 		 * page, free_huge_page will handle it by freeing the page
18037893d1d5SAdam Litke 		 * and reducing the surplus.
18047893d1d5SAdam Litke 		 */
18057893d1d5SAdam Litke 		spin_unlock(&hugetlb_lock);
1806944d9fecSLuiz Capitulino 		if (hstate_is_gigantic(h))
1807944d9fecSLuiz Capitulino 			ret = alloc_fresh_gigantic_page(h, nodes_allowed);
1808944d9fecSLuiz Capitulino 		else
18096ae11b27SLee Schermerhorn 			ret = alloc_fresh_huge_page(h, nodes_allowed);
18107893d1d5SAdam Litke 		spin_lock(&hugetlb_lock);
18117893d1d5SAdam Litke 		if (!ret)
18127893d1d5SAdam Litke 			goto out;
18137893d1d5SAdam Litke 
1814536240f2SMel Gorman 		/* Bail for signals. Probably ctrl-c from user */
1815536240f2SMel Gorman 		if (signal_pending(current))
1816536240f2SMel Gorman 			goto out;
18177893d1d5SAdam Litke 	}
18187893d1d5SAdam Litke 
18197893d1d5SAdam Litke 	/*
18207893d1d5SAdam Litke 	 * Decrease the pool size
18217893d1d5SAdam Litke 	 * First return free pages to the buddy allocator (being careful
18227893d1d5SAdam Litke 	 * to keep enough around to satisfy reservations).  Then place
18237893d1d5SAdam Litke 	 * pages into surplus state as needed so the pool will shrink
18247893d1d5SAdam Litke 	 * to the desired size as pages become free.
1825d1c3fb1fSNishanth Aravamudan 	 *
1826d1c3fb1fSNishanth Aravamudan 	 * By placing pages into the surplus state independent of the
1827d1c3fb1fSNishanth Aravamudan 	 * overcommit value, we are allowing the surplus pool size to
1828d1c3fb1fSNishanth Aravamudan 	 * exceed overcommit. There are few sane options here. Since
1829d1c3fb1fSNishanth Aravamudan 	 * alloc_buddy_huge_page() is checking the global counter,
1830d1c3fb1fSNishanth Aravamudan 	 * though, we'll note that we're not allowed to exceed surplus
1831d1c3fb1fSNishanth Aravamudan 	 * and won't grow the pool anywhere else. Not until one of the
1832d1c3fb1fSNishanth Aravamudan 	 * sysctls are changed, or the surplus pages go out of use.
18337893d1d5SAdam Litke 	 */
1834a5516438SAndi Kleen 	min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
18356b0c880dSAdam Litke 	min_count = max(count, min_count);
18366ae11b27SLee Schermerhorn 	try_to_free_low(h, min_count, nodes_allowed);
1837a5516438SAndi Kleen 	while (min_count < persistent_huge_pages(h)) {
18386ae11b27SLee Schermerhorn 		if (!free_pool_huge_page(h, nodes_allowed, 0))
18391da177e4SLinus Torvalds 			break;
184055f67141SMizuma, Masayoshi 		cond_resched_lock(&hugetlb_lock);
18411da177e4SLinus Torvalds 	}
1842a5516438SAndi Kleen 	while (count < persistent_huge_pages(h)) {
18436ae11b27SLee Schermerhorn 		if (!adjust_pool_surplus(h, nodes_allowed, 1))
18447893d1d5SAdam Litke 			break;
18457893d1d5SAdam Litke 	}
18467893d1d5SAdam Litke out:
1847a5516438SAndi Kleen 	ret = persistent_huge_pages(h);
18481da177e4SLinus Torvalds 	spin_unlock(&hugetlb_lock);
18497893d1d5SAdam Litke 	return ret;
18501da177e4SLinus Torvalds }
18511da177e4SLinus Torvalds 
1852a3437870SNishanth Aravamudan #define HSTATE_ATTR_RO(_name) \
1853a3437870SNishanth Aravamudan 	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
1854a3437870SNishanth Aravamudan 
1855a3437870SNishanth Aravamudan #define HSTATE_ATTR(_name) \
1856a3437870SNishanth Aravamudan 	static struct kobj_attribute _name##_attr = \
1857a3437870SNishanth Aravamudan 		__ATTR(_name, 0644, _name##_show, _name##_store)
1858a3437870SNishanth Aravamudan 
1859a3437870SNishanth Aravamudan static struct kobject *hugepages_kobj;
1860a3437870SNishanth Aravamudan static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
1861a3437870SNishanth Aravamudan 
18629a305230SLee Schermerhorn static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp);
18639a305230SLee Schermerhorn 
18649a305230SLee Schermerhorn static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp)
1865a3437870SNishanth Aravamudan {
1866a3437870SNishanth Aravamudan 	int i;
18679a305230SLee Schermerhorn 
1868a3437870SNishanth Aravamudan 	for (i = 0; i < HUGE_MAX_HSTATE; i++)
18699a305230SLee Schermerhorn 		if (hstate_kobjs[i] == kobj) {
18709a305230SLee Schermerhorn 			if (nidp)
18719a305230SLee Schermerhorn 				*nidp = NUMA_NO_NODE;
1872a3437870SNishanth Aravamudan 			return &hstates[i];
18739a305230SLee Schermerhorn 		}
18749a305230SLee Schermerhorn 
18759a305230SLee Schermerhorn 	return kobj_to_node_hstate(kobj, nidp);
1876a3437870SNishanth Aravamudan }
1877a3437870SNishanth Aravamudan 
187806808b08SLee Schermerhorn static ssize_t nr_hugepages_show_common(struct kobject *kobj,
1879a3437870SNishanth Aravamudan 					struct kobj_attribute *attr, char *buf)
1880a3437870SNishanth Aravamudan {
18819a305230SLee Schermerhorn 	struct hstate *h;
18829a305230SLee Schermerhorn 	unsigned long nr_huge_pages;
18839a305230SLee Schermerhorn 	int nid;
18849a305230SLee Schermerhorn 
18859a305230SLee Schermerhorn 	h = kobj_to_hstate(kobj, &nid);
18869a305230SLee Schermerhorn 	if (nid == NUMA_NO_NODE)
18879a305230SLee Schermerhorn 		nr_huge_pages = h->nr_huge_pages;
18889a305230SLee Schermerhorn 	else
18899a305230SLee Schermerhorn 		nr_huge_pages = h->nr_huge_pages_node[nid];
18909a305230SLee Schermerhorn 
18919a305230SLee Schermerhorn 	return sprintf(buf, "%lu\n", nr_huge_pages);
1892a3437870SNishanth Aravamudan }
1893adbe8726SEric B Munson 
1894238d3c13SDavid Rientjes static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
1895238d3c13SDavid Rientjes 					   struct hstate *h, int nid,
1896238d3c13SDavid Rientjes 					   unsigned long count, size_t len)
1897a3437870SNishanth Aravamudan {
1898a3437870SNishanth Aravamudan 	int err;
1899bad44b5bSDavid Rientjes 	NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
1900a3437870SNishanth Aravamudan 
1901944d9fecSLuiz Capitulino 	if (hstate_is_gigantic(h) && !gigantic_page_supported()) {
1902adbe8726SEric B Munson 		err = -EINVAL;
1903adbe8726SEric B Munson 		goto out;
1904adbe8726SEric B Munson 	}
1905adbe8726SEric B Munson 
19069a305230SLee Schermerhorn 	if (nid == NUMA_NO_NODE) {
19079a305230SLee Schermerhorn 		/*
19089a305230SLee Schermerhorn 		 * global hstate attribute
19099a305230SLee Schermerhorn 		 */
19109a305230SLee Schermerhorn 		if (!(obey_mempolicy &&
19119a305230SLee Schermerhorn 				init_nodemask_of_mempolicy(nodes_allowed))) {
191206808b08SLee Schermerhorn 			NODEMASK_FREE(nodes_allowed);
19138cebfcd0SLai Jiangshan 			nodes_allowed = &node_states[N_MEMORY];
191406808b08SLee Schermerhorn 		}
19159a305230SLee Schermerhorn 	} else if (nodes_allowed) {
19169a305230SLee Schermerhorn 		/*
19179a305230SLee Schermerhorn 		 * per node hstate attribute: adjust count to global,
19189a305230SLee Schermerhorn 		 * but restrict alloc/free to the specified node.
19199a305230SLee Schermerhorn 		 */
19209a305230SLee Schermerhorn 		count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
19219a305230SLee Schermerhorn 		init_nodemask_of_node(nodes_allowed, nid);
19229a305230SLee Schermerhorn 	} else
19238cebfcd0SLai Jiangshan 		nodes_allowed = &node_states[N_MEMORY];
19249a305230SLee Schermerhorn 
192506808b08SLee Schermerhorn 	h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
1926a3437870SNishanth Aravamudan 
19278cebfcd0SLai Jiangshan 	if (nodes_allowed != &node_states[N_MEMORY])
192806808b08SLee Schermerhorn 		NODEMASK_FREE(nodes_allowed);
192906808b08SLee Schermerhorn 
193006808b08SLee Schermerhorn 	return len;
1931adbe8726SEric B Munson out:
1932adbe8726SEric B Munson 	NODEMASK_FREE(nodes_allowed);
1933adbe8726SEric B Munson 	return err;
193406808b08SLee Schermerhorn }
193506808b08SLee Schermerhorn 
1936238d3c13SDavid Rientjes static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
1937238d3c13SDavid Rientjes 					 struct kobject *kobj, const char *buf,
1938238d3c13SDavid Rientjes 					 size_t len)
1939238d3c13SDavid Rientjes {
1940238d3c13SDavid Rientjes 	struct hstate *h;
1941238d3c13SDavid Rientjes 	unsigned long count;
1942238d3c13SDavid Rientjes 	int nid;
1943238d3c13SDavid Rientjes 	int err;
1944238d3c13SDavid Rientjes 
1945238d3c13SDavid Rientjes 	err = kstrtoul(buf, 10, &count);
1946238d3c13SDavid Rientjes 	if (err)
1947238d3c13SDavid Rientjes 		return err;
1948238d3c13SDavid Rientjes 
1949238d3c13SDavid Rientjes 	h = kobj_to_hstate(kobj, &nid);
1950238d3c13SDavid Rientjes 	return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len);
1951238d3c13SDavid Rientjes }
1952238d3c13SDavid Rientjes 
195306808b08SLee Schermerhorn static ssize_t nr_hugepages_show(struct kobject *kobj,
195406808b08SLee Schermerhorn 				       struct kobj_attribute *attr, char *buf)
195506808b08SLee Schermerhorn {
195606808b08SLee Schermerhorn 	return nr_hugepages_show_common(kobj, attr, buf);
195706808b08SLee Schermerhorn }
195806808b08SLee Schermerhorn 
195906808b08SLee Schermerhorn static ssize_t nr_hugepages_store(struct kobject *kobj,
196006808b08SLee Schermerhorn 	       struct kobj_attribute *attr, const char *buf, size_t len)
196106808b08SLee Schermerhorn {
1962238d3c13SDavid Rientjes 	return nr_hugepages_store_common(false, kobj, buf, len);
1963a3437870SNishanth Aravamudan }
1964a3437870SNishanth Aravamudan HSTATE_ATTR(nr_hugepages);
1965a3437870SNishanth Aravamudan 
196606808b08SLee Schermerhorn #ifdef CONFIG_NUMA
196706808b08SLee Schermerhorn 
196806808b08SLee Schermerhorn /*
196906808b08SLee Schermerhorn  * hstate attribute for optionally mempolicy-based constraint on persistent
197006808b08SLee Schermerhorn  * huge page alloc/free.
197106808b08SLee Schermerhorn  */
197206808b08SLee Schermerhorn static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
197306808b08SLee Schermerhorn 				       struct kobj_attribute *attr, char *buf)
197406808b08SLee Schermerhorn {
197506808b08SLee Schermerhorn 	return nr_hugepages_show_common(kobj, attr, buf);
197606808b08SLee Schermerhorn }
197706808b08SLee Schermerhorn 
197806808b08SLee Schermerhorn static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
197906808b08SLee Schermerhorn 	       struct kobj_attribute *attr, const char *buf, size_t len)
198006808b08SLee Schermerhorn {
1981238d3c13SDavid Rientjes 	return nr_hugepages_store_common(true, kobj, buf, len);
198206808b08SLee Schermerhorn }
198306808b08SLee Schermerhorn HSTATE_ATTR(nr_hugepages_mempolicy);
198406808b08SLee Schermerhorn #endif
198506808b08SLee Schermerhorn 
198606808b08SLee Schermerhorn 
1987a3437870SNishanth Aravamudan static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
1988a3437870SNishanth Aravamudan 					struct kobj_attribute *attr, char *buf)
1989a3437870SNishanth Aravamudan {
19909a305230SLee Schermerhorn 	struct hstate *h = kobj_to_hstate(kobj, NULL);
1991a3437870SNishanth Aravamudan 	return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
1992a3437870SNishanth Aravamudan }
1993adbe8726SEric B Munson 
1994a3437870SNishanth Aravamudan static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
1995a3437870SNishanth Aravamudan 		struct kobj_attribute *attr, const char *buf, size_t count)
1996a3437870SNishanth Aravamudan {
1997a3437870SNishanth Aravamudan 	int err;
1998a3437870SNishanth Aravamudan 	unsigned long input;
19999a305230SLee Schermerhorn 	struct hstate *h = kobj_to_hstate(kobj, NULL);
2000a3437870SNishanth Aravamudan 
2001bae7f4aeSLuiz Capitulino 	if (hstate_is_gigantic(h))
2002adbe8726SEric B Munson 		return -EINVAL;
2003adbe8726SEric B Munson 
20043dbb95f7SJingoo Han 	err = kstrtoul(buf, 10, &input);
2005a3437870SNishanth Aravamudan 	if (err)
200673ae31e5SEric B Munson 		return err;
2007a3437870SNishanth Aravamudan 
2008a3437870SNishanth Aravamudan 	spin_lock(&hugetlb_lock);
2009a3437870SNishanth Aravamudan 	h->nr_overcommit_huge_pages = input;
2010a3437870SNishanth Aravamudan 	spin_unlock(&hugetlb_lock);
2011a3437870SNishanth Aravamudan 
2012a3437870SNishanth Aravamudan 	return count;
2013a3437870SNishanth Aravamudan }
2014a3437870SNishanth Aravamudan HSTATE_ATTR(nr_overcommit_hugepages);
2015a3437870SNishanth Aravamudan 
2016a3437870SNishanth Aravamudan static ssize_t free_hugepages_show(struct kobject *kobj,
2017a3437870SNishanth Aravamudan 					struct kobj_attribute *attr, char *buf)
2018a3437870SNishanth Aravamudan {
20199a305230SLee Schermerhorn 	struct hstate *h;
20209a305230SLee Schermerhorn 	unsigned long free_huge_pages;
20219a305230SLee Schermerhorn 	int nid;
20229a305230SLee Schermerhorn 
20239a305230SLee Schermerhorn 	h = kobj_to_hstate(kobj, &nid);
20249a305230SLee Schermerhorn 	if (nid == NUMA_NO_NODE)
20259a305230SLee Schermerhorn 		free_huge_pages = h->free_huge_pages;
20269a305230SLee Schermerhorn 	else
20279a305230SLee Schermerhorn 		free_huge_pages = h->free_huge_pages_node[nid];
20289a305230SLee Schermerhorn 
20299a305230SLee Schermerhorn 	return sprintf(buf, "%lu\n", free_huge_pages);
2030a3437870SNishanth Aravamudan }
2031a3437870SNishanth Aravamudan HSTATE_ATTR_RO(free_hugepages);
2032a3437870SNishanth Aravamudan 
2033a3437870SNishanth Aravamudan static ssize_t resv_hugepages_show(struct kobject *kobj,
2034a3437870SNishanth Aravamudan 					struct kobj_attribute *attr, char *buf)
2035a3437870SNishanth Aravamudan {
20369a305230SLee Schermerhorn 	struct hstate *h = kobj_to_hstate(kobj, NULL);
2037a3437870SNishanth Aravamudan 	return sprintf(buf, "%lu\n", h->resv_huge_pages);
2038a3437870SNishanth Aravamudan }
2039a3437870SNishanth Aravamudan HSTATE_ATTR_RO(resv_hugepages);
2040a3437870SNishanth Aravamudan 
2041a3437870SNishanth Aravamudan static ssize_t surplus_hugepages_show(struct kobject *kobj,
2042a3437870SNishanth Aravamudan 					struct kobj_attribute *attr, char *buf)
2043a3437870SNishanth Aravamudan {
20449a305230SLee Schermerhorn 	struct hstate *h;
20459a305230SLee Schermerhorn 	unsigned long surplus_huge_pages;
20469a305230SLee Schermerhorn 	int nid;
20479a305230SLee Schermerhorn 
20489a305230SLee Schermerhorn 	h = kobj_to_hstate(kobj, &nid);
20499a305230SLee Schermerhorn 	if (nid == NUMA_NO_NODE)
20509a305230SLee Schermerhorn 		surplus_huge_pages = h->surplus_huge_pages;
20519a305230SLee Schermerhorn 	else
20529a305230SLee Schermerhorn 		surplus_huge_pages = h->surplus_huge_pages_node[nid];
20539a305230SLee Schermerhorn 
20549a305230SLee Schermerhorn 	return sprintf(buf, "%lu\n", surplus_huge_pages);
2055a3437870SNishanth Aravamudan }
2056a3437870SNishanth Aravamudan HSTATE_ATTR_RO(surplus_hugepages);
2057a3437870SNishanth Aravamudan 
2058a3437870SNishanth Aravamudan static struct attribute *hstate_attrs[] = {
2059a3437870SNishanth Aravamudan 	&nr_hugepages_attr.attr,
2060a3437870SNishanth Aravamudan 	&nr_overcommit_hugepages_attr.attr,
2061a3437870SNishanth Aravamudan 	&free_hugepages_attr.attr,
2062a3437870SNishanth Aravamudan 	&resv_hugepages_attr.attr,
2063a3437870SNishanth Aravamudan 	&surplus_hugepages_attr.attr,
206406808b08SLee Schermerhorn #ifdef CONFIG_NUMA
206506808b08SLee Schermerhorn 	&nr_hugepages_mempolicy_attr.attr,
206606808b08SLee Schermerhorn #endif
2067a3437870SNishanth Aravamudan 	NULL,
2068a3437870SNishanth Aravamudan };
2069a3437870SNishanth Aravamudan 
2070a3437870SNishanth Aravamudan static struct attribute_group hstate_attr_group = {
2071a3437870SNishanth Aravamudan 	.attrs = hstate_attrs,
2072a3437870SNishanth Aravamudan };
2073a3437870SNishanth Aravamudan 
2074094e9539SJeff Mahoney static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
20759a305230SLee Schermerhorn 				    struct kobject **hstate_kobjs,
20769a305230SLee Schermerhorn 				    struct attribute_group *hstate_attr_group)
2077a3437870SNishanth Aravamudan {
2078a3437870SNishanth Aravamudan 	int retval;
2079972dc4deSAneesh Kumar K.V 	int hi = hstate_index(h);
2080a3437870SNishanth Aravamudan 
20819a305230SLee Schermerhorn 	hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
20829a305230SLee Schermerhorn 	if (!hstate_kobjs[hi])
2083a3437870SNishanth Aravamudan 		return -ENOMEM;
2084a3437870SNishanth Aravamudan 
20859a305230SLee Schermerhorn 	retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
2086a3437870SNishanth Aravamudan 	if (retval)
20879a305230SLee Schermerhorn 		kobject_put(hstate_kobjs[hi]);
2088a3437870SNishanth Aravamudan 
2089a3437870SNishanth Aravamudan 	return retval;
2090a3437870SNishanth Aravamudan }
2091a3437870SNishanth Aravamudan 
2092a3437870SNishanth Aravamudan static void __init hugetlb_sysfs_init(void)
2093a3437870SNishanth Aravamudan {
2094a3437870SNishanth Aravamudan 	struct hstate *h;
2095a3437870SNishanth Aravamudan 	int err;
2096a3437870SNishanth Aravamudan 
2097a3437870SNishanth Aravamudan 	hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
2098a3437870SNishanth Aravamudan 	if (!hugepages_kobj)
2099a3437870SNishanth Aravamudan 		return;
2100a3437870SNishanth Aravamudan 
2101a3437870SNishanth Aravamudan 	for_each_hstate(h) {
21029a305230SLee Schermerhorn 		err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
21039a305230SLee Schermerhorn 					 hstate_kobjs, &hstate_attr_group);
2104a3437870SNishanth Aravamudan 		if (err)
2105ffb22af5SAndrew Morton 			pr_err("Hugetlb: Unable to add hstate %s", h->name);
2106a3437870SNishanth Aravamudan 	}
2107a3437870SNishanth Aravamudan }
2108a3437870SNishanth Aravamudan 
21099a305230SLee Schermerhorn #ifdef CONFIG_NUMA
21109a305230SLee Schermerhorn 
21119a305230SLee Schermerhorn /*
21129a305230SLee Schermerhorn  * node_hstate/s - associate per node hstate attributes, via their kobjects,
211310fbcf4cSKay Sievers  * with node devices in node_devices[] using a parallel array.  The array
211410fbcf4cSKay Sievers  * index of a node device or _hstate == node id.
211510fbcf4cSKay Sievers  * This is here to avoid any static dependency of the node device driver, in
21169a305230SLee Schermerhorn  * the base kernel, on the hugetlb module.
21179a305230SLee Schermerhorn  */
21189a305230SLee Schermerhorn struct node_hstate {
21199a305230SLee Schermerhorn 	struct kobject		*hugepages_kobj;
21209a305230SLee Schermerhorn 	struct kobject		*hstate_kobjs[HUGE_MAX_HSTATE];
21219a305230SLee Schermerhorn };
21229a305230SLee Schermerhorn struct node_hstate node_hstates[MAX_NUMNODES];
21239a305230SLee Schermerhorn 
21249a305230SLee Schermerhorn /*
212510fbcf4cSKay Sievers  * A subset of global hstate attributes for node devices
21269a305230SLee Schermerhorn  */
21279a305230SLee Schermerhorn static struct attribute *per_node_hstate_attrs[] = {
21289a305230SLee Schermerhorn 	&nr_hugepages_attr.attr,
21299a305230SLee Schermerhorn 	&free_hugepages_attr.attr,
21309a305230SLee Schermerhorn 	&surplus_hugepages_attr.attr,
21319a305230SLee Schermerhorn 	NULL,
21329a305230SLee Schermerhorn };
21339a305230SLee Schermerhorn 
21349a305230SLee Schermerhorn static struct attribute_group per_node_hstate_attr_group = {
21359a305230SLee Schermerhorn 	.attrs = per_node_hstate_attrs,
21369a305230SLee Schermerhorn };
21379a305230SLee Schermerhorn 
21389a305230SLee Schermerhorn /*
213910fbcf4cSKay Sievers  * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj.
21409a305230SLee Schermerhorn  * Returns node id via non-NULL nidp.
21419a305230SLee Schermerhorn  */
21429a305230SLee Schermerhorn static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
21439a305230SLee Schermerhorn {
21449a305230SLee Schermerhorn 	int nid;
21459a305230SLee Schermerhorn 
21469a305230SLee Schermerhorn 	for (nid = 0; nid < nr_node_ids; nid++) {
21479a305230SLee Schermerhorn 		struct node_hstate *nhs = &node_hstates[nid];
21489a305230SLee Schermerhorn 		int i;
21499a305230SLee Schermerhorn 		for (i = 0; i < HUGE_MAX_HSTATE; i++)
21509a305230SLee Schermerhorn 			if (nhs->hstate_kobjs[i] == kobj) {
21519a305230SLee Schermerhorn 				if (nidp)
21529a305230SLee Schermerhorn 					*nidp = nid;
21539a305230SLee Schermerhorn 				return &hstates[i];
21549a305230SLee Schermerhorn 			}
21559a305230SLee Schermerhorn 	}
21569a305230SLee Schermerhorn 
21579a305230SLee Schermerhorn 	BUG();
21589a305230SLee Schermerhorn 	return NULL;
21599a305230SLee Schermerhorn }
21609a305230SLee Schermerhorn 
21619a305230SLee Schermerhorn /*
216210fbcf4cSKay Sievers  * Unregister hstate attributes from a single node device.
21639a305230SLee Schermerhorn  * No-op if no hstate attributes attached.
21649a305230SLee Schermerhorn  */
21653cd8b44fSClaudiu Ghioc static void hugetlb_unregister_node(struct node *node)
21669a305230SLee Schermerhorn {
21679a305230SLee Schermerhorn 	struct hstate *h;
216810fbcf4cSKay Sievers 	struct node_hstate *nhs = &node_hstates[node->dev.id];
21699a305230SLee Schermerhorn 
21709a305230SLee Schermerhorn 	if (!nhs->hugepages_kobj)
21719b5e5d0fSLee Schermerhorn 		return;		/* no hstate attributes */
21729a305230SLee Schermerhorn 
2173972dc4deSAneesh Kumar K.V 	for_each_hstate(h) {
2174972dc4deSAneesh Kumar K.V 		int idx = hstate_index(h);
2175972dc4deSAneesh Kumar K.V 		if (nhs->hstate_kobjs[idx]) {
2176972dc4deSAneesh Kumar K.V 			kobject_put(nhs->hstate_kobjs[idx]);
2177972dc4deSAneesh Kumar K.V 			nhs->hstate_kobjs[idx] = NULL;
2178972dc4deSAneesh Kumar K.V 		}
21799a305230SLee Schermerhorn 	}
21809a305230SLee Schermerhorn 
21819a305230SLee Schermerhorn 	kobject_put(nhs->hugepages_kobj);
21829a305230SLee Schermerhorn 	nhs->hugepages_kobj = NULL;
21839a305230SLee Schermerhorn }
21849a305230SLee Schermerhorn 
21859a305230SLee Schermerhorn /*
218610fbcf4cSKay Sievers  * hugetlb module exit:  unregister hstate attributes from node devices
21879a305230SLee Schermerhorn  * that have them.
21889a305230SLee Schermerhorn  */
21899a305230SLee Schermerhorn static void hugetlb_unregister_all_nodes(void)
21909a305230SLee Schermerhorn {
21919a305230SLee Schermerhorn 	int nid;
21929a305230SLee Schermerhorn 
21939a305230SLee Schermerhorn 	/*
219410fbcf4cSKay Sievers 	 * disable node device registrations.
21959a305230SLee Schermerhorn 	 */
21969a305230SLee Schermerhorn 	register_hugetlbfs_with_node(NULL, NULL);
21979a305230SLee Schermerhorn 
21989a305230SLee Schermerhorn 	/*
21999a305230SLee Schermerhorn 	 * remove hstate attributes from any nodes that have them.
22009a305230SLee Schermerhorn 	 */
22019a305230SLee Schermerhorn 	for (nid = 0; nid < nr_node_ids; nid++)
22028732794bSWen Congyang 		hugetlb_unregister_node(node_devices[nid]);
22039a305230SLee Schermerhorn }
22049a305230SLee Schermerhorn 
22059a305230SLee Schermerhorn /*
220610fbcf4cSKay Sievers  * Register hstate attributes for a single node device.
22079a305230SLee Schermerhorn  * No-op if attributes already registered.
22089a305230SLee Schermerhorn  */
22093cd8b44fSClaudiu Ghioc static void hugetlb_register_node(struct node *node)
22109a305230SLee Schermerhorn {
22119a305230SLee Schermerhorn 	struct hstate *h;
221210fbcf4cSKay Sievers 	struct node_hstate *nhs = &node_hstates[node->dev.id];
22139a305230SLee Schermerhorn 	int err;
22149a305230SLee Schermerhorn 
22159a305230SLee Schermerhorn 	if (nhs->hugepages_kobj)
22169a305230SLee Schermerhorn 		return;		/* already allocated */
22179a305230SLee Schermerhorn 
22189a305230SLee Schermerhorn 	nhs->hugepages_kobj = kobject_create_and_add("hugepages",
221910fbcf4cSKay Sievers 							&node->dev.kobj);
22209a305230SLee Schermerhorn 	if (!nhs->hugepages_kobj)
22219a305230SLee Schermerhorn 		return;
22229a305230SLee Schermerhorn 
22239a305230SLee Schermerhorn 	for_each_hstate(h) {
22249a305230SLee Schermerhorn 		err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj,
22259a305230SLee Schermerhorn 						nhs->hstate_kobjs,
22269a305230SLee Schermerhorn 						&per_node_hstate_attr_group);
22279a305230SLee Schermerhorn 		if (err) {
2228ffb22af5SAndrew Morton 			pr_err("Hugetlb: Unable to add hstate %s for node %d\n",
222910fbcf4cSKay Sievers 				h->name, node->dev.id);
22309a305230SLee Schermerhorn 			hugetlb_unregister_node(node);
22319a305230SLee Schermerhorn 			break;
22329a305230SLee Schermerhorn 		}
22339a305230SLee Schermerhorn 	}
22349a305230SLee Schermerhorn }
22359a305230SLee Schermerhorn 
22369a305230SLee Schermerhorn /*
22379b5e5d0fSLee Schermerhorn  * hugetlb init time:  register hstate attributes for all registered node
223810fbcf4cSKay Sievers  * devices of nodes that have memory.  All on-line nodes should have
223910fbcf4cSKay Sievers  * registered their associated device by this time.
22409a305230SLee Schermerhorn  */
22417d9ca000SLuiz Capitulino static void __init hugetlb_register_all_nodes(void)
22429a305230SLee Schermerhorn {
22439a305230SLee Schermerhorn 	int nid;
22449a305230SLee Schermerhorn 
22458cebfcd0SLai Jiangshan 	for_each_node_state(nid, N_MEMORY) {
22468732794bSWen Congyang 		struct node *node = node_devices[nid];
224710fbcf4cSKay Sievers 		if (node->dev.id == nid)
22489a305230SLee Schermerhorn 			hugetlb_register_node(node);
22499a305230SLee Schermerhorn 	}
22509a305230SLee Schermerhorn 
22519a305230SLee Schermerhorn 	/*
225210fbcf4cSKay Sievers 	 * Let the node device driver know we're here so it can
22539a305230SLee Schermerhorn 	 * [un]register hstate attributes on node hotplug.
22549a305230SLee Schermerhorn 	 */
22559a305230SLee Schermerhorn 	register_hugetlbfs_with_node(hugetlb_register_node,
22569a305230SLee Schermerhorn 				     hugetlb_unregister_node);
22579a305230SLee Schermerhorn }
22589a305230SLee Schermerhorn #else	/* !CONFIG_NUMA */
22599a305230SLee Schermerhorn 
22609a305230SLee Schermerhorn static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
22619a305230SLee Schermerhorn {
22629a305230SLee Schermerhorn 	BUG();
22639a305230SLee Schermerhorn 	if (nidp)
22649a305230SLee Schermerhorn 		*nidp = -1;
22659a305230SLee Schermerhorn 	return NULL;
22669a305230SLee Schermerhorn }
22679a305230SLee Schermerhorn 
22689a305230SLee Schermerhorn static void hugetlb_unregister_all_nodes(void) { }
22699a305230SLee Schermerhorn 
22709a305230SLee Schermerhorn static void hugetlb_register_all_nodes(void) { }
22719a305230SLee Schermerhorn 
22729a305230SLee Schermerhorn #endif
22739a305230SLee Schermerhorn 
2274a3437870SNishanth Aravamudan static void __exit hugetlb_exit(void)
2275a3437870SNishanth Aravamudan {
2276a3437870SNishanth Aravamudan 	struct hstate *h;
2277a3437870SNishanth Aravamudan 
22789a305230SLee Schermerhorn 	hugetlb_unregister_all_nodes();
22799a305230SLee Schermerhorn 
2280a3437870SNishanth Aravamudan 	for_each_hstate(h) {
2281972dc4deSAneesh Kumar K.V 		kobject_put(hstate_kobjs[hstate_index(h)]);
2282a3437870SNishanth Aravamudan 	}
2283a3437870SNishanth Aravamudan 
2284a3437870SNishanth Aravamudan 	kobject_put(hugepages_kobj);
22858382d914SDavidlohr Bueso 	kfree(htlb_fault_mutex_table);
2286a3437870SNishanth Aravamudan }
2287a3437870SNishanth Aravamudan module_exit(hugetlb_exit);
2288a3437870SNishanth Aravamudan 
2289a3437870SNishanth Aravamudan static int __init hugetlb_init(void)
2290a3437870SNishanth Aravamudan {
22918382d914SDavidlohr Bueso 	int i;
22928382d914SDavidlohr Bueso 
2293457c1b27SNishanth Aravamudan 	if (!hugepages_supported())
22940ef89d25SBenjamin Herrenschmidt 		return 0;
2295a3437870SNishanth Aravamudan 
2296e11bfbfcSNick Piggin 	if (!size_to_hstate(default_hstate_size)) {
2297e11bfbfcSNick Piggin 		default_hstate_size = HPAGE_SIZE;
2298e11bfbfcSNick Piggin 		if (!size_to_hstate(default_hstate_size))
2299a3437870SNishanth Aravamudan 			hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
2300a3437870SNishanth Aravamudan 	}
2301972dc4deSAneesh Kumar K.V 	default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size));
2302e11bfbfcSNick Piggin 	if (default_hstate_max_huge_pages)
2303e11bfbfcSNick Piggin 		default_hstate.max_huge_pages = default_hstate_max_huge_pages;
2304a3437870SNishanth Aravamudan 
2305a3437870SNishanth Aravamudan 	hugetlb_init_hstates();
2306aa888a74SAndi Kleen 	gather_bootmem_prealloc();
2307a3437870SNishanth Aravamudan 	report_hugepages();
2308a3437870SNishanth Aravamudan 
2309a3437870SNishanth Aravamudan 	hugetlb_sysfs_init();
23109a305230SLee Schermerhorn 	hugetlb_register_all_nodes();
23117179e7bfSJianguo Wu 	hugetlb_cgroup_file_init();
23129a305230SLee Schermerhorn 
23138382d914SDavidlohr Bueso #ifdef CONFIG_SMP
23148382d914SDavidlohr Bueso 	num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus());
23158382d914SDavidlohr Bueso #else
23168382d914SDavidlohr Bueso 	num_fault_mutexes = 1;
23178382d914SDavidlohr Bueso #endif
23188382d914SDavidlohr Bueso 	htlb_fault_mutex_table =
23198382d914SDavidlohr Bueso 		kmalloc(sizeof(struct mutex) * num_fault_mutexes, GFP_KERNEL);
23208382d914SDavidlohr Bueso 	BUG_ON(!htlb_fault_mutex_table);
23218382d914SDavidlohr Bueso 
23228382d914SDavidlohr Bueso 	for (i = 0; i < num_fault_mutexes; i++)
23238382d914SDavidlohr Bueso 		mutex_init(&htlb_fault_mutex_table[i]);
2324a3437870SNishanth Aravamudan 	return 0;
2325a3437870SNishanth Aravamudan }
2326a3437870SNishanth Aravamudan module_init(hugetlb_init);
2327a3437870SNishanth Aravamudan 
2328a3437870SNishanth Aravamudan /* Should be called on processing a hugepagesz=... option */
2329a3437870SNishanth Aravamudan void __init hugetlb_add_hstate(unsigned order)
2330a3437870SNishanth Aravamudan {
2331a3437870SNishanth Aravamudan 	struct hstate *h;
23328faa8b07SAndi Kleen 	unsigned long i;
23338faa8b07SAndi Kleen 
2334a3437870SNishanth Aravamudan 	if (size_to_hstate(PAGE_SIZE << order)) {
2335ffb22af5SAndrew Morton 		pr_warning("hugepagesz= specified twice, ignoring\n");
2336a3437870SNishanth Aravamudan 		return;
2337a3437870SNishanth Aravamudan 	}
233847d38344SAneesh Kumar K.V 	BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
2339a3437870SNishanth Aravamudan 	BUG_ON(order == 0);
234047d38344SAneesh Kumar K.V 	h = &hstates[hugetlb_max_hstate++];
2341a3437870SNishanth Aravamudan 	h->order = order;
2342a3437870SNishanth Aravamudan 	h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
23438faa8b07SAndi Kleen 	h->nr_huge_pages = 0;
23448faa8b07SAndi Kleen 	h->free_huge_pages = 0;
23458faa8b07SAndi Kleen 	for (i = 0; i < MAX_NUMNODES; ++i)
23468faa8b07SAndi Kleen 		INIT_LIST_HEAD(&h->hugepage_freelists[i]);
23470edaecfaSAneesh Kumar K.V 	INIT_LIST_HEAD(&h->hugepage_activelist);
23488cebfcd0SLai Jiangshan 	h->next_nid_to_alloc = first_node(node_states[N_MEMORY]);
23498cebfcd0SLai Jiangshan 	h->next_nid_to_free = first_node(node_states[N_MEMORY]);
2350a3437870SNishanth Aravamudan 	snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
2351a3437870SNishanth Aravamudan 					huge_page_size(h)/1024);
23528faa8b07SAndi Kleen 
2353a3437870SNishanth Aravamudan 	parsed_hstate = h;
2354a3437870SNishanth Aravamudan }
2355a3437870SNishanth Aravamudan 
2356e11bfbfcSNick Piggin static int __init hugetlb_nrpages_setup(char *s)
2357a3437870SNishanth Aravamudan {
2358a3437870SNishanth Aravamudan 	unsigned long *mhp;
23598faa8b07SAndi Kleen 	static unsigned long *last_mhp;
2360a3437870SNishanth Aravamudan 
2361a3437870SNishanth Aravamudan 	/*
236247d38344SAneesh Kumar K.V 	 * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,
2363a3437870SNishanth Aravamudan 	 * so this hugepages= parameter goes to the "default hstate".
2364a3437870SNishanth Aravamudan 	 */
236547d38344SAneesh Kumar K.V 	if (!hugetlb_max_hstate)
2366a3437870SNishanth Aravamudan 		mhp = &default_hstate_max_huge_pages;
2367a3437870SNishanth Aravamudan 	else
2368a3437870SNishanth Aravamudan 		mhp = &parsed_hstate->max_huge_pages;
2369a3437870SNishanth Aravamudan 
23708faa8b07SAndi Kleen 	if (mhp == last_mhp) {
2371ffb22af5SAndrew Morton 		pr_warning("hugepages= specified twice without "
23728faa8b07SAndi Kleen 			   "interleaving hugepagesz=, ignoring\n");
23738faa8b07SAndi Kleen 		return 1;
23748faa8b07SAndi Kleen 	}
23758faa8b07SAndi Kleen 
2376a3437870SNishanth Aravamudan 	if (sscanf(s, "%lu", mhp) <= 0)
2377a3437870SNishanth Aravamudan 		*mhp = 0;
2378a3437870SNishanth Aravamudan 
23798faa8b07SAndi Kleen 	/*
23808faa8b07SAndi Kleen 	 * Global state is always initialized later in hugetlb_init.
23818faa8b07SAndi Kleen 	 * But we need to allocate >= MAX_ORDER hstates here early to still
23828faa8b07SAndi Kleen 	 * use the bootmem allocator.
23838faa8b07SAndi Kleen 	 */
238447d38344SAneesh Kumar K.V 	if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)
23858faa8b07SAndi Kleen 		hugetlb_hstate_alloc_pages(parsed_hstate);
23868faa8b07SAndi Kleen 
23878faa8b07SAndi Kleen 	last_mhp = mhp;
23888faa8b07SAndi Kleen 
2389a3437870SNishanth Aravamudan 	return 1;
2390a3437870SNishanth Aravamudan }
2391e11bfbfcSNick Piggin __setup("hugepages=", hugetlb_nrpages_setup);
2392e11bfbfcSNick Piggin 
2393e11bfbfcSNick Piggin static int __init hugetlb_default_setup(char *s)
2394e11bfbfcSNick Piggin {
2395e11bfbfcSNick Piggin 	default_hstate_size = memparse(s, &s);
2396e11bfbfcSNick Piggin 	return 1;
2397e11bfbfcSNick Piggin }
2398e11bfbfcSNick Piggin __setup("default_hugepagesz=", hugetlb_default_setup);
2399a3437870SNishanth Aravamudan 
24008a213460SNishanth Aravamudan static unsigned int cpuset_mems_nr(unsigned int *array)
24018a213460SNishanth Aravamudan {
24028a213460SNishanth Aravamudan 	int node;
24038a213460SNishanth Aravamudan 	unsigned int nr = 0;
24048a213460SNishanth Aravamudan 
24058a213460SNishanth Aravamudan 	for_each_node_mask(node, cpuset_current_mems_allowed)
24068a213460SNishanth Aravamudan 		nr += array[node];
24078a213460SNishanth Aravamudan 
24088a213460SNishanth Aravamudan 	return nr;
24098a213460SNishanth Aravamudan }
24108a213460SNishanth Aravamudan 
24118a213460SNishanth Aravamudan #ifdef CONFIG_SYSCTL
241206808b08SLee Schermerhorn static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
241306808b08SLee Schermerhorn 			 struct ctl_table *table, int write,
241406808b08SLee Schermerhorn 			 void __user *buffer, size_t *length, loff_t *ppos)
24151da177e4SLinus Torvalds {
2416e5ff2159SAndi Kleen 	struct hstate *h = &default_hstate;
2417238d3c13SDavid Rientjes 	unsigned long tmp = h->max_huge_pages;
241808d4a246SMichal Hocko 	int ret;
2419e5ff2159SAndi Kleen 
2420457c1b27SNishanth Aravamudan 	if (!hugepages_supported())
2421457c1b27SNishanth Aravamudan 		return -ENOTSUPP;
2422457c1b27SNishanth Aravamudan 
2423e5ff2159SAndi Kleen 	table->data = &tmp;
2424e5ff2159SAndi Kleen 	table->maxlen = sizeof(unsigned long);
242508d4a246SMichal Hocko 	ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
242608d4a246SMichal Hocko 	if (ret)
242708d4a246SMichal Hocko 		goto out;
2428e5ff2159SAndi Kleen 
2429238d3c13SDavid Rientjes 	if (write)
2430238d3c13SDavid Rientjes 		ret = __nr_hugepages_store_common(obey_mempolicy, h,
2431238d3c13SDavid Rientjes 						  NUMA_NO_NODE, tmp, *length);
243208d4a246SMichal Hocko out:
243308d4a246SMichal Hocko 	return ret;
24341da177e4SLinus Torvalds }
2435396faf03SMel Gorman 
243606808b08SLee Schermerhorn int hugetlb_sysctl_handler(struct ctl_table *table, int write,
243706808b08SLee Schermerhorn 			  void __user *buffer, size_t *length, loff_t *ppos)
243806808b08SLee Schermerhorn {
243906808b08SLee Schermerhorn 
244006808b08SLee Schermerhorn 	return hugetlb_sysctl_handler_common(false, table, write,
244106808b08SLee Schermerhorn 							buffer, length, ppos);
244206808b08SLee Schermerhorn }
244306808b08SLee Schermerhorn 
244406808b08SLee Schermerhorn #ifdef CONFIG_NUMA
244506808b08SLee Schermerhorn int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
244606808b08SLee Schermerhorn 			  void __user *buffer, size_t *length, loff_t *ppos)
244706808b08SLee Schermerhorn {
244806808b08SLee Schermerhorn 	return hugetlb_sysctl_handler_common(true, table, write,
244906808b08SLee Schermerhorn 							buffer, length, ppos);
245006808b08SLee Schermerhorn }
245106808b08SLee Schermerhorn #endif /* CONFIG_NUMA */
245206808b08SLee Schermerhorn 
2453a3d0c6aaSNishanth Aravamudan int hugetlb_overcommit_handler(struct ctl_table *table, int write,
24548d65af78SAlexey Dobriyan 			void __user *buffer,
2455a3d0c6aaSNishanth Aravamudan 			size_t *length, loff_t *ppos)
2456a3d0c6aaSNishanth Aravamudan {
2457a5516438SAndi Kleen 	struct hstate *h = &default_hstate;
2458e5ff2159SAndi Kleen 	unsigned long tmp;
245908d4a246SMichal Hocko 	int ret;
2460e5ff2159SAndi Kleen 
2461457c1b27SNishanth Aravamudan 	if (!hugepages_supported())
2462457c1b27SNishanth Aravamudan 		return -ENOTSUPP;
2463457c1b27SNishanth Aravamudan 
2464e5ff2159SAndi Kleen 	tmp = h->nr_overcommit_huge_pages;
2465e5ff2159SAndi Kleen 
2466bae7f4aeSLuiz Capitulino 	if (write && hstate_is_gigantic(h))
2467adbe8726SEric B Munson 		return -EINVAL;
2468adbe8726SEric B Munson 
2469e5ff2159SAndi Kleen 	table->data = &tmp;
2470e5ff2159SAndi Kleen 	table->maxlen = sizeof(unsigned long);
247108d4a246SMichal Hocko 	ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
247208d4a246SMichal Hocko 	if (ret)
247308d4a246SMichal Hocko 		goto out;
2474e5ff2159SAndi Kleen 
2475e5ff2159SAndi Kleen 	if (write) {
2476064d9efeSNishanth Aravamudan 		spin_lock(&hugetlb_lock);
2477e5ff2159SAndi Kleen 		h->nr_overcommit_huge_pages = tmp;
2478a3d0c6aaSNishanth Aravamudan 		spin_unlock(&hugetlb_lock);
2479e5ff2159SAndi Kleen 	}
248008d4a246SMichal Hocko out:
248108d4a246SMichal Hocko 	return ret;
2482a3d0c6aaSNishanth Aravamudan }
2483a3d0c6aaSNishanth Aravamudan 
24841da177e4SLinus Torvalds #endif /* CONFIG_SYSCTL */
24851da177e4SLinus Torvalds 
2486e1759c21SAlexey Dobriyan void hugetlb_report_meminfo(struct seq_file *m)
24871da177e4SLinus Torvalds {
2488a5516438SAndi Kleen 	struct hstate *h = &default_hstate;
2489457c1b27SNishanth Aravamudan 	if (!hugepages_supported())
2490457c1b27SNishanth Aravamudan 		return;
2491e1759c21SAlexey Dobriyan 	seq_printf(m,
24921da177e4SLinus Torvalds 			"HugePages_Total:   %5lu\n"
24931da177e4SLinus Torvalds 			"HugePages_Free:    %5lu\n"
2494b45b5bd6SDavid Gibson 			"HugePages_Rsvd:    %5lu\n"
24957893d1d5SAdam Litke 			"HugePages_Surp:    %5lu\n"
24964f98a2feSRik van Riel 			"Hugepagesize:   %8lu kB\n",
2497a5516438SAndi Kleen 			h->nr_huge_pages,
2498a5516438SAndi Kleen 			h->free_huge_pages,
2499a5516438SAndi Kleen 			h->resv_huge_pages,
2500a5516438SAndi Kleen 			h->surplus_huge_pages,
2501a5516438SAndi Kleen 			1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
25021da177e4SLinus Torvalds }
25031da177e4SLinus Torvalds 
25041da177e4SLinus Torvalds int hugetlb_report_node_meminfo(int nid, char *buf)
25051da177e4SLinus Torvalds {
2506a5516438SAndi Kleen 	struct hstate *h = &default_hstate;
2507457c1b27SNishanth Aravamudan 	if (!hugepages_supported())
2508457c1b27SNishanth Aravamudan 		return 0;
25091da177e4SLinus Torvalds 	return sprintf(buf,
25101da177e4SLinus Torvalds 		"Node %d HugePages_Total: %5u\n"
2511a1de0919SNishanth Aravamudan 		"Node %d HugePages_Free:  %5u\n"
2512a1de0919SNishanth Aravamudan 		"Node %d HugePages_Surp:  %5u\n",
2513a5516438SAndi Kleen 		nid, h->nr_huge_pages_node[nid],
2514a5516438SAndi Kleen 		nid, h->free_huge_pages_node[nid],
2515a5516438SAndi Kleen 		nid, h->surplus_huge_pages_node[nid]);
25161da177e4SLinus Torvalds }
25171da177e4SLinus Torvalds 
2518949f7ec5SDavid Rientjes void hugetlb_show_meminfo(void)
2519949f7ec5SDavid Rientjes {
2520949f7ec5SDavid Rientjes 	struct hstate *h;
2521949f7ec5SDavid Rientjes 	int nid;
2522949f7ec5SDavid Rientjes 
2523457c1b27SNishanth Aravamudan 	if (!hugepages_supported())
2524457c1b27SNishanth Aravamudan 		return;
2525457c1b27SNishanth Aravamudan 
2526949f7ec5SDavid Rientjes 	for_each_node_state(nid, N_MEMORY)
2527949f7ec5SDavid Rientjes 		for_each_hstate(h)
2528949f7ec5SDavid Rientjes 			pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
2529949f7ec5SDavid Rientjes 				nid,
2530949f7ec5SDavid Rientjes 				h->nr_huge_pages_node[nid],
2531949f7ec5SDavid Rientjes 				h->free_huge_pages_node[nid],
2532949f7ec5SDavid Rientjes 				h->surplus_huge_pages_node[nid],
2533949f7ec5SDavid Rientjes 				1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
2534949f7ec5SDavid Rientjes }
2535949f7ec5SDavid Rientjes 
25361da177e4SLinus Torvalds /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
25371da177e4SLinus Torvalds unsigned long hugetlb_total_pages(void)
25381da177e4SLinus Torvalds {
2539d0028588SWanpeng Li 	struct hstate *h;
2540d0028588SWanpeng Li 	unsigned long nr_total_pages = 0;
2541d0028588SWanpeng Li 
2542d0028588SWanpeng Li 	for_each_hstate(h)
2543d0028588SWanpeng Li 		nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h);
2544d0028588SWanpeng Li 	return nr_total_pages;
25451da177e4SLinus Torvalds }
25461da177e4SLinus Torvalds 
2547a5516438SAndi Kleen static int hugetlb_acct_memory(struct hstate *h, long delta)
2548fc1b8a73SMel Gorman {
2549fc1b8a73SMel Gorman 	int ret = -ENOMEM;
2550fc1b8a73SMel Gorman 
2551fc1b8a73SMel Gorman 	spin_lock(&hugetlb_lock);
2552fc1b8a73SMel Gorman 	/*
2553fc1b8a73SMel Gorman 	 * When cpuset is configured, it breaks the strict hugetlb page
2554fc1b8a73SMel Gorman 	 * reservation as the accounting is done on a global variable. Such
2555fc1b8a73SMel Gorman 	 * reservation is completely rubbish in the presence of cpuset because
2556fc1b8a73SMel Gorman 	 * the reservation is not checked against page availability for the
2557fc1b8a73SMel Gorman 	 * current cpuset. Application can still potentially OOM'ed by kernel
2558fc1b8a73SMel Gorman 	 * with lack of free htlb page in cpuset that the task is in.
2559fc1b8a73SMel Gorman 	 * Attempt to enforce strict accounting with cpuset is almost
2560fc1b8a73SMel Gorman 	 * impossible (or too ugly) because cpuset is too fluid that
2561fc1b8a73SMel Gorman 	 * task or memory node can be dynamically moved between cpusets.
2562fc1b8a73SMel Gorman 	 *
2563fc1b8a73SMel Gorman 	 * The change of semantics for shared hugetlb mapping with cpuset is
2564fc1b8a73SMel Gorman 	 * undesirable. However, in order to preserve some of the semantics,
2565fc1b8a73SMel Gorman 	 * we fall back to check against current free page availability as
2566fc1b8a73SMel Gorman 	 * a best attempt and hopefully to minimize the impact of changing
2567fc1b8a73SMel Gorman 	 * semantics that cpuset has.
2568fc1b8a73SMel Gorman 	 */
2569fc1b8a73SMel Gorman 	if (delta > 0) {
2570a5516438SAndi Kleen 		if (gather_surplus_pages(h, delta) < 0)
2571fc1b8a73SMel Gorman 			goto out;
2572fc1b8a73SMel Gorman 
2573a5516438SAndi Kleen 		if (delta > cpuset_mems_nr(h->free_huge_pages_node)) {
2574a5516438SAndi Kleen 			return_unused_surplus_pages(h, delta);
2575fc1b8a73SMel Gorman 			goto out;
2576fc1b8a73SMel Gorman 		}
2577fc1b8a73SMel Gorman 	}
2578fc1b8a73SMel Gorman 
2579fc1b8a73SMel Gorman 	ret = 0;
2580fc1b8a73SMel Gorman 	if (delta < 0)
2581a5516438SAndi Kleen 		return_unused_surplus_pages(h, (unsigned long) -delta);
2582fc1b8a73SMel Gorman 
2583fc1b8a73SMel Gorman out:
2584fc1b8a73SMel Gorman 	spin_unlock(&hugetlb_lock);
2585fc1b8a73SMel Gorman 	return ret;
2586fc1b8a73SMel Gorman }
2587fc1b8a73SMel Gorman 
258884afd99bSAndy Whitcroft static void hugetlb_vm_op_open(struct vm_area_struct *vma)
258984afd99bSAndy Whitcroft {
2590f522c3acSJoonsoo Kim 	struct resv_map *resv = vma_resv_map(vma);
259184afd99bSAndy Whitcroft 
259284afd99bSAndy Whitcroft 	/*
259384afd99bSAndy Whitcroft 	 * This new VMA should share its siblings reservation map if present.
259484afd99bSAndy Whitcroft 	 * The VMA will only ever have a valid reservation map pointer where
259584afd99bSAndy Whitcroft 	 * it is being copied for another still existing VMA.  As that VMA
259625985edcSLucas De Marchi 	 * has a reference to the reservation map it cannot disappear until
259784afd99bSAndy Whitcroft 	 * after this open call completes.  It is therefore safe to take a
259884afd99bSAndy Whitcroft 	 * new reference here without additional locking.
259984afd99bSAndy Whitcroft 	 */
26004e35f483SJoonsoo Kim 	if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
2601f522c3acSJoonsoo Kim 		kref_get(&resv->refs);
260284afd99bSAndy Whitcroft }
260384afd99bSAndy Whitcroft 
2604a1e78772SMel Gorman static void hugetlb_vm_op_close(struct vm_area_struct *vma)
2605a1e78772SMel Gorman {
2606a5516438SAndi Kleen 	struct hstate *h = hstate_vma(vma);
2607f522c3acSJoonsoo Kim 	struct resv_map *resv = vma_resv_map(vma);
260890481622SDavid Gibson 	struct hugepage_subpool *spool = subpool_vma(vma);
26094e35f483SJoonsoo Kim 	unsigned long reserve, start, end;
26101c5ecae3SMike Kravetz 	long gbl_reserve;
261184afd99bSAndy Whitcroft 
26124e35f483SJoonsoo Kim 	if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
26134e35f483SJoonsoo Kim 		return;
26144e35f483SJoonsoo Kim 
2615a5516438SAndi Kleen 	start = vma_hugecache_offset(h, vma, vma->vm_start);
2616a5516438SAndi Kleen 	end = vma_hugecache_offset(h, vma, vma->vm_end);
261784afd99bSAndy Whitcroft 
26184e35f483SJoonsoo Kim 	reserve = (end - start) - region_count(resv, start, end);
261984afd99bSAndy Whitcroft 
2620f031dd27SJoonsoo Kim 	kref_put(&resv->refs, resv_map_release);
262184afd99bSAndy Whitcroft 
26227251ff78SAdam Litke 	if (reserve) {
26231c5ecae3SMike Kravetz 		/*
26241c5ecae3SMike Kravetz 		 * Decrement reserve counts.  The global reserve count may be
26251c5ecae3SMike Kravetz 		 * adjusted if the subpool has a minimum size.
26261c5ecae3SMike Kravetz 		 */
26271c5ecae3SMike Kravetz 		gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
26281c5ecae3SMike Kravetz 		hugetlb_acct_memory(h, -gbl_reserve);
26297251ff78SAdam Litke 	}
2630a1e78772SMel Gorman }
2631a1e78772SMel Gorman 
26321da177e4SLinus Torvalds /*
26331da177e4SLinus Torvalds  * We cannot handle pagefaults against hugetlb pages at all.  They cause
26341da177e4SLinus Torvalds  * handle_mm_fault() to try to instantiate regular-sized pages in the
26351da177e4SLinus Torvalds  * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
26361da177e4SLinus Torvalds  * this far.
26371da177e4SLinus Torvalds  */
2638d0217ac0SNick Piggin static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
26391da177e4SLinus Torvalds {
26401da177e4SLinus Torvalds 	BUG();
2641d0217ac0SNick Piggin 	return 0;
26421da177e4SLinus Torvalds }
26431da177e4SLinus Torvalds 
2644f0f37e2fSAlexey Dobriyan const struct vm_operations_struct hugetlb_vm_ops = {
2645d0217ac0SNick Piggin 	.fault = hugetlb_vm_op_fault,
264684afd99bSAndy Whitcroft 	.open = hugetlb_vm_op_open,
2647a1e78772SMel Gorman 	.close = hugetlb_vm_op_close,
26481da177e4SLinus Torvalds };
26491da177e4SLinus Torvalds 
26501e8f889bSDavid Gibson static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
26511e8f889bSDavid Gibson 				int writable)
265263551ae0SDavid Gibson {
265363551ae0SDavid Gibson 	pte_t entry;
265463551ae0SDavid Gibson 
26551e8f889bSDavid Gibson 	if (writable) {
2656106c992aSGerald Schaefer 		entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
2657106c992aSGerald Schaefer 					 vma->vm_page_prot)));
265863551ae0SDavid Gibson 	} else {
2659106c992aSGerald Schaefer 		entry = huge_pte_wrprotect(mk_huge_pte(page,
2660106c992aSGerald Schaefer 					   vma->vm_page_prot));
266163551ae0SDavid Gibson 	}
266263551ae0SDavid Gibson 	entry = pte_mkyoung(entry);
266363551ae0SDavid Gibson 	entry = pte_mkhuge(entry);
2664d9ed9faaSChris Metcalf 	entry = arch_make_huge_pte(entry, vma, page, writable);
266563551ae0SDavid Gibson 
266663551ae0SDavid Gibson 	return entry;
266763551ae0SDavid Gibson }
266863551ae0SDavid Gibson 
26691e8f889bSDavid Gibson static void set_huge_ptep_writable(struct vm_area_struct *vma,
26701e8f889bSDavid Gibson 				   unsigned long address, pte_t *ptep)
26711e8f889bSDavid Gibson {
26721e8f889bSDavid Gibson 	pte_t entry;
26731e8f889bSDavid Gibson 
2674106c992aSGerald Schaefer 	entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep)));
267532f84528SChris Forbes 	if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
26764b3073e1SRussell King 		update_mmu_cache(vma, address, ptep);
26771e8f889bSDavid Gibson }
26781e8f889bSDavid Gibson 
26794a705fefSNaoya Horiguchi static int is_hugetlb_entry_migration(pte_t pte)
26804a705fefSNaoya Horiguchi {
26814a705fefSNaoya Horiguchi 	swp_entry_t swp;
26824a705fefSNaoya Horiguchi 
26834a705fefSNaoya Horiguchi 	if (huge_pte_none(pte) || pte_present(pte))
26844a705fefSNaoya Horiguchi 		return 0;
26854a705fefSNaoya Horiguchi 	swp = pte_to_swp_entry(pte);
26864a705fefSNaoya Horiguchi 	if (non_swap_entry(swp) && is_migration_entry(swp))
26874a705fefSNaoya Horiguchi 		return 1;
26884a705fefSNaoya Horiguchi 	else
26894a705fefSNaoya Horiguchi 		return 0;
26904a705fefSNaoya Horiguchi }
26914a705fefSNaoya Horiguchi 
26924a705fefSNaoya Horiguchi static int is_hugetlb_entry_hwpoisoned(pte_t pte)
26934a705fefSNaoya Horiguchi {
26944a705fefSNaoya Horiguchi 	swp_entry_t swp;
26954a705fefSNaoya Horiguchi 
26964a705fefSNaoya Horiguchi 	if (huge_pte_none(pte) || pte_present(pte))
26974a705fefSNaoya Horiguchi 		return 0;
26984a705fefSNaoya Horiguchi 	swp = pte_to_swp_entry(pte);
26994a705fefSNaoya Horiguchi 	if (non_swap_entry(swp) && is_hwpoison_entry(swp))
27004a705fefSNaoya Horiguchi 		return 1;
27014a705fefSNaoya Horiguchi 	else
27024a705fefSNaoya Horiguchi 		return 0;
27034a705fefSNaoya Horiguchi }
27041e8f889bSDavid Gibson 
270563551ae0SDavid Gibson int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
270663551ae0SDavid Gibson 			    struct vm_area_struct *vma)
270763551ae0SDavid Gibson {
270863551ae0SDavid Gibson 	pte_t *src_pte, *dst_pte, entry;
270963551ae0SDavid Gibson 	struct page *ptepage;
27101c59827dSHugh Dickins 	unsigned long addr;
27111e8f889bSDavid Gibson 	int cow;
2712a5516438SAndi Kleen 	struct hstate *h = hstate_vma(vma);
2713a5516438SAndi Kleen 	unsigned long sz = huge_page_size(h);
2714e8569dd2SAndreas Sandberg 	unsigned long mmun_start;	/* For mmu_notifiers */
2715e8569dd2SAndreas Sandberg 	unsigned long mmun_end;		/* For mmu_notifiers */
2716e8569dd2SAndreas Sandberg 	int ret = 0;
27171e8f889bSDavid Gibson 
27181e8f889bSDavid Gibson 	cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
271963551ae0SDavid Gibson 
2720e8569dd2SAndreas Sandberg 	mmun_start = vma->vm_start;
2721e8569dd2SAndreas Sandberg 	mmun_end = vma->vm_end;
2722e8569dd2SAndreas Sandberg 	if (cow)
2723e8569dd2SAndreas Sandberg 		mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end);
2724e8569dd2SAndreas Sandberg 
2725a5516438SAndi Kleen 	for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
2726cb900f41SKirill A. Shutemov 		spinlock_t *src_ptl, *dst_ptl;
2727c74df32cSHugh Dickins 		src_pte = huge_pte_offset(src, addr);
2728c74df32cSHugh Dickins 		if (!src_pte)
2729c74df32cSHugh Dickins 			continue;
2730a5516438SAndi Kleen 		dst_pte = huge_pte_alloc(dst, addr, sz);
2731e8569dd2SAndreas Sandberg 		if (!dst_pte) {
2732e8569dd2SAndreas Sandberg 			ret = -ENOMEM;
2733e8569dd2SAndreas Sandberg 			break;
2734e8569dd2SAndreas Sandberg 		}
2735c5c99429SLarry Woodman 
2736c5c99429SLarry Woodman 		/* If the pagetables are shared don't copy or take references */
2737c5c99429SLarry Woodman 		if (dst_pte == src_pte)
2738c5c99429SLarry Woodman 			continue;
2739c5c99429SLarry Woodman 
2740cb900f41SKirill A. Shutemov 		dst_ptl = huge_pte_lock(h, dst, dst_pte);
2741cb900f41SKirill A. Shutemov 		src_ptl = huge_pte_lockptr(h, src, src_pte);
2742cb900f41SKirill A. Shutemov 		spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
27434a705fefSNaoya Horiguchi 		entry = huge_ptep_get(src_pte);
27444a705fefSNaoya Horiguchi 		if (huge_pte_none(entry)) { /* skip none entry */
27454a705fefSNaoya Horiguchi 			;
27464a705fefSNaoya Horiguchi 		} else if (unlikely(is_hugetlb_entry_migration(entry) ||
27474a705fefSNaoya Horiguchi 				    is_hugetlb_entry_hwpoisoned(entry))) {
27484a705fefSNaoya Horiguchi 			swp_entry_t swp_entry = pte_to_swp_entry(entry);
27494a705fefSNaoya Horiguchi 
27504a705fefSNaoya Horiguchi 			if (is_write_migration_entry(swp_entry) && cow) {
27514a705fefSNaoya Horiguchi 				/*
27524a705fefSNaoya Horiguchi 				 * COW mappings require pages in both
27534a705fefSNaoya Horiguchi 				 * parent and child to be set to read.
27544a705fefSNaoya Horiguchi 				 */
27554a705fefSNaoya Horiguchi 				make_migration_entry_read(&swp_entry);
27564a705fefSNaoya Horiguchi 				entry = swp_entry_to_pte(swp_entry);
27574a705fefSNaoya Horiguchi 				set_huge_pte_at(src, addr, src_pte, entry);
27584a705fefSNaoya Horiguchi 			}
27594a705fefSNaoya Horiguchi 			set_huge_pte_at(dst, addr, dst_pte, entry);
27604a705fefSNaoya Horiguchi 		} else {
276134ee645eSJoerg Roedel 			if (cow) {
27627f2e9525SGerald Schaefer 				huge_ptep_set_wrprotect(src, addr, src_pte);
276334ee645eSJoerg Roedel 				mmu_notifier_invalidate_range(src, mmun_start,
276434ee645eSJoerg Roedel 								   mmun_end);
276534ee645eSJoerg Roedel 			}
27660253d634SNaoya Horiguchi 			entry = huge_ptep_get(src_pte);
276763551ae0SDavid Gibson 			ptepage = pte_page(entry);
276863551ae0SDavid Gibson 			get_page(ptepage);
27690fe6e20bSNaoya Horiguchi 			page_dup_rmap(ptepage);
277063551ae0SDavid Gibson 			set_huge_pte_at(dst, addr, dst_pte, entry);
27711c59827dSHugh Dickins 		}
2772cb900f41SKirill A. Shutemov 		spin_unlock(src_ptl);
2773cb900f41SKirill A. Shutemov 		spin_unlock(dst_ptl);
277463551ae0SDavid Gibson 	}
277563551ae0SDavid Gibson 
2776e8569dd2SAndreas Sandberg 	if (cow)
2777e8569dd2SAndreas Sandberg 		mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end);
2778e8569dd2SAndreas Sandberg 
2779e8569dd2SAndreas Sandberg 	return ret;
278063551ae0SDavid Gibson }
278163551ae0SDavid Gibson 
278224669e58SAneesh Kumar K.V void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
278324669e58SAneesh Kumar K.V 			    unsigned long start, unsigned long end,
278424669e58SAneesh Kumar K.V 			    struct page *ref_page)
278563551ae0SDavid Gibson {
278624669e58SAneesh Kumar K.V 	int force_flush = 0;
278763551ae0SDavid Gibson 	struct mm_struct *mm = vma->vm_mm;
278863551ae0SDavid Gibson 	unsigned long address;
2789c7546f8fSDavid Gibson 	pte_t *ptep;
279063551ae0SDavid Gibson 	pte_t pte;
2791cb900f41SKirill A. Shutemov 	spinlock_t *ptl;
279263551ae0SDavid Gibson 	struct page *page;
2793a5516438SAndi Kleen 	struct hstate *h = hstate_vma(vma);
2794a5516438SAndi Kleen 	unsigned long sz = huge_page_size(h);
27952ec74c3eSSagi Grimberg 	const unsigned long mmun_start = start;	/* For mmu_notifiers */
27962ec74c3eSSagi Grimberg 	const unsigned long mmun_end   = end;	/* For mmu_notifiers */
2797a5516438SAndi Kleen 
279863551ae0SDavid Gibson 	WARN_ON(!is_vm_hugetlb_page(vma));
2799a5516438SAndi Kleen 	BUG_ON(start & ~huge_page_mask(h));
2800a5516438SAndi Kleen 	BUG_ON(end & ~huge_page_mask(h));
280163551ae0SDavid Gibson 
280224669e58SAneesh Kumar K.V 	tlb_start_vma(tlb, vma);
28032ec74c3eSSagi Grimberg 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2804569f48b8SHillf Danton 	address = start;
280524669e58SAneesh Kumar K.V again:
2806569f48b8SHillf Danton 	for (; address < end; address += sz) {
2807c7546f8fSDavid Gibson 		ptep = huge_pte_offset(mm, address);
2808c7546f8fSDavid Gibson 		if (!ptep)
2809c7546f8fSDavid Gibson 			continue;
2810c7546f8fSDavid Gibson 
2811cb900f41SKirill A. Shutemov 		ptl = huge_pte_lock(h, mm, ptep);
281239dde65cSChen, Kenneth W 		if (huge_pmd_unshare(mm, &address, ptep))
2813cb900f41SKirill A. Shutemov 			goto unlock;
281439dde65cSChen, Kenneth W 
28156629326bSHillf Danton 		pte = huge_ptep_get(ptep);
28166629326bSHillf Danton 		if (huge_pte_none(pte))
2817cb900f41SKirill A. Shutemov 			goto unlock;
28186629326bSHillf Danton 
28196629326bSHillf Danton 		/*
28209fbc1f63SNaoya Horiguchi 		 * Migrating hugepage or HWPoisoned hugepage is already
28219fbc1f63SNaoya Horiguchi 		 * unmapped and its refcount is dropped, so just clear pte here.
28226629326bSHillf Danton 		 */
28239fbc1f63SNaoya Horiguchi 		if (unlikely(!pte_present(pte))) {
2824106c992aSGerald Schaefer 			huge_pte_clear(mm, address, ptep);
2825cb900f41SKirill A. Shutemov 			goto unlock;
28268c4894c6SNaoya Horiguchi 		}
28276629326bSHillf Danton 
28286629326bSHillf Danton 		page = pte_page(pte);
282904f2cbe3SMel Gorman 		/*
283004f2cbe3SMel Gorman 		 * If a reference page is supplied, it is because a specific
283104f2cbe3SMel Gorman 		 * page is being unmapped, not a range. Ensure the page we
283204f2cbe3SMel Gorman 		 * are about to unmap is the actual page of interest.
283304f2cbe3SMel Gorman 		 */
283404f2cbe3SMel Gorman 		if (ref_page) {
283504f2cbe3SMel Gorman 			if (page != ref_page)
2836cb900f41SKirill A. Shutemov 				goto unlock;
283704f2cbe3SMel Gorman 
283804f2cbe3SMel Gorman 			/*
283904f2cbe3SMel Gorman 			 * Mark the VMA as having unmapped its page so that
284004f2cbe3SMel Gorman 			 * future faults in this VMA will fail rather than
284104f2cbe3SMel Gorman 			 * looking like data was lost
284204f2cbe3SMel Gorman 			 */
284304f2cbe3SMel Gorman 			set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
284404f2cbe3SMel Gorman 		}
284504f2cbe3SMel Gorman 
2846c7546f8fSDavid Gibson 		pte = huge_ptep_get_and_clear(mm, address, ptep);
284724669e58SAneesh Kumar K.V 		tlb_remove_tlb_entry(tlb, ptep, address);
2848106c992aSGerald Schaefer 		if (huge_pte_dirty(pte))
28496649a386SKen Chen 			set_page_dirty(page);
28509e81130bSHillf Danton 
285124669e58SAneesh Kumar K.V 		page_remove_rmap(page);
285224669e58SAneesh Kumar K.V 		force_flush = !__tlb_remove_page(tlb, page);
2853cb900f41SKirill A. Shutemov 		if (force_flush) {
2854569f48b8SHillf Danton 			address += sz;
2855cb900f41SKirill A. Shutemov 			spin_unlock(ptl);
28569e81130bSHillf Danton 			break;
285763551ae0SDavid Gibson 		}
2858cb900f41SKirill A. Shutemov 		/* Bail out after unmapping reference page if supplied */
2859cb900f41SKirill A. Shutemov 		if (ref_page) {
2860cb900f41SKirill A. Shutemov 			spin_unlock(ptl);
2861cb900f41SKirill A. Shutemov 			break;
2862cb900f41SKirill A. Shutemov 		}
2863cb900f41SKirill A. Shutemov unlock:
2864cb900f41SKirill A. Shutemov 		spin_unlock(ptl);
2865cb900f41SKirill A. Shutemov 	}
286624669e58SAneesh Kumar K.V 	/*
286724669e58SAneesh Kumar K.V 	 * mmu_gather ran out of room to batch pages, we break out of
286824669e58SAneesh Kumar K.V 	 * the PTE lock to avoid doing the potential expensive TLB invalidate
286924669e58SAneesh Kumar K.V 	 * and page-free while holding it.
287024669e58SAneesh Kumar K.V 	 */
287124669e58SAneesh Kumar K.V 	if (force_flush) {
287224669e58SAneesh Kumar K.V 		force_flush = 0;
287324669e58SAneesh Kumar K.V 		tlb_flush_mmu(tlb);
287424669e58SAneesh Kumar K.V 		if (address < end && !ref_page)
287524669e58SAneesh Kumar K.V 			goto again;
2876fe1668aeSChen, Kenneth W 	}
28772ec74c3eSSagi Grimberg 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
287824669e58SAneesh Kumar K.V 	tlb_end_vma(tlb, vma);
28791da177e4SLinus Torvalds }
288063551ae0SDavid Gibson 
2881d833352aSMel Gorman void __unmap_hugepage_range_final(struct mmu_gather *tlb,
2882d833352aSMel Gorman 			  struct vm_area_struct *vma, unsigned long start,
2883d833352aSMel Gorman 			  unsigned long end, struct page *ref_page)
2884d833352aSMel Gorman {
2885d833352aSMel Gorman 	__unmap_hugepage_range(tlb, vma, start, end, ref_page);
2886d833352aSMel Gorman 
2887d833352aSMel Gorman 	/*
2888d833352aSMel Gorman 	 * Clear this flag so that x86's huge_pmd_share page_table_shareable
2889d833352aSMel Gorman 	 * test will fail on a vma being torn down, and not grab a page table
2890d833352aSMel Gorman 	 * on its way out.  We're lucky that the flag has such an appropriate
2891d833352aSMel Gorman 	 * name, and can in fact be safely cleared here. We could clear it
2892d833352aSMel Gorman 	 * before the __unmap_hugepage_range above, but all that's necessary
2893c8c06efaSDavidlohr Bueso 	 * is to clear it before releasing the i_mmap_rwsem. This works
2894d833352aSMel Gorman 	 * because in the context this is called, the VMA is about to be
2895c8c06efaSDavidlohr Bueso 	 * destroyed and the i_mmap_rwsem is held.
2896d833352aSMel Gorman 	 */
2897d833352aSMel Gorman 	vma->vm_flags &= ~VM_MAYSHARE;
2898d833352aSMel Gorman }
2899d833352aSMel Gorman 
2900502717f4SChen, Kenneth W void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
290104f2cbe3SMel Gorman 			  unsigned long end, struct page *ref_page)
2902502717f4SChen, Kenneth W {
290324669e58SAneesh Kumar K.V 	struct mm_struct *mm;
290424669e58SAneesh Kumar K.V 	struct mmu_gather tlb;
290524669e58SAneesh Kumar K.V 
290624669e58SAneesh Kumar K.V 	mm = vma->vm_mm;
290724669e58SAneesh Kumar K.V 
29082b047252SLinus Torvalds 	tlb_gather_mmu(&tlb, mm, start, end);
290924669e58SAneesh Kumar K.V 	__unmap_hugepage_range(&tlb, vma, start, end, ref_page);
291024669e58SAneesh Kumar K.V 	tlb_finish_mmu(&tlb, start, end);
2911502717f4SChen, Kenneth W }
2912502717f4SChen, Kenneth W 
291304f2cbe3SMel Gorman /*
291404f2cbe3SMel Gorman  * This is called when the original mapper is failing to COW a MAP_PRIVATE
291504f2cbe3SMel Gorman  * mappping it owns the reserve page for. The intention is to unmap the page
291604f2cbe3SMel Gorman  * from other VMAs and let the children be SIGKILLed if they are faulting the
291704f2cbe3SMel Gorman  * same region.
291804f2cbe3SMel Gorman  */
29192f4612afSDavidlohr Bueso static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
29202a4b3dedSHarvey Harrison 			      struct page *page, unsigned long address)
292104f2cbe3SMel Gorman {
29227526674dSAdam Litke 	struct hstate *h = hstate_vma(vma);
292304f2cbe3SMel Gorman 	struct vm_area_struct *iter_vma;
292404f2cbe3SMel Gorman 	struct address_space *mapping;
292504f2cbe3SMel Gorman 	pgoff_t pgoff;
292604f2cbe3SMel Gorman 
292704f2cbe3SMel Gorman 	/*
292804f2cbe3SMel Gorman 	 * vm_pgoff is in PAGE_SIZE units, hence the different calculation
292904f2cbe3SMel Gorman 	 * from page cache lookup which is in HPAGE_SIZE units.
293004f2cbe3SMel Gorman 	 */
29317526674dSAdam Litke 	address = address & huge_page_mask(h);
293236e4f20aSMichal Hocko 	pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
293336e4f20aSMichal Hocko 			vma->vm_pgoff;
2934496ad9aaSAl Viro 	mapping = file_inode(vma->vm_file)->i_mapping;
293504f2cbe3SMel Gorman 
29364eb2b1dcSMel Gorman 	/*
29374eb2b1dcSMel Gorman 	 * Take the mapping lock for the duration of the table walk. As
29384eb2b1dcSMel Gorman 	 * this mapping should be shared between all the VMAs,
29394eb2b1dcSMel Gorman 	 * __unmap_hugepage_range() is called as the lock is already held
29404eb2b1dcSMel Gorman 	 */
294183cde9e8SDavidlohr Bueso 	i_mmap_lock_write(mapping);
29426b2dbba8SMichel Lespinasse 	vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
294304f2cbe3SMel Gorman 		/* Do not unmap the current VMA */
294404f2cbe3SMel Gorman 		if (iter_vma == vma)
294504f2cbe3SMel Gorman 			continue;
294604f2cbe3SMel Gorman 
294704f2cbe3SMel Gorman 		/*
294804f2cbe3SMel Gorman 		 * Unmap the page from other VMAs without their own reserves.
294904f2cbe3SMel Gorman 		 * They get marked to be SIGKILLed if they fault in these
295004f2cbe3SMel Gorman 		 * areas. This is because a future no-page fault on this VMA
295104f2cbe3SMel Gorman 		 * could insert a zeroed page instead of the data existing
295204f2cbe3SMel Gorman 		 * from the time of fork. This would look like data corruption
295304f2cbe3SMel Gorman 		 */
295404f2cbe3SMel Gorman 		if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
295524669e58SAneesh Kumar K.V 			unmap_hugepage_range(iter_vma, address,
295624669e58SAneesh Kumar K.V 					     address + huge_page_size(h), page);
295704f2cbe3SMel Gorman 	}
295883cde9e8SDavidlohr Bueso 	i_mmap_unlock_write(mapping);
295904f2cbe3SMel Gorman }
296004f2cbe3SMel Gorman 
29610fe6e20bSNaoya Horiguchi /*
29620fe6e20bSNaoya Horiguchi  * Hugetlb_cow() should be called with page lock of the original hugepage held.
2963ef009b25SMichal Hocko  * Called with hugetlb_instantiation_mutex held and pte_page locked so we
2964ef009b25SMichal Hocko  * cannot race with other handlers or page migration.
2965ef009b25SMichal Hocko  * Keep the pte_same checks anyway to make transition from the mutex easier.
29660fe6e20bSNaoya Horiguchi  */
29671e8f889bSDavid Gibson static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
296804f2cbe3SMel Gorman 			unsigned long address, pte_t *ptep, pte_t pte,
2969cb900f41SKirill A. Shutemov 			struct page *pagecache_page, spinlock_t *ptl)
29701e8f889bSDavid Gibson {
2971a5516438SAndi Kleen 	struct hstate *h = hstate_vma(vma);
29721e8f889bSDavid Gibson 	struct page *old_page, *new_page;
2973ad4404a2SDavidlohr Bueso 	int ret = 0, outside_reserve = 0;
29742ec74c3eSSagi Grimberg 	unsigned long mmun_start;	/* For mmu_notifiers */
29752ec74c3eSSagi Grimberg 	unsigned long mmun_end;		/* For mmu_notifiers */
29761e8f889bSDavid Gibson 
29771e8f889bSDavid Gibson 	old_page = pte_page(pte);
29781e8f889bSDavid Gibson 
297904f2cbe3SMel Gorman retry_avoidcopy:
29801e8f889bSDavid Gibson 	/* If no-one else is actually using this page, avoid the copy
29811e8f889bSDavid Gibson 	 * and just make the page writable */
298237a2140dSJoonsoo Kim 	if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
29830fe6e20bSNaoya Horiguchi 		page_move_anon_rmap(old_page, vma, address);
29841e8f889bSDavid Gibson 		set_huge_ptep_writable(vma, address, ptep);
298583c54070SNick Piggin 		return 0;
29861e8f889bSDavid Gibson 	}
29871e8f889bSDavid Gibson 
298804f2cbe3SMel Gorman 	/*
298904f2cbe3SMel Gorman 	 * If the process that created a MAP_PRIVATE mapping is about to
299004f2cbe3SMel Gorman 	 * perform a COW due to a shared page count, attempt to satisfy
299104f2cbe3SMel Gorman 	 * the allocation without using the existing reserves. The pagecache
299204f2cbe3SMel Gorman 	 * page is used to determine if the reserve at this address was
299304f2cbe3SMel Gorman 	 * consumed or not. If reserves were used, a partial faulted mapping
299404f2cbe3SMel Gorman 	 * at the time of fork() could consume its reserves on COW instead
299504f2cbe3SMel Gorman 	 * of the full address range.
299604f2cbe3SMel Gorman 	 */
29975944d011SJoonsoo Kim 	if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
299804f2cbe3SMel Gorman 			old_page != pagecache_page)
299904f2cbe3SMel Gorman 		outside_reserve = 1;
300004f2cbe3SMel Gorman 
30011e8f889bSDavid Gibson 	page_cache_get(old_page);
3002b76c8cfbSLarry Woodman 
3003ad4404a2SDavidlohr Bueso 	/*
3004ad4404a2SDavidlohr Bueso 	 * Drop page table lock as buddy allocator may be called. It will
3005ad4404a2SDavidlohr Bueso 	 * be acquired again before returning to the caller, as expected.
3006ad4404a2SDavidlohr Bueso 	 */
3007cb900f41SKirill A. Shutemov 	spin_unlock(ptl);
300804f2cbe3SMel Gorman 	new_page = alloc_huge_page(vma, address, outside_reserve);
30091e8f889bSDavid Gibson 
30102fc39cecSAdam Litke 	if (IS_ERR(new_page)) {
301104f2cbe3SMel Gorman 		/*
301204f2cbe3SMel Gorman 		 * If a process owning a MAP_PRIVATE mapping fails to COW,
301304f2cbe3SMel Gorman 		 * it is due to references held by a child and an insufficient
301404f2cbe3SMel Gorman 		 * huge page pool. To guarantee the original mappers
301504f2cbe3SMel Gorman 		 * reliability, unmap the page from child processes. The child
301604f2cbe3SMel Gorman 		 * may get SIGKILLed if it later faults.
301704f2cbe3SMel Gorman 		 */
301804f2cbe3SMel Gorman 		if (outside_reserve) {
3019ad4404a2SDavidlohr Bueso 			page_cache_release(old_page);
302004f2cbe3SMel Gorman 			BUG_ON(huge_pte_none(pte));
30212f4612afSDavidlohr Bueso 			unmap_ref_private(mm, vma, old_page, address);
302204f2cbe3SMel Gorman 			BUG_ON(huge_pte_none(pte));
3023cb900f41SKirill A. Shutemov 			spin_lock(ptl);
3024a734bcc8SHillf Danton 			ptep = huge_pte_offset(mm, address & huge_page_mask(h));
3025a9af0c5dSNaoya Horiguchi 			if (likely(ptep &&
3026a9af0c5dSNaoya Horiguchi 				   pte_same(huge_ptep_get(ptep), pte)))
302704f2cbe3SMel Gorman 				goto retry_avoidcopy;
3028a734bcc8SHillf Danton 			/*
3029cb900f41SKirill A. Shutemov 			 * race occurs while re-acquiring page table
3030cb900f41SKirill A. Shutemov 			 * lock, and our job is done.
3031a734bcc8SHillf Danton 			 */
3032a734bcc8SHillf Danton 			return 0;
303304f2cbe3SMel Gorman 		}
303404f2cbe3SMel Gorman 
3035ad4404a2SDavidlohr Bueso 		ret = (PTR_ERR(new_page) == -ENOMEM) ?
3036ad4404a2SDavidlohr Bueso 			VM_FAULT_OOM : VM_FAULT_SIGBUS;
3037ad4404a2SDavidlohr Bueso 		goto out_release_old;
30381e8f889bSDavid Gibson 	}
30391e8f889bSDavid Gibson 
30400fe6e20bSNaoya Horiguchi 	/*
30410fe6e20bSNaoya Horiguchi 	 * When the original hugepage is shared one, it does not have
30420fe6e20bSNaoya Horiguchi 	 * anon_vma prepared.
30430fe6e20bSNaoya Horiguchi 	 */
304444e2aa93SDean Nelson 	if (unlikely(anon_vma_prepare(vma))) {
3045ad4404a2SDavidlohr Bueso 		ret = VM_FAULT_OOM;
3046ad4404a2SDavidlohr Bueso 		goto out_release_all;
304744e2aa93SDean Nelson 	}
30480fe6e20bSNaoya Horiguchi 
304947ad8475SAndrea Arcangeli 	copy_user_huge_page(new_page, old_page, address, vma,
305047ad8475SAndrea Arcangeli 			    pages_per_huge_page(h));
30510ed361deSNick Piggin 	__SetPageUptodate(new_page);
3052bcc54222SNaoya Horiguchi 	set_page_huge_active(new_page);
30531e8f889bSDavid Gibson 
30542ec74c3eSSagi Grimberg 	mmun_start = address & huge_page_mask(h);
30552ec74c3eSSagi Grimberg 	mmun_end = mmun_start + huge_page_size(h);
30562ec74c3eSSagi Grimberg 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
3057ad4404a2SDavidlohr Bueso 
3058b76c8cfbSLarry Woodman 	/*
3059cb900f41SKirill A. Shutemov 	 * Retake the page table lock to check for racing updates
3060b76c8cfbSLarry Woodman 	 * before the page tables are altered
3061b76c8cfbSLarry Woodman 	 */
3062cb900f41SKirill A. Shutemov 	spin_lock(ptl);
3063a5516438SAndi Kleen 	ptep = huge_pte_offset(mm, address & huge_page_mask(h));
3064a9af0c5dSNaoya Horiguchi 	if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
306507443a85SJoonsoo Kim 		ClearPagePrivate(new_page);
306607443a85SJoonsoo Kim 
30671e8f889bSDavid Gibson 		/* Break COW */
30688fe627ecSGerald Schaefer 		huge_ptep_clear_flush(vma, address, ptep);
306934ee645eSJoerg Roedel 		mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
30701e8f889bSDavid Gibson 		set_huge_pte_at(mm, address, ptep,
30711e8f889bSDavid Gibson 				make_huge_pte(vma, new_page, 1));
30720fe6e20bSNaoya Horiguchi 		page_remove_rmap(old_page);
3073cd67f0d2SNaoya Horiguchi 		hugepage_add_new_anon_rmap(new_page, vma, address);
30741e8f889bSDavid Gibson 		/* Make the old page be freed below */
30751e8f889bSDavid Gibson 		new_page = old_page;
30761e8f889bSDavid Gibson 	}
3077cb900f41SKirill A. Shutemov 	spin_unlock(ptl);
30782ec74c3eSSagi Grimberg 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
3079ad4404a2SDavidlohr Bueso out_release_all:
30801e8f889bSDavid Gibson 	page_cache_release(new_page);
3081ad4404a2SDavidlohr Bueso out_release_old:
30821e8f889bSDavid Gibson 	page_cache_release(old_page);
30838312034fSJoonsoo Kim 
3084ad4404a2SDavidlohr Bueso 	spin_lock(ptl); /* Caller expects lock to be held */
3085ad4404a2SDavidlohr Bueso 	return ret;
30861e8f889bSDavid Gibson }
30871e8f889bSDavid Gibson 
308804f2cbe3SMel Gorman /* Return the pagecache page at a given address within a VMA */
3089a5516438SAndi Kleen static struct page *hugetlbfs_pagecache_page(struct hstate *h,
3090a5516438SAndi Kleen 			struct vm_area_struct *vma, unsigned long address)
309104f2cbe3SMel Gorman {
309204f2cbe3SMel Gorman 	struct address_space *mapping;
3093e7c4b0bfSAndy Whitcroft 	pgoff_t idx;
309404f2cbe3SMel Gorman 
309504f2cbe3SMel Gorman 	mapping = vma->vm_file->f_mapping;
3096a5516438SAndi Kleen 	idx = vma_hugecache_offset(h, vma, address);
309704f2cbe3SMel Gorman 
309804f2cbe3SMel Gorman 	return find_lock_page(mapping, idx);
309904f2cbe3SMel Gorman }
310004f2cbe3SMel Gorman 
31013ae77f43SHugh Dickins /*
31023ae77f43SHugh Dickins  * Return whether there is a pagecache page to back given address within VMA.
31033ae77f43SHugh Dickins  * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
31043ae77f43SHugh Dickins  */
31053ae77f43SHugh Dickins static bool hugetlbfs_pagecache_present(struct hstate *h,
31062a15efc9SHugh Dickins 			struct vm_area_struct *vma, unsigned long address)
31072a15efc9SHugh Dickins {
31082a15efc9SHugh Dickins 	struct address_space *mapping;
31092a15efc9SHugh Dickins 	pgoff_t idx;
31102a15efc9SHugh Dickins 	struct page *page;
31112a15efc9SHugh Dickins 
31122a15efc9SHugh Dickins 	mapping = vma->vm_file->f_mapping;
31132a15efc9SHugh Dickins 	idx = vma_hugecache_offset(h, vma, address);
31142a15efc9SHugh Dickins 
31152a15efc9SHugh Dickins 	page = find_get_page(mapping, idx);
31162a15efc9SHugh Dickins 	if (page)
31172a15efc9SHugh Dickins 		put_page(page);
31182a15efc9SHugh Dickins 	return page != NULL;
31192a15efc9SHugh Dickins }
31202a15efc9SHugh Dickins 
3121a1ed3ddaSRobert P. J. Day static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
31228382d914SDavidlohr Bueso 			   struct address_space *mapping, pgoff_t idx,
3123788c7df4SHugh Dickins 			   unsigned long address, pte_t *ptep, unsigned int flags)
3124ac9b9c66SHugh Dickins {
3125a5516438SAndi Kleen 	struct hstate *h = hstate_vma(vma);
3126ac9b9c66SHugh Dickins 	int ret = VM_FAULT_SIGBUS;
3127409eb8c2SHillf Danton 	int anon_rmap = 0;
31284c887265SAdam Litke 	unsigned long size;
31294c887265SAdam Litke 	struct page *page;
31301e8f889bSDavid Gibson 	pte_t new_pte;
3131cb900f41SKirill A. Shutemov 	spinlock_t *ptl;
31324c887265SAdam Litke 
313304f2cbe3SMel Gorman 	/*
313404f2cbe3SMel Gorman 	 * Currently, we are forced to kill the process in the event the
313504f2cbe3SMel Gorman 	 * original mapper has unmapped pages from the child due to a failed
313625985edcSLucas De Marchi 	 * COW. Warn that such a situation has occurred as it may not be obvious
313704f2cbe3SMel Gorman 	 */
313804f2cbe3SMel Gorman 	if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
3139ffb22af5SAndrew Morton 		pr_warning("PID %d killed due to inadequate hugepage pool\n",
314004f2cbe3SMel Gorman 			   current->pid);
314104f2cbe3SMel Gorman 		return ret;
314204f2cbe3SMel Gorman 	}
314304f2cbe3SMel Gorman 
31444c887265SAdam Litke 	/*
31454c887265SAdam Litke 	 * Use page lock to guard against racing truncation
31464c887265SAdam Litke 	 * before we get page_table_lock.
31474c887265SAdam Litke 	 */
31486bda666aSChristoph Lameter retry:
31496bda666aSChristoph Lameter 	page = find_lock_page(mapping, idx);
31506bda666aSChristoph Lameter 	if (!page) {
3151a5516438SAndi Kleen 		size = i_size_read(mapping->host) >> huge_page_shift(h);
3152ebed4bfcSHugh Dickins 		if (idx >= size)
3153ebed4bfcSHugh Dickins 			goto out;
315404f2cbe3SMel Gorman 		page = alloc_huge_page(vma, address, 0);
31552fc39cecSAdam Litke 		if (IS_ERR(page)) {
315676dcee75SAneesh Kumar K.V 			ret = PTR_ERR(page);
315776dcee75SAneesh Kumar K.V 			if (ret == -ENOMEM)
315876dcee75SAneesh Kumar K.V 				ret = VM_FAULT_OOM;
315976dcee75SAneesh Kumar K.V 			else
316076dcee75SAneesh Kumar K.V 				ret = VM_FAULT_SIGBUS;
31616bda666aSChristoph Lameter 			goto out;
31626bda666aSChristoph Lameter 		}
316347ad8475SAndrea Arcangeli 		clear_huge_page(page, address, pages_per_huge_page(h));
31640ed361deSNick Piggin 		__SetPageUptodate(page);
3165bcc54222SNaoya Horiguchi 		set_page_huge_active(page);
3166ac9b9c66SHugh Dickins 
3167f83a275dSMel Gorman 		if (vma->vm_flags & VM_MAYSHARE) {
31686bda666aSChristoph Lameter 			int err;
316945c682a6SKen Chen 			struct inode *inode = mapping->host;
31706bda666aSChristoph Lameter 
31716bda666aSChristoph Lameter 			err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
31726bda666aSChristoph Lameter 			if (err) {
31736bda666aSChristoph Lameter 				put_page(page);
31746bda666aSChristoph Lameter 				if (err == -EEXIST)
31756bda666aSChristoph Lameter 					goto retry;
31766bda666aSChristoph Lameter 				goto out;
31776bda666aSChristoph Lameter 			}
317807443a85SJoonsoo Kim 			ClearPagePrivate(page);
317945c682a6SKen Chen 
318045c682a6SKen Chen 			spin_lock(&inode->i_lock);
3181a5516438SAndi Kleen 			inode->i_blocks += blocks_per_huge_page(h);
318245c682a6SKen Chen 			spin_unlock(&inode->i_lock);
318323be7468SMel Gorman 		} else {
31846bda666aSChristoph Lameter 			lock_page(page);
31850fe6e20bSNaoya Horiguchi 			if (unlikely(anon_vma_prepare(vma))) {
31860fe6e20bSNaoya Horiguchi 				ret = VM_FAULT_OOM;
31870fe6e20bSNaoya Horiguchi 				goto backout_unlocked;
318823be7468SMel Gorman 			}
3189409eb8c2SHillf Danton 			anon_rmap = 1;
31900fe6e20bSNaoya Horiguchi 		}
31910fe6e20bSNaoya Horiguchi 	} else {
319257303d80SAndy Whitcroft 		/*
3193998b4382SNaoya Horiguchi 		 * If memory error occurs between mmap() and fault, some process
3194998b4382SNaoya Horiguchi 		 * don't have hwpoisoned swap entry for errored virtual address.
3195998b4382SNaoya Horiguchi 		 * So we need to block hugepage fault by PG_hwpoison bit check.
3196fd6a03edSNaoya Horiguchi 		 */
3197fd6a03edSNaoya Horiguchi 		if (unlikely(PageHWPoison(page))) {
3198aa50d3a7SAndi Kleen 			ret = VM_FAULT_HWPOISON |
3199972dc4deSAneesh Kumar K.V 				VM_FAULT_SET_HINDEX(hstate_index(h));
3200fd6a03edSNaoya Horiguchi 			goto backout_unlocked;
32016bda666aSChristoph Lameter 		}
3202998b4382SNaoya Horiguchi 	}
32031e8f889bSDavid Gibson 
320457303d80SAndy Whitcroft 	/*
320557303d80SAndy Whitcroft 	 * If we are going to COW a private mapping later, we examine the
320657303d80SAndy Whitcroft 	 * pending reservations for this page now. This will ensure that
320757303d80SAndy Whitcroft 	 * any allocations necessary to record that reservation occur outside
320857303d80SAndy Whitcroft 	 * the spinlock.
320957303d80SAndy Whitcroft 	 */
3210788c7df4SHugh Dickins 	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
32112b26736cSAndy Whitcroft 		if (vma_needs_reservation(h, vma, address) < 0) {
32122b26736cSAndy Whitcroft 			ret = VM_FAULT_OOM;
32132b26736cSAndy Whitcroft 			goto backout_unlocked;
32142b26736cSAndy Whitcroft 		}
321557303d80SAndy Whitcroft 
3216cb900f41SKirill A. Shutemov 	ptl = huge_pte_lockptr(h, mm, ptep);
3217cb900f41SKirill A. Shutemov 	spin_lock(ptl);
3218a5516438SAndi Kleen 	size = i_size_read(mapping->host) >> huge_page_shift(h);
32194c887265SAdam Litke 	if (idx >= size)
32204c887265SAdam Litke 		goto backout;
32214c887265SAdam Litke 
322283c54070SNick Piggin 	ret = 0;
32237f2e9525SGerald Schaefer 	if (!huge_pte_none(huge_ptep_get(ptep)))
32244c887265SAdam Litke 		goto backout;
32254c887265SAdam Litke 
322607443a85SJoonsoo Kim 	if (anon_rmap) {
322707443a85SJoonsoo Kim 		ClearPagePrivate(page);
3228409eb8c2SHillf Danton 		hugepage_add_new_anon_rmap(page, vma, address);
3229ac714904SChoi Gi-yong 	} else
3230409eb8c2SHillf Danton 		page_dup_rmap(page);
32311e8f889bSDavid Gibson 	new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
32321e8f889bSDavid Gibson 				&& (vma->vm_flags & VM_SHARED)));
32331e8f889bSDavid Gibson 	set_huge_pte_at(mm, address, ptep, new_pte);
32341e8f889bSDavid Gibson 
3235788c7df4SHugh Dickins 	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
32361e8f889bSDavid Gibson 		/* Optimization, do the COW without a second fault */
3237cb900f41SKirill A. Shutemov 		ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl);
32381e8f889bSDavid Gibson 	}
32391e8f889bSDavid Gibson 
3240cb900f41SKirill A. Shutemov 	spin_unlock(ptl);
32414c887265SAdam Litke 	unlock_page(page);
32424c887265SAdam Litke out:
3243ac9b9c66SHugh Dickins 	return ret;
32444c887265SAdam Litke 
32454c887265SAdam Litke backout:
3246cb900f41SKirill A. Shutemov 	spin_unlock(ptl);
32472b26736cSAndy Whitcroft backout_unlocked:
32484c887265SAdam Litke 	unlock_page(page);
32494c887265SAdam Litke 	put_page(page);
32504c887265SAdam Litke 	goto out;
3251ac9b9c66SHugh Dickins }
3252ac9b9c66SHugh Dickins 
32538382d914SDavidlohr Bueso #ifdef CONFIG_SMP
32548382d914SDavidlohr Bueso static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
32558382d914SDavidlohr Bueso 			    struct vm_area_struct *vma,
32568382d914SDavidlohr Bueso 			    struct address_space *mapping,
32578382d914SDavidlohr Bueso 			    pgoff_t idx, unsigned long address)
32588382d914SDavidlohr Bueso {
32598382d914SDavidlohr Bueso 	unsigned long key[2];
32608382d914SDavidlohr Bueso 	u32 hash;
32618382d914SDavidlohr Bueso 
32628382d914SDavidlohr Bueso 	if (vma->vm_flags & VM_SHARED) {
32638382d914SDavidlohr Bueso 		key[0] = (unsigned long) mapping;
32648382d914SDavidlohr Bueso 		key[1] = idx;
32658382d914SDavidlohr Bueso 	} else {
32668382d914SDavidlohr Bueso 		key[0] = (unsigned long) mm;
32678382d914SDavidlohr Bueso 		key[1] = address >> huge_page_shift(h);
32688382d914SDavidlohr Bueso 	}
32698382d914SDavidlohr Bueso 
32708382d914SDavidlohr Bueso 	hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0);
32718382d914SDavidlohr Bueso 
32728382d914SDavidlohr Bueso 	return hash & (num_fault_mutexes - 1);
32738382d914SDavidlohr Bueso }
32748382d914SDavidlohr Bueso #else
32758382d914SDavidlohr Bueso /*
32768382d914SDavidlohr Bueso  * For uniprocesor systems we always use a single mutex, so just
32778382d914SDavidlohr Bueso  * return 0 and avoid the hashing overhead.
32788382d914SDavidlohr Bueso  */
32798382d914SDavidlohr Bueso static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
32808382d914SDavidlohr Bueso 			    struct vm_area_struct *vma,
32818382d914SDavidlohr Bueso 			    struct address_space *mapping,
32828382d914SDavidlohr Bueso 			    pgoff_t idx, unsigned long address)
32838382d914SDavidlohr Bueso {
32848382d914SDavidlohr Bueso 	return 0;
32858382d914SDavidlohr Bueso }
32868382d914SDavidlohr Bueso #endif
32878382d914SDavidlohr Bueso 
328886e5216fSAdam Litke int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3289788c7df4SHugh Dickins 			unsigned long address, unsigned int flags)
329086e5216fSAdam Litke {
32918382d914SDavidlohr Bueso 	pte_t *ptep, entry;
3292cb900f41SKirill A. Shutemov 	spinlock_t *ptl;
32931e8f889bSDavid Gibson 	int ret;
32948382d914SDavidlohr Bueso 	u32 hash;
32958382d914SDavidlohr Bueso 	pgoff_t idx;
32960fe6e20bSNaoya Horiguchi 	struct page *page = NULL;
329757303d80SAndy Whitcroft 	struct page *pagecache_page = NULL;
3298a5516438SAndi Kleen 	struct hstate *h = hstate_vma(vma);
32998382d914SDavidlohr Bueso 	struct address_space *mapping;
33000f792cf9SNaoya Horiguchi 	int need_wait_lock = 0;
330186e5216fSAdam Litke 
33021e16a539SKAMEZAWA Hiroyuki 	address &= huge_page_mask(h);
33031e16a539SKAMEZAWA Hiroyuki 
3304fd6a03edSNaoya Horiguchi 	ptep = huge_pte_offset(mm, address);
3305fd6a03edSNaoya Horiguchi 	if (ptep) {
3306fd6a03edSNaoya Horiguchi 		entry = huge_ptep_get(ptep);
3307290408d4SNaoya Horiguchi 		if (unlikely(is_hugetlb_entry_migration(entry))) {
3308cb900f41SKirill A. Shutemov 			migration_entry_wait_huge(vma, mm, ptep);
3309290408d4SNaoya Horiguchi 			return 0;
3310290408d4SNaoya Horiguchi 		} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
3311aa50d3a7SAndi Kleen 			return VM_FAULT_HWPOISON_LARGE |
3312972dc4deSAneesh Kumar K.V 				VM_FAULT_SET_HINDEX(hstate_index(h));
3313fd6a03edSNaoya Horiguchi 	}
3314fd6a03edSNaoya Horiguchi 
3315a5516438SAndi Kleen 	ptep = huge_pte_alloc(mm, address, huge_page_size(h));
331686e5216fSAdam Litke 	if (!ptep)
331786e5216fSAdam Litke 		return VM_FAULT_OOM;
331886e5216fSAdam Litke 
33198382d914SDavidlohr Bueso 	mapping = vma->vm_file->f_mapping;
33208382d914SDavidlohr Bueso 	idx = vma_hugecache_offset(h, vma, address);
33218382d914SDavidlohr Bueso 
33223935baa9SDavid Gibson 	/*
33233935baa9SDavid Gibson 	 * Serialize hugepage allocation and instantiation, so that we don't
33243935baa9SDavid Gibson 	 * get spurious allocation failures if two CPUs race to instantiate
33253935baa9SDavid Gibson 	 * the same page in the page cache.
33263935baa9SDavid Gibson 	 */
33278382d914SDavidlohr Bueso 	hash = fault_mutex_hash(h, mm, vma, mapping, idx, address);
33288382d914SDavidlohr Bueso 	mutex_lock(&htlb_fault_mutex_table[hash]);
33298382d914SDavidlohr Bueso 
33307f2e9525SGerald Schaefer 	entry = huge_ptep_get(ptep);
33317f2e9525SGerald Schaefer 	if (huge_pte_none(entry)) {
33328382d914SDavidlohr Bueso 		ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags);
3333b4d1d99fSDavid Gibson 		goto out_mutex;
33343935baa9SDavid Gibson 	}
333586e5216fSAdam Litke 
333683c54070SNick Piggin 	ret = 0;
33371e8f889bSDavid Gibson 
333857303d80SAndy Whitcroft 	/*
33390f792cf9SNaoya Horiguchi 	 * entry could be a migration/hwpoison entry at this point, so this
33400f792cf9SNaoya Horiguchi 	 * check prevents the kernel from going below assuming that we have
33410f792cf9SNaoya Horiguchi 	 * a active hugepage in pagecache. This goto expects the 2nd page fault,
33420f792cf9SNaoya Horiguchi 	 * and is_hugetlb_entry_(migration|hwpoisoned) check will properly
33430f792cf9SNaoya Horiguchi 	 * handle it.
33440f792cf9SNaoya Horiguchi 	 */
33450f792cf9SNaoya Horiguchi 	if (!pte_present(entry))
33460f792cf9SNaoya Horiguchi 		goto out_mutex;
33470f792cf9SNaoya Horiguchi 
33480f792cf9SNaoya Horiguchi 	/*
334957303d80SAndy Whitcroft 	 * If we are going to COW the mapping later, we examine the pending
335057303d80SAndy Whitcroft 	 * reservations for this page now. This will ensure that any
335157303d80SAndy Whitcroft 	 * allocations necessary to record that reservation occur outside the
335257303d80SAndy Whitcroft 	 * spinlock. For private mappings, we also lookup the pagecache
335357303d80SAndy Whitcroft 	 * page now as it is used to determine if a reservation has been
335457303d80SAndy Whitcroft 	 * consumed.
335557303d80SAndy Whitcroft 	 */
3356106c992aSGerald Schaefer 	if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
33572b26736cSAndy Whitcroft 		if (vma_needs_reservation(h, vma, address) < 0) {
33582b26736cSAndy Whitcroft 			ret = VM_FAULT_OOM;
3359b4d1d99fSDavid Gibson 			goto out_mutex;
33602b26736cSAndy Whitcroft 		}
336157303d80SAndy Whitcroft 
3362f83a275dSMel Gorman 		if (!(vma->vm_flags & VM_MAYSHARE))
336357303d80SAndy Whitcroft 			pagecache_page = hugetlbfs_pagecache_page(h,
336457303d80SAndy Whitcroft 								vma, address);
336557303d80SAndy Whitcroft 	}
336657303d80SAndy Whitcroft 
33670f792cf9SNaoya Horiguchi 	ptl = huge_pte_lock(h, mm, ptep);
33680fe6e20bSNaoya Horiguchi 
33691e8f889bSDavid Gibson 	/* Check for a racing update before calling hugetlb_cow */
3370b4d1d99fSDavid Gibson 	if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
3371cb900f41SKirill A. Shutemov 		goto out_ptl;
3372b4d1d99fSDavid Gibson 
33730f792cf9SNaoya Horiguchi 	/*
33740f792cf9SNaoya Horiguchi 	 * hugetlb_cow() requires page locks of pte_page(entry) and
33750f792cf9SNaoya Horiguchi 	 * pagecache_page, so here we need take the former one
33760f792cf9SNaoya Horiguchi 	 * when page != pagecache_page or !pagecache_page.
33770f792cf9SNaoya Horiguchi 	 */
33780f792cf9SNaoya Horiguchi 	page = pte_page(entry);
33790f792cf9SNaoya Horiguchi 	if (page != pagecache_page)
33800f792cf9SNaoya Horiguchi 		if (!trylock_page(page)) {
33810f792cf9SNaoya Horiguchi 			need_wait_lock = 1;
33820f792cf9SNaoya Horiguchi 			goto out_ptl;
33830f792cf9SNaoya Horiguchi 		}
33840f792cf9SNaoya Horiguchi 
33850f792cf9SNaoya Horiguchi 	get_page(page);
3386b4d1d99fSDavid Gibson 
3387788c7df4SHugh Dickins 	if (flags & FAULT_FLAG_WRITE) {
3388106c992aSGerald Schaefer 		if (!huge_pte_write(entry)) {
338957303d80SAndy Whitcroft 			ret = hugetlb_cow(mm, vma, address, ptep, entry,
3390cb900f41SKirill A. Shutemov 					pagecache_page, ptl);
33910f792cf9SNaoya Horiguchi 			goto out_put_page;
3392b4d1d99fSDavid Gibson 		}
3393106c992aSGerald Schaefer 		entry = huge_pte_mkdirty(entry);
3394b4d1d99fSDavid Gibson 	}
3395b4d1d99fSDavid Gibson 	entry = pte_mkyoung(entry);
3396788c7df4SHugh Dickins 	if (huge_ptep_set_access_flags(vma, address, ptep, entry,
3397788c7df4SHugh Dickins 						flags & FAULT_FLAG_WRITE))
33984b3073e1SRussell King 		update_mmu_cache(vma, address, ptep);
33990f792cf9SNaoya Horiguchi out_put_page:
34000f792cf9SNaoya Horiguchi 	if (page != pagecache_page)
34010f792cf9SNaoya Horiguchi 		unlock_page(page);
34020f792cf9SNaoya Horiguchi 	put_page(page);
3403cb900f41SKirill A. Shutemov out_ptl:
3404cb900f41SKirill A. Shutemov 	spin_unlock(ptl);
340557303d80SAndy Whitcroft 
340657303d80SAndy Whitcroft 	if (pagecache_page) {
340757303d80SAndy Whitcroft 		unlock_page(pagecache_page);
340857303d80SAndy Whitcroft 		put_page(pagecache_page);
340957303d80SAndy Whitcroft 	}
3410b4d1d99fSDavid Gibson out_mutex:
34118382d914SDavidlohr Bueso 	mutex_unlock(&htlb_fault_mutex_table[hash]);
34120f792cf9SNaoya Horiguchi 	/*
34130f792cf9SNaoya Horiguchi 	 * Generally it's safe to hold refcount during waiting page lock. But
34140f792cf9SNaoya Horiguchi 	 * here we just wait to defer the next page fault to avoid busy loop and
34150f792cf9SNaoya Horiguchi 	 * the page is not used after unlocked before returning from the current
34160f792cf9SNaoya Horiguchi 	 * page fault. So we are safe from accessing freed page, even if we wait
34170f792cf9SNaoya Horiguchi 	 * here without taking refcount.
34180f792cf9SNaoya Horiguchi 	 */
34190f792cf9SNaoya Horiguchi 	if (need_wait_lock)
34200f792cf9SNaoya Horiguchi 		wait_on_page_locked(page);
34211e8f889bSDavid Gibson 	return ret;
342286e5216fSAdam Litke }
342386e5216fSAdam Litke 
342428a35716SMichel Lespinasse long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
342563551ae0SDavid Gibson 			 struct page **pages, struct vm_area_struct **vmas,
342628a35716SMichel Lespinasse 			 unsigned long *position, unsigned long *nr_pages,
342728a35716SMichel Lespinasse 			 long i, unsigned int flags)
342863551ae0SDavid Gibson {
3429d5d4b0aaSChen, Kenneth W 	unsigned long pfn_offset;
3430d5d4b0aaSChen, Kenneth W 	unsigned long vaddr = *position;
343128a35716SMichel Lespinasse 	unsigned long remainder = *nr_pages;
3432a5516438SAndi Kleen 	struct hstate *h = hstate_vma(vma);
343363551ae0SDavid Gibson 
343463551ae0SDavid Gibson 	while (vaddr < vma->vm_end && remainder) {
343563551ae0SDavid Gibson 		pte_t *pte;
3436cb900f41SKirill A. Shutemov 		spinlock_t *ptl = NULL;
34372a15efc9SHugh Dickins 		int absent;
343863551ae0SDavid Gibson 		struct page *page;
343963551ae0SDavid Gibson 
34404c887265SAdam Litke 		/*
344102057967SDavid Rientjes 		 * If we have a pending SIGKILL, don't keep faulting pages and
344202057967SDavid Rientjes 		 * potentially allocating memory.
344302057967SDavid Rientjes 		 */
344402057967SDavid Rientjes 		if (unlikely(fatal_signal_pending(current))) {
344502057967SDavid Rientjes 			remainder = 0;
344602057967SDavid Rientjes 			break;
344702057967SDavid Rientjes 		}
344802057967SDavid Rientjes 
344902057967SDavid Rientjes 		/*
34504c887265SAdam Litke 		 * Some archs (sparc64, sh*) have multiple pte_ts to
34512a15efc9SHugh Dickins 		 * each hugepage.  We have to make sure we get the
34524c887265SAdam Litke 		 * first, for the page indexing below to work.
3453cb900f41SKirill A. Shutemov 		 *
3454cb900f41SKirill A. Shutemov 		 * Note that page table lock is not held when pte is null.
34554c887265SAdam Litke 		 */
3456a5516438SAndi Kleen 		pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
3457cb900f41SKirill A. Shutemov 		if (pte)
3458cb900f41SKirill A. Shutemov 			ptl = huge_pte_lock(h, mm, pte);
34592a15efc9SHugh Dickins 		absent = !pte || huge_pte_none(huge_ptep_get(pte));
346063551ae0SDavid Gibson 
34612a15efc9SHugh Dickins 		/*
34622a15efc9SHugh Dickins 		 * When coredumping, it suits get_dump_page if we just return
34633ae77f43SHugh Dickins 		 * an error where there's an empty slot with no huge pagecache
34643ae77f43SHugh Dickins 		 * to back it.  This way, we avoid allocating a hugepage, and
34653ae77f43SHugh Dickins 		 * the sparse dumpfile avoids allocating disk blocks, but its
34663ae77f43SHugh Dickins 		 * huge holes still show up with zeroes where they need to be.
34672a15efc9SHugh Dickins 		 */
34683ae77f43SHugh Dickins 		if (absent && (flags & FOLL_DUMP) &&
34693ae77f43SHugh Dickins 		    !hugetlbfs_pagecache_present(h, vma, vaddr)) {
3470cb900f41SKirill A. Shutemov 			if (pte)
3471cb900f41SKirill A. Shutemov 				spin_unlock(ptl);
34722a15efc9SHugh Dickins 			remainder = 0;
34732a15efc9SHugh Dickins 			break;
34742a15efc9SHugh Dickins 		}
34752a15efc9SHugh Dickins 
34769cc3a5bdSNaoya Horiguchi 		/*
34779cc3a5bdSNaoya Horiguchi 		 * We need call hugetlb_fault for both hugepages under migration
34789cc3a5bdSNaoya Horiguchi 		 * (in which case hugetlb_fault waits for the migration,) and
34799cc3a5bdSNaoya Horiguchi 		 * hwpoisoned hugepages (in which case we need to prevent the
34809cc3a5bdSNaoya Horiguchi 		 * caller from accessing to them.) In order to do this, we use
34819cc3a5bdSNaoya Horiguchi 		 * here is_swap_pte instead of is_hugetlb_entry_migration and
34829cc3a5bdSNaoya Horiguchi 		 * is_hugetlb_entry_hwpoisoned. This is because it simply covers
34839cc3a5bdSNaoya Horiguchi 		 * both cases, and because we can't follow correct pages
34849cc3a5bdSNaoya Horiguchi 		 * directly from any kind of swap entries.
34859cc3a5bdSNaoya Horiguchi 		 */
34869cc3a5bdSNaoya Horiguchi 		if (absent || is_swap_pte(huge_ptep_get(pte)) ||
3487106c992aSGerald Schaefer 		    ((flags & FOLL_WRITE) &&
3488106c992aSGerald Schaefer 		      !huge_pte_write(huge_ptep_get(pte)))) {
34894c887265SAdam Litke 			int ret;
34904c887265SAdam Litke 
3491cb900f41SKirill A. Shutemov 			if (pte)
3492cb900f41SKirill A. Shutemov 				spin_unlock(ptl);
34932a15efc9SHugh Dickins 			ret = hugetlb_fault(mm, vma, vaddr,
34942a15efc9SHugh Dickins 				(flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
3495a89182c7SAdam Litke 			if (!(ret & VM_FAULT_ERROR))
34964c887265SAdam Litke 				continue;
34974c887265SAdam Litke 
34981c59827dSHugh Dickins 			remainder = 0;
34991c59827dSHugh Dickins 			break;
35001c59827dSHugh Dickins 		}
350163551ae0SDavid Gibson 
3502a5516438SAndi Kleen 		pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
35037f2e9525SGerald Schaefer 		page = pte_page(huge_ptep_get(pte));
3504d5d4b0aaSChen, Kenneth W same_page:
3505d6692183SChen, Kenneth W 		if (pages) {
350669d177c2SAndy Whitcroft 			pages[i] = mem_map_offset(page, pfn_offset);
3507a0368d4eSAndrea Arcangeli 			get_page_foll(pages[i]);
3508d6692183SChen, Kenneth W 		}
350963551ae0SDavid Gibson 
351063551ae0SDavid Gibson 		if (vmas)
351163551ae0SDavid Gibson 			vmas[i] = vma;
351263551ae0SDavid Gibson 
351363551ae0SDavid Gibson 		vaddr += PAGE_SIZE;
3514d5d4b0aaSChen, Kenneth W 		++pfn_offset;
351563551ae0SDavid Gibson 		--remainder;
351663551ae0SDavid Gibson 		++i;
3517d5d4b0aaSChen, Kenneth W 		if (vaddr < vma->vm_end && remainder &&
3518a5516438SAndi Kleen 				pfn_offset < pages_per_huge_page(h)) {
3519d5d4b0aaSChen, Kenneth W 			/*
3520d5d4b0aaSChen, Kenneth W 			 * We use pfn_offset to avoid touching the pageframes
3521d5d4b0aaSChen, Kenneth W 			 * of this compound page.
3522d5d4b0aaSChen, Kenneth W 			 */
3523d5d4b0aaSChen, Kenneth W 			goto same_page;
3524d5d4b0aaSChen, Kenneth W 		}
3525cb900f41SKirill A. Shutemov 		spin_unlock(ptl);
352663551ae0SDavid Gibson 	}
352728a35716SMichel Lespinasse 	*nr_pages = remainder;
352863551ae0SDavid Gibson 	*position = vaddr;
352963551ae0SDavid Gibson 
35302a15efc9SHugh Dickins 	return i ? i : -EFAULT;
353163551ae0SDavid Gibson }
35328f860591SZhang, Yanmin 
35337da4d641SPeter Zijlstra unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
35348f860591SZhang, Yanmin 		unsigned long address, unsigned long end, pgprot_t newprot)
35358f860591SZhang, Yanmin {
35368f860591SZhang, Yanmin 	struct mm_struct *mm = vma->vm_mm;
35378f860591SZhang, Yanmin 	unsigned long start = address;
35388f860591SZhang, Yanmin 	pte_t *ptep;
35398f860591SZhang, Yanmin 	pte_t pte;
3540a5516438SAndi Kleen 	struct hstate *h = hstate_vma(vma);
35417da4d641SPeter Zijlstra 	unsigned long pages = 0;
35428f860591SZhang, Yanmin 
35438f860591SZhang, Yanmin 	BUG_ON(address >= end);
35448f860591SZhang, Yanmin 	flush_cache_range(vma, address, end);
35458f860591SZhang, Yanmin 
3546a5338093SRik van Riel 	mmu_notifier_invalidate_range_start(mm, start, end);
354783cde9e8SDavidlohr Bueso 	i_mmap_lock_write(vma->vm_file->f_mapping);
3548a5516438SAndi Kleen 	for (; address < end; address += huge_page_size(h)) {
3549cb900f41SKirill A. Shutemov 		spinlock_t *ptl;
35508f860591SZhang, Yanmin 		ptep = huge_pte_offset(mm, address);
35518f860591SZhang, Yanmin 		if (!ptep)
35528f860591SZhang, Yanmin 			continue;
3553cb900f41SKirill A. Shutemov 		ptl = huge_pte_lock(h, mm, ptep);
35547da4d641SPeter Zijlstra 		if (huge_pmd_unshare(mm, &address, ptep)) {
35557da4d641SPeter Zijlstra 			pages++;
3556cb900f41SKirill A. Shutemov 			spin_unlock(ptl);
355739dde65cSChen, Kenneth W 			continue;
35587da4d641SPeter Zijlstra 		}
3559a8bda28dSNaoya Horiguchi 		pte = huge_ptep_get(ptep);
3560a8bda28dSNaoya Horiguchi 		if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
3561a8bda28dSNaoya Horiguchi 			spin_unlock(ptl);
3562a8bda28dSNaoya Horiguchi 			continue;
3563a8bda28dSNaoya Horiguchi 		}
3564a8bda28dSNaoya Horiguchi 		if (unlikely(is_hugetlb_entry_migration(pte))) {
3565a8bda28dSNaoya Horiguchi 			swp_entry_t entry = pte_to_swp_entry(pte);
3566a8bda28dSNaoya Horiguchi 
3567a8bda28dSNaoya Horiguchi 			if (is_write_migration_entry(entry)) {
3568a8bda28dSNaoya Horiguchi 				pte_t newpte;
3569a8bda28dSNaoya Horiguchi 
3570a8bda28dSNaoya Horiguchi 				make_migration_entry_read(&entry);
3571a8bda28dSNaoya Horiguchi 				newpte = swp_entry_to_pte(entry);
3572a8bda28dSNaoya Horiguchi 				set_huge_pte_at(mm, address, ptep, newpte);
3573a8bda28dSNaoya Horiguchi 				pages++;
3574a8bda28dSNaoya Horiguchi 			}
3575a8bda28dSNaoya Horiguchi 			spin_unlock(ptl);
3576a8bda28dSNaoya Horiguchi 			continue;
3577a8bda28dSNaoya Horiguchi 		}
3578a8bda28dSNaoya Horiguchi 		if (!huge_pte_none(pte)) {
35798f860591SZhang, Yanmin 			pte = huge_ptep_get_and_clear(mm, address, ptep);
3580106c992aSGerald Schaefer 			pte = pte_mkhuge(huge_pte_modify(pte, newprot));
3581be7517d6STony Lu 			pte = arch_make_huge_pte(pte, vma, NULL, 0);
35828f860591SZhang, Yanmin 			set_huge_pte_at(mm, address, ptep, pte);
35837da4d641SPeter Zijlstra 			pages++;
35848f860591SZhang, Yanmin 		}
3585cb900f41SKirill A. Shutemov 		spin_unlock(ptl);
35868f860591SZhang, Yanmin 	}
3587d833352aSMel Gorman 	/*
3588c8c06efaSDavidlohr Bueso 	 * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare
3589d833352aSMel Gorman 	 * may have cleared our pud entry and done put_page on the page table:
3590c8c06efaSDavidlohr Bueso 	 * once we release i_mmap_rwsem, another task can do the final put_page
3591d833352aSMel Gorman 	 * and that page table be reused and filled with junk.
3592d833352aSMel Gorman 	 */
35938f860591SZhang, Yanmin 	flush_tlb_range(vma, start, end);
359434ee645eSJoerg Roedel 	mmu_notifier_invalidate_range(mm, start, end);
359583cde9e8SDavidlohr Bueso 	i_mmap_unlock_write(vma->vm_file->f_mapping);
3596a5338093SRik van Riel 	mmu_notifier_invalidate_range_end(mm, start, end);
35977da4d641SPeter Zijlstra 
35987da4d641SPeter Zijlstra 	return pages << h->order;
35998f860591SZhang, Yanmin }
36008f860591SZhang, Yanmin 
3601a1e78772SMel Gorman int hugetlb_reserve_pages(struct inode *inode,
3602a1e78772SMel Gorman 					long from, long to,
36035a6fe125SMel Gorman 					struct vm_area_struct *vma,
3604ca16d140SKOSAKI Motohiro 					vm_flags_t vm_flags)
3605e4e574b7SAdam Litke {
360617c9d12eSMel Gorman 	long ret, chg;
3607a5516438SAndi Kleen 	struct hstate *h = hstate_inode(inode);
360890481622SDavid Gibson 	struct hugepage_subpool *spool = subpool_inode(inode);
36099119a41eSJoonsoo Kim 	struct resv_map *resv_map;
36101c5ecae3SMike Kravetz 	long gbl_reserve;
3611e4e574b7SAdam Litke 
3612a1e78772SMel Gorman 	/*
361317c9d12eSMel Gorman 	 * Only apply hugepage reservation if asked. At fault time, an
361417c9d12eSMel Gorman 	 * attempt will be made for VM_NORESERVE to allocate a page
361590481622SDavid Gibson 	 * without using reserves
361617c9d12eSMel Gorman 	 */
3617ca16d140SKOSAKI Motohiro 	if (vm_flags & VM_NORESERVE)
361817c9d12eSMel Gorman 		return 0;
361917c9d12eSMel Gorman 
362017c9d12eSMel Gorman 	/*
3621a1e78772SMel Gorman 	 * Shared mappings base their reservation on the number of pages that
3622a1e78772SMel Gorman 	 * are already allocated on behalf of the file. Private mappings need
3623a1e78772SMel Gorman 	 * to reserve the full area even if read-only as mprotect() may be
3624a1e78772SMel Gorman 	 * called to make the mapping read-write. Assume !vma is a shm mapping
3625a1e78772SMel Gorman 	 */
36269119a41eSJoonsoo Kim 	if (!vma || vma->vm_flags & VM_MAYSHARE) {
36274e35f483SJoonsoo Kim 		resv_map = inode_resv_map(inode);
36289119a41eSJoonsoo Kim 
36291406ec9bSJoonsoo Kim 		chg = region_chg(resv_map, from, to);
36309119a41eSJoonsoo Kim 
36319119a41eSJoonsoo Kim 	} else {
36329119a41eSJoonsoo Kim 		resv_map = resv_map_alloc();
36335a6fe125SMel Gorman 		if (!resv_map)
36345a6fe125SMel Gorman 			return -ENOMEM;
36355a6fe125SMel Gorman 
363617c9d12eSMel Gorman 		chg = to - from;
363717c9d12eSMel Gorman 
36385a6fe125SMel Gorman 		set_vma_resv_map(vma, resv_map);
36395a6fe125SMel Gorman 		set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
36405a6fe125SMel Gorman 	}
36415a6fe125SMel Gorman 
3642c50ac050SDave Hansen 	if (chg < 0) {
3643c50ac050SDave Hansen 		ret = chg;
3644c50ac050SDave Hansen 		goto out_err;
3645c50ac050SDave Hansen 	}
364617c9d12eSMel Gorman 
36471c5ecae3SMike Kravetz 	/*
36481c5ecae3SMike Kravetz 	 * There must be enough pages in the subpool for the mapping. If
36491c5ecae3SMike Kravetz 	 * the subpool has a minimum size, there may be some global
36501c5ecae3SMike Kravetz 	 * reservations already in place (gbl_reserve).
36511c5ecae3SMike Kravetz 	 */
36521c5ecae3SMike Kravetz 	gbl_reserve = hugepage_subpool_get_pages(spool, chg);
36531c5ecae3SMike Kravetz 	if (gbl_reserve < 0) {
3654c50ac050SDave Hansen 		ret = -ENOSPC;
3655c50ac050SDave Hansen 		goto out_err;
3656c50ac050SDave Hansen 	}
365717c9d12eSMel Gorman 
365817c9d12eSMel Gorman 	/*
365917c9d12eSMel Gorman 	 * Check enough hugepages are available for the reservation.
366090481622SDavid Gibson 	 * Hand the pages back to the subpool if there are not
366117c9d12eSMel Gorman 	 */
36621c5ecae3SMike Kravetz 	ret = hugetlb_acct_memory(h, gbl_reserve);
366317c9d12eSMel Gorman 	if (ret < 0) {
36641c5ecae3SMike Kravetz 		/* put back original number of pages, chg */
36651c5ecae3SMike Kravetz 		(void)hugepage_subpool_put_pages(spool, chg);
3666c50ac050SDave Hansen 		goto out_err;
366717c9d12eSMel Gorman 	}
366817c9d12eSMel Gorman 
366917c9d12eSMel Gorman 	/*
367017c9d12eSMel Gorman 	 * Account for the reservations made. Shared mappings record regions
367117c9d12eSMel Gorman 	 * that have reservations as they are shared by multiple VMAs.
367217c9d12eSMel Gorman 	 * When the last VMA disappears, the region map says how much
367317c9d12eSMel Gorman 	 * the reservation was and the page cache tells how much of
367417c9d12eSMel Gorman 	 * the reservation was consumed. Private mappings are per-VMA and
367517c9d12eSMel Gorman 	 * only the consumed reservations are tracked. When the VMA
367617c9d12eSMel Gorman 	 * disappears, the original reservation is the VMA size and the
367717c9d12eSMel Gorman 	 * consumed reservations are stored in the map. Hence, nothing
367817c9d12eSMel Gorman 	 * else has to be done for private mappings here
367917c9d12eSMel Gorman 	 */
3680f83a275dSMel Gorman 	if (!vma || vma->vm_flags & VM_MAYSHARE)
36811406ec9bSJoonsoo Kim 		region_add(resv_map, from, to);
3682a43a8c39SChen, Kenneth W 	return 0;
3683c50ac050SDave Hansen out_err:
3684f031dd27SJoonsoo Kim 	if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
3685f031dd27SJoonsoo Kim 		kref_put(&resv_map->refs, resv_map_release);
3686c50ac050SDave Hansen 	return ret;
3687a43a8c39SChen, Kenneth W }
3688a43a8c39SChen, Kenneth W 
3689a43a8c39SChen, Kenneth W void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
3690a43a8c39SChen, Kenneth W {
3691a5516438SAndi Kleen 	struct hstate *h = hstate_inode(inode);
36924e35f483SJoonsoo Kim 	struct resv_map *resv_map = inode_resv_map(inode);
36939119a41eSJoonsoo Kim 	long chg = 0;
369490481622SDavid Gibson 	struct hugepage_subpool *spool = subpool_inode(inode);
36951c5ecae3SMike Kravetz 	long gbl_reserve;
369645c682a6SKen Chen 
36979119a41eSJoonsoo Kim 	if (resv_map)
36981406ec9bSJoonsoo Kim 		chg = region_truncate(resv_map, offset);
369945c682a6SKen Chen 	spin_lock(&inode->i_lock);
3700e4c6f8beSEric Sandeen 	inode->i_blocks -= (blocks_per_huge_page(h) * freed);
370145c682a6SKen Chen 	spin_unlock(&inode->i_lock);
370245c682a6SKen Chen 
37031c5ecae3SMike Kravetz 	/*
37041c5ecae3SMike Kravetz 	 * If the subpool has a minimum size, the number of global
37051c5ecae3SMike Kravetz 	 * reservations to be released may be adjusted.
37061c5ecae3SMike Kravetz 	 */
37071c5ecae3SMike Kravetz 	gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
37081c5ecae3SMike Kravetz 	hugetlb_acct_memory(h, -gbl_reserve);
3709a43a8c39SChen, Kenneth W }
371093f70f90SNaoya Horiguchi 
37113212b535SSteve Capper #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
37123212b535SSteve Capper static unsigned long page_table_shareable(struct vm_area_struct *svma,
37133212b535SSteve Capper 				struct vm_area_struct *vma,
37143212b535SSteve Capper 				unsigned long addr, pgoff_t idx)
37153212b535SSteve Capper {
37163212b535SSteve Capper 	unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
37173212b535SSteve Capper 				svma->vm_start;
37183212b535SSteve Capper 	unsigned long sbase = saddr & PUD_MASK;
37193212b535SSteve Capper 	unsigned long s_end = sbase + PUD_SIZE;
37203212b535SSteve Capper 
37213212b535SSteve Capper 	/* Allow segments to share if only one is marked locked */
37223212b535SSteve Capper 	unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED;
37233212b535SSteve Capper 	unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED;
37243212b535SSteve Capper 
37253212b535SSteve Capper 	/*
37263212b535SSteve Capper 	 * match the virtual addresses, permission and the alignment of the
37273212b535SSteve Capper 	 * page table page.
37283212b535SSteve Capper 	 */
37293212b535SSteve Capper 	if (pmd_index(addr) != pmd_index(saddr) ||
37303212b535SSteve Capper 	    vm_flags != svm_flags ||
37313212b535SSteve Capper 	    sbase < svma->vm_start || svma->vm_end < s_end)
37323212b535SSteve Capper 		return 0;
37333212b535SSteve Capper 
37343212b535SSteve Capper 	return saddr;
37353212b535SSteve Capper }
37363212b535SSteve Capper 
37373212b535SSteve Capper static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
37383212b535SSteve Capper {
37393212b535SSteve Capper 	unsigned long base = addr & PUD_MASK;
37403212b535SSteve Capper 	unsigned long end = base + PUD_SIZE;
37413212b535SSteve Capper 
37423212b535SSteve Capper 	/*
37433212b535SSteve Capper 	 * check on proper vm_flags and page table alignment
37443212b535SSteve Capper 	 */
37453212b535SSteve Capper 	if (vma->vm_flags & VM_MAYSHARE &&
37463212b535SSteve Capper 	    vma->vm_start <= base && end <= vma->vm_end)
37473212b535SSteve Capper 		return 1;
37483212b535SSteve Capper 	return 0;
37493212b535SSteve Capper }
37503212b535SSteve Capper 
37513212b535SSteve Capper /*
37523212b535SSteve Capper  * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
37533212b535SSteve Capper  * and returns the corresponding pte. While this is not necessary for the
37543212b535SSteve Capper  * !shared pmd case because we can allocate the pmd later as well, it makes the
37553212b535SSteve Capper  * code much cleaner. pmd allocation is essential for the shared case because
3756c8c06efaSDavidlohr Bueso  * pud has to be populated inside the same i_mmap_rwsem section - otherwise
37573212b535SSteve Capper  * racing tasks could either miss the sharing (see huge_pte_offset) or select a
37583212b535SSteve Capper  * bad pmd for sharing.
37593212b535SSteve Capper  */
37603212b535SSteve Capper pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
37613212b535SSteve Capper {
37623212b535SSteve Capper 	struct vm_area_struct *vma = find_vma(mm, addr);
37633212b535SSteve Capper 	struct address_space *mapping = vma->vm_file->f_mapping;
37643212b535SSteve Capper 	pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
37653212b535SSteve Capper 			vma->vm_pgoff;
37663212b535SSteve Capper 	struct vm_area_struct *svma;
37673212b535SSteve Capper 	unsigned long saddr;
37683212b535SSteve Capper 	pte_t *spte = NULL;
37693212b535SSteve Capper 	pte_t *pte;
3770cb900f41SKirill A. Shutemov 	spinlock_t *ptl;
37713212b535SSteve Capper 
37723212b535SSteve Capper 	if (!vma_shareable(vma, addr))
37733212b535SSteve Capper 		return (pte_t *)pmd_alloc(mm, pud, addr);
37743212b535SSteve Capper 
377583cde9e8SDavidlohr Bueso 	i_mmap_lock_write(mapping);
37763212b535SSteve Capper 	vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
37773212b535SSteve Capper 		if (svma == vma)
37783212b535SSteve Capper 			continue;
37793212b535SSteve Capper 
37803212b535SSteve Capper 		saddr = page_table_shareable(svma, vma, addr, idx);
37813212b535SSteve Capper 		if (saddr) {
37823212b535SSteve Capper 			spte = huge_pte_offset(svma->vm_mm, saddr);
37833212b535SSteve Capper 			if (spte) {
3784dc6c9a35SKirill A. Shutemov 				mm_inc_nr_pmds(mm);
37853212b535SSteve Capper 				get_page(virt_to_page(spte));
37863212b535SSteve Capper 				break;
37873212b535SSteve Capper 			}
37883212b535SSteve Capper 		}
37893212b535SSteve Capper 	}
37903212b535SSteve Capper 
37913212b535SSteve Capper 	if (!spte)
37923212b535SSteve Capper 		goto out;
37933212b535SSteve Capper 
3794cb900f41SKirill A. Shutemov 	ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte);
3795cb900f41SKirill A. Shutemov 	spin_lock(ptl);
3796dc6c9a35SKirill A. Shutemov 	if (pud_none(*pud)) {
37973212b535SSteve Capper 		pud_populate(mm, pud,
37983212b535SSteve Capper 				(pmd_t *)((unsigned long)spte & PAGE_MASK));
3799dc6c9a35SKirill A. Shutemov 	} else {
38003212b535SSteve Capper 		put_page(virt_to_page(spte));
3801dc6c9a35SKirill A. Shutemov 		mm_inc_nr_pmds(mm);
3802dc6c9a35SKirill A. Shutemov 	}
3803cb900f41SKirill A. Shutemov 	spin_unlock(ptl);
38043212b535SSteve Capper out:
38053212b535SSteve Capper 	pte = (pte_t *)pmd_alloc(mm, pud, addr);
380683cde9e8SDavidlohr Bueso 	i_mmap_unlock_write(mapping);
38073212b535SSteve Capper 	return pte;
38083212b535SSteve Capper }
38093212b535SSteve Capper 
38103212b535SSteve Capper /*
38113212b535SSteve Capper  * unmap huge page backed by shared pte.
38123212b535SSteve Capper  *
38133212b535SSteve Capper  * Hugetlb pte page is ref counted at the time of mapping.  If pte is shared
38143212b535SSteve Capper  * indicated by page_count > 1, unmap is achieved by clearing pud and
38153212b535SSteve Capper  * decrementing the ref count. If count == 1, the pte page is not shared.
38163212b535SSteve Capper  *
3817cb900f41SKirill A. Shutemov  * called with page table lock held.
38183212b535SSteve Capper  *
38193212b535SSteve Capper  * returns: 1 successfully unmapped a shared pte page
38203212b535SSteve Capper  *	    0 the underlying pte page is not shared, or it is the last user
38213212b535SSteve Capper  */
38223212b535SSteve Capper int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
38233212b535SSteve Capper {
38243212b535SSteve Capper 	pgd_t *pgd = pgd_offset(mm, *addr);
38253212b535SSteve Capper 	pud_t *pud = pud_offset(pgd, *addr);
38263212b535SSteve Capper 
38273212b535SSteve Capper 	BUG_ON(page_count(virt_to_page(ptep)) == 0);
38283212b535SSteve Capper 	if (page_count(virt_to_page(ptep)) == 1)
38293212b535SSteve Capper 		return 0;
38303212b535SSteve Capper 
38313212b535SSteve Capper 	pud_clear(pud);
38323212b535SSteve Capper 	put_page(virt_to_page(ptep));
3833dc6c9a35SKirill A. Shutemov 	mm_dec_nr_pmds(mm);
38343212b535SSteve Capper 	*addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
38353212b535SSteve Capper 	return 1;
38363212b535SSteve Capper }
38379e5fc74cSSteve Capper #define want_pmd_share()	(1)
38389e5fc74cSSteve Capper #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
38399e5fc74cSSteve Capper pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
38409e5fc74cSSteve Capper {
38419e5fc74cSSteve Capper 	return NULL;
38429e5fc74cSSteve Capper }
3843e81f2d22SZhang Zhen 
3844e81f2d22SZhang Zhen int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
3845e81f2d22SZhang Zhen {
3846e81f2d22SZhang Zhen 	return 0;
3847e81f2d22SZhang Zhen }
38489e5fc74cSSteve Capper #define want_pmd_share()	(0)
38493212b535SSteve Capper #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
38503212b535SSteve Capper 
38519e5fc74cSSteve Capper #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
38529e5fc74cSSteve Capper pte_t *huge_pte_alloc(struct mm_struct *mm,
38539e5fc74cSSteve Capper 			unsigned long addr, unsigned long sz)
38549e5fc74cSSteve Capper {
38559e5fc74cSSteve Capper 	pgd_t *pgd;
38569e5fc74cSSteve Capper 	pud_t *pud;
38579e5fc74cSSteve Capper 	pte_t *pte = NULL;
38589e5fc74cSSteve Capper 
38599e5fc74cSSteve Capper 	pgd = pgd_offset(mm, addr);
38609e5fc74cSSteve Capper 	pud = pud_alloc(mm, pgd, addr);
38619e5fc74cSSteve Capper 	if (pud) {
38629e5fc74cSSteve Capper 		if (sz == PUD_SIZE) {
38639e5fc74cSSteve Capper 			pte = (pte_t *)pud;
38649e5fc74cSSteve Capper 		} else {
38659e5fc74cSSteve Capper 			BUG_ON(sz != PMD_SIZE);
38669e5fc74cSSteve Capper 			if (want_pmd_share() && pud_none(*pud))
38679e5fc74cSSteve Capper 				pte = huge_pmd_share(mm, addr, pud);
38689e5fc74cSSteve Capper 			else
38699e5fc74cSSteve Capper 				pte = (pte_t *)pmd_alloc(mm, pud, addr);
38709e5fc74cSSteve Capper 		}
38719e5fc74cSSteve Capper 	}
38729e5fc74cSSteve Capper 	BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
38739e5fc74cSSteve Capper 
38749e5fc74cSSteve Capper 	return pte;
38759e5fc74cSSteve Capper }
38769e5fc74cSSteve Capper 
38779e5fc74cSSteve Capper pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
38789e5fc74cSSteve Capper {
38799e5fc74cSSteve Capper 	pgd_t *pgd;
38809e5fc74cSSteve Capper 	pud_t *pud;
38819e5fc74cSSteve Capper 	pmd_t *pmd = NULL;
38829e5fc74cSSteve Capper 
38839e5fc74cSSteve Capper 	pgd = pgd_offset(mm, addr);
38849e5fc74cSSteve Capper 	if (pgd_present(*pgd)) {
38859e5fc74cSSteve Capper 		pud = pud_offset(pgd, addr);
38869e5fc74cSSteve Capper 		if (pud_present(*pud)) {
38879e5fc74cSSteve Capper 			if (pud_huge(*pud))
38889e5fc74cSSteve Capper 				return (pte_t *)pud;
38899e5fc74cSSteve Capper 			pmd = pmd_offset(pud, addr);
38909e5fc74cSSteve Capper 		}
38919e5fc74cSSteve Capper 	}
38929e5fc74cSSteve Capper 	return (pte_t *) pmd;
38939e5fc74cSSteve Capper }
38949e5fc74cSSteve Capper 
389561f77edaSNaoya Horiguchi #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
389661f77edaSNaoya Horiguchi 
389761f77edaSNaoya Horiguchi /*
389861f77edaSNaoya Horiguchi  * These functions are overwritable if your architecture needs its own
389961f77edaSNaoya Horiguchi  * behavior.
390061f77edaSNaoya Horiguchi  */
390161f77edaSNaoya Horiguchi struct page * __weak
390261f77edaSNaoya Horiguchi follow_huge_addr(struct mm_struct *mm, unsigned long address,
390361f77edaSNaoya Horiguchi 			      int write)
390461f77edaSNaoya Horiguchi {
390561f77edaSNaoya Horiguchi 	return ERR_PTR(-EINVAL);
390661f77edaSNaoya Horiguchi }
390761f77edaSNaoya Horiguchi 
390861f77edaSNaoya Horiguchi struct page * __weak
39099e5fc74cSSteve Capper follow_huge_pmd(struct mm_struct *mm, unsigned long address,
3910e66f17ffSNaoya Horiguchi 		pmd_t *pmd, int flags)
39119e5fc74cSSteve Capper {
3912e66f17ffSNaoya Horiguchi 	struct page *page = NULL;
3913e66f17ffSNaoya Horiguchi 	spinlock_t *ptl;
3914e66f17ffSNaoya Horiguchi retry:
3915e66f17ffSNaoya Horiguchi 	ptl = pmd_lockptr(mm, pmd);
3916e66f17ffSNaoya Horiguchi 	spin_lock(ptl);
3917e66f17ffSNaoya Horiguchi 	/*
3918e66f17ffSNaoya Horiguchi 	 * make sure that the address range covered by this pmd is not
3919e66f17ffSNaoya Horiguchi 	 * unmapped from other threads.
3920e66f17ffSNaoya Horiguchi 	 */
3921e66f17ffSNaoya Horiguchi 	if (!pmd_huge(*pmd))
3922e66f17ffSNaoya Horiguchi 		goto out;
3923e66f17ffSNaoya Horiguchi 	if (pmd_present(*pmd)) {
392497534127SGerald Schaefer 		page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
3925e66f17ffSNaoya Horiguchi 		if (flags & FOLL_GET)
3926e66f17ffSNaoya Horiguchi 			get_page(page);
3927e66f17ffSNaoya Horiguchi 	} else {
3928e66f17ffSNaoya Horiguchi 		if (is_hugetlb_entry_migration(huge_ptep_get((pte_t *)pmd))) {
3929e66f17ffSNaoya Horiguchi 			spin_unlock(ptl);
3930e66f17ffSNaoya Horiguchi 			__migration_entry_wait(mm, (pte_t *)pmd, ptl);
3931e66f17ffSNaoya Horiguchi 			goto retry;
3932e66f17ffSNaoya Horiguchi 		}
3933e66f17ffSNaoya Horiguchi 		/*
3934e66f17ffSNaoya Horiguchi 		 * hwpoisoned entry is treated as no_page_table in
3935e66f17ffSNaoya Horiguchi 		 * follow_page_mask().
3936e66f17ffSNaoya Horiguchi 		 */
3937e66f17ffSNaoya Horiguchi 	}
3938e66f17ffSNaoya Horiguchi out:
3939e66f17ffSNaoya Horiguchi 	spin_unlock(ptl);
39409e5fc74cSSteve Capper 	return page;
39419e5fc74cSSteve Capper }
39429e5fc74cSSteve Capper 
394361f77edaSNaoya Horiguchi struct page * __weak
39449e5fc74cSSteve Capper follow_huge_pud(struct mm_struct *mm, unsigned long address,
3945e66f17ffSNaoya Horiguchi 		pud_t *pud, int flags)
39469e5fc74cSSteve Capper {
3947e66f17ffSNaoya Horiguchi 	if (flags & FOLL_GET)
3948e66f17ffSNaoya Horiguchi 		return NULL;
39499e5fc74cSSteve Capper 
3950e66f17ffSNaoya Horiguchi 	return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
39519e5fc74cSSteve Capper }
39529e5fc74cSSteve Capper 
3953d5bd9106SAndi Kleen #ifdef CONFIG_MEMORY_FAILURE
3954d5bd9106SAndi Kleen 
395593f70f90SNaoya Horiguchi /*
395693f70f90SNaoya Horiguchi  * This function is called from memory failure code.
395793f70f90SNaoya Horiguchi  * Assume the caller holds page lock of the head page.
395893f70f90SNaoya Horiguchi  */
39596de2b1aaSNaoya Horiguchi int dequeue_hwpoisoned_huge_page(struct page *hpage)
396093f70f90SNaoya Horiguchi {
396193f70f90SNaoya Horiguchi 	struct hstate *h = page_hstate(hpage);
396293f70f90SNaoya Horiguchi 	int nid = page_to_nid(hpage);
39636de2b1aaSNaoya Horiguchi 	int ret = -EBUSY;
396493f70f90SNaoya Horiguchi 
396593f70f90SNaoya Horiguchi 	spin_lock(&hugetlb_lock);
39667e1f049eSNaoya Horiguchi 	/*
39677e1f049eSNaoya Horiguchi 	 * Just checking !page_huge_active is not enough, because that could be
39687e1f049eSNaoya Horiguchi 	 * an isolated/hwpoisoned hugepage (which have >0 refcount).
39697e1f049eSNaoya Horiguchi 	 */
39707e1f049eSNaoya Horiguchi 	if (!page_huge_active(hpage) && !page_count(hpage)) {
397156f2fb14SNaoya Horiguchi 		/*
397256f2fb14SNaoya Horiguchi 		 * Hwpoisoned hugepage isn't linked to activelist or freelist,
397356f2fb14SNaoya Horiguchi 		 * but dangling hpage->lru can trigger list-debug warnings
397456f2fb14SNaoya Horiguchi 		 * (this happens when we call unpoison_memory() on it),
397556f2fb14SNaoya Horiguchi 		 * so let it point to itself with list_del_init().
397656f2fb14SNaoya Horiguchi 		 */
397756f2fb14SNaoya Horiguchi 		list_del_init(&hpage->lru);
39788c6c2ecbSNaoya Horiguchi 		set_page_refcounted(hpage);
397993f70f90SNaoya Horiguchi 		h->free_huge_pages--;
398093f70f90SNaoya Horiguchi 		h->free_huge_pages_node[nid]--;
39816de2b1aaSNaoya Horiguchi 		ret = 0;
398293f70f90SNaoya Horiguchi 	}
39836de2b1aaSNaoya Horiguchi 	spin_unlock(&hugetlb_lock);
39846de2b1aaSNaoya Horiguchi 	return ret;
39856de2b1aaSNaoya Horiguchi }
39866de2b1aaSNaoya Horiguchi #endif
398731caf665SNaoya Horiguchi 
398831caf665SNaoya Horiguchi bool isolate_huge_page(struct page *page, struct list_head *list)
398931caf665SNaoya Horiguchi {
3990bcc54222SNaoya Horiguchi 	bool ret = true;
3991bcc54222SNaoya Horiguchi 
3992309381feSSasha Levin 	VM_BUG_ON_PAGE(!PageHead(page), page);
399331caf665SNaoya Horiguchi 	spin_lock(&hugetlb_lock);
3994bcc54222SNaoya Horiguchi 	if (!page_huge_active(page) || !get_page_unless_zero(page)) {
3995bcc54222SNaoya Horiguchi 		ret = false;
3996bcc54222SNaoya Horiguchi 		goto unlock;
3997bcc54222SNaoya Horiguchi 	}
3998bcc54222SNaoya Horiguchi 	clear_page_huge_active(page);
399931caf665SNaoya Horiguchi 	list_move_tail(&page->lru, list);
4000bcc54222SNaoya Horiguchi unlock:
400131caf665SNaoya Horiguchi 	spin_unlock(&hugetlb_lock);
4002bcc54222SNaoya Horiguchi 	return ret;
400331caf665SNaoya Horiguchi }
400431caf665SNaoya Horiguchi 
400531caf665SNaoya Horiguchi void putback_active_hugepage(struct page *page)
400631caf665SNaoya Horiguchi {
4007309381feSSasha Levin 	VM_BUG_ON_PAGE(!PageHead(page), page);
400831caf665SNaoya Horiguchi 	spin_lock(&hugetlb_lock);
4009bcc54222SNaoya Horiguchi 	set_page_huge_active(page);
401031caf665SNaoya Horiguchi 	list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
401131caf665SNaoya Horiguchi 	spin_unlock(&hugetlb_lock);
401231caf665SNaoya Horiguchi 	put_page(page);
401331caf665SNaoya Horiguchi }
4014